In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
plt.style.use('ggplot')
survey2016 = pd.read_csv('Survey Data/2016 Stack Overflow Survey Responses.csv')

# Does stackoverflow lower wages?

## Outline

* Exponential growth of stack overflow
* Initial look at wages and information availability
* Survey's own education data
* Second look at wages, controlling for geography
* A simple regression

## To be continued...
* country map of stackoverflow growth
* Can we specify a causal regression using time series data?
* Other fields in the survey and stack overflow data dump
* deeper exploration of stack overflow posts: what does the number of posts indicate about the quality of information?
* deeper exploration of stack overflow posts: other indicators for quality?
* scraping developer profiles on stackoverflow
* data on MOOCs


### Graph 1, growth of Cross Validated

How is this trend increasing over time? Is it an important economic force?

In [None]:
## Create a timeseries graph of users and posts over time on Cross Validated
dusers = pd.read_csv("stats.stackexchange.com/Users.csv")
dposts = pd.read_csv("stats.stackexchange.com/Posts.csv")

dposts['CreationDate'] = pd.to_datetime(dposts['CreationDate'], format='%Y-%m-%dT%H:%M:%S.%f')
dposts['Date'] = dposts.CreationDate.dt.date

dusers.CreationDate = pd.to_datetime(dusers['CreationDate'], format='%Y-%m-%dT%H:%M:%S.%f')
dusers['Date'] = dusers.CreationDate.dt.date
usercounts = dusers.groupby('Date').size()
postcounts = dposts.groupby('Date').size()
posts_users = pd.DataFrame(data={"users": usercounts.cumsum(),
                                 "posts": postcounts},
                           index=postcounts.index)

posts_users['users30dayMA'] = posts_users['users'].rolling(window=30).mean()
posts_users['posts30dayMA'] = posts_users['posts'].rolling(window=30).mean()


In [None]:
fig, ax1 = plt.subplots()
ax1.plot(posts_users.index, posts_users['users30dayMA'], color='b', label="Number of Users")
ax1.set_xlabel('Date')

# Make the y-axis label, ticks and tick labels match the line color.
ax1.set_ylabel('Number of Users', color='b')
ax1.tick_params('y', colors='b')

ax2 = ax1.twinx()
ax2.plot(posts_users.index, posts_users['posts30dayMA'], color='r', label="Posts per Day (30 day MA)")
ax2.set_ylabel('Posts per day', color='r')
ax2.tick_params('y', colors='r')

ax1.set(title="Growth of Cross Validated Over Time")
def update_ylabels(ax):
    ylabels = [format(label, ',.0f') for label in ax.get_yticks()]
    ax.set_yticklabels(ylabels)
update_ylabels(ax1)
fig.tight_layout()

## Graph two, number of posts vs. salary

Does making things easier to learn decrease salary?

In [None]:
tags = ET.parse("stackoverflow/Tags.xml")
tags = tags.getroot()

tags = pd.DataFrame.from_dict([x.attrib for x in tags])
tags = tags.apply(pd.to_numeric, errors='ignore')
unique_tags = tags.TagName.unique()

survey2016['tags'] = survey2016.tech_do.str.lower() #.str.split("; *")

plotting_tags = tags.loc[tags.Count > 10000, :]
plotting_tags.loc[:, 'avg_salary'] = np.NaN
plotting_tags = plotting_tags.reset_index()

# TODO: don't use a loop here
# TODO: make sure to do a better job cleaning the tags before passing in
for i in range(len(plotting_tags)):
    tag = plotting_tags.loc[i, 'TagName']
    regex = re.escape(tag)
    print(tag)
    plotting_tags.loc[i, 'avg_salary'] = survey2016.loc[survey2016.tags.str.contains(regex).fillna(False), 'salary_midpoint'].mean()


In [None]:

plt_tags = plotting_tags.loc[~plotting_tags.avg_salary.isnull()]
fig, ax = plt.subplots()

plt_tags.plot('Count', 'avg_salary', kind='scatter', ax=ax)

for k, v in plt_tags.iterrows():
    ax.annotate(plt_tags.loc[k, 'TagName'], plt_tags.loc[k, ['Count', 'avg_salary']])

plt.show()

# I'm not an expert at matplotlib so the graphics might be somewhat lacking
fig, ax = plt.subplots()
ax.scatter(plt_tags.Count, plt_tags.avg_salary)
ax.set(xlabel='Number of Posts', ylabel='Average Salary in 2016', title="Stack Overflow Posts And Salary")
for k, v in plt_tags.iterrows():
    ax.annotate(plt_tags.loc[k, 'TagName'], plt_tags.loc[k, ['Count', 'avg_salary']])


## What the surveys contains on education already

2015 and 2016 surveys collect data on developer education, including whether they are self taught.

In [None]:
#Many people list more than one education type
education = survey2016.education.str.split('; ', expand=True).stack()

In [None]:
#basic plot of education counts
fig, ax = plt.subplots(figsize=(8,6))
education.value_counts().plot(kind='barh', rot=0)
plt.xlabel("Number of individuals")
plt.tight_layout()

In [None]:
#heatmap of overlap between categories
degrees = ["I'm self-taught",
           'On-the-job training',
           'Online class (e.g. Coursera, Codecademy, Khan Academy, etc.)',
           'Full-time, intensive program (e.g. "boot-camp")',
           'Part-time program (e.g. night school)',
           'Some college coursework in Computer Science (or related field)',
           'B.A. in Computer Science (or related field)',
           'B.S. in Computer Science (or related field)',
           'Masters Degree in Computer Science (or related field)',
           'PhD in Computer Science (or related field)']

education_grid = pd.DataFrame(index=degrees, columns=degrees)

# use index slice for multiindex subsets
index_slice = pd.IndexSlice
for ed1 in education.unique():
    ix = education[education == ed1].index.get_level_values(0)
    number_with_education = len(ix)
    subset = education.loc[index_slice[ix,:]]
    for ed2 in subset.unique():
        # rows of the dataframe all have the same denominator: number of people with the ed.
        education_grid.loc[ed1,ed2] = np.sum(subset == ed2)/number_with_education

education_grid = education_grid.astype(float)
plt.close()
fig, ax = plt.subplots(figsize=(10,10))
plt.pcolor(education_grid)
plt.yticks(np.arange(0.5, len(education_grid.index), 1), education_grid.index)
plt.xticks(np.arange(0.5, len(education_grid.columns), 1), education_grid.columns, rotation=45, ha='right')
ax.set_xlabel("Rows define denominator")
fig.subplots_adjust(bottom=.5,left=0.5)
plt.colorbar()

In [None]:
# education vs incomes
survey2016['highest_degree'] = "None"

for deg in degrees:
    survey2016.loc[survey2016.education.str.match(re.escape(deg)).fillna(False), 'highest_degree'] = deg
plt.close()
fig, ax = plt.subplots(figsize=(8, 8))
fig.subplots_adjust(bottom=.4, left=0.4)
survey2016.groupby('highest_degree')['salary_midpoint'].mean()[degrees].plot(kind='bar')
plt.xticks(np.arange(0, len(degrees), 1), degrees, rotation=30, ha='right')
ax.set(title("Salary by Highest Education")
ax.set_xlabel("Highest Educational Achievement")
ax.set_ylabel("Income")
ax.get_yaxis().set_major_formatter(
    matplotlib.ticker.FuncFormatter(lambda x, p: '${:,}'.format(x)))


In [None]:
## Geography, distribution of visitors to regular stack overflow website

In [None]:
salary = pd.concat()

## Controlling for geography, developer salaries compared to their peers

## A simple regression