In [105]:
#getting necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns
%matplotlib inline




In [114]:
#stackoverflow 2019 data
survey_2019 = pd.read_csv('./survey_results_public_2019.csv')


Index(['Respondent', 'MainBranch', 'Hobbyist', 'OpenSourcer', 'OpenSource',
       'Employment', 'Country', 'Student', 'EdLevel', 'UndergradMajor',
       'EduOther', 'OrgSize', 'DevType', 'YearsCode', 'Age1stCode',
       'YearsCodePro', 'CareerSat', 'JobSat', 'MgrIdiot', 'MgrMoney',
       'MgrWant', 'JobSeek', 'LastHireDate', 'LastInt', 'FizzBuzz',
       'JobFactors', 'ResumeUpdate', 'CurrencySymbol', 'CurrencyDesc',
       'CompTotal', 'CompFreq', 'ConvertedComp', 'WorkWeekHrs', 'WorkPlan',
       'WorkChallenge', 'WorkRemote', 'WorkLoc', 'ImpSyn', 'CodeRev',
       'CodeRevHrs', 'UnitTests', 'PurchaseHow', 'PurchaseWhat',
       'LanguageWorkedWith', 'LanguageDesireNextYear', 'DatabaseWorkedWith',
       'DatabaseDesireNextYear', 'PlatformWorkedWith',
       'PlatformDesireNextYear', 'WebFrameWorkedWith',
       'WebFrameDesireNextYear', 'MiscTechWorkedWith',
       'MiscTechDesireNextYear', 'DevEnviron', 'OpSys', 'Containers',
       'BlockchainOrg', 'BlockchainIs', 'BetterLife'

In [116]:
survey_2019['CareerSat'].unique()

array([nan, 'Slightly satisfied', 'Very satisfied', 'Very dissatisfied',
       'Slightly dissatisfied', 'Neither satisfied nor dissatisfied'],
      dtype=object)

In [109]:
#missing check for columns, decision was to let all columns stay as is because they all have at most 24% missing
missing={}
for x in survey_2019:
    missing[x] = survey_2019[x].isna().sum()/len(survey_2019.index)
sorted(missing.items(), key=lambda x: x[1], reverse=True)

[('BlockchainOrg', 0.4579953421914202),
 ('CodeRevHrs', 0.4398253884319836),
 ('ConvertedComp', 0.3719496416637602),
 ('CompTotal', 0.37057705072961084),
 ('MiscTechWorkedWith', 0.3296130868670049),
 ('BlockchainIs', 0.3230989053024763),
 ('PurchaseHow', 0.3124894524262232),
 ('MgrMoney', 0.3119381659034911),
 ('MgrIdiot', 0.31191566441276736),
 ('MgrWant', 0.3110943600013501),
 ('PurchaseWhat', 0.3021275159479315),
 ('UnitTests', 0.2949382896616901),
 ('WebFrameDesireNextYear', 0.2918330839418111),
 ('CompFreq', 0.28818784244456197),
 ('WorkWeekHrs', 0.27429317192263986),
 ('MiscTechDesireNextYear', 0.2742031659597448),
 ('WebFrameWorkedWith', 0.268454035079824),
 ('LastInt', 0.24445619522293352),
 ('WorkChallenge', 0.2333629602961196),
 ('SOHowMuchTime', 0.230696533645354),
 ('WorkPlan', 0.2246661341313862),
 ('DatabaseDesireNextYear', 0.2220447104620681),
 ('SONewContent', 0.21739815262761158),
 ('WorkLoc', 0.21182903367348085),
 ('WorkRemote', 0.2092526129856103),
 ('CodeRev', 0.20

In [118]:
#above 25% empty columns are eliminated
for k,v in missing.items():
    if v > 0.25:
        print(k)
        survey_2019 = survey_2019.drop(k, axis=1)

MgrIdiot
MgrMoney
MgrWant
CompTotal
CompFreq
ConvertedComp
WorkWeekHrs
CodeRevHrs
UnitTests
PurchaseHow
PurchaseWhat
WebFrameWorkedWith
WebFrameDesireNextYear
MiscTechWorkedWith
MiscTechDesireNextYear
BlockchainOrg
BlockchainIs


In [119]:
#missing analysis
#missing rows, users that did not participate enough will be removed
survey_2019['missing_row_rate'] = len(survey_2019.columns) - (survey_2019.apply(lambda x: x.count(), axis=1))/len(survey_2019.columns)
survey_2019.sort_values(by=['missing_row_rate'], ascending = 0)


Unnamed: 0,Respondent,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,...,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase,missing_row_rate
88389,12779,,No,Never,,,,,,,...,,,,,,,,,,67.955882
88595,42460,,No,Never,,,,,,,...,,,,,,,,,,67.955882
88790,73472,,No,Never,,,,,,,...,,,,,,,,,,67.955882
88789,73408,,No,Never,,,,,,,...,,,,,,,,,,67.955882
88666,52784,,No,Never,,,,,,,...,,,,,,,,,,67.955882
88502,29513,,No,Never,,,,,,,...,,,,,,,,,,67.955882
88847,83648,,No,Never,,,,,,,...,,,,,,,,,,67.955882
88610,44453,,No,Never,,,,,,,...,,,,,,,,,,67.955882
88403,14848,,No,Never,,,,,,,...,,,,,,,,,,67.955882
88458,22994,,No,Never,,,,,,,...,,,,,,,,,,67.955882


In [89]:
#decision, continue with users answering at least 50%
df.groupby(['missing_row_rate']).size()
#df_user_elim = df.loc[df['missing_row_rate'] <= 0.5]
#df_user_elim = df_user_elim.drop('missing_row_rate', axis=1)
print(df_user_elim.shape)

(166449, 18)


[('DatabaseDesireNextYear', 0.239598916184537),
 ('CurrencySymbol', 0.2302206681926596),
 ('PlatformDesireNextYear', 0.16695804720965582),
 ('DatabaseWorkedWith', 0.14608678934688704),
 ('UndergradMajor', 0.14549201256841435),
 ('Age', 0.13676261197123443),
 ('Dependents', 0.12581631610883814),
 ('PlatformWorkedWith', 0.1198445169391225),
 ('Gender', 0.10086573064422136),
 ('LanguageDesireNextYear', 0.05836021844528955),
 ('DevType', 0.0502976887815487),
 ('Student', 0.016058973018762503),
 ('Employment', 0.013998281755973301),
 ('OpenSource', 0.011384868638441806),
 ('LanguageWorkedWith', 0.008597227979741542),
 ('Respondent', 0.0),
 ('Country', 0.0),
 ('Year', 0.0)]

First question, does the population in stackoverflow change drastically? This question will be tried to answered through country distribution