In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
def train_model(X, y):
    lm_model = LinearRegression(normalize=True) # Instantiate
    lm_model.fit(X, y) #Fit
    return lm_model

In [3]:
def import_data(data_path, drop_col=[]):
    df = pd.read_csv(data_path)
    print("Imported dataframe shape: ", df.shape)
    
    # drop specified columns
    df = df.drop(drop_col, axis=1)
    
    # drop column with all nan values
    df = df.dropna(how='all', axis=1)
    print(f"Df now has shape of {df.shape}")
    print(f"Importing df with shape of {df.shape} is complete!!!")
    return df

In [4]:
def test_with_numeric_columns(df, label_to_predict):
    #Only use quant variables and drop any rows with missing values
    df = df.select_dtypes(include=['int', 'float'])
    print("Dataframe shape with only numeric value columns: ", df.shape)

    df = df.dropna(subset=[label_to_predict], axis=0)
    print(f"Dataframe shape when nan value is dropped in label({label_to_predict}): ", df.shape)

    # replace nan value with mean of column
    df = fill_mean(df)

    #Split into explanatory and response variables
    X = df.drop(label_to_predict, axis=1)
    y = df[label_to_predict]
    
    #Split into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42) 
    
    model = train_model(X_train, y_train)

    #Predict and score the model
    y_test_preds = model.predict(X_test) 
    print("The r-squared score for the model using only quantitative variables was {} on {} values.".format(r2_score(y_test, y_test_preds), len(y_test)))

In [5]:
def get_numeric_type_columns(df):
    df_num = df.select_dtypes(include=['int', 'float'])
    df_num_cols = df_num.columns
    print("Numeric columns from this df: ", df_num.shape[1])
    return df_num_cols

In [6]:
def fill_mean(series):
    # Mean function
    mean = series.mean()
    # Fill the mean
    series = series.fillna(mean, axis=0)
    return series

In [7]:
def get_object_type_columns(df):
    df_obj = df.select_dtypes(include=['object'])
    df_obj_cols = df_obj.columns
    print("Object columns from this df: ", df_obj.shape[1])
    return df_obj_cols

In [8]:
def get_multiple_value_columns(df):
    contain_semicolons = df.apply(lambda col: 
                                  col.str.contains(';').any()
                                  if col.dtypes==object
                                  else False)
    df_contain_semicolons = df.loc[:, contain_semicolons]
    multiple_value_columns = df_contain_semicolons.columns
    return multiple_value_columns

In [54]:
def get_dummy_simple(series, dummy_na=False):
    df = pd.DataFrame()
    try:
        df = pd.get_dummies(series, prefix=series.name, prefix_sep='_', drop_first=False, dummy_na=dummy_na)
    except:
        print(f"Error with get_dummy_simple(), column: {series.name}")
    return df

In [10]:
def categorize_feature(series):
    series = series.apply(lambda value: list(value.split(';'))
                       if type(value)!=float
                       else float('nan'))
    return pd.get_dummies(series.apply(pd.Series).stack(dropna=False), prefix=series.name, prefix_sep='_').sum(level=0)    

In [11]:
def get_numeric_dict(df):
    df_num_cols = get_numeric_type_columns(df)
    numeric_df_dict = {}
    numeric_df_names_list = []
    for col in df_num_cols:
        df_filled = fill_mean(df[col])
        numeric_df_dict[col] = df_filled
        numeric_df_names_list.append(col)
    return numeric_df_dict, numeric_df_names_list

In [12]:
def get_dummy_dict(df, multiple_value_columns=[]):
    df_obj_cols = get_object_type_columns(df)
    categorized_df_dict = {}
    categorized_df_names_list = []
    for col in df_obj_cols:
        if col in multiple_value_columns:
            df_dummy = categorize_feature(df[col]) # my method
        else:
            df_dummy = get_dummy_simple(df[col]) # simple method
        categorized_df_dict[col] = df_dummy
        categorized_df_names_list.append(col)
    return categorized_df_dict, categorized_df_names_list

## Use simple get dummies function

In [174]:
# import 2017 dataset
df = import_data('./survey_results_public_2017.csv', ['Respondent', 'ExpectedSalary'])
label_to_predict = 'Salary'
test_with_numeric_columns(df, label_to_predict)

Imported dataframe shape:  (51392, 154)
Df now has shape of (51392, 152)
Importing df with shape of (51392, 152) is complete!!!
Dataframe shape with only numeric value columns:  (51392, 5)
Dataframe shape when nan value is dropped in label(Salary):  (12891, 5)
The r-squared score for the model using only quantitative variables was 0.04072431792894726 on 3868 values.


In [175]:
df = df.dropna(subset=[label_to_predict], axis=0)
Xs = df.drop(label_to_predict, axis=1)
Xs = Xs.dropna(how='all', axis=1)
y = df[label_to_predict]
print(f"df shape: {df.shape}")
print(f"Xs shape: {Xs.shape}")
print(f"y shape: {y.shape}")

df shape: (12891, 152)
Xs shape: (12891, 145)
y shape: (12891,)


In [178]:
numeric_df_dict, numeric_df_names_list = get_numeric_dict(Xs)
multiple_value_columns = get_multiple_value_columns(Xs) # if use my method to get 
categorized_df_dict, categorized_df_names_list = get_dummy_dict(Xs)

Numeric columns from this df:  4
Object columns from this df:  141


In [179]:
assert Xs.shape[1] == len(numeric_df_dict)+len(categorized_df_dict)
assert Xs.shape[1] == len(categorized_df_names_list)+len(numeric_df_names_list)

In [180]:
def check_result(col, df, y):
    print(col)
    print(f"shape: {df.shape}")
    
    X = df      
    #Split into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42) 
    
    model = train_model(X_train, y_train)

    #Predict and score the model
    y_test_preds = model.predict(X_test) 
    y_train_preds = model.predict(X_train)
    test_score = r2_score(y_test, y_test_preds)
    train_score = r2_score(y_train, y_train_preds)
    print(f"test_score: {test_score}, train_score: {train_score}")
    print('-'*40)
    
    return test_score, train_score

In [181]:
# rank every columns prediction r2score
rank_test = []
rank_train = []

for col, df_num in numeric_df_dict.items():
    test_score, train_score = check_result(col, df_num.values.reshape(-1,1), y)
    
    rank_test.append([test_score, col])
    rank_train.append([train_score, col])

for col, df_obj in categorized_df_dict.items():   
    test_score, train_score = check_result(col, df_obj, y)
    
    rank_test.append([test_score, col])
    rank_train.append([train_score, col])

CareerSatisfaction
shape: (12891, 1)
test_score: 0.02440744353670876, train_score: 0.02511874806277581
----------------------------------------
JobSatisfaction
shape: (12891, 1)
test_score: 0.015714880014184307, train_score: 0.013990287390995837
----------------------------------------
HoursPerWeek
shape: (12891, 1)
test_score: 0.008116429443099316, train_score: 0.005353632681996512
----------------------------------------
StackOverflowSatisfaction
shape: (12891, 1)
test_score: 0.005130504975913497, train_score: 0.005559666137063934
----------------------------------------
Professional
shape: (12891, 1)
test_score: -1.858952728239771e-05, train_score: 0.0
----------------------------------------
ProgramHobby
shape: (12891, 4)
test_score: 0.0005247655454900801, train_score: 0.0057765601272673495
----------------------------------------
Country
shape: (12891, 136)
test_score: -7.709702341110363e+25, train_score: 0.570935821010838
----------------------------------------
University
shape:

test_score: -3.038669656748752e+29, train_score: -0.45432784884518873
----------------------------------------
ClickyKeys
shape: (12891, 2)
test_score: 0.0009173493809646605, train_score: 0.0009201719340772296
----------------------------------------
JobProfile
shape: (12891, 280)
test_score: -1.756481845222238e+28, train_score: 0.07184415657926957
----------------------------------------
ResumePrompted
shape: (12891, 8)
test_score: 0.007348218600445988, train_score: 0.009252590135821137
----------------------------------------
LearnedHiring
shape: (12891, 8)
test_score: 0.010394993173299616, train_score: 0.013904729404609073
----------------------------------------
ImportantHiringAlgorithms
shape: (12891, 5)
test_score: 0.006151073898474579, train_score: 0.00920158875427457
----------------------------------------
ImportantHiringTechExp
shape: (12891, 5)
test_score: 0.016662468584125234, train_score: 0.01812199910832213
----------------------------------------
ImportantHiringCommunica

test_score: 0.006610164436013477, train_score: 0.012066898161880668
----------------------------------------
StackOverflowCommunity
shape: (12891, 5)
test_score: 0.02016983637725922, train_score: 0.014356307232091803
----------------------------------------
StackOverflowHelpful
shape: (12891, 5)
test_score: 0.01396088313618915, train_score: 0.012978908134480882
----------------------------------------
StackOverflowBetter
shape: (12891, 5)
test_score: 0.009390536736195942, train_score: 0.010785354820559978
----------------------------------------
StackOverflowWhatDo
shape: (12891, 5)
test_score: 0.006180239226589834, train_score: 0.01149144330948737
----------------------------------------
StackOverflowMakeMoney
shape: (12891, 5)
test_score: 0.00617275047883703, train_score: 0.010522676755073346
----------------------------------------
Gender
shape: (12891, 19)
test_score: 1.3587170536011328e-05, train_score: 0.008618156069942318
----------------------------------------
HighestEducation

In [182]:
rank_test.sort(reverse=True)
rank_train.sort(reverse=True)

In [183]:
rank_test[:10]

[[0.4167738538485196, 'Currency'],
 [0.20301667135459334, 'YearsCodedJob'],
 [0.17681542205634249, 'YearsProgram'],
 [0.10317553831646353, 'CompanyType'],
 [0.06349849905100025, 'Race'],
 [0.059509865901402725, 'University'],
 [0.05663789325191737, 'Overpaid'],
 [0.05192326010693504, 'CompanySize'],
 [0.038759220445147125, 'ImportantHiringPMExp'],
 [0.03699140020104186, 'StackOverflowCopiedCode']]

In [148]:
rank_train[:10]

[[0.570935821010838, 'Country'],
 [0.4339837071598497, 'Currency'],
 [0.2311629385849997, 'YearsCodedJob'],
 [0.20201034255966033, 'YearsProgram'],
 [0.10236362926073217, 'CompanyType'],
 [0.08636347969307256, 'EducationTypes'],
 [0.07458423130903169, 'Methodology'],
 [0.07184415657926957, 'JobProfile'],
 [0.06561597617228043, 'Race'],
 [0.05739833744743217, 'WantWorkDatabase']]

In [184]:
num_features = [1, 2, 3, 4, 5, 10, 20, 50, 100, len(rank_test)]

In [185]:
test_scores = []
train_scores = []
for num in num_features:
    features = []
    for i in range(num):
        features.append(rank_test[i][1])
    X = pd.DataFrame()
    for feature in features:
        if feature in numeric_df_dict:
            X = pd.concat([X, numeric_df_dict[feature]], axis=1)
        elif feature in categorized_df_dict:
            X = pd.concat([X, categorized_df_dict[feature]], axis=1)
        else:
            print(f"{feature} not in numeric_df_dict or categorized_df_dict!!!")
            
    test_score, train_score = check_result(num, X, y)
    test_scores.append([test_score, num])
    train_scores.append([train_score, num])

1
shape: (12891, 17)
test_score: 0.4167738538485196, train_score: 0.4339837071598497
----------------------------------------
2
shape: (12891, 38)
test_score: 0.5533608651193768, train_score: 0.5717896815799879
----------------------------------------
3
shape: (12891, 59)
test_score: 0.5579702570388851, train_score: 0.5791724605692948
----------------------------------------
4
shape: (12891, 70)
test_score: 0.5907981786641967, train_score: 0.6093640430709086
----------------------------------------
5
shape: (12891, 125)
test_score: -2.8933110678915737e+24, train_score: 0.6156692302749206
----------------------------------------
10
shape: (12891, 154)
test_score: -1.3860895637801805e+25, train_score: 0.655538856055528
----------------------------------------
20
shape: (12891, 211)
test_score: -6.550066498594873e+23, train_score: 0.6745769563524515
----------------------------------------
50
shape: (12891, 409)
test_score: -2.447397800897439e+26, train_score: 0.6949489154160645
---------

In [186]:
num_features = list(range(len(rank_test), len(rank_test)-10, -1))

In [None]:
test_scores = []
train_scores = []
for num in num_features:
    features = []
    for i in range(num):
        features.append(rank_test[i][1])
    X = pd.DataFrame()
    for feature in features:
        if feature in numeric_df_dict:
            X = pd.concat([X, numeric_df_dict[feature]], axis=1)
        elif feature in categorized_df_dict:
            X = pd.concat([X, categorized_df_dict[feature]], axis=1)
        else:
            print(f"{feature} not in numeric_df_dict or categorized_df_dict!!!")
            
    test_score, train_score = check_result(num, X, y)
    test_scores.append([test_score, num])
    train_scores.append([train_score, num])

145
shape: (12891, 21248)


## Use my get dummies function

In [154]:
# import 2017 dataset
df = import_data('./survey_results_public_2017.csv', ['Respondent', 'ExpectedSalary'])
label_to_predict = 'Salary'
test_with_numeric_columns(df, label_to_predict)

Imported dataframe shape:  (51392, 154)
Df now has shape of (51392, 152)
Importing df with shape of (51392, 152) is complete!!!
Dataframe shape with only numeric value columns:  (51392, 5)
Dataframe shape when nan value is dropped in label(Salary):  (12891, 5)
The r-squared score for the model using only quantitative variables was 0.04072431792894726 on 3868 values.


In [155]:
df = df.dropna(subset=[label_to_predict], axis=0)
Xs = df.drop(label_to_predict, axis=1)
Xs = Xs.dropna(how='all', axis=1)
y = df[label_to_predict]
print(f"df shape: {df.shape}")
print(f"Xs shape: {Xs.shape}")
print(f"y shape: {y.shape}")

df shape: (12891, 152)
Xs shape: (12891, 145)
y shape: (12891,)


In [156]:
numeric_df_dict, numeric_df_names_list = get_numeric_dict(Xs)
multiple_value_columns = get_multiple_value_columns(Xs) # if use my method to get 
categorized_df_dict, categorized_df_names_list = get_dummy_dict(Xs, multiple_value_columns)

Numeric columns from this df:  4
Object columns from this df:  141


In [157]:
assert Xs.shape[1] == len(numeric_df_dict)+len(categorized_df_dict)
assert Xs.shape[1] == len(categorized_df_names_list)+len(numeric_df_names_list)

In [158]:
def check_result(col, df, y):
    print(col)
    print(f"shape: {df.shape}")
    
    X = df      
    #Split into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42) 
    
    model = train_model(X_train, y_train)

    #Predict and score the model
    y_test_preds = model.predict(X_test) 
    y_train_preds = model.predict(X_train)
    test_score = r2_score(y_test, y_test_preds)
    train_score = r2_score(y_train, y_train_preds)
    print(f"test_score: {test_score}, train_score: {train_score}")
    print('-'*40)
    
    return test_score, train_score

In [159]:
# rank every columns prediction r2score
rank_test = []
rank_train = []

for col, df_num in numeric_df_dict.items():
    test_score, train_score = check_result(col, df_num.values.reshape(-1,1), y)
    
    rank_test.append([test_score, col])
    rank_train.append([train_score, col])

for col, df_obj in categorized_df_dict.items():   
    test_score, train_score = check_result(col, df_obj, y)
    
    rank_test.append([test_score, col])
    rank_train.append([train_score, col])

CareerSatisfaction
shape: (12891, 1)
test_score: 0.02440744353670876, train_score: 0.02511874806277581
----------------------------------------
JobSatisfaction
shape: (12891, 1)
test_score: 0.015714880014184307, train_score: 0.013990287390995837
----------------------------------------
HoursPerWeek
shape: (12891, 1)
test_score: 0.008116429443099316, train_score: 0.005353632681996512
----------------------------------------
StackOverflowSatisfaction
shape: (12891, 1)
test_score: 0.005130504975913497, train_score: 0.005559666137063934
----------------------------------------
Professional
shape: (12891, 1)
test_score: -1.858952728239771e-05, train_score: 0.0
----------------------------------------
ProgramHobby
shape: (12891, 4)
test_score: 0.0005247655454900801, train_score: 0.0057765601272673495
----------------------------------------
Country
shape: (12891, 136)
test_score: -7.709702341110363e+25, train_score: 0.570935821010838
----------------------------------------
University
shape:

test_score: 0.06473857270436212, train_score: 0.07301978977600854
----------------------------------------
WantWorkLanguage
shape: (12891, 69)
test_score: 0.04218761137502414, train_score: 0.04930963860940918
----------------------------------------
HaveWorkedFramework
shape: (12891, 17)
test_score: 0.0231102684478387, train_score: 0.019071483304514247
----------------------------------------
WantWorkFramework
shape: (12891, 17)
test_score: 0.0227705471130778, train_score: 0.022730693709255845
----------------------------------------
HaveWorkedDatabase
shape: (12891, 15)
test_score: 0.03421260902954826, train_score: 0.028820407785287605
----------------------------------------
WantWorkDatabase
shape: (12891, 15)
test_score: 0.031630608863732035, train_score: 0.040513914172822796
----------------------------------------
HaveWorkedPlatform
shape: (12891, 29)
test_score: 0.046886041425607705, train_score: 0.06489831403472701
----------------------------------------
WantWorkPlatform
shape:

In [160]:
rank_test.sort(reverse=True)
rank_train.sort(reverse=True)

In [161]:
rank_test[:10]

[[0.4167738538485196, 'Currency'],
 [0.20301667135459334, 'YearsCodedJob'],
 [0.17681542205634249, 'YearsProgram'],
 [0.10317553831646353, 'CompanyType'],
 [0.07708971568958778, 'IDE'],
 [0.07310429022915554, 'JobProfile'],
 [0.0709689802747091, 'ImportantBenefits'],
 [0.06621110184471446, 'Race'],
 [0.06473857270436212, 'HaveWorkedLanguage'],
 [0.059509865901402725, 'University']]

In [162]:
rank_train[:10]

[[0.570935821010838, 'Country'],
 [0.4339837071598497, 'Currency'],
 [0.2311629385849997, 'YearsCodedJob'],
 [0.20201034255966033, 'YearsProgram'],
 [0.10236362926073217, 'CompanyType'],
 [0.10011009144103578, 'ImportantBenefits'],
 [0.08282596256489372, 'IDE'],
 [0.07301978977600854, 'HaveWorkedLanguage'],
 [0.06828447343954602, 'JobProfile'],
 [0.06489831403472701, 'HaveWorkedPlatform']]

In [172]:
# num_features = [2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 50, 100, len(rank_test)]
num_features = list(range(1, len(rank_test)+1))

In [173]:
test_scores = []
train_scores = []
for num in num_features:
    features = []
    for i in range(num):
        features.append(rank_test[i][1])
    X = pd.DataFrame()
    for feature in features:
        if feature in numeric_df_dict:
            X = pd.concat([X, numeric_df_dict[feature]], axis=1)
        elif feature in categorized_df_dict:
            X = pd.concat([X, categorized_df_dict[feature]], axis=1)
        else:
            print(f"{feature} not in numeric_df_dict or categorized_df_dict!!!")
            
    test_score, train_score = check_result(num, X, y)
    test_scores.append([test_score, num])
    train_scores.append([train_score, num])

1
shape: (12891, 17)
test_score: 0.4167738538485196, train_score: 0.4339837071598497
----------------------------------------
2
shape: (12891, 38)
test_score: 0.5533608651193768, train_score: 0.5717896815799879
----------------------------------------
3
shape: (12891, 59)
test_score: 0.5579702570388851, train_score: 0.5791724605692948
----------------------------------------
4
shape: (12891, 70)
test_score: 0.5907981786641967, train_score: 0.6093640430709086
----------------------------------------
5
shape: (12891, 113)
test_score: 0.6057809978127665, train_score: 0.6242068087506347
----------------------------------------
6
shape: (12891, 150)
test_score: 0.6094893088177588, train_score: 0.6282150322209696
----------------------------------------
7
shape: (12891, 183)
test_score: -5.430516754220984e+23, train_score: 0.6370699425305056
----------------------------------------
8
shape: (12891, 200)
test_score: -5.73748797031855e+24, train_score: 0.6406628475894767
----------------------

test_score: -5.493449263793631e+24, train_score: 0.7313414938873333
----------------------------------------
65
shape: (12891, 843)
test_score: -5.696360287838561e+23, train_score: 0.7317537852318439
----------------------------------------
66
shape: (12891, 851)
test_score: -6.519887836391103e+22, train_score: 0.7330347733432283
----------------------------------------
67
shape: (12891, 856)
test_score: -9.067296563770792e+23, train_score: 0.7331841273819442
----------------------------------------
68
shape: (12891, 866)
test_score: -2.576298082948149e+24, train_score: 0.7340312844638066
----------------------------------------
69
shape: (12891, 871)
test_score: -2.3629257816254556e+24, train_score: 0.7341267302266304
----------------------------------------
70
shape: (12891, 876)
test_score: -6.046584888726268e+24, train_score: 0.734293654401261
----------------------------------------
71
shape: (12891, 881)
test_score: -1.6851775122111667e+25, train_score: 0.7346624930437075
-------

126
shape: (12891, 1159)
test_score: -5.933959810802754e+24, train_score: 0.7466323892713793
----------------------------------------
127
shape: (12891, 1164)
test_score: -6.465419011375839e+24, train_score: 0.7468486133151441
----------------------------------------
128
shape: (12891, 1169)
test_score: -2.441974238749599e+24, train_score: 0.7470376572346011
----------------------------------------
129
shape: (12891, 1174)
test_score: -1.7799900605194805e+24, train_score: 0.7470988521896766
----------------------------------------
130
shape: (12891, 1176)
test_score: -5.624609011823753e+24, train_score: 0.7471266151268299
----------------------------------------
131
shape: (12891, 1181)
test_score: -2.538669470245428e+23, train_score: 0.7473542443444361
----------------------------------------
132
shape: (12891, 1185)
test_score: -3.170779845873687e+23, train_score: 0.7474923237103365
----------------------------------------
133
shape: (12891, 1191)
test_score: -2.3419106629503682e+23,

# drop rows where 