In [19]:
# import 2017 dataset

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')
%matplotlib inline
%load_ext autoreload
%autoreload 2

df = pd.read_csv('./survey_results_public_2017.csv')
print("Imported dataframe shape: ", df.shape)

#Only use quant variables and drop any rows with missing values
num_vars = df.select_dtypes(include=['int', 'float'])
num_vars = num_vars.drop('ExpectedSalary', axis=1)
print("Dataframe shape with only numeric value columns: ", num_vars.shape)

label_to_predict = 'Salary'

num_vars = num_vars.dropna(subset=[label_to_predict], axis=0)
print("Dataframe shape when nan value is dropped in label(Salary): ", num_vars.shape)

# Mean function
fill_mean = lambda col: col.fillna(col.mean())
# Fill the mean
num_vars = num_vars.apply(fill_mean, axis=0)

#Split into explanatory and response variables
X = num_vars.drop(label_to_predict, axis=1)
y = num_vars[label_to_predict]

#Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42) 

lm_model = LinearRegression(normalize=True) # Instantiate
lm_model.fit(X_train, y_train) #Fit
        
#Predict and score the model
y_test_preds = lm_model.predict(X_test) 
"The r-squared score for the model using only quantitative variables was {} on {} values.".format(r2_score(y_test, y_test_preds), len(y_test))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Imported dataframe shape:  (51392, 154)
Dataframe shape with only numeric value columns:  (51392, 6)
Dataframe shape when nan value is dropped in label(Salary):  (12891, 6)


'The r-squared score for the model using only quantitative variables was 0.04113420903416998 on 3868 values.'

In [20]:
# import 2018 dataset

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')
%matplotlib inline
%load_ext autoreload
%autoreload 2

df = pd.read_csv('./survey_results_public_2018.csv')
print("Imported dataframe shape: ", df.shape)

num_vars = df.select_dtypes(include=['int', 'float'])
print("Dataframe shape with only numeric value columns: ", num_vars.shape)

label_to_predict = 'ConvertedSalary'

num_vars = num_vars.dropna(subset=[label_to_predict], axis=0)
print("Dataframe shape when nan value is dropped in label(ConvertedSalary): ", num_vars.shape)

# Mean function
fill_mean = lambda col: col.fillna(col.mean())
# Fill the mean
num_vars = num_vars.apply(fill_mean, axis=0)

#Split into explanatory and response variables
X = num_vars.drop(label_to_predict, axis=1)
y = num_vars[label_to_predict]

#Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42) 

lm_model = LinearRegression(normalize=True) # Instantiate
lm_model.fit(X_train, y_train) #Fit
        
#Predict and score the model
y_test_preds = lm_model.predict(X_test) 
"The r-squared score for the model using only quantitative variables was {} on {} values.".format(r2_score(y_test, y_test_preds), len(y_test))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Imported dataframe shape:  (98855, 129)
Dataframe shape with only numeric value columns:  (98855, 42)
Dataframe shape when nan value is dropped in label(ConvertedSalary):  (47702, 42)


'The r-squared score for the model using only quantitative variables was 0.014745829369589014 on 14311 values.'

In [21]:
cat_df = df.select_dtypes(include=['object'])
cat_cols_lst = cat_df.columns
cat_df.shape[1]

87

In [22]:
contain_semicolons = df.apply(lambda col: 
                                  col.str.contains(';').any()
                                  if col.dtypes==object
                                  else False)
df_contain_semicolons = df.loc[:, contain_semicolons]
columns_with_multiple_values = df_contain_semicolons.columns
columns_with_multiple_values

Index(['DevType', 'CommunicationTools', 'EducationTypes', 'SelfTaughtTypes',
       'HackathonReasons', 'LanguageWorkedWith', 'LanguageDesireNextYear',
       'DatabaseWorkedWith', 'DatabaseDesireNextYear', 'PlatformWorkedWith',
       'PlatformDesireNextYear', 'FrameworkWorkedWith',
       'FrameworkDesireNextYear', 'IDE', 'Methodology', 'VersionControl',
       'AdBlockerReasons', 'AdsActions', 'ErgonomicDevices', 'Gender',
       'SexualOrientation', 'RaceEthnicity'],
      dtype='object')

In [23]:
def create_dummy_lecture_method(df, cat_cols, dummy_na):
    for col in  cat_cols:
        try:
            # for each cat add dummy var, drop original column
            df = pd.concat([df.drop(col, axis=1), pd.get_dummies(df[col], prefix=col, prefix_sep='_', drop_first=True, dummy_na=dummy_na)], axis=1)
        except:
            continue
    return df


# def create_dummy_my_method(df, cat_cols, _):
#     df_tmp = pd.DataFrame()
#     for col in cat_cols:
#         df1 = df[col]
#         df1 = categorize_feature(df1)
#         df_tmp = pd.concat([df_tmp, df1], axis=1)
#     return df_new
def create_dummy_my_method(df, cat_cols, _):
    for col in cat_cols:
        try:
            df1 = df[col]
        except:
            continue
        df1 = categorize_feature(df1)
        df.drop(col, axis=1, inplace=True)
        df = pd.concat([df, df1], axis=1)
    return df

def categorize_feature(single_column):
    if single_column.name in columns_with_multiple_values:
        single_column = single_column.apply(lambda value: list(value.split(';'))
                           if type(value)!=float
                           else float('nan'))
        return pd.get_dummies(single_column.apply(pd.Series).stack(dropna=False), prefix=single_column.name, prefix_sep='_').sum(level=0)
    
    else:
        return pd.get_dummies(single_column, prefix=single_column.name, prefix_sep='_')

In [24]:
# df = df.dropna(subset=[label_to_predict], axis=0)
# print(df.shape)
# cat_df = df.select_dtypes(include=['object'])
# cat_cols_lst = cat_df.columns

# df_lecture = create_dummy_lecture_method(df, cat_cols_lst, dummy_na=False)
# df_my = create_dummy_my_method(df, cat_cols_lst, False)

# print(df_lecture.shape)
# print(df_my.shape)

In [25]:
def clean_fit_linear_mod(df, response_col, cat_cols, func_dummy, dummy_na, test_size=.3, rand_state=42):
    #Drop the rows with missing response values
    df  = df.dropna(subset=[response_col], axis=0)
    
    
#     # Fill the mean
#     columns_numeric = df.select_dtypes(include=["int", "float"]).columns
#     for col in columns_numeric:
#         df[col] = df[col].fillna(df[col].mean())

    print("Shape of df: ", df.shape)

    #Drop columns with all NaN values
    df = df.dropna(how='all', axis=1)
    
    #Dummy categorical variables
    df = func_dummy(df, cat_cols, dummy_na)
    
    print("Shape of dummy df: ", df.shape)
    
    
    
    # Mean function
    fill_mean = lambda col: col.fillna(col.mean())
    # Fill the mean
    df = df.apply(fill_mean, axis=0)

#     # Fill the mean
#     columns_numeric = df.select_dtypes(include=["int", "float"]).columns
#     for col in columns_numeric:
#         df[col] = df.fillna(df[col])
        
    #Split into explanatory and response variables
    X = df.drop(response_col, axis=1)
    y = df[response_col]
    
    print("Shape of X: ", X.shape)
    
    #Split into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=rand_state)

    lm_model = LinearRegression(normalize=True) # Instantiate
    lm_model.fit(X_train, y_train) #Fit

    #Predict using your model
    y_test_preds = lm_model.predict(X_test)
    y_train_preds = lm_model.predict(X_train)

    #Score using your model
    test_score = r2_score(y_test, y_test_preds)
    train_score = r2_score(y_train, y_train_preds)

    return test_score, train_score, lm_model, X_train, X_test, y_train, y_test

In [26]:
test_score, train_score, lm_model, X_train, X_test, y_train, y_test = clean_fit_linear_mod(df, label_to_predict, cat_cols_lst, create_dummy_lecture_method, dummy_na=False)
print("The rsquared on the training data was {}.  The rsquared on the test data was {}.".format(train_score, test_score))

Shape of df:  (47702, 129)
Shape of dummy df:  (47702, 86961)
Shape of X:  (47702, 86960)
Shape of X:  (47702, 86960)


MemoryError: Unable to allocate 21.6 GiB for an array with shape (33391, 86960) and data type float64

In [27]:
test_score, train_score, lm_model, X_train, X_test, y_train, y_test = clean_fit_linear_mod(df, label_to_predict, cat_cols_lst, create_dummy_my_method, dummy_na=False)
print("The rsquared on the training data was {}.  The rsquared on the test data was {}.".format(train_score, test_score))

Shape of df:  (47702, 129)


MemoryError: Unable to allocate 204. MiB for an array with shape (4486, 47702) and data type uint8