In [96]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
# import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')
# %matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [97]:
def import_and_test(data_path, label_to_predict, drop_col=[]):
    df = pd.read_csv(data_path)
    print("Imported dataframe shape: ", df.shape)

    #Only use quant variables and drop any rows with missing values
    num_vars = df.select_dtypes(include=['int', 'float'])
    num_vars = num_vars.drop(drop_col, axis=1)
    print("Dataframe shape with only numeric value columns: ", num_vars.shape)

    num_vars = num_vars.dropna(subset=[label_to_predict], axis=0)
    print(f"Dataframe shape when nan value is dropped in label({label_to_predict}): ", num_vars.shape)

    # Mean function
    fill_mean = lambda col: col.fillna(col.mean())
    # Fill the mean
    num_vars = num_vars.apply(fill_mean, axis=0)

    #Split into explanatory and response variables
    X = num_vars.drop(label_to_predict, axis=1)
    y = num_vars[label_to_predict]

    #Split into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42) 

    lm_model = LinearRegression(normalize=True) # Instantiate
    lm_model.fit(X_train, y_train) #Fit

    #Predict and score the model
    y_test_preds = lm_model.predict(X_test) 
    print("The r-squared score for the model using only quantitative variables was {} on {} values.".format(r2_score(y_test, y_test_preds), len(y_test)))
    
    return df

In [107]:
def get_object_cols(df):
    cat_df = df.select_dtypes(include=['object'])
    cat_cols_lst = cat_df.columns
    print("Object columns from this df: ", cat_df.shape[1])
    return cat_cols_lst

In [100]:
def get_multiple_value_cols(df):
    contain_semicolons = df.apply(lambda col: 
                                  col.str.contains(';').any()
                                  if col.dtypes==object
                                  else False)
    df_contain_semicolons = df.loc[:, contain_semicolons]
    columns_with_multiple_values = df_contain_semicolons.columns
    return columns_with_multiple_values

In [None]:
def get_dummy_dict(columns_with_multiple_values):
    categorized_df_dict = {}
    categorized_df_names_list = []
    for col in columns_with_multiple_values:
        df_categorized = categorize_feature(df[col])
        categorized_df_dict[col] = df_categorized
        categorized_df_names_list.append(col)

In [126]:
def create_dummy_lecture_method(df, cat_cols, dummy_na):
    for col in  cat_cols:
        try:
            # for each cat add dummy var, drop original column
            df = pd.concat([df.drop(col, axis=1), pd.get_dummies(df[col], prefix=col, prefix_sep='_', drop_first=True, dummy_na=dummy_na)], axis=1)
        except:
            print(col, 'Exception occur!!')
            continue
    return df

def create_dummy_my_method(df, cat_cols, _):
    for col in cat_cols:
        try:
            df1 = df[col]
        except:
            continue
        df1 = categorize_feature(df1)
        df.drop(col, axis=1, inplace=True)
        df = pd.concat([df, df1], axis=1)
    return df

def categorize_feature(single_column):
    if single_column.name in columns_with_multiple_values:
        single_column = single_column.apply(lambda value: list(value.split(';'))
                           if type(value)!=float
                           else float('nan'))
        print(124124124)
        return pd.get_dummies(single_column.apply(pd.Series).stack(dropna=False), prefix=single_column.name, prefix_sep='_').sum(level=0)
    
    else:
        return pd.get_dummies(single_column, prefix=single_column.name, prefix_sep='_')

In [116]:
def clean_fit_linear_mod(df, response_col, func_dummy, dummy_na, test_size=.3, rand_state=42):
    #Drop the rows with missing response values
    df  = df.dropna(subset=response_col, axis=0)
    print("Shape of df: ", df.shape)

    #Drop columns with all NaN values
    df = df.dropna(how='all', axis=1)
    
    # drop Respondent column
    df = df.drop('Respondent', axis=1)
    
    for column in df.select_dtypes(include=['int', 'float']):
        # Mean function
#         fill_mean = lambda col: col.fillna(col.mean())
        # Fill the mean
        df[column] = df[column].fillna(df[column].mean(), axis=0)
        
    
    cat_cols = get_object_cols(df)
    
    #Dummy categorical variables
    df = func_dummy(df, cat_cols, dummy_na)
    print("Shape of dummy df: ", df.shape)   
    
    
    #Split into explanatory and response variables
    X = df.drop(response_col, axis=1)
    y = df[response_col]
    
    print("Shape of X: ", X.shape)
    
    #Split into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=rand_state)

    lm_model = LinearRegression(normalize=True) # Instantiate
    lm_model.fit(X_train, y_train) #Fit

    #Predict using your model
    y_test_preds = lm_model.predict(X_test)
    y_train_preds = lm_model.predict(X_train)

    #Score using your model
    test_score = r2_score(y_test, y_test_preds)
    train_score = r2_score(y_train, y_train_preds)
    
    print("The rsquared on the training data was {}.  The rsquared on the test data was {}.".format(train_score, test_score))
    
    return test_score, train_score, lm_model, X_train, X_test, y_train, y_test

In [103]:
df_2017 = import_and_test('./survey_results_public_2017.csv', 'Salary', 'ExpectedSalary')
print('Import complete', '-'*40)
label_to_predict = 'Salary'
X, y = clean_fit_linear_mod(df_2017, label_to_predict, create_dummy_lecture_method, dummy_na=False)

Imported dataframe shape:  (51392, 154)
Dataframe shape with only numeric value columns:  (51392, 6)
Dataframe shape when nan value is dropped in label(Salary):  (12891, 6)
The r-squared score for the model using only quantitative variables was 0.04113420903416998 on 3868 values.
Shape of df:  (12891, 154)
Object column size from this df:  141
Shape of dummy df:  (12891, 21108)
Shape of X:  (12891, 21107)


In [104]:
'ExCoderNotForMe' in X.columns

False

In [106]:
# import 2018 dataset
df_2018 = import_and_test('./survey_results_public_2018.csv', 'ConvertedSalary')
print('Import complete', '-'*40)
label_to_predict = 'ConvertedSalary'
X, y = clean_fit_linear_mod(df_2018, label_to_predict, create_dummy_lecture_method, dummy_na=False)

Imported dataframe shape:  (98855, 129)
Dataframe shape with only numeric value columns:  (98855, 42)
Dataframe shape when nan value is dropped in label(ConvertedSalary):  (47702, 42)
The r-squared score for the model using only quantitative variables was 0.019991430883123606 on 14311 values.
Shape of df:  (47702, 129)
Object column size from this df:  87
Shape of dummy df:  (47702, 86960)
Shape of X:  (47702, 86959)


In [119]:
# import 2017 dataset
df_2017 = import_and_test('./survey_results_public_2017.csv', 'Salary', 'ExpectedSalary')
label_to_predict = 'Salary'

Imported dataframe shape:  (51392, 154)
Dataframe shape with only numeric value columns:  (51392, 6)
Dataframe shape when nan value is dropped in label(Salary):  (12891, 6)
The r-squared score for the model using only quantitative variables was 0.04113420903416998 on 3868 values.


In [120]:
print('2017, get dummies method: from lectures code')
test_score, train_score, lm_model, X_train, X_test, y_train, y_test = clean_fit_linear_mod(df_2017, label_to_predict, create_dummy_lecture_method, dummy_na=False)
print('-'*100)
print('2017, get dummies method: my method')
test_score, train_score, lm_model, X_train, X_test, y_train, y_test = clean_fit_linear_mod(df_2017, label_to_predict, create_dummy_my_method, dummy_na=False)

2017, get dummies method: from lectures code
Shape of df:  (12891, 154)
Object columns from this df:  141
Shape of dummy df:  (12891, 21108)
Shape of X:  (12891, 21107)
The rsquared on the training data was 1.0.  The rsquared on the test data was -0.6116611493680051.
----------------------------------------------------------------------------------------------------
2017, get dummies method: my method
Shape of df:  (12891, 154)
Object columns from this df:  141
Shape of dummy df:  (12891, 1366)
Shape of X:  (12891, 1365)
The rsquared on the training data was 0.8118050907109072.  The rsquared on the test data was -9.556389674176387e+25.


In [127]:
# import 2018 dataset
df_2018 = import_and_test('./survey_results_public_2018.csv', 'ConvertedSalary')
label_to_predict = 'ConvertedSalary'

Imported dataframe shape:  (98855, 129)
Dataframe shape with only numeric value columns:  (98855, 42)
Dataframe shape when nan value is dropped in label(ConvertedSalary):  (47702, 42)
The r-squared score for the model using only quantitative variables was 0.019991430883123606 on 14311 values.


In [128]:
print('2018, get dummies method: from lectures code')
test_score, train_score, lm_model, X_train, X_test, y_train, y_test = clean_fit_linear_mod(df_2018, label_to_predict, create_dummy_lecture_method, dummy_na=False)
print('-'*100)
print('2018, get dummies method: my method')
test_score, train_score, lm_model, X_train, X_test, y_train, y_test = clean_fit_linear_mod(df_2018, label_to_predict, create_dummy_my_method, dummy_na=False)

2018, get dummies method: my method
Shape of df:  (47702, 129)
Object columns from this df:  87


MemoryError: Unable to allocate 467. MiB for an array with shape (10265, 47702) and data type uint8