In [105]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

%matplotlib inline

In [106]:
# convert categorical variables to dummy variables and concatenate with the dataframe
def create_dummy_var(df, cat_cols_lst, dummy_na):

    for col in cat_cols_lst:
        try:
            # for each cat add dummy variable and drop original column and then concatenate
            df = pd.concat([df.drop(col, axis=1), pd.get_dummies(df[col], prefix=col, prefix_sep='_', dummy_na=dummy_na)], axis = 1)
        except:
            continue
    return df

In [107]:
# A linear model to estimate expected salary after graduation. 
def pred_model(df, response_col, cat_cols, dummy_na, test_size=.2, rand_state=32):

    # Split dataframe into train and test
    X = df.drop(response_col, axis=1)
    y = df[response_col]
    X_train, X_test,y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=rand_state)
    
    # linear model
    lm = LinearRegression()
    lm.fit(X_train, y_train)

    y_test_pred = lm.predict(X_test)
    y_train_pred = lm.predict(X_train)

    # model metrics: r-square
    test_r2 = r2_score(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)

    return test_r2, train_r2, lm


In [110]:
def main():
    df = pd.read_csv('./dataset/stackoverflow_data/survey_results_public.csv')
    working_df = df[['University', 'YearsCodedJob', 'YearsProgram', 'MajorUndergrad', 'YearsCodedJob', 'YearsCodedJobPast', 'Salary', 'ExpectedSalary']]  
    drop_sal_df = working_df.dropna(subset=['ExpectedSalary'], axis = 0) # drop rows with NaN response variable
    drop_sal_df = drop_sal_df.dropna(how = 'all', axis=1) # drop columns with all NaNs
    cat_df = drop_sal_df.select_dtypes(include=['object'])
    cat_cols_lst = cat_df.columns
    df_new = create_dummy_var(drop_sal_df, cat_cols_lst, dummy_na=False) 
    test_r2, train_r2, lm = pred_model(df_new, 'ExpectedSalary', cat_cols_lst, dummy_na=False)
    print("The rsquared on the training data was {}.  The rsquared on the test data was {}.".format(train_r2, test_r2))
    print("The model has underfitting problem")

In [111]:
if __name__ == "__main__":
    main()

The rsquared on the training data was 0.047292496746298096.  The rsquared on the test data was 0.005812837468330723.
The model has underfitting problem
