In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
%matplotlib inline

df = pd.read_csv('./survey_results_public.csv')
df.head()

Unnamed: 0,Respondent,Professional,ProgramHobby,Country,University,EmploymentStatus,FormalEducation,MajorUndergrad,HomeRemote,CompanySize,...,StackOverflowMakeMoney,Gender,HighestEducationParents,Race,SurveyLong,QuestionsInteresting,QuestionsConfusing,InterestedAnswers,Salary,ExpectedSalary
0,1,Student,"Yes, both",United States,No,"Not employed, and not looking for work",Secondary school,,,,...,Strongly disagree,Male,High school,White or of European descent,Strongly disagree,Strongly agree,Disagree,Strongly agree,,
1,2,Student,"Yes, both",United Kingdom,"Yes, full-time",Employed part-time,Some college/university study without earning ...,Computer science or software engineering,"More than half, but not all, the time",20 to 99 employees,...,Strongly disagree,Male,A master's degree,White or of European descent,Somewhat agree,Somewhat agree,Disagree,Strongly agree,,37500.0
2,3,Professional developer,"Yes, both",United Kingdom,No,Employed full-time,Bachelor's degree,Computer science or software engineering,"Less than half the time, but at least one day ...","10,000 or more employees",...,Disagree,Male,A professional degree,White or of European descent,Somewhat agree,Agree,Disagree,Agree,113750.0,
3,4,Professional non-developer who sometimes write...,"Yes, both",United States,No,Employed full-time,Doctoral degree,A non-computer-focused engineering discipline,"Less than half the time, but at least one day ...","10,000 or more employees",...,Disagree,Male,A doctoral degree,White or of European descent,Agree,Agree,Somewhat agree,Strongly agree,,
4,5,Professional developer,"Yes, I program as a hobby",Switzerland,No,Employed full-time,Master's degree,Computer science or software engineering,Never,10 to 19 employees,...,,,,,,,,,,


In [4]:
df['Salary']

0             NaN
1             NaN
2        113750.0
3             NaN
4             NaN
           ...   
19097         NaN
19098         NaN
19099         NaN
19100    110000.0
19101         NaN
Name: Salary, Length: 19102, dtype: float64

In [6]:
df = df.dropna(subset=['Salary'], axis=0)
y = df['Salary']
y

2        113750.00000
14       100000.00000
17       130000.00000
18        82500.00000
22       100764.00000
             ...     
19079     65000.00000
19086     80645.16129
19088     41250.00000
19089     50500.00000
19100    110000.00000
Name: Salary, Length: 5009, dtype: float64

In [9]:
def clean_data(df):
    '''
    INPUT
    df - pandas dataframe 
    
    OUTPUT
    X - A matrix holding all of the variables you want to consider when predicting the response
    y - the corresponding response vector
    
    This function cleans df using the following steps to produce X and y:
    1. Drop all the rows with no salaries
    2. Create X as all the columns that are not the Salary column
    3. Create y as the Salary column
    4. Drop the Salary, Respondent, and the ExpectedSalary columns from X
    5. For each numeric variable in X, fill the column with the mean value of the column.
    6. Create dummy columns for all the categorical variables in X, drop the original columns
    '''
    # Drop rows with missing salary values
    df = df.dropna(subset=['Salary'], axis=0)
    y = df['Salary']
    
    # Drop respondent and expected salary columns
    df = df.drop(['Respondent', 'ExpectedSalary', 'Salary'], axis=1)
    
    # fill numerict columns with the mean
    num_vars = df.select_dtypes(include=['float', 'int']).columns
    for col in num_vars:
        df[col].fillna((df[col].mean()), inplace=True)
        
    # Dummy the categorical variables
    cat_vars = df.select_dtypes(include=['object']).copy().columns
    for var in cat_vars:
        # for each cat add dummy_var, drop original column
        df = pd.concat([df.drop(var, axis=1), pd.get_dummies(df[var], prefix=var, prefix_sep='_', drop_first=True)], axis=1)
    
    X = df
    return X, y

#use the function to create X and y
X, y = clean_data(df)

In [10]:
X

Unnamed: 0,CareerSatisfaction,JobSatisfaction,HoursPerWeek,StackOverflowSatisfaction,"ProgramHobby_Yes, I contribute to open source projects","ProgramHobby_Yes, I program as a hobby","ProgramHobby_Yes, both",Country_Albania,Country_Argentina,Country_Armenia,...,QuestionsInteresting_Strongly agree,QuestionsInteresting_Strongly disagree,QuestionsConfusing_Disagree,QuestionsConfusing_Somewhat agree,QuestionsConfusing_Strongly agree,QuestionsConfusing_Strongly disagree,InterestedAnswers_Disagree,InterestedAnswers_Somewhat agree,InterestedAnswers_Strongly agree,InterestedAnswers_Strongly disagree
2,8.0,9.0,2.447415,8.000000,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
14,8.0,8.0,2.447415,8.000000,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
17,9.0,8.0,2.447415,8.000000,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
18,5.0,3.0,2.447415,8.442686,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22,8.0,9.0,2.447415,8.000000,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19079,10.0,8.0,0.000000,10.000000,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
19086,10.0,8.0,2.447415,10.000000,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
19088,8.0,8.0,2.447415,9.000000,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
19089,7.0,6.0,30.000000,9.000000,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [11]:
y

2        113750.00000
14       100000.00000
17       130000.00000
18        82500.00000
22       100764.00000
             ...     
19079     65000.00000
19086     80645.16129
19088     41250.00000
19089     50500.00000
19100    110000.00000
Name: Salary, Length: 5009, dtype: float64

In [12]:
def coef_weights(coefficients, X_train):
    '''
    INPUT:
    coefficients - the coefficients of the linear model 
    X_train - the training data, so the column names can be used
    OUTPUT:
    coefs_df - a dataframe holding the coefficient, estimate, and abs(estimate)
    
    Provides a dataframe that can be used to understand the most influential coefficients
    in a linear model by providing the coefficient estimates along with the name of the 
    variable attached to the coefficient.
    '''
    coefs_df = pd.DataFrame()
    coefs_df['est_int'] = X_train.columns
    coefs_df['coefs'] = lm_model.coef_
    coefs_df['abs_coefs'] = np.abs(lm_model.coef_)
    coefs_df = coefs_df.sort_values('abs_coefs', ascending=False)
    return coefs_df

#Use the function
coef_df = coef_weights(lm_model.coef_, X_train)

# A quick look at the top results
coef_df.head(20)

NameError: name 'lm_model' is not defined