In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.metrics import adjusted_rand_score

In [6]:
# verifying CSV file location
!ls ../indeed_data_science_exercise/

Indeed_DS_Exercise.zip        train_features_2013-03-07.csv
salary_data.csv               train_salaries_2013-03-07.csv
test_features_2013-03-07.csv


In [12]:
df = pd.read_csv('../indeed_data_science_exercise/salary_data.csv', index_col=0)

In [29]:
df.head()

Unnamed: 0,jobId,salary,companyId,jobType,degree,major,industry,yearsExperience,milesFromMetropolis,companyId_COMP0,...,major_MATH,major_NONE,major_PHYSICS,industry_AUTO,industry_EDUCATION,industry_FINANCE,industry_HEALTH,industry_OIL,industry_SERVICE,industry_WEB
0,JOB1362684407687,130,COMP37,CFO,MASTERS,MATH,HEALTH,10,83,0,...,1,0,0,0,0,0,1,0,0,0
1,JOB1362684407688,101,COMP19,CEO,HIGH_SCHOOL,NONE,WEB,3,73,0,...,0,1,0,0,0,0,0,0,0,1
2,JOB1362684407689,137,COMP52,VICE_PRESIDENT,DOCTORAL,PHYSICS,HEALTH,10,38,0,...,0,0,1,0,0,0,1,0,0,0
3,JOB1362684407690,142,COMP38,MANAGER,DOCTORAL,CHEMISTRY,AUTO,8,17,0,...,0,0,0,1,0,0,0,0,0,0
4,JOB1362684407691,163,COMP7,VICE_PRESIDENT,BACHELORS,PHYSICS,FINANCE,8,16,0,...,0,0,1,0,0,1,0,0,0,0


In [32]:
# X not including target, companyId and all encoded columns
X = df.drop(['jobId', 'companyId', 'salary', 'jobType', 'degree', 'major', 
                    'industry'], axis=1)
# salary is the target
y = df['salary']

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [43]:
# Standardize X_train and X_test to transform yearsExperience and milesFromMetropolis to the same scale
ss = StandardScaler()
X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.fit_transform(X_test)

In [75]:
lr = LinearRegression(n_jobs=2)
lr = lr.fit(X_train_ss, y_train)
print('Linear Regression Model R2 = {:.3} +/- {:.3} 3 std ='.format(cross_val_score(lr, 
                                                                                X_test_ss, y_test, 
                                                                                cv = 10).mean(),
                                                               (cross_val_score(lr, X_test_ss, y_test, 
                                                                                cv = 10).std()*3)))

Linear Regression Model R2 = 0.744 +/- 0.00468 3 std =


In [57]:
features = pd.Series(X_train.columns.tolist())
coef = pd.Series(lr.coef_)

In [69]:
fi = pd.concat([features, coef], axis=1, keys=['feature', 'beta']).set_index('feature')

In [79]:
fi['abs_beta'] = abs(fi['beta'])

In [83]:
fi['abs_beta']

feature
yearsExperience           4.147472e+13
milesFromMetropolis       4.146661e+13
companyId_COMP0           3.712323e+13
companyId_COMP1           3.711040e+13
companyId_COMP10          3.707851e+13
companyId_COMP11          9.402836e+12
companyId_COMP12          4.822366e+12
companyId_COMP13          4.821786e+12
companyId_COMP14          4.819682e+12
companyId_COMP15          4.816472e+12
companyId_COMP16          4.815810e+12
companyId_COMP17          4.812012e+12
companyId_COMP18          4.809256e+12
companyId_COMP19          4.434845e+12
companyId_COMP2           4.428076e+12
companyId_COMP20          4.426660e+12
companyId_COMP21          4.421345e+12
companyId_COMP22          4.420179e+12
companyId_COMP23          4.419571e+12
companyId_COMP24          4.418659e+12
companyId_COMP25          4.410334e+12
companyId_COMP26          3.971615e+12
companyId_COMP27          3.968523e+12
companyId_COMP28          3.962504e+12
companyId_COMP29          3.960610e+12
companyId_COMP3  

In [82]:
fi['abs_beta'].sort_values('abs_beta', ascending=False)[:10].plot(kind='bar')

ValueError: No axis named abs_beta for object type <class 'pandas.core.series.Series'>

In [56]:
lr.coef_

array([-4.14747187e+13, -4.14666089e+13, -3.71232285e+13, -3.71103973e+13,
       -3.70785113e+13, -9.40283646e+12, -4.82236636e+12, -4.82178559e+12,
       -4.81968165e+12, -4.81647217e+12, -4.81580969e+12, -4.81201206e+12,
       -4.80925561e+12, -4.43484467e+12, -4.42807600e+12, -4.42665998e+12,
       -4.42134469e+12, -4.42017930e+12, -4.41957111e+12, -4.41865862e+12,
       -4.41033373e+12, -3.97161477e+12, -3.96852302e+12, -3.96250389e+12,
       -3.96061015e+12, -3.95613018e+12, -3.95492311e+12, -3.94922733e+12,
       -3.94888185e+12, -3.94784522e+12, -3.94611684e+12, -3.94300373e+12,
       -3.94075375e+12, -3.93971483e+12, -3.93954165e+12, -3.93832915e+12,
       -3.93746283e+12, -3.93676963e+12, -3.93538285e+12, -3.93538285e+12,
       -3.93520946e+12, -3.93486267e+12, -3.93434242e+12, -3.93399554e+12,
       -3.93278122e+12, -3.92965683e+12, -3.92930952e+12, -3.92913584e+12,
       -3.92861478e+12, -3.92826737e+12, -3.92757244e+12, -3.92722492e+12,
       -3.92722492e+12, -