In [28]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from sklearn_pandas import DataFrameMapper
import sklearn.preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import string

In [29]:
jobs = pd.read_csv('./datasets/jobsclean.csv')
jobs.drop([jobs.columns[0]], axis=1, inplace=True)

In [30]:
jobs.head()

Unnamed: 0,company,details,title,state,experience_,salary_
0,eliassen group,other great industries information technology ...,data scientist,NC,,
1,rang technologies,computer software information technology roles...,data scientist,IL,,
2,austin fraser,computer software information technology austi...,data scientist,TX,,132500.0
3,frg technology consulting,computer software information technology data ...,data scientist,MA,,195000.0
4,roc search inc,computer software information technology data ...,data scientist,TX,5.0,115000.0


In [31]:
jobs.drop(jobs[jobs['salary_'].isnull()].index,inplace=True)
jobs['salary_']= jobs['salary_'].astype(float)

In [32]:
jobs[jobs['experience_'].isnull()].count()

company        493
details        493
title          493
state          493
experience_      0
salary_        493
dtype: int64

In [33]:
jobs.loc[jobs['experience_'].isnull(), 'experience_'] = jobs.experience_.median()

In [34]:
jobs.head()

Unnamed: 0,company,details,title,state,experience_,salary_
2,austin fraser,computer software information technology austi...,data scientist,TX,5.0,132500.0
3,frg technology consulting,computer software information technology data ...,data scientist,MA,5.0,195000.0
4,roc search inc,computer software information technology data ...,data scientist,TX,5.0,115000.0
5,synergy business consulting inc,travel information technology data scientist p...,data scientist,FL,2.0,100000.0
6,catapult staffing,energy utilities gas electric information t...,data scientist,CA,5.0,152482.5


In [35]:
jobs.shape

(1251, 6)

In [36]:
dummy = pd.get_dummies(jobs['state'], drop_first=True, prefix = 'state')
jobs.drop(['state'],axis = 1,inplace=True)
jobs = pd.concat([jobs, dummy],axis=1)

In [37]:
dummy = pd.get_dummies(jobs['company'], drop_first=True, prefix='coy')
jobs.drop('company',axis=1,inplace=True)
jobs = pd.concat([jobs, dummy],axis=1)

In [38]:
dummy = pd.get_dummies(jobs['title'], drop_first=True, prefix='titl')
jobs.drop('title',axis=1,inplace=True)
jobs = pd.concat([jobs, dummy],axis=1)

In [39]:
# To pick out columns with missing values and their numbers

for i,e in enumerate(jobs.columns):
    if jobs[e].isnull().sum() >0:
        print ((jobs.columns[i]),(jobs[e].isnull().sum()))

In [40]:
stop = stopwords.words('english')
jobs['details'] = jobs['details'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [41]:
vectorizer = TfidfVectorizer(ngram_range=(1, 3),min_df=5)
X=vectorizer.fit_transform(list(jobs['details']))
counts=pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())
jobs.drop('details',axis=1,inplace=True)

In [42]:
counts.shape

(1251, 14580)

In [43]:
counts.transpose().sort_values(0, ascending=False).transpose().head(10)

Unnamed: 0,data,econometrics,statistics,austin,austin fraser,fraser,operations research,language processing,natural language processing,mathematical,...,experience ability,experience accounting,experience active,experience advanced,experience agile,experience agile development,experience amazon,experience amazon web,experience analyzing,zookeeper plus proven
0,0.262578,0.205829,0.172667,0.16221,0.13722,0.13722,0.118883,0.114682,0.114682,0.113445,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.238871,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.17948,0.0,0.055323,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.235082,0.0,0.05797,0.0,0.0,0.0,0.039913,0.038502,0.038502,0.076174,...,0.0,0.0,0.0,0.0,0.035141,0.0,0.0,0.0,0.0,0.0
4,0.092274,0.0,0.075848,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.124961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.24436,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.141453,0.0,0.058136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.148396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.072398,0.0,0.0,0.0,0.0,0.0,0.0,0.177863,0.177863,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
word_counts = counts.sum(axis=0)
word_counts.sort_values(ascending = False).head(20)

data            64.525599
experience      62.561814
business        43.836764
technology      36.197893
software        34.250591
work            32.053455
development     29.932651
skills          28.240042
years           25.463195
financial       25.293660
team            24.913106
systems         24.796351
management      24.380949
job             23.946325
ability         23.533470
design          23.068600
requirements    22.989832
information     22.668541
support         22.631913
sql             22.508394
dtype: float64

In [45]:
jobs.reset_index(drop=True, inplace=True) 

In [46]:
jobs = pd.concat([jobs, counts],axis=1)

In [47]:
jobs.shape

(1251, 14967)

In [48]:
from sklearn.model_selection import train_test_split

predictors = list(jobs.columns)
predictors.remove('salary_')

# X is a matrix, hence we use [] to access the features we want in feature_cols
X = jobs[predictors]

# y is a vector, hence we use dot to access 'label'
y = jobs.salary_.values

# 80-20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)

In [49]:
# Standard Scaling
from sklearn.preprocessing import StandardScaler

# Initialize the scaler.
ss = StandardScaler()

# Fit the data using the scaler (scale the data).
Xs = ss.fit_transform(X_train.values)
Xst = ss.fit_transform(X_test.values)

In [50]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.linear_model import LinearRegression

lm = LinearRegression()

scores = cross_val_score(lm, Xs, y_train, cv=5)
print scores
print np.mean(scores)

[ 0.4819592   0.3006032   0.42570226  0.550452    0.49170158]
0.450083648461


In [51]:
from sklearn.metrics import mean_squared_error, r2_score

model = lm.fit(X_train,y_train)
predictions = model.predict(X_test)

RMSE = mean_squared_error(y_test, predictions) # If y_test exisits
r2 = r2_score(y_test, predictions) # If y_test exisits

In [52]:
RMSE

1084351738.6667855

In [53]:
r2

0.53054748904159232

In [633]:
ar = AdaBoostRegressor(n_estimators=50,
                         learning_rate=1,
                         random_state=0)

scores = cross_val_score(ar, Xs, y_train, cv=5)
print scores
print np.mean(scores)

[ 0.32756932  0.24257611  0.16956321  0.31781049  0.22311612]
0.256127051097


In [634]:
model = ar.fit(X_train,y_train)
predictions = model.predict(Xst)

RMSE = mean_squared_error(y_test, predictions) # If y_test exisits
r2 = r2_score(y_test, predictions) # If y_test exisits

In [635]:
RMSE

1499028662.2236264

In [636]:
r2

0.35101983573638751