# Project:
# Web Scraping Job Postings and Predicting Salary and Job Categories



In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
#from HTMLParser import HTMLParser
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import SnowballStemmer
#from nltk.corpus import stopwords
#from nltk.tag import pos_tag
#from nltk.tokenize import WordPunctTokenizer
from textblob import TextBlob
from textacy.preprocess import preprocess_text
from sklearn.feature_extraction.text import TfidfVectorizer

import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style('whitegrid')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import patsy
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
from scipy.spatial.distance import pdist
import os
from gensim import corpora, models, matutils
from collections import defaultdict

In [2]:
df = pd.read_csv('./formatted_job_listings.csv', sep=',', encoding='utf-8', index_col=None)

df.shape

(1960, 39)

In [3]:
df.head()

Unnamed: 0,city,company,jobkey,jobtitle,snippet,url,search_term,company_abbrev,jobtitle_length,snippet_clean,jobtitle_clean,jobtitle_noun_phrases,snippet_noun_phrases,jobtitle_words,snippet_words,jobtitle_stem,snippet_stem,snippet_stem_string,jobtitle_stem_string,data,advanc,experienc,role,team,look,princip,level,play,scienc,lead,statist,scientist,maintain,develop,JT_princip,JT_scientist,JT_data,salary_estimate,salary_categories
0,Redmond,Microsoft,8f2960110f91289e,Principal Data Scientist,We are looking for an experienced principal le...,http://www.indeed.com/viewjob?jk=8f2960110f912...,data science,Microsoft,3,we are looking for an experienced principal le...,principal data scientist,[data scientist],"[principal level data scientist, lead role, da...","[principal, data, scientist]","[we, are, looking, for, an, experienced, princ...","[princip, data, scientist]","[we, are, look, for, an, experienc, princip, l...",we are look for an experienc princip level dat...,princip data scientist,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,132071,2
1,Seattle,Facebook,c822b64a1008e72a,"Data Scientist, Analytics",The <b>Data</b> <b>Scientist</b> Analytics rol...,http://www.indeed.com/viewjob?jk=c822b64a1008e...,data science,Facebook,3,the data scientist analytics role has work acr...,data scientist analytics,"[data scientist, analytics]","[data scientist analytics role, areas building...","[data, scientist, analytics]","[the, data, scientist, analytics, role, has, w...","[data, scientist, analyt]","[the, data, scientist, analyt, role, has, work...",the data scientist analyt role has work across...,data scientist analyt,2,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,132071,2
2,Seattle,Amazon Corporate LLC,088d06342a3c0613,Data Scientist,Amazon is seeking a <b>Data</b> <b>Scientist</...,http://www.indeed.com/viewjob?jk=088d06342a3c0...,data science,Amazon,2,amazon is seeking a data scientist to join the...,data scientist,[data scientist],"[data scientist, previous experience, data sci...","[data, scientist]","[amazon, is, seeking, a, data, scientist, to, ...","[data, scientist]","[amazon, is, seek, a, data, scientist, to, joi...",amazon is seek a data scientist to join the co...,data scientist,2,0,0,1,1,0,0,0,0,0,0,0,2,0,0,0,1,1,132071,2
3,Seattle,Amazon Corporate LLC,219629e50af0388b,Data Engineer,3+ years in relevant experience as <b>data</b>...,http://www.indeed.com/viewjob?jk=219629e50af03...,data science,Amazon,2,3 years in relevant experience as data enginee...,data engineer,[data engineer],"[relevant experience, data engineer data scien...","[data, engineer]","[3, years, in, relevant, experience, as, data,...","[data, engin]","[3, year, in, relev, experi, as, data, engin, ...",3 year in relev experi as data engin data scie...,data engin,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,132071,2
4,Renton,XDuce,020aaa78301c721d,Associate Technical Architect,Senior programmer or <b>Data</b> <b>scientist<...,http://www.indeed.com/viewjob?jk=020aaa78301c7...,data science,XDuce,3,senior programmer or data scientist experience...,associate technical architect,[associate technical architect],"[senior programmer, data scientist experience,...","[associate, technical, architect]","[senior, programmer, or, data, scientist, expe...","[associ, technic, architect]","[senior, programm, or, data, scientist, experi...",senior programm or data scientist experi with ...,associ technic architect,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,132071,2


# Modeling

## Is a job at Amazon or not?

#### Since almost 40% of the job listings I obtained are for Amazon, I thought it would be interesting to see if I can predict whether a given listing is from Amazon or not.

In [107]:
# Creating a target column

df['Amazon'] = df['company_abbrev'].map(lambda x: 1 if x == 'Amazon' else 0)

df.head()

Unnamed: 0,city,company,jobkey,jobtitle,snippet,url,search_term,company_abbrev,jobtitle_length,snippet_clean,jobtitle_clean,jobtitle_noun_phrases,snippet_noun_phrases,jobtitle_words,snippet_words,jobtitle_stem,snippet_stem,snippet_stem_string,jobtitle_stem_string,data,advanc,experienc,role,team,look,princip,level,play,scienc,lead,statist,scientist,maintain,develop,JT_princip,JT_scientist,JT_data,salary_estimate,salary_categories,Amazon
0,Redmond,Microsoft,8f2960110f91289e,Principal Data Scientist,We are looking for an experienced principal le...,http://www.indeed.com/viewjob?jk=8f2960110f912...,data science,Microsoft,3,we are looking for an experienced principal le...,principal data scientist,[data scientist],"[principal level data scientist, lead role, da...","[principal, data, scientist]","[we, are, looking, for, an, experienced, princ...","[princip, data, scientist]","[we, are, look, for, an, experienc, princip, l...",we are look for an experienc princip level dat...,princip data scientist,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,132071,2,0
1,Seattle,Facebook,c822b64a1008e72a,"Data Scientist, Analytics",The <b>Data</b> <b>Scientist</b> Analytics rol...,http://www.indeed.com/viewjob?jk=c822b64a1008e...,data science,Facebook,3,the data scientist analytics role has work acr...,data scientist analytics,"[data scientist, analytics]","[data scientist analytics role, areas building...","[data, scientist, analytics]","[the, data, scientist, analytics, role, has, w...","[data, scientist, analyt]","[the, data, scientist, analyt, role, has, work...",the data scientist analyt role has work across...,data scientist analyt,2,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,132071,2,0
2,Seattle,Amazon Corporate LLC,088d06342a3c0613,Data Scientist,Amazon is seeking a <b>Data</b> <b>Scientist</...,http://www.indeed.com/viewjob?jk=088d06342a3c0...,data science,Amazon,2,amazon is seeking a data scientist to join the...,data scientist,[data scientist],"[data scientist, previous experience, data sci...","[data, scientist]","[amazon, is, seeking, a, data, scientist, to, ...","[data, scientist]","[amazon, is, seek, a, data, scientist, to, joi...",amazon is seek a data scientist to join the co...,data scientist,2,0,0,1,1,0,0,0,0,0,0,0,2,0,0,0,1,1,132071,2,1
3,Seattle,Amazon Corporate LLC,219629e50af0388b,Data Engineer,3+ years in relevant experience as <b>data</b>...,http://www.indeed.com/viewjob?jk=219629e50af03...,data science,Amazon,2,3 years in relevant experience as data enginee...,data engineer,[data engineer],"[relevant experience, data engineer data scien...","[data, engineer]","[3, years, in, relevant, experience, as, data,...","[data, engin]","[3, year, in, relev, experi, as, data, engin, ...",3 year in relev experi as data engin data scie...,data engin,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,132071,2,1
4,Renton,XDuce,020aaa78301c721d,Associate Technical Architect,Senior programmer or <b>Data</b> <b>scientist<...,http://www.indeed.com/viewjob?jk=020aaa78301c7...,data science,XDuce,3,senior programmer or data scientist experience...,associate technical architect,[associate technical architect],"[senior programmer, data scientist experience,...","[associate, technical, architect]","[senior, programmer, or, data, scientist, expe...","[associ, technic, architect]","[senior, programm, or, data, scientist, experi...",senior programm or data scientist experi with ...,associ technic architect,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,132071,2,0


In [108]:
y_amzn = df['Amazon'].values
X_amzn = cvec_sn_df

### Let's try a naive Bayes classifier using BernoulliNB since it's a binary classification problem.

In [109]:
bnb_amzn = MultinomialNB().fit(X_amzn, y_amzn)

In [110]:
bnb_amzn_scores = cross_val_score(bnb_amzn, X_amzn, y_amzn, cv=4)

print bnb_amzn_scores, np.mean(bnb_amzn_scores)


[ 0.59063136  0.73319756  0.73824131  0.50715746] 0.642306923394


In [111]:
# The model does significantly better than the 39-40% baseline, but let's see if we can improve on it with a decision
# tree classifier.

In [112]:
dtc_amzn_params = {
    'max_depth':[None,1,2,3,4],
    'max_features':[None,'log2','sqrt',2,3,4,5],
    'min_samples_split':[2,3,4,5,10,15,20,25,30,40,50]
}

dtc_amzn_gs = GridSearchCV(DecisionTreeClassifier(), dtc_amzn_params, cv=5, verbose=1)

In [113]:
dtc_amzn_gs.fit(X_amzn, y_amzn)

Fitting 5 folds for each of 385 candidates, totalling 1925 fits


[Parallel(n_jobs=1)]: Done 1925 out of 1925 | elapsed:   43.9s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_features': [None, 'log2', 'sqrt', 2, 3, 4, 5], 'min_samples_split': [2, 3, 4, 5, 10, 15, 20, 25, 30, 40, 50], 'max_depth': [None, 1, 2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [114]:
dtc_amzn_best = dtc_amzn_gs.best_estimator_
print dtc_amzn_gs.best_params_
print dtc_amzn_gs.best_score_

{'max_features': None, 'min_samples_split': 25, 'max_depth': 3}
0.705612244898


In [115]:
# That is better. Let's see the feature importances:

fi_amzn = pd.DataFrame({
        'feature':X_amzn.columns,
        'importance':dtc_amzn_best.feature_importances_
    })

fi_amzn.sort_values('importance', ascending=False, inplace=True)

fi_amzn.head(10)

Unnamed: 0,feature,importance
44,amazon,0.63022
887,success,0.174819
954,use,0.148403
676,portal,0.023471
745,recommend,0.023088
0,10,0.0
666,place,0.0
668,platform,0.0
667,plan,0.0
665,pipelin,0.0


In [116]:
# Not surprisingly, having the word 'Amazon' in a snippet is the best indicator that a job listing is for Amazon.
# 'Success' and 'use,' and to a lesser extent, 'portal' and 'recommend' also determine whether a job is at Amazon.