# Project:
# Web Scraping Job Postings and Predicting Salary and Job Categories



In [None]:
### The outputs make this notebook too large to upload to github, so the version here does not contain the outputs.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
#from HTMLParser import HTMLParser
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import SnowballStemmer
#from nltk.corpus import stopwords
#from nltk.tag import pos_tag
#from nltk.tokenize import WordPunctTokenizer
from textblob import TextBlob
from textacy.preprocess import preprocess_text
from sklearn.feature_extraction.text import TfidfVectorizer

import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style('whitegrid')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import patsy
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
from scipy.spatial.distance import pdist
import os
from gensim import corpora, models, matutils
from collections import defaultdict

In [None]:
df = pd.read_csv('./formatted_job_listings.csv', sep=',', encoding='utf-8', index_col=None)

df.shape

In [None]:
df.head()

# Modeling

## Salary Prediction: Which factors predict salary estimate classes?

#### Since I'm looking for feature importances, I'll try a DecisionTreeClassifier first.

In [None]:
# Using patsy, I created a matrix of predictors based on the categorical columns city and company_abbrev and the 
# binary columns from the vector counts on the snippets and the job titles.

y, X = patsy.dmatrices('salary_categories ~ C(city) + C(company_abbrev) + jobtitle_length + data + experienc + statist + maintain + scienc + princip + scientist + look + level + play + team + lead + develop + advanc + role + JT_princip + JT_data + JT_scientist', 
                       data=df, return_type='dataframe')

In [None]:
X

In [None]:
# Getting the target values into array format

y = y.values.ravel()

In [None]:
# Trying out some parameters for the DecisionTreeClassifier to inform my selection of parameters for a GridSearch
tree_cl1 = DecisionTreeClassifier(max_depth=1, random_state=1)
tree_cl2 = DecisionTreeClassifier(max_depth=2, random_state=1)
tree_cl3 = DecisionTreeClassifier(max_depth=3, random_state=1)
tree_clN = DecisionTreeClassifier(max_depth=None, random_state=1)

In [None]:
tree_cl1.fit(X, y)
tree_cl2.fit(X, y)
tree_cl3.fit(X, y)
tree_clN.fit(X, y)

In [None]:
tree_cl1_scores = cross_val_score(tree_cl1, X, y, cv=4)
tree_cl2_scores = cross_val_score(tree_cl2, X, y, cv=4)
tree_cl3_scores = cross_val_score(tree_cl3, X, y, cv=4)
tree_clN_scores = cross_val_score(tree_clN, X, y, cv=4)

print tree_cl1_scores, np.mean(tree_cl1_scores)
print tree_cl2_scores, np.mean(tree_cl2_scores)
print tree_cl3_scores, np.mean(tree_cl3_scores)
print tree_clN_scores, np.mean(tree_clN_scores)

In [None]:
dtc_params = {
    'max_depth':[None,1,2,3,4],
    'max_features':[None,'log2','sqrt',2,3,4,5],
    'min_samples_split':[2,3,4,5,10,15,20,25,30,40,50]
}

dtc_gs = GridSearchCV(DecisionTreeClassifier(), dtc_params, cv=5, verbose=1)

In [None]:
dtc_gs.fit(X, y)

In [None]:
# Extracting the parameters and score for the best DecisionTreeClassifier identified by GridSearch

dtc_best = dtc_gs.best_estimator_
print dtc_gs.best_params_
print dtc_gs.best_score_

In [None]:
# Visualizing the decision tree

dot_data = StringIO() 

export_graphviz(dtc_best, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,
                feature_names=X.columns)  

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

In [None]:
# Displaying the feature importances for the decision tree

fi = pd.DataFrame({
        'feature':X.columns,
        'importance':dtc_best.feature_importances_
    })

fi.sort_values('importance', ascending=False, inplace=True)
fi

In [None]:
# The number of times that the word 'data' appears in the job snippet is the most important feature in predicting
# salary. If 'data' appears 0-1 times, then the salary is most likely in one of the highest two categories. In this
# case, if the hiring company is Tableau, the salary is likely to be in the second highest rather than the highest
# category.

# If the word 'data' appears two or more times in the snippet, then the next most important feature is whether
# the word 'scientist' appears in the job title. If it does, and the hiring organization is the University of
# Washington, then the salary is likely in the lowest category. If 'scientist' is not in the job title but
# the employer is not UW, then the salary is likely in the middle category if the job title has eight words or fewer
# or the highest category if the title has more than eight words. And so on.

# If 'data' appears in the listing at least twice but 'scientist' is not in the title, then having a job title
# containing 'data' suggests a higher salary, in the second lowest category if the job is at Amazon and potentially
# higher if it is not. If 'data' does not appear in the job title, then the salary is likely in the lowest category.

#### Next let's try a Naive Bayes with MultinomialNB since since we're classifying with discrete categories.

In [None]:
mnb = MultinomialNB().fit(X, y)

In [None]:
mnb_scores = cross_val_score(mnb, X, y, cv=4)

print mnb_scores, np.mean(mnb_scores)


In [None]:
# The MultinomialNB did not score quite as well as the optimal decision tree.