 Project 4
---

---

**Importing modules**

---

In [3]:
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import requests
import pprint

from bs4 import BeautifulSoup

import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

---

**Scraping**

Unfortunately since I'm not too familiar with scrapy and jupyter notebook, I dont know how to integrate the scrapy spider into the notebook. However, the below cell contains the code for spider scraper

---

In [4]:
import scrapy, re
from bs4 import BeautifulSoup
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

class IndeedSpider(CrawlSpider):
    name = 'indeed'
    allowed_domains = ['au.indeed.com']

    pay = 60000

    urls = ['https://au.indeed.com/jobs?q=data+scientist+%24{}+-+%24{}'.format(x-10000, x) for x in range(50000,210000,10000)]

    start_urls = urls

    rules = (
        Rule(LinkExtractor("start\=.*$"), callback='parse_item', follow = True),
    )    

    def parse_item(self, response):
        self.log('------------------------------------')
        self.log('I just visited: ' + response.url)
        self.log('------------------------------------')

        soup = BeautifulSoup(response.text, 'lxml')
        divs = soup.find_all("div", class_ = ["row", "result"])


        for job in divs:
            
            item = {}

            item["id"] = job.get("id")

            try:
                item["company"] = job.find("span", class_ = "company").text.strip("\n").strip()
            except AttributeError:
                item["company"] = None
                
            item["pay"] = re.search('([0-9]*)&start', response.url).group(1)

            item["title"] = job.a.get("title")

            item["loc"]= job.find("span", class_ ="location").text

            job_summ_page = job.get("data-jk")
            next_page_url = "https://au.indeed.com/viewjob?jk={}&from=tp-serp&tk=1c2i82l0a102q7vk".format(job_summ_page)
            yield scrapy.Request(url = next_page_url, callback = self.parse_summ, meta=dict(item=item))

    def parse_summ(self, response):
        
        item = response.meta['item']
        
        soup = BeautifulSoup(response.text, 'lxml')
        item["summ"] = soup.find("span", class_ = "summary").text.replace("\n", " ")
        yield item

---

**Data**

For the purpose of this project, I will be working on data that has already been scraped into a .json file from the spider. 

---

In [5]:
# Initiates file name: file_name
file_name = "test.json"

In [6]:
# Imports file as dataframe: data
data = pd.read_json(file_name)

# Displays the head of the dataframe
data.head(3)

Unnamed: 0,company,id,loc,pay,summ,title
0,Opus Recruitment Solutions,p_d4996fe3bbcdf3f3,Sydney NSW,120000,My client is a rapidly growing start-up within...,DATA SCIENTIST
1,QBE Insurance,pj_112015665e5a2e01,Sydney NSW,120000,QBE is one of the top 20 global general insure...,Senior Data Scientist
2,Xpand,p_95299eb857a58426,Australia,120000,Senior Data Scientist An exponentially growi...,Senior Data Scientist


In [7]:
# Shape of the data
data.shape

(200, 6)

In [8]:
# Data types
data.dtypes

company    object
id         object
loc        object
pay         int64
summ       object
title      object
dtype: object

In [9]:
# Missing Values
data.isnull().sum()

company    0
id         0
loc        0
pay        0
summ       0
title      0
dtype: int64

In [1]:
#Distribution of pay scale for jobs
data.pay.value_counts()

NameError: name 'data' is not defined

---

**Cleaning Job Titles and Job Summaries**

Do to the inconsistencies with the job titles and job summaries, we are going to clean it up a bit. This involves removing all non-alphanumeric characters and making everything lowercase. 

---

In [11]:
#Import Regex to clean
import re
from nltk.corpus import stopwords

def clean_text(text):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the outputr is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(text).get_text()
    #
    # 2. Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    #
    # 3. Conbert to lower case, split into individual words
    words = letters_only.lower().split()
    #
    # 4.In Python, searching a set is much faster than searching a list, soconvert the stop words to a set
    stops = set(stopwords.words("english"))
    #
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    #
    # 6. Join the words back into one string seperated by space, 
    # and return the result.
    return( " ".join(meaningful_words))

In [12]:
data.summ.map(clean_text);
data.title.map(clean_text);

---

**Bag of Words model**

Use CountVectorizer to create bag of words for:

- Job titles
- Job summarys

---

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

#Setting the vectorizer just like we would set a model
cvec = CountVectorizer(stop_words = "english", ngram_range=(2,3))

#Fitting the vectorizer on our training data
cvec.fit(data["summary"])

NameError: name 'data' is not defined

In [14]:
print(len(cvec.get_feature_names()))

5893


In [15]:
X_train = pd.DataFrame(cvec.transform(data['summ']).todense(),
                      columns = cvec.get_feature_names())

In [16]:
X_train.shape

(200, 5893)

In [17]:
words_counts = X_train.sum(axis=0)
words_counts.sort_values(ascending = False).head(20)

data           897
experience     624
team           459
work           414
role           359
skills         332
business       313
research       304
working        261
development    250
management     224
science        204
analytics      202
new            201
ability        199
strong         188
analysis       186
apply          186
australia      185
projects       176
dtype: int64

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

vectorizer = TfidfVectorizer(stop_words='english',use_idf=True)
model = vectorizer.fit_transform(data['summ'].str.upper())
km = KMeans(n_clusters=5,init='k-means++',max_iter=200,n_init=1)

k=km.fit(model)
terms = vectorizer.get_feature_names()
order_centroids = km.cluster_centers_.argsort()[:,::-1]
for i in range(5):
    print("cluster of words %d:" %i)
    for ind in order_centroids[i,:10]:
        print(' %s' % terms[ind])
    print() 

cluster of words 0:
 cardiac
 heart
 tasmania
 clinic
 launceston
 echo
 charles
 imaging
 north
 coronary

cluster of words 1:
 data
 analytics
 learning
 experience
 business
 machine
 statistical
 team
 science
 models

cluster of words 2:
 environmental
 research
 experience
 team
 work
 australia
 role
 data
 csiro
 position

cluster of words 3:
 risk
 financial
 quantitative
 markets
 macquarie
 trading
 research
 analysis
 marketing
 credit

cluster of words 4:
 investor
 project
 management
 wholesale
 property
 business
 research
 macquarie
 presentations
 investment



---

### 2. Create a predictor matrix of words from the quotes with CountVectorizer

It is up to you what ngram range you want to select. **Make sure that `binary=True`**

---

In [19]:
cv = CountVectorizer(ngram_range=(1,2), max_features=2500, binary=True, stop_words='english')
words = cv.fit_transform(data.summ)
words = pd.DataFrame(words.todense(), columns=cv.get_feature_names())

In [20]:
words.head()

Unnamed: 0,000,02,03,10,100,103k,109k,109k 128k,11,11 59pm,...,www macquarie,year,years,years ago,years experience,years professional,years related,years relevant,york,zealand
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0


In [21]:
words.shape

(200, 2500)

---

### 3. Split pay into high - low.

Lets split pay into high - low and see the differences in the words used between the two categories. We're going to split it by median which is ~ $110,000

---

In [22]:
data["pay_hilo"] = data["pay"].map(lambda x: 0 if x < 110000 else 1)

---

### 3. Split data into training and testing splits

You should keep 25% of the data in the test set.

---

In [23]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(words.values, data.pay_hilo, test_size=0.25)
print(Xtrain.shape, Xtest.shape)

(150, 2500) (50, 2500)


---

### 4. Build a `BernoulliNB` model predicting high vs low salaries from the word appearances

The model should only be built (and cross-validated) on the training data.

Cross-validate the score and compare it to baseline.

---

In [24]:
from sklearn.naive_bayes import BernoulliNB
nb = BernoulliNB()

In [25]:
from sklearn.model_selection import cross_val_score
nb.fit(Xtrain, ytrain)

nb_scores = cross_val_score(BernoulliNB(), Xtrain, ytrain, cv = 5)

In [26]:
print(nb_scores)
print("----------------------------------------------------------------------")
print("The average score of a BernoulliNB model is {:.2f}".format(np.mean(nb_scores)))
print("The baseline score is {:.2f}".format(np.mean(np.mean(ytrain))))

[ 0.77419355  0.73333333  0.66666667  0.73333333  0.65517241]
----------------------------------------------------------------------
The average score of a BernoulliNB model is 0.71
The baseline score is 0.51


---

### 5. Pull out the probability of words given "high salary"

The `.feature_log_prob_` attribute of the naive bayes model contains the log probabilities of a feature appearing given a target class.

The rows correspond to the class of the target, and the columns correpsond to the features. The first row is the 0 "low salary" class, and the second is the 1 "high salary" class.

#### 5.1 Pull out the log probabilities and convert them to probabilities (for high and low salaries).

In [27]:
feat_lp = nb.feature_log_prob_

In [28]:
high_p = np.exp(feat_lp[1])

In [29]:
low_p = np.exp(feat_lp[0])

#### 5.2 Make a dataframe with the probabilities and features

In [30]:
feat_probs = pd.DataFrame({'high_p':high_p, 'low_p':low_p, 'feature':words.columns.values})

In [31]:
feat_probs.head()

Unnamed: 0,feature,high_p,low_p
0,0,0.064103,0.052632
1,2,0.217949,0.105263
2,3,0.089744,0.105263
3,10,0.025641,0.052632
4,100,0.089744,0.065789


#### 5.3 Create a column that is the difference between fresh probability of appearance and rotten

In [32]:
feat_probs['sal_diff'] = feat_probs.high_p - feat_probs.low_p

#### 5.4 Look at the most likely words for fresh and rotten reviews

feat_probs.sort_values('sal_diff', ascending=False, inplace=True)
feat_probs.head(20)

In [33]:
feat_probs.sort_values('sal_diff', ascending=True, inplace=True)
feat_probs.head(20)

Unnamed: 0,feature,high_p,low_p,sal_diff
253,based,0.24359,0.447368,-0.203779
2291,time,0.269231,0.460526,-0.191296
1648,position,0.24359,0.434211,-0.190621
1379,management,0.410256,0.578947,-0.168691
1743,programs,0.076923,0.236842,-0.159919
638,description,0.115385,0.263158,-0.147773
368,click,0.102564,0.25,-0.147436
205,assist,0.089744,0.236842,-0.147099
19,2018,0.064103,0.210526,-0.146424
1055,health,0.076923,0.210526,-0.133603


---

### 6. Examine how your model performs on the test set

In [34]:
print (nb.score(Xtest, ytest))
print (np.mean(ytest))

0.64
0.48


---

### 4. Trees Cart Model

---

In [59]:
cvec = CountVectorizer(analyzer='word')

In [65]:
from sklearn.tree import DecisionTreeRegressor
title_clean = data.title.map(clean_text)
word_matrix = cvec.fit_transform(title_clean)
word_matrix.shape

(200, 295)

In [72]:
columns = [x.encode('utf8') for x in cvec.get_feature_names()]
words_df = pd.DataFrame(columns=columns, data=word_matrix.todense())
words_df.shape

(200, 295)

In [69]:
word_df.head()

Unnamed: 0,b'account',b'acoustic',b'adelaide',b'advertising',b'adviser',b'agency',b'agronomist',b'ai',b'air',b'akqa',...,b'validation',b'value',b'vibration',b'view',b'water',b'web',b'wheat',b'women',b'workforce',b'years'
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [76]:
pay_bool = data["pay_hilo"]

In [84]:
from sklearn.linear_model import LogisticRegression
Xc = words_df
yc = pay_bool

cls_scores = cross_val_score(LogisticRegression(), Xc, yc, cv=4)
print (cls_scores)
print(np.mean(cls_scores))

[ 0.54  0.64  0.62  0.58]
0.595


In [88]:
from sklearn.tree import DecisionTreeClassifier
dtc1 = DecisionTreeClassifier(max_depth=1)
dtc2 = DecisionTreeClassifier(max_depth=2)
dtc3 = DecisionTreeClassifier(max_depth=3)
dtcN = DecisionTreeClassifier(max_depth=None)

In [90]:
dtc1.fit(Xc, yc)
dtc2.fit(Xc, yc)
dtc3.fit(Xc, yc)
dtcN.fit(Xc, yc)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [93]:
dtc1_scores = cross_val_score(dtc1, Xc, yc, cv=4)
dtc2_scores = cross_val_score(dtc2, Xc, yc, cv=4)
dtc3_scores = cross_val_score(dtc3, Xc, yc, cv=4)
dtcN_scores = cross_val_score(dtcN, Xc, yc, cv=4)

print (dtc1_scores, np.mean(dtc1_scores))
print (dtc2_scores, np.mean(dtc2_scores))
print (dtc3_scores, np.mean(dtc3_scores))
print (dtcN_scores, np.mean(dtcN_scores))

[ 0.54  0.58  0.54  0.52] 0.545
[ 0.58  0.6   0.58  0.5 ] 0.565
[ 0.58  0.58  0.58  0.5 ] 0.56
[ 0.64  0.56  0.58  0.54] 0.58


In [104]:
logreg = LogisticRegression().fit(Xc, yc)
logreg.score(Xc, yc)

0.89000000000000001

---

** Grid Search CV **

---

In [106]:
X = words_df
y = data.pay_hilo

In [99]:
dtc_params = {
    'max_depth':[None,1,2,3,4],
    'max_features':[None,'log2','sqrt',2,3,4,5],
    'min_samples_split':[2,3,4,5,10,15,20,25,30,40,50]
}

In [102]:
from sklearn.model_selection import GridSearchCV
dtc_gs = GridSearchCV(DecisionTreeClassifier(), dtc_params, cv=5, verbose=1)

In [108]:
dtc_gs.fit(X, y)

Fitting 5 folds for each of 385 candidates, totalling 1925 fits


[Parallel(n_jobs=1)]: Done 1925 out of 1925 | elapsed:    3.1s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [None, 1, 2, 3, 4], 'max_features': [None, 'log2', 'sqrt', 2, 3, 4, 5], 'min_samples_split': [2, 3, 4, 5, 10, 15, 20, 25, 30, 40, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [109]:
dtc_best = dtc_gs.best_estimator_
print (dtc_gs.best_params_)
print (dtc_gs.best_score_)

{'max_depth': None, 'max_features': 4, 'min_samples_split': 15}
0.625


In [113]:
fi = pd.DataFrame({
        'feature':X.columns,
        'importance':dtc_best.feature_importances_
    })

fi.sort_values('importance', ascending=False, inplace=True)
fi.head(20)

Unnamed: 0,feature,importance
251,b'senior',0.04729
63,b'consulting',0.031311
31,b'backend',0.025893
15,b'angular',0.024194
223,b'quality',0.023221
219,b'project',0.023091
158,b'lead',0.021368
93,b'engineer',0.019336
258,b'stack',0.019108
11,b'analysis',0.018086
