 Project 4
---

---

**Importing modules**

---

In [77]:
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import requests
import pprint

from bs4 import BeautifulSoup

import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

---

**Scraping**

Unfortunately since I'm not too familiar with scrapy and jupyter notebook, I dont know how to integrate the scrapy spider into the notebook. However, the below cell contains the code for spider scraper

---

In [51]:
import scrapy, re
from bs4 import BeautifulSoup
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

class IndeedSpider(CrawlSpider):
    name = 'indeed'
    allowed_domains = ['au.indeed.com']

    pay = 60000

    urls = ['https://au.indeed.com/jobs?q=data+scientist+%24{}+-+%24{}'.format(x-10000, x) for x in range(50000,210000,10000)]

    start_urls = urls

    rules = (
        Rule(LinkExtractor("start\=.*$"), callback='parse_item', follow = True),
    )    

    def parse_item(self, response):
        self.log('------------------------------------')
        self.log('I just visited: ' + response.url)
        self.log('------------------------------------')

        soup = BeautifulSoup(response.text, 'lxml')
        divs = soup.find_all("div", class_ = ["row", "result"])


        for job in divs:
            
            item = {}

            item["id"] = job.get("id")

            try:
                item["company"] = job.find("span", class_ = "company").text.strip("\n").strip()
            except AttributeError:
                item["company"] = None
                
            item["pay"] = re.search('([0-9]*)&start', response.url).group(1)

            item["title"] = job.a.get("title")

            item["loc"]= job.find("span", class_ ="location").text

            job_summ_page = job.get("data-jk")
            next_page_url = "https://au.indeed.com/viewjob?jk={}&from=tp-serp&tk=1c2i82l0a102q7vk".format(job_summ_page)
            yield scrapy.Request(url = next_page_url, callback = self.parse_summ, meta=dict(item=item))

    def parse_summ(self, response):
        
        item = response.meta['item']
        
        soup = BeautifulSoup(response.text, 'lxml')
        item["summ"] = soup.find("span", class_ = "summary").text.replace("\n", " ")
        yield item

SyntaxError: invalid syntax (<ipython-input-51-f13e8b891917>, line 1)

---

**Data**

For the purpose of this project, I will be working on data that has already been scraped into a .json file from the spider. 

---

In [48]:
# Initiates file name: file_name
file_name = "test.json"

In [49]:
# Imports file as dataframe: data
data = pd.read_json(file_name)

# Displays the head of the dataframe
data.head(3)

Unnamed: 0,company,id,loc,pay,summ,title
0,Opus Recruitment Solutions,p_d4996fe3bbcdf3f3,Sydney NSW,120000,My client is a rapidly growing start-up within...,DATA SCIENTIST
1,QBE Insurance,pj_112015665e5a2e01,Sydney NSW,120000,QBE is one of the top 20 global general insure...,Senior Data Scientist
2,Xpand,p_95299eb857a58426,Australia,120000,Senior Data Scientist An exponentially growi...,Senior Data Scientist


In [53]:
# Shape of the data
data.shape

(200, 6)

In [55]:
# Data types
data.dtypes

company    object
id         object
loc        object
pay         int64
summ       object
title      object
dtype: object

In [57]:
# Missing Values
data.isnull().sum()

company    0
id         0
loc        0
pay        0
summ       0
title      0
dtype: int64

In [71]:
#Distribution of pay scale for jobs
data.pay.value_counts()

100000    59
110000    55
90000     32
120000    23
130000    21
80000      6
70000      3
150000     1
Name: pay, dtype: int64

---

**Cleaning Job Titles and Job Summaries**

Do to the inconsistencies with the job titles and job summaries, we are going to clean it up a bit. This involves removing all non-alphanumeric characters and making everything lowercase. 

---

In [81]:
#Import Regex to clean
import re
from nltk.corpus import stopwords

def clean_text(text):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the outputr is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(text).get_text()
    #
    # 2. Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    #
    # 3. Conbert to lower case, split into individual words
    words = letters_only.lower().split()
    #
    # 4.In Python, searching a set is much faster than searching a list, soconvert the stop words to a set
    stops = set(stopwords.words("english"))
    #
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    #
    # 6. Join the words back into one string seperated by space, 
    # and return the result.
    return( " ".join(meaningful_words))

In [84]:
data.summ.map(clean_text);
data.title.map(clean_text);

---

**Bag of Words model**

Use CountVectorizer to create bag of words for:

- Job titles
- Job summarys

---

In [92]:
from sklearn.feature_extraction.text import CountVectorizer

#Setting the vectorizer just like we would set a model
cvec = CountVectorizer(stop_words = "english")

#Fitting the vectorizer on our training data
cvec.fit(data["summ"])

5893


In [None]:
print(len(cvec.get_feature_names()))

In [94]:
X_train = pd.DataFrame(cvec.transform(data['summ']).todense(),
                      columns = cvec.get_feature_names())

In [95]:
X_train.shape

(200, 5893)

In [99]:
words_counts = X_train.sum(axis=0)
words_counts.sort_values(ascending = False).head(20)

data           897
experience     624
team           459
work           414
role           359
skills         332
business       313
research       304
working        261
development    250
management     224
science        204
analytics      202
new            201
ability        199
strong         188
analysis       186
apply          186
australia      185
projects       176
dtype: int64

In [101]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

vectorizer = TfidfVectorizer(stop_words='english',use_idf=True)
model = vectorizer.fit_transform(data['summ'].str.upper())
km = KMeans(n_clusters=5,init='k-means++',max_iter=200,n_init=1)

k=km.fit(model)
terms = vectorizer.get_feature_names()
order_centroids = km.cluster_centers_.argsort()[:,::-1]
for i in range(5):
    print("cluster of words %d:" %i)
    for ind in order_centroids[i,:10]:
        print(' %s' % terms[ind])
    print() 

cluster of words 0:
 data
 risk
 marketing
 experience
 team
 work
 financial
 macquarie
 business
 customer

cluster of words 1:
 stack
 backend
 developer
 experience
 jvm
 team
 engineers
 aws
 kotlin
 js

cluster of words 2:
 data
 analytics
 learning
 experience
 business
 machine
 science
 team
 big
 statistical

cluster of words 3:
 environmental
 research
 role
 management
 project
 experience
 australia
 csiro
 position
 work

cluster of words 4:
 cardiac
 clinic
 heart
 tasmania
 launceston
 echo
 care
 imaging
 service
 uq



---

### 2. Create a predictor matrix of words from the quotes with CountVectorizer

It is up to you what ngram range you want to select. **Make sure that `binary=True`**

---

In [104]:
cv = CountVectorizer(ngram_range=(1,2), max_features=2500, binary=True, stop_words='english')
words = cv.fit_transform(data.summ)
words = pd.DataFrame(words.todense(), columns=cv.get_feature_names())

In [105]:
words.head()

Unnamed: 0,000,02,03,10,100,103k,109k,109k 128k,11,11 59pm,...,www macquarie,year,years,years ago,years experience,years professional,years related,years relevant,york,zealand
0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0


In [112]:
words.shape

(200, 2500)

---

### 3. Split pay into high - low.

Lets split pay into high - low and see the differences in the words used between the two categories. We're going to split it by median which is ~ $11,0000

---

In [129]:
data["pay"].map(lambda x: 1 for x in data["pay"])

TypeError: 'generator' object is not callable

---

### 3. Split data into training and testing splits

You should keep 25% of the data in the test set.

---

In [107]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(words.values, data.pay, test_size=0.25)
print(Xtrain.shape, Xtest.shape)

(150, 2500) (50, 2500)


---

### 4. Build a `BernoulliNB` model predicting high vs low salaries from the word appearances

The model should only be built (and cross-validated) on the training data.

Cross-validate the score and compare it to baseline.

---

In [109]:
from sklearn.naive_bayes import BernoulliNB
nb = BernoulliNB()

In [110]:
from sklearn.model_selection import cross_val_score
nb.fit(Xtrain, ytrain)

nb_scores = cross_val_score(BernoulliNB(), Xtrain, ytrain, cv = 5)

In [111]:
print(nb_scores)
print("----------------------------------------------------------------------")
print("The average score of a BernoulliNB model is {:.2f}".format(np.mean(nb_scores)))
print("The baseline score is {:.2f}".format(np.mean(np.mean(ytrain))))

[ 0.42424242  0.375       0.4         0.17857143  0.18518519]
----------------------------------------------------------------------
The average score of a BernoulliNB model is 0.31
The baseline score is 106200.00
