Project 4
---

---

Importing random trash

---

In [None]:
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from bs4 import BeautifulSoup
import requests
import pprint as pprint
from scrapy.selector import Selector
from scrapy.http import HtmlResponse

import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

---

We are going to scrape indeed. It is not going to be fun

**Lets test things out first**

---

In [None]:
url = "https://au.indeed.com/jobs?as_and=data+scientist&salary=$30,000+-+$100,000&limit=50&start=00"

In [None]:
r = requests.get(url)
soup = BeautifulSoup(r.text, "lxml")
pages = int(int(soup.find("div", {"id":"searchCount"}).text[-3:])/50)

In [None]:
def scrape_indeedpage(page_soup, pay):
    divs = soup.find_all("div", class_ = ["row", "result"])    
    for job_card in divs:
        deets = {}

        deets["id"] = job_card.get("id")

        try:
            deets["company"] = job_card.find("span", class_ = "company").text.strip("\n").strip()
        except AttributeError:
            deets["company"] = None
            
        if pay == "low":
            deets["pay"] = 0
        else:
            deets["pay"] = 1

        deets["title"] = job_card.a.get("title")

        deets["loc"]= job_card.find("span", class_ ="location").text

        deets["summary"] = job_card.find("span", class_ = "summary").text.strip()
        
        yield deets

In [None]:
all_jobs = []
for job_listings in range(0,(pages+1)*50, 50):
    r = requests.get("https://au.indeed.com/jobs?as_and=data+scientist&salary=$30,000+-+$100,000&limit=50&start={}"\
                     .format(job_listings))
    
    soup = BeautifulSoup(r.text, "lxml")
    
    for i in scrape_indeedpage(soup, "low"):
        all_jobs.append(i)

In [None]:
for job_listings in range(0,(pages+1)*50, 50):
    r = requests.get("https://au.indeed.com/jobs?as_and=data+scientist&salary=$100,000+-+$500,000&limit=50&start={}"\
                     .format(job_listings))
    
    soup = BeautifulSoup(r.text, "lxml")
    
    for i in scrape_indeedpage(soup, "high"):
        all_jobs.append(i)

In [None]:
data = pd.DataFrame.from_dict(all_jobs)
data.shape

In [None]:
data.head()

In [None]:
print("As we can see there are a few duplicates as defined by the job_id")
print(data["id"].value_counts()[0:5])

In [None]:
pd.DataFrame.drop_duplicates(data, inplace = True)

In [None]:
print(data.pay.value_counts())

In [None]:
import re

def clean_summary(summary):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the outputr is a single string (a preprocessed movie review)
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(summary).get_text()
    #
    # 2. Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    #
    # 3. Conbert to lower case, split into individual words
    words = letters_only.lower().split()
    #
    # 4.In Python, searching a set is much faster than searching a list, soconvert the stop words to a set
    stops = set(stopwords.words("english"))
    #
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]
    #
    # 6. Join the words back into one string seperated by space, 
    # and return the result.
    return( " ".join(meaningful_words))

In [None]:
data["clean_summary"] = data.summary.map(clean_summary)

In [None]:
pd.DataFrame.to_csv(data, "indeed_datascientist.csv")

**3. Bag of Words model**
Use CountVectorizer to create bag of words for:

- Job titles
- Job summarys

In [None]:
#Setting the vectorizer just like we would set a model
cvec = CountVectorizer(stop_words = "english")

#Fitting the vectorizer on our training data
cvec.fit(data["summary"])

In [None]:
print(len(cvec.get_feature_names()))

In [None]:
X_train = pd.DataFrame(cvec.transform(data['summary']).todense(),
                      columns = cvec.get_feature_names())

In [None]:
X_train.shape

In [None]:
words_counts = X_train.sum(axis=0)
words_counts.sort_values(ascending = False).head(20)

In [None]:
data

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

vectorizer = TfidfVectorizer(stop_words='english',use_idf=True)
model = vectorizer.fit_transform(data['summary'].str.upper())
km = KMeans(n_clusters=5,init='k-means++',max_iter=200,n_init=1)

k=km.fit(model)
terms = vectorizer.get_feature_names()
order_centroids = km.cluster_centers_.argsort()[:,::-1]
for i in range(5):
    print("cluster of words %d:" %i)
    for ind in order_centroids[i,:10]:
        print(' %s' % terms[ind])
    print() 

---

### 2. Create a predictor matrix of words from the quotes with CountVectorizer

It is up to you what ngram range you want to select. **Make sure that `binary=True`**

In [None]:
cv = CountVectorizer(ngram_range=(1,2), max_features=2500, binary=True, stop_words='english')
words = cv.fit_transform(data.clean_summary)

In [None]:
words = pd.DataFrame(words.todense(), columns=cv.get_feature_names())

In [None]:
words.head()

In [None]:
words.shape

---

### 3. Split data into training and testing splits

You should keep 25% of the data in the test set.

In [None]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(words.values, data.pay, test_size=0.25)
print(Xtrain.shape, Xtest.shape)

---

### 4. Build a `BernoulliNB` model predicting high vs low salaries from the word appearances

The model should only be built (and cross-validated) on the training data.

Cross-validate the score and compare it to baseline.

In [None]:
from sklearn.naive_bayes import BernoulliNB
nb = BernoulliNB()

In [None]:
from sklearn.model_selection import cross_val_score
nb.fit(Xtrain, ytrain)

nb_scores = cross_val_score(BernoulliNB(), Xtrain, ytrain, cv = 5)

In [None]:
print(nb_scores)
print("----------------------------------------------------------------------")
print("The average score of a BernoulliNB model is {:.2f}".format(np.mean(nb_scores)))
print("The baseline score is {:.2f}".format(np.mean(np.mean(ytrain))))

---

### 5. Pull out the probability of words given "high salary"

The `.feature_log_prob_` attribute of the naive bayes model contains the log probabilities of a feature appearing given a target class.

The rows correspond to the class of the target, and the columns correpsond to the features. The first row is the 0 "low salary" class, and the second is the 1 "high salary" class.

#### 5.1 Pull out the log probabilities and convert them to probabilities (for high and low salaries).

In [None]:
feat_lp = nb.feature_log_prob_

In [None]:
high_p = np.exp(feat_lp[1])

In [None]:
low_p = np.exp(feat_lp[0])

#### 5.2 Make a dataframe with the probabilities and features

In [None]:
feat_probs = pd.DataFrame({'high_p':high_p, 'low_p':low_p, 'feature':words.columns.values})

In [None]:
feat_probs.head()

#### 5.3 Create a column that is the difference between fresh probability of appearance and rotten

In [None]:
feat_probs['sal_diff'] = feat_probs.high_p - feat_probs.low_p

#### 5.4 Look at the most likely words for fresh and rotten reviews

In [None]:
feat_probs.sort_values('sal_diff', ascending=False, inplace=True)
feat_probs.head(20)

In [None]:
feat_probs.sort_values('sal_diff', ascending=True, inplace=True)
feat_probs.head(20)

---

### 6. Examine how your model performs on the test set

In [None]:
print (nb.score(Xtest, ytest))
print (np.mean(ytest))