# Job Web-Scraping
Note: This is intented for practice to extract real time job descriptions. Please adhere to the Robots.txt

In [59]:
#If you are using Selenium for the first time, please download the webdriver and note the filepath
from selenium import webdriver
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', 100)  # increase the string length before truncating

# Monitor the status of the webscraping to get an idea of how long it will take 
import time
from tqdm import tqdm
tqdm.monitor_interval = 0

# Used as a import for dateing the CSV
import datetime

# NLP libaries
import spacy
from spacy.lang.en import stop_words

# Punctuation attribute will be used for removing punctuation
import string

# Import nltk for removing stopwords and lemmatizing
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn

import gensim
import pyLDAvis.gensim
from gensim.models import LdaModel
import gensim.corpora as corpora  # *****
from gensim.utils import simple_preprocess
from gensim.models import coherencemodel
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import datetime as dt

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/guillermogonzalez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
path_to_chromedriver = "/Users/guillermogonzalez/projects/webscraping/chromedriver"

In [12]:
!ls /Users/guillermogonzalez/projects/webscraping/

[31mchromedriver[m[m           chromedriver_mac64.zip [34mjob_topic_modeling[m[m


In [20]:
#adds the data in URL format by removing any white space and adding the data. 
def Keyword_to_url(kw):
    '''
    Stips any leading and trailing white space from a string and replaces spaces with'%20'.
    '''
    kw = kw.strip().replace(" ","%20")
    return kw

### URL Example
```https://www.careeronestop.org/Toolkit/Jobs/find-jobs.aspx?keyword=data%20analyst&ajax=0&location=austin,%20texas&radius=5&source=IN&pagesize=100&sortcolumns=accquisitiondate&sortdirections=DSC```
- occupation: keyword=data%20analyst  
- location: location=austin,%20texas  
- radius: radius=5  
- page_size: pagesize=100  
- page_sort: sortcolumns=accquisitiondate&sortdirections=DSC  
- source: source=IN  


<img width="983" alt="screen shot 2018-06-24 at 10 16 01 pm" src="https://user-images.githubusercontent.com/7989686/41828679-4cf946ce-77fc-11e8-8b80-6479d6865873.png">

# occupation = 'data analyst'
> #### Scrubbing dataframe to identify titles more closely matched to data science and data analyst roles

In [79]:
#setup browser window
browser = webdriver.Chrome(executable_path= path_to_chromedriver)

#adds the data in URL format by removing any white space and adding the data. 
def Keyword_to_url(kw):
    '''
    Stips any leading and trailing white space from a string and replaces spaces with'%20'.
    '''
    kw = kw.strip().replace(" ","%20")
    return kw



# this will be used as the URL input for multiple job searches. 
occupation = 'data analyst' # str(input("Job you are looking for: "))
location = 'seattle, wa' # str(input("Where do you want to look: "))
radius = str(5)
page_sort = 'DSC' # ASC (ascending) or DSC (descending) dates 
page_size = "100" # how many pages would you like to get. 
source = 'IN' # stands for indeed

#give it the site URL
url = 'https://www.careeronestop.org/Toolkit/Jobs/find-jobs.aspx?'+\
    'keyword='+Keyword_to_url(occupation)+\
    '&ajax=0&location='+Keyword_to_url(location)+\
    '&radius='+radius+\
    '&source='+ source +\
    '&pagesize='+page_size+\
    '&sortcolumns=accquisitiondate&sortdirections='+page_sort
browser.get(url)
browser.implicitly_wait(30)

# Default is 100 job listings for this page and this verifies that thera are 100 job titles
listing_num = len(browser.find_elements_by_xpath('//td[@data-title="Job Title"]'))

job_titles = browser.find_elements_by_xpath('//td[@data-title="Job Title"]')
company_titles = browser.find_elements_by_xpath('//td[@data-title="Company"]')
locations = browser.find_elements_by_xpath('//td[@data-title="Location"]')
dates = browser.find_elements_by_xpath('//td[@data-title="Date Posted"]')

data = {}
data['company'] = []
data['date_posted'] = []
data['location'] = []
data['position'] = []
data['pos_link'] = []
data['pos_description'] = []

for job in tqdm(range(listing_num)):
    data['position'].append(job_titles[job].text) # position title
    data['company'].append(company_titles[job].text) # comapany title
    data['location'].append(locations[job].text) # job location
    data['date_posted'].append(dates[job].text) # date job posted
    # link to job description
    data['pos_link'].append(browser.find_element_by_link_text(job_titles[job].text).get_attribute('href')) 
    
for link in tqdm(data['pos_link']):
    browser.get(link) # open link to job description in the same window
    # append job description
    data['pos_description'].append(browser.find_element_by_xpath('//td[@class="snip"]').text.replace('\n', ' '))
    browser.back() # click the back button to return to original page

# transform the dictionary into a DataFrame    
df = pd.DataFrame(data)

100%|██████████| 100/100 [00:10<00:00,  9.71it/s]
100%|██████████| 100/100 [01:51<00:00,  1.11s/it]


In [22]:
# identifying all positions specific to data science or analysis
positions_da = df['position']
da_positions = set()
for position in positions_da:
    if position.lower().find('data scientist') > -1 \
        or position.lower().find('analyst') > -1\
        or position.lower().find('machine learning') > -1\
        or position.lower().find('ai') > -1\
        or position.lower().find('data science') > -1\
        or position.lower().find('analytics') > -1:
        da_positions.add(position)    

In [23]:
# Need to come back to, but was making a function to drop rows that were not relavent job posts
def ds_or_da(dataframe):
    index_to_drop = []
    for position in dataframe['position']:
        if position.lower().find('data scientist') == -1 \
        and position.lower().find('analyst') == -1\
        and position.lower().find('machine learning') == -1\
        and position.lower().find('ai') == -1\
        and position.lower().find('data science') == -1\
        and position.lower().find('analytics') == -1:
            index_to_drop.append(position.index())  
        
    return index_to_drop
            

In [24]:
# data science/analyst positions in this list
df[ df['position'].isin(da_positions) ].shape

(98, 6)

In [25]:
# Sample of ru
df[ df['position'].isin(da_positions) ]['position'][:10]

0                               Business Analyst
1       Business Analyst (Operations Excellence)
2          Reporting Analyst - CORP - Austin, TX
3         Analyst, Investment Analytics and Data
4                               Business Analyst
5                        Accounting Data Analyst
6                                 Data Analyst I
7                               Business Analyst
8      Business Intelligence Development/Analyst
9    Payment Intelligence and Cybercrime Analyst
Name: position, dtype: object

In [26]:
# identifying non-data science or analysis positions
non_da_positions = set()
for position in positions_da:
    if position.lower().find('data scientist') == -1 \
        and position.lower().find('analyst') == -1\
        and position.lower().find('machine learning') == -1\
        and position.lower().find('ai') == -1\
        and position.lower().find('data science') == -1\
        and position.lower().find('analytics') == -1:
        non_da_positions.add(position)    

In [27]:
# there are 11 non-data science/analyst positions in this list
df[ df['position'].isin(non_da_positions) ].shape

(2, 6)

In [28]:
df[ df['position'].isin(non_da_positions) ]['position']

39      Data Engineer
90    Data Strategist
Name: position, dtype: object

In [30]:
print('{:.3}% of job titles are non-data science/anlaysis titles'.format((df[ df['position'].isin(non_da_positions) ]\
                                                            .shape[0]/len(df))*100))

2.0% of job titles are non-data science/anlaysis titles


In [65]:
df.head()

Unnamed: 0,company,date_posted,location,pos_description,pos_link,position
0,Avani Technology Solutions Inc,07/18/2018,"Austin, TX","Job Summary Position: Business Analyst 3 Location: Austin, Texas Duration: 160 Hours Responsibil...",https://www.indeed.com/viewjob?jk=4bbfad1784f0ee33&qd=gMF-077E7aWVyicXRrUZA568_ZMafrHmjPUuefcLv6...,Business Analyst
1,HireBetter,07/18/2018,"Austin, TX","The Role: Treasury Analyst (Contract) Location: Austin, TX Position Summary: Our client, a globa...",https://www.indeed.com/viewjob?jk=e999cdbb789f9d09&qd=gMF-077E7aWVyicXRrUZA568_ZMafrHmjPUuefcLv6...,Treasury Analyst (Contract)
2,Soal Technologies,07/18/2018,"Austin, TX","Job Summary Position: Business Analyst 3 Location: Austin, Texas Duration: 160 Hours Responsibil...",https://www.indeed.com/viewjob?jk=4bbfad1784f0ee33&qd=gMF-077E7aWVyicXRrUZA568_ZMafrHmjPUuefcLv6...,Business Analyst
3,Virtus Partners,07/18/2018,"Austin, TX",Prepares daily reconciliation of cash transactions Manages large volumes of transactions and mon...,https://www.indeed.com/viewjob?jk=ce50923ac74cd092&qd=gMF-077E7aWVyicXRrUZA568_ZMafrHmjPUuefcLv6...,Data Analyst II
4,"PMCS Services, Inc",07/18/2018,"Austin, TX",PMCS looking for a Full time/contract employee for Data Analytics Consultant. Essential Job Task...,https://www.indeed.com/viewjob?jk=76f60dc0f5f3767f&qd=gMF-077E7aWVyicXRrUZA568_ZMafrHmjPUuefcLv6...,Data Analytics Consultant


# Text Preprocessing

## Create function to REMOVE PUCTUATION

In [560]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

#### Function DOES NOT WORK with this dataset (see below why)

In [403]:
# sring of all punctuations
string.punctuation

def remove_punct(text):
    # only returns char not in the punctuation list then join the characters to look like the original document                                                                                  #  punctuation list
    text_nopunct = ''.join([char for char in text if char not in string.punctuation]) 
    return text_nopunct

# make a new colum of the descriptions with punctuation removed
df['docs_nopunct'] = df['pos_description'].apply(lambda x: remove_punct(x))

Noticed that there are still apostrophies in the new column???

In [404]:
# the apostrophe from the document does not match the puncutation string's apostophe; need to find another solution
print('You can see that \"{}\" from the punctuation string does not look like \"{}\" from the document and\
 this is what it like\n manually typed \"{}\"'.format(string.punctuation[6], df['docs_nopunct'][4][5],'\''))

You can see that "'" from the punctuation string does not look like "’" from the document and this is what it like
 manually typed "'"


#### Function WORKS correctly utilizing regex

In [405]:
import re

In [406]:
# Using regular expressions to remove punctuation
def remove_punct(text):
    '''
    Finds all word elements only and joins all the stings on ' '
    '''
    text_nopunct = ' '.join(re.findall('\w+', text)) # \w+ selects word elements only 
    return text_nopunct

df['docs_nopunct'] = df['pos_description'].apply(lambda x: remove_punct(x))

## Create a function to TOKENIZE the text

In [408]:
def tokenizer(text):
    docs_tokenized = re.split('\W+', text)
    return docs_tokenized   

df['docs_tokenized'] = df['docs_nopunct'].apply(lambda x: tokenizer(x.lower()))

## Create a function to REMOVE STOPWORDS

In [410]:
stopwords = nltk.corpus.stopwords.words('english')

In [411]:
def remove_stopwords(tokenized_text):
    docs_no_stopwords = [word for word in tokenized_text if word not in stopwords]
    return docs_no_stopwords     

df['docs_tokenized_nostopwords'] = df['docs_tokenized'].apply(lambda x: remove_stopwords(x))

## Create a function to LEMMATIZE the tokens

In [369]:
def word_lemmatizer(word):   
    lemma = wn.morphy(word)
    if lemma is None:
        return  word
    else:
        return lemma

In [370]:
def doc_lemmatizer(text):
    docs_lemmatized = [word_lemmatizer(word) for word in text]
    return docs_lemmatized

df['docs_lemmatized'] = df['docs_tokenized_nostopwords'].apply(lambda x: doc_lemmatizer(x))

## Create a function to REMOVE CRAP WORDS

In [None]:
crap_words = ['hour', 'hours', 'day', 'days', 'minutes', 'austin', 'tx', 'ago']

In [None]:
def remove_crap(text):
    docs_no_crap = [word for word in text if word not in crap_words]
    return docs_no_crap

df['docs_no_crap'] = df['docs_tokenized'].apply(lambda x: remove_crap(x))

## Job recommendations at bottom of webpages
 - At the end of every job description there is a "save job" link.  A function was made to call out the index of "save" in every job post in order to index the output of the text_processing function to only include everything before it.  
<img width="406" alt="screen shot 2018-07-12 at 1 16 00 am" src="https://user-images.githubusercontent.com/7989686/42616824-2ebf21c0-8575-11e8-9cb9-e2c0a6c037b1.png">

# Combining into a one TEXT PREPROCESSING FUNCTION 

In [32]:
import re

In [80]:
stopwords = nltk.corpus.stopwords.words('english')
crap_words = ['hour', 'hours', 'day', 'days', 'minutes', 'austin', 'tx', 'ago', 'experience', 'team',
              'work', 'skill', 'understanding', 'ability', 'member', 'years', 'requirement', 'job'
              ,'knowledge', 'require', 'include', 'provide']

# function for wordnet lemmatizing a word returns the word if it's not in wordnet
#  this allows there to be a combination of wordnet lemmatized words and non-wordnet words in the doc
def word_lemmatizer(word):   
    lemma = wn.morphy(word) 
    if lemma is None:  # return the word if there is no lemma in WordNet for that word
        return  word
    else:
        return lemma
# Bottom of web page has job recommendations. This function identifies the index of 'save' at 
# the bottom of every job description
def save_index(text):
    save = text.index('save')
    return save

def text_preprocess(text):
    text_nopunct = ' '.join(re.findall('\w+', text))  # remove puncuation by only selecting word elements '\w+'
    docs_tokenized = re.split('\W+', text_nopunct)  # tokenize splitting on non-word elements '\W+'
    docs_no_stopwords = [word for word in docs_tokenized if word not in stopwords]  # remove stopwords
    docs_lemmatized = [word_lemmatizer(word) for word in docs_no_stopwords]  # wordnet lemmatizing
    docs_no_crapwords = [word for word in docs_lemmatized if word not in crap_words]  # remove crap words
    return docs_no_crapwords[ : save_index(docs_no_crapwords)]  # return the list up to 'save'

In [35]:
# revert the dataframe to teh pre-application of the text_preprocess function to tweak some optimizations
def revert(dataframe):
    try:
        return dataframe.drop('pos_description_processed', axis=1, inplace=True)
    except:
        pass
revert(df)

In [81]:
# Adding a new column of the processed text
df['pos_description_processed'] = df['pos_description'].apply(lambda x: text_preprocess(x.lower()))

In [82]:
df.head()

Unnamed: 0,company,date_posted,location,pos_description,pos_link,position,pos_description_processed
0,Amperity,07/18/2018,"Seattle, WA","Today, there is a chronic and pervasive customer data problem -- one that’s preventing some of t...",https://www.indeed.com/viewjob?jk=0a9a257678dbf2aa&qd=gMF-077E7aWVyicXRrUZA568_ZMafrHmjPUuefcLv6...,"Data Analyst, Machine Learning","[today, chronic, pervasive, customer, data, problem, one, prevent, world, love, brand, engage, c..."
1,Qualis Health,07/18/2018,"Seattle, WA","In April 2018, HealthInsight and Qualis Health announced a formal merger, combining the two orga...",https://www.indeed.com/viewjob?jk=45194a5079b3ceff&qd=gMF-077E7aWVyicXRrUZA568_ZMafrHmjPUuefcLv6...,Health Data Analyst,"[april, 2018, healthinsight, qualis, health, announce, formal, merger, combining, two, organizat..."
2,"AssuredPartners, Inc.",07/18/2018,"Seattle, WA",Overview Job Title: EB Data Analyst Department: Employee Benefits Reports To: President of Emplo...,https://www.indeed.com/viewjob?jk=22cd09f5dede6437&qd=gMF-077E7aWVyicXRrUZA568_ZMafrHmjPUuefcLv6...,Employee Benefits Data Analyst,"[overview, title, eb, data, analyst, department, employee, benefit, report, president, employee,..."
3,Drift,07/18/2018,"Seattle, WA",Drift is a tech startup located in Seattle. We’ve raised $10 million from top investors to fund ...,https://www.indeed.com/viewjob?jk=7f0a46e24a46e345&qd=gMF-077E7aWVyicXRrUZA568_ZMafrHmjPUuefcLv6...,Finance Operations Analyst,"[drift, tech, startup, locate, seattle, raise, 10, million, top, investor, fund, vision, bring, ..."
4,King County,07/17/2018,"Seattle, WA",Summary SUMMARY About King County: As the only county in the United States named after Martin L...,https://www.indeed.com/viewjob?jk=062003e0be267ffa&qd=gMF-077E7aWVyicXRrUZA568_ZMafrHmjPUuefcLv6...,Business Analyst,"[summary, summary, king, county, county, unite, state, name, martin, luther, king, jr, one, infl..."


In [83]:
df.to_csv('../data/{}_{}_{}.csv'.format( dt.date.today(), ''.join(re.findall('\w', occupation)), ''.join(re.findall('\w', location)) ))

### After reviewing the results using "data scientist" and "data analyst" there were more positive titles directly related to Data Science or Data Analytics using the "data analyst" keywords.

## Time for LDA

In [37]:
# Text data
texts = df['pos_description_processed']

In [38]:
# validating that all crap words are removed
count = 0
words = []
for word in texts[0]:
    if word in crap_words:
        count += 1
        words.append(word)
print(count)
print(words)

0
[]


In [39]:
dictionary = corpora.Dictionary(texts)

In [40]:
# Illustration of what doc2bow is doing. "data" appears 5 tims in this position
pd.concat([pd.Series(dictionary.doc2bow(texts[0])[:10]),pd.Series(texts[0][:10])], axis=1)

Unnamed: 0,0,1
0,"(0, 5)",peak
1,"(1, 5)",performer
2,"(2, 1)",nonprofit
3,"(3, 2)",recruit
4,"(4, 1)",firm
5,"(5, 1)",specialize
6,"(6, 1)",contract
7,"(7, 4)",role
8,"(8, 3)",state
9,"(9, 3)",texas


In [41]:
# Validation that there are 5 instances of 'data' in the first job description
def word_count(word):
    count = 0
    for text in texts[0]:
        if text == word:
            count += 1
    return print('There are {} instances of \'{}\''.format(count, word))

In [42]:
word_count('recruit')
word_count('state')
word_count('nonprofit')

There are 2 instances of 'recruit'
There are 3 instances of 'state'
There are 1 instances of 'nonprofit'


In [43]:
# defining the variable for the corpus
corpus = [dictionary.doc2bow(text) for text in texts]

In [44]:
# pickling the corpus object and dicitonary to plug into different LDA topic count models 
import pickle
pickle.dump(corpus, open('corpus_da.pkl', 'wb'))
dictionary.save('dictionary_da.gensim')

## LDA with Gensim

### Training a 2 topic model

In [45]:
topic_count = 2
ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                           num_topics=topic_count,
                                           id2word=dictionary,
                                           passes=15,
                                           random_state=42)
ldamodel.save('model2_da.gensim')


topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.025*"business" + 0.022*"system" + 0.016*"data" + 0.014*"process"')
(1, '0.018*"data" + 0.010*"business" + 0.007*"customer" + 0.006*"analytics"')


### Training a 3 topic model

In [46]:
topic_count = 3
ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                           num_topics=topic_count,
                                           id2word=dictionary,
                                           passes=15,
                                           random_state=42)
ldamodel.save('model3_da.gensim')


topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.026*"data" + 0.015*"business" + 0.007*"information" + 0.007*"process"')
(1, '0.020*"data" + 0.008*"analytics" + 0.007*"business" + 0.006*"customer"')
(2, '0.031*"system" + 0.030*"business" + 0.016*"process" + 0.011*"project"')


### Training a 4 topic model

In [47]:
topic_count = 4
ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                           num_topics=topic_count,
                                           id2word=dictionary,
                                           passes=15,
                                           random_state=42)
ldamodel.save('model4_da.gensim')


topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.029*"data" + 0.015*"business" + 0.007*"system" + 0.007*"information"')
(1, '0.016*"data" + 0.009*"analytics" + 0.008*"business" + 0.008*"customer"')
(2, '0.033*"system" + 0.032*"business" + 0.018*"process" + 0.012*"project"')
(3, '0.015*"data" + 0.012*"business" + 0.008*"management" + 0.007*"project"')


### Training a 5 topic model

In [48]:
topic_count = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                           num_topics=topic_count,
                                           id2word=dictionary,
                                           passes=15,
                                           random_state=42)
ldamodel.save('model5_da.gensim')


topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.030*"data" + 0.016*"business" + 0.008*"information" + 0.008*"report"')
(1, '0.015*"data" + 0.010*"business" + 0.009*"customer" + 0.006*"analytics"')
(2, '0.034*"system" + 0.033*"business" + 0.018*"process" + 0.012*"project"')
(3, '0.017*"data" + 0.013*"business" + 0.009*"project" + 0.008*"management"')
(4, '0.019*"data" + 0.011*"company" + 0.010*"analytics" + 0.009*"business"')


### Training an 8 topic model

In [49]:
topic_count = 8
ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                           num_topics=topic_count,
                                           id2word=dictionary,
                                           passes=15,
                                           random_state=42)
ldamodel.save('model8_da.gensim')


topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.026*"data" + 0.013*"business" + 0.010*"information" + 0.008*"report"')
(1, '0.011*"customer" + 0.010*"marketing" + 0.008*"data" + 0.007*"analysis"')
(2, '0.028*"data" + 0.014*"analysis" + 0.013*"business" + 0.009*"database"')
(3, '0.017*"business" + 0.010*"project" + 0.008*"data" + 0.007*"customer"')
(4, '0.022*"data" + 0.011*"business" + 0.010*"company" + 0.009*"product"')
(5, '0.043*"data" + 0.014*"business" + 0.011*"management" + 0.010*"analysis"')
(6, '0.023*"analytics" + 0.017*"fraud" + 0.010*"digital" + 0.010*"risk"')
(7, '0.038*"system" + 0.035*"business" + 0.018*"process" + 0.012*"project"')


# Visualizing the LDA models

In [50]:
import pyLDAvis.gensim

### 3 Topic Visalization

In [51]:
dictionary = gensim.corpora.Dictionary.load('dictionary_da.gensim')
corpus = pickle.load(open(file='corpus_da.pkl', mode='rb'))  # 'rb' --> Read Bytes
lda = gensim.models.ldamodel.LdaModel.load('model3_da.gensim')

In [52]:
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.display(lda_display)

### 4 Topic Visualization

In [53]:
dictionary = gensim.corpora.Dictionary.load('dictionary_da.gensim')
corpus = pickle.load(open(file='corpus_da.pkl', mode='rb'))  # 'rb' --> Read Bytes
lda = gensim.models.ldamodel.LdaModel.load('model4_da.gensim')

In [54]:
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.display(lda_display)

## 5 Topic Visualization

In [55]:
dictionary = gensim.corpora.Dictionary.load('dictionary_da.gensim')
corpus = pickle.load(open(file='corpus_da.pkl', mode='rb'))  # 'rb' --> Read Bytes
lda = gensim.models.ldamodel.LdaModel.load('model5_da.gensim')

In [56]:
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.display(lda_display)

# ------------------THIS IS WHERE EDDIE IS UP TO --------------------------------

### MacOS online advice for clicking links
element.send_keys(Keys.CONTROL + Keys.RETURN)  
driver.switch_to.window(self.driver.window_handles[1])  
#### do stuff  
driver.close()  
driver.switch_to.window(self.driver.window_handles[0])  

## Using SpaCy for NLP - this is a Work in Progress (Eddie has not started this part yet)
Objective: Setup for topic modeling and use LDA to determine feature importance

In [324]:
parser = spacy.lang.en.English()
nlp = spacy.load('en')

In [325]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(pos_description),deacc=True))

In [326]:
data_words = list(sent_to_words(pos_description))

In [328]:
#Building bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)

trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

In [329]:
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [331]:
def remove_stopwords(text):
    return [[word for word in simple_preprocess(str(doc)) if word not in STOP_WORDS] for doc in text]

def make_bigrams(text):
    return [bigram_mod[doc] for doc in text]

def make_trigrams(text):
    return [trigram_mod[bigram_mod[doc]] for doc in text]

def lemmatization(text, allowed_postags = ['NOUN','ADJ','VERB','ADV']):
    text_rem = []
    for sent in text:
        doc = nlp(" ".join(sent))
        text_rem.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        return text_rem 

In [332]:
words_no_stops = remove_stopwords(data_words)

word_bigrams = make_bigrams(words_no_stops)

data_lemma =  lemmatization(word_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [335]:
id2word = corpora.Dictionary(data_lemma)

In [336]:
texts = data_lemma

In [337]:
corpus = [id2word.doc2bow(text) for text in texts]

In [338]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word,num_topics=20,
                               random_state=42,
                               update_every=1,
                               chunksize=100,
                               passes=10,
                               alpha='auto',
                               per_word_topics=True)

In [339]:
doc_lda = lda_model[corpus]

In [340]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = coherencemodel.CoherenceModel(model=lda_model, texts=data_lemma, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.106306833828683

Coherence Score:  0.26450302724652314


In [341]:
pyLDAvis.gensim.prepare(lda_model,corpus, id2word)