# Job Web-Scraping
Note: This is intented for practice to extract real time job descriptions. Please adhere to the Robots.txt

In [134]:
#If you are using Selenium for the first time, please download the webdriver and note the filepath
from selenium import webdriver
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', 100)  # increase the string length before truncating

# Monitor the status of the webscraping to get an idea of how long it will take 
import time
from tqdm import tqdm
tqdm.monitor_interval = 0

# Used as a import for dateing the CSV
import datetime

# NLP libaries
import spacy
from spacy.lang.en import stop_words

# Punctuation attribute will be used for removing punctuation
import string

# Import nltk for removing stopwords and lemmatizing
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn

import gensim
import pyLDAvis.gensim
from gensim.models import LdaModel
import gensim.corpora as corpora  # *****
from gensim.utils import simple_preprocess
from gensim.models import coherencemodel
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import datetime as dt

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/guillermogonzalez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
path_to_chromedriver = "/Users/guillermogonzalez/projects/webscraping/chromedriver"

In [3]:
!ls /Users/guillermogonzalez/projects/webscraping/

[31mchromedriver[m[m           chromedriver_mac64.zip [34mjob_topic_modeling[m[m


### URL Example
```https://www.careeronestop.org/Toolkit/Jobs/find-jobs.aspx?keyword=data%20analyst&ajax=0&location=austin,%20texas&radius=5&source=IN&pagesize=100&sortcolumns=accquisitiondate&sortdirections=DSC```
- occupation: keyword=data%20analyst  
- location: location=austin,%20texas  
- radius: radius=5  
- page_size: pagesize=100  
- page_sort: sortcolumns=accquisitiondate&sortdirections=DSC  
- source: source=IN  


<img width="983" alt="screen shot 2018-06-24 at 10 16 01 pm" src="https://user-images.githubusercontent.com/7989686/41828679-4cf946ce-77fc-11e8-8b80-6479d6865873.png">

In [174]:
#setup browser window
browser = webdriver.Chrome(executable_path= path_to_chromedriver)

#adds the data in URL format by removing any white space and adding the data. 
def Keyword_to_url(kw):
    '''
    Stips any leading and trailing white space from a string and replaces spaces with'%20'.
    '''
    kw = kw.strip().replace(" ","%20")
    return kw

# this will be used as the URL input for multiple job searches. 
occupation = 'data scientist' # str(input("Job you are looking for: "))
location = 'seattle, wa' # str(input("Where do you want to look: "))
radius = str(5)
page_sort = 'DSC' # ASC (ascending) or DSC (descending) dates 
page_size = "100" # how many pages would you like to get. 
source = 'IN' # stands for indeed

#give it the site URL
url = 'https://www.careeronestop.org/Toolkit/Jobs/find-jobs.aspx?'+\
    'keyword='+Keyword_to_url(occupation)+\
    '&ajax=0&location='+Keyword_to_url(location)+\
    '&radius='+radius+\
    '&source='+ source +\
    '&pagesize='+page_size+\
    '&sortcolumns=accquisitiondate&sortdirections='+page_sort
browser.get(url)
browser.implicitly_wait(30) # tells WebDriver to poll the DOM (Document Object Model) 30s 
                            # for life of WebDriver object

# Default it 100 job listings for this page and this verifies that thera are 100 job titles
listing_num = len(browser.find_elements_by_xpath('//td[@data-title="Job Title"]'))

job_titles = browser.find_elements_by_xpath('//td[@data-title="Job Title"]') # list of job titles
company_titles = browser.find_elements_by_xpath('//td[@data-title="Company"]') # list of company titles 
locations = browser.find_elements_by_xpath('//td[@data-title="Location"]') # list of locations
dates = browser.find_elements_by_xpath('//td[@data-title="Date Posted"]') # list of dates posted

data = {}
data['company'] = []
data['date_posted'] = []
data['location'] = []
data['position'] = []
data['pos_link'] = []
data['pos_description'] = []

for job in tqdm(range(listing_num)):
    data['position'].append(job_titles[job].text) # position title
    data['company'].append(company_titles[job].text) # comapany title
    data['location'].append(locations[job].text) # job location
    data['date_posted'].append(dates[job].text) # date job posted
    # link to job description
    data['pos_link'].append(browser.find_element_by_link_text(job_titles[job].text).get_attribute('href')) 
    
for link in tqdm(data['pos_link']):
    browser.get(link) # open link to job description in the same window
    # append job description
    data['pos_description'].append(browser.find_element_by_xpath('//td[@class="snip"]').text.replace('\n', ' '))
    browser.back() # click the back button to return to original page

# transform the dictionary into a DataFrame    
df = pd.DataFrame(data)



100%|██████████| 100/100 [00:11<00:00,  8.60it/s]
100%|██████████| 100/100 [01:55<00:00,  1.16s/it]


In [169]:
df.shape

(100, 6)

# occupation = 'data scientist'  
> #### Scrubbing dataframe to identify titles more closely matched to data science and data analyst roles

In [14]:
# identifying all positions specific to data science or analysis
positions_ds = df['position']
ds_positions = set()
for position in positions_ds:
    if position.lower().find('data scientist') > -1 \
        or position.lower().find('analyst') > -1\
        or position.lower().find('machine learning') > -1\
        or position.lower().find('ai') > -1\
        or position.lower().find('data science') > -1\
        or position.lower().find('analytics') > -1:
        ds_positions.add(position)    

In [20]:
df.columns.tolist()

['company',
 'date_posted',
 'location',
 'pos_description',
 'pos_link',
 'position']

In [15]:
# data science/analyst positions in this list
df[ df['position'].isin(ds_positions) ].shape

(46, 6)

In [22]:
df[ df['position'].isin(ds_positions) ][['company','position']][:5]

Unnamed: 0,company,position
0,Keller Williams,Machine Learning Engineer
2,Avanade,Sr. AI Lead/Data Scientist
3,General Motors,AI/ML Scientist
4,"Advanced Micro Devices, Inc.",Data Scientist / Operations Research Engineer
10,Zynga,"Data Scientist 4, Poker"


In [23]:
# identifying non-data science or analysis positions
non_ds_positions = set()
for position in positions_ds:
    if position.lower().find('data scientist') == -1 \
        and position.lower().find('analyst') == -1\
        and position.lower().find('machine learning') == -1\
        and position.lower().find('ai') == -1\
        and position.lower().find('data science') == -1\
        and position.lower().find('analytics') == -1:
        non_ds_positions.add(position)    

In [25]:
# non-data science/analyst positions in this list
df[ df['position'].isin(non_ds_positions) ].shape

(54, 6)

In [26]:
df[ df['position'].isin(non_ds_positions) ][['company','position']][:5]

Unnamed: 0,company,position
1,ICF,Social Scientist
5,City of Austin,Environmental Scientist Senior (Temporary)
6,Cyber Warrior Network,Front End Web Developer
7,Siemens AG,"Engineering and Development Manager - Autonomous Buildings - Building Technologies - Austin, TX"
8,Black & Veatch,Water Resources Planner / Scientist - Water Business


In [30]:
print('{:.3}% of job titles are non-data science/analysis type titles'.format(
                                (df[ df['position'].isin(non_ds_positions) ].shape[0]/len(df))*100))

54.0% of job titles are non-data science/analysis type titles


# Text Preprocessing

## Create function to REMOVE PUCTUATION

In [560]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

#### Function DOES NOT WORK with this dataset (see below why)

In [84]:
# sring of all punctuations
string.punctuation

def remove_punct(text):
    # only returns char not in the punctuation list then join the characters to look like the original document                                                                                  #  punctuation list
    text_nopunct = ''.join([char for char in text if char not in string.punctuation]) 
    return text_nopunct

# make a new colum of the descriptions with punctuation removed
df['docs_nopunct'] = df['pos_description'].apply(lambda x: remove_punct(x))

Noticed that there are still apostrophies in the new column???

In [85]:
# the apostrophe from the document does not match the puncutation string's apostophe; need to find another solution
print('You can see that \"{}\" from the punctuation string does not look like \"{}\" from the document and\
 this is what it like\n manually typed \"{}\"'.format(string.punctuation[6], df['docs_nopunct'][4][5],'\''))

You can see that "'" from the punctuation string does not look like "y" from the document and this is what it like
 manually typed "'"


In [89]:
df['docs_nopunct'][0][8]

'’'

In [93]:
print('Typed apostrophe: [{}]\nString.puncuation apostrophe: [{}]\nDocument apostrophe: [{}]'.\
      format('\'',string.punctuation[6], df['docs_nopunct'][0][8]))

Typed apostrophe: [']
String.puncuation apostrophe: [']
Document apostrophe: [’]


#### Function WORKS correctly utilizing regex

In [405]:
import re

In [406]:
# Using regular expressions to remove punctuation
def remove_punct(text):
    '''
    Finds all word elements only and joins all the stings on ' '
    '''
    text_nopunct = ' '.join(re.findall('\w+', text)) # \w+ selects word elements only 
    return text_nopunct

df['docs_nopunct'] = df['pos_description'].apply(lambda x: remove_punct(x))

## Create a function to TOKENIZE the text

In [408]:
def tokenizer(text):
    docs_tokenized = re.split('\W+', text)
    return docs_tokenized   

df['docs_tokenized'] = df['docs_nopunct'].apply(lambda x: tokenizer(x.lower()))

## Create a function to REMOVE STOPWORDS

In [410]:
stopwords = nltk.corpus.stopwords.words('english')

In [411]:
def remove_stopwords(tokenized_text):
    docs_no_stopwords = [word for word in tokenized_text if word not in stopwords]
    return docs_no_stopwords     

df['docs_tokenized_nostopwords'] = df['docs_tokenized'].apply(lambda x: remove_stopwords(x))

## Create a function to LEMMATIZE the tokens

In [369]:
def word_lemmatizer(word):   
    lemma = wn.morphy(word)
    if lemma is None:
        return  word
    else:
        return lemma

In [370]:
def doc_lemmatizer(text):
    docs_lemmatized = [word_lemmatizer(word) for word in text]
    return docs_lemmatized

df['docs_lemmatized'] = df['docs_tokenized_nostopwords'].apply(lambda x: doc_lemmatizer(x))

## Create a function to REMOVE CRAP WORDS

In [None]:
crap_words = ['hour', 'hours', 'day', 'days', 'minutes', 'austin', 'tx', 'ago']

In [None]:
def remove_crap(text):
    docs_no_crap = [word for word in text if word not in crap_words]
    return docs_no_crap

df['docs_no_crap'] = df['docs_tokenized'].apply(lambda x: remove_crap(x))

## Job recommendations at bottom of webpages
 - At the end of every job description there is a "save job" link.  A function was made to call out the index of "save" in every job post in order to index the output of the text_processing function to only include everything before it.  
<img width="406" alt="screen shot 2018-07-12 at 1 16 00 am" src="https://user-images.githubusercontent.com/7989686/42616824-2ebf21c0-8575-11e8-9cb9-e2c0a6c037b1.png">

# Combining into a one TEXT PREPROCESSING FUNCTION 

In [34]:
import re

In [175]:
stopwords = nltk.corpus.stopwords.words('english')
crap_words = ['hour', 'hours', 'day', 'days', 'minutes', 'austin', 'tx', 'ago', 'experience', 'team',
              'work', 'skill', 'understanding', 'ability', 'member', 'years', 'requirement', 'job', 
              'knowledge', 'require', 'include', 'provide']

# function for wordnet lemmatizing a word returns the word if it's not in wordnet
#  this allows there to be a combination of wordnet lemmatized words and non-wordnet words in the doc
def word_lemmatizer(word):   
    lemma = wn.morphy(word) 
    if lemma is None:  # return the word if there is no lemma in WordNet for that word
        return  word
    else:
        return lemma
# Bottom of web page has job recommendations. This function identifies the index of 'save' at 
# the bottom of every job description
def save_index(text):
    save = text.index('save')
    return save

def text_preprocess(text):
    text_nopunct = ' '.join(re.findall('\w+', text))  # remove puncuation by only selecting word elements '\w+'
    docs_tokenized = re.split('\W+', text_nopunct)  # tokenize splitting on non-word elements '\W+'
    docs_no_stopwords = [word for word in docs_tokenized if word not in stopwords]  # remove stopwords
    docs_lemmatized = [word_lemmatizer(word) for word in docs_no_stopwords]  # wordnet lemmatizing
    docs_no_crapwords = [word for word in docs_lemmatized if word not in crap_words]  # remove crap words
    return docs_no_crapwords[ : save_index(docs_no_crapwords)]  # return the list up to 'save'

In [61]:
# revert the dataframe to teh pre-application of the text_preprocess function to tweak some optimizations
def revert(dataframe):
    try:
        return dataframe.drop('pos_description_processed', axis=1, inplace=True)
    except:
        pass
revert(df)

In [176]:
# Adding a new column of the processed text
df['pos_description_processed'] = df['pos_description'].apply(lambda x: text_preprocess(x.lower()))

In [177]:
df.head()

Unnamed: 0,company,date_posted,location,pos_description,pos_link,position,pos_description_processed
0,Projectline,07/18/2018,"Seattle, WA",Are you passionate about learning? Do you have a passion for mixing Bioinformatics with Technolo...,https://www.indeed.com/viewjob?jk=357cb05690d1e35e&qd=gMF-077E7aWVyicXRrUZA9egzeZ3T2dBgeDfNnaoLk...,Bioinformatics Scientist / Data Scientist,"[passionate, learning, passion, mixing, bioinformatics, technology, hand, learner, position, foc..."
1,"Ocean Associates ,Inc.",07/18/2018,"Seattle, WA","Ocean Associates, Inc. (OAI) is seeking a Scientist to provide support to the NOAA National Mari...",https://www.indeed.com/viewjob?jk=b0aa5070b8912bae&qd=gMF-077E7aWVyicXRrUZA9egzeZ3T2dBgeDfNnaoLk...,Scientist - Integration of Chinook Salmon Catch Data,"[ocean, associate, inc, oai, seeking, scientist, support, noaa, national, marine, fishery, servi..."
2,Amazon.com,07/18/2018,"Seattle, WA",Job Description Amazon Go is a new kind of physical store with no lines and no checkout—you just...,https://www.indeed.com/viewjob?jk=2d83956af9659715&qd=gMF-077E7aWVyicXRrUZA9egzeZ3T2dBgeDfNnaoLk...,"Data Engineering Manager, Amazon Go","[description, amazon, go, new, kind, physical, store, line, checkout, grab, go, customer, simply..."
3,Leafly Holdings,07/18/2018,"Seattle, WA",HELP US MAKE HISTORY. Millions trust Leafly to help them find unbiased information about nearby ...,https://www.indeed.com/viewjob?jk=8111eed9aadaea12&qd=gMF-077E7aWVyicXRrUZA9egzeZ3T2dBgeDfNnaoLk...,Lead Data Engineer,"[help, us, make, history, million, trust, leafly, help, find, unbiased, information, nearby, str..."
4,Zume Inc.,07/17/2018,"Seattle, WA",WHO WE ARE Zume is on a quest to be the most powerful source of health and well-being on the pl...,https://www.indeed.com/viewjob?jk=542d3a666a7572b8&qd=gMF-077E7aWVyicXRrUZA9egzeZ3T2dBgeDfNnaoLk...,Data Scientist,"[zume, quest, powerful, source, health, well, planet, start, partner, people, technology, create..."


In [178]:
df.to_csv('../data/{}_{}_{}.csv'.format( dt.date.today(), ''.join(re.findall('\w', occupation)), ''.join(re.findall('\w', location)) ))

### After reviewing the results using "data scientist" and "data analyst" there were more positive titles directly related to Data Science or Data Analytics using the "data analyst" keywords.

## Time for LDA

In [64]:
# Text data
texts = df['pos_description_processed']

In [65]:
# validating that all crap words are removed
count = 0
words = []
for word in texts[0]:
    if word in crap_words:
        count += 1
        words.append(word)
print(count)
print(words)

0
[]


In [66]:
dictionary = corpora.Dictionary(texts)

In [67]:
# Illustration of what doc2bow is doing. "data" appears 5 tims in this position
pd.concat([pd.Series(dictionary.doc2bow(texts[0])[:10]),pd.Series(texts[0][:10])], axis=1)

Unnamed: 0,0,1
0,"(0, 1)",call
1,"(1, 7)",machine
2,"(2, 8)",learning
3,"(3, 6)",engineer
4,"(4, 3)",kwri
5,"(5, 2)",headquarters
6,"(6, 1)",remote
7,"(7, 1)",vote
8,"(8, 2)",one
9,"(9, 1)",best


In [68]:
# Validation that there are 5 instances of 'data' in the first job description
def word_count(word):
    count = 0
    for text in texts[0]:
        if text == word:
            count += 1
    return print('There are {} instances of \'{}\''.format(count, word))

In [69]:
word_count('call')
word_count('machine')
word_count('engineer')

There are 1 instances of 'call'
There are 7 instances of 'machine'
There are 6 instances of 'engineer'


In [70]:
# defining the corpus variable
corpus = [dictionary.doc2bow(text) for text in texts]

In [71]:
# pickling the corpus object and save the dicitonary for plugging back into training different topic count models
import pickle
pickle.dump(corpus, open('corpus_ds.pkl', 'wb'))
dictionary.save('dictionary_ds.gensim')

## LDA with Gensim

### Training a 2 topic model

In [72]:
topic_count = 2
ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                           num_topics=topic_count,
                                           id2word=dictionary,
                                           passes=15,
                                           random_state=42)
ldamodel.save('model2_ds.gensim')


topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.014*"data" + 0.006*"business" + 0.006*"project" + 0.006*"system"')
(1, '0.024*"data" + 0.008*"business" + 0.006*"learning" + 0.005*"client"')


### Training a 3 topic model

In [73]:
topic_count = 3
ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                           num_topics=topic_count,
                                           id2word=dictionary,
                                           passes=15,
                                           random_state=42)
ldamodel.save('model3_ds.gensim')


topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.009*"data" + 0.008*"project" + 0.007*"system" + 0.007*"position"')
(1, '0.019*"data" + 0.006*"icf" + 0.005*"business" + 0.005*"development"')
(2, '0.026*"data" + 0.013*"business" + 0.006*"science" + 0.006*"drive"')


### Training a 4 topic model

In [74]:
topic_count = 4
ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                           num_topics=topic_count,
                                           id2word=dictionary,
                                           passes=15,
                                           random_state=42)
ldamodel.save('model4_ds.gensim')


topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.008*"project" + 0.008*"system" + 0.008*"position" + 0.008*"data"')
(1, '0.019*"data" + 0.007*"customer" + 0.006*"business" + 0.005*"client"')
(2, '0.025*"data" + 0.014*"business" + 0.007*"product" + 0.006*"drive"')
(3, '0.022*"data" + 0.008*"icf" + 0.007*"development" + 0.006*"business"')


### Training a 5 topic model

In [75]:
topic_count = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                           num_topics=topic_count,
                                           id2word=dictionary,
                                           passes=15,
                                           random_state=42)
ldamodel.save('model5_ds.gensim')


topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.008*"position" + 0.008*"system" + 0.008*"data" + 0.007*"project"')
(1, '0.012*"water" + 0.011*"customer" + 0.010*"software" + 0.006*"user"')
(2, '0.016*"data" + 0.012*"business" + 0.011*"client" + 0.010*"product"')
(3, '0.016*"data" + 0.010*"icf" + 0.006*"business" + 0.006*"development"')
(4, '0.032*"data" + 0.011*"business" + 0.008*"learning" + 0.007*"analytics"')


### Training an 8 topic model

In [76]:
topic_count = 8
ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                           num_topics=topic_count,
                                           id2word=dictionary,
                                           passes=15,
                                           random_state=42)
ldamodel.save('model8_ds.gensim')


topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.011*"system" + 0.010*"position" + 0.008*"project" + 0.007*"research"')
(1, '0.017*"software" + 0.009*"customer" + 0.008*"development" + 0.007*"application"')
(2, '0.012*"client" + 0.012*"product" + 0.009*"business" + 0.008*"digital"')
(3, '0.016*"data" + 0.014*"icf" + 0.007*"opportunity" + 0.007*"research"')
(4, '0.032*"data" + 0.012*"business" + 0.007*"analysis" + 0.007*"learning"')
(5, '0.006*"support" + 0.006*"real" + 0.006*"dell" + 0.006*"product"')
(6, '0.019*"data" + 0.009*"project" + 0.006*"company" + 0.005*"field"')
(7, '0.021*"data" + 0.011*"development" + 0.010*"design" + 0.009*"software"')


# Visualizing the LDA models

In [77]:
import pyLDAvis.gensim

### 3 Topic Visalization

In [78]:
dictionary = gensim.corpora.Dictionary.load('dictionary_ds.gensim')
corpus = pickle.load(open(file='corpus_ds.pkl', mode='rb'))  # 'rb' --> Read Bytes
lda = gensim.models.ldamodel.LdaModel.load('model3_ds.gensim')

In [79]:
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.display(lda_display)

### 4 Topic Visualization

In [80]:
dictionary = gensim.corpora.Dictionary.load('dictionary_ds.gensim')
corpus = pickle.load(open(file='corpus_ds.pkl', mode='rb'))  # 'rb' --> Read Bytes
lda = gensim.models.ldamodel.LdaModel.load('model4_ds.gensim')

In [81]:
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.display(lda_display)

## 5 Topic Visualization

In [82]:
dictionary = gensim.corpora.Dictionary.load('dictionary_ds.gensim')
corpus = pickle.load(open(file='corpus_ds.pkl', mode='rb'))  # 'rb' --> Read Bytes
lda = gensim.models.ldamodel.LdaModel.load('model5_ds.gensim')

In [83]:
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.display(lda_display)

# ------------------THIS IS WHERE EDDIE IS UP TO --------------------------------

### MacOS online advice for clicking links
element.send_keys(Keys.CONTROL + Keys.RETURN)  
driver.switch_to.window(self.driver.window_handles[1])  
#### do stuff  
driver.close()  
driver.switch_to.window(self.driver.window_handles[0])  

## Using SpaCy for NLP - this is a Work in Progress (Eddie has not started this part yet)
Objective: Setup for topic modeling and use LDA to determine feature importance

In [324]:
parser = spacy.lang.en.English()
nlp = spacy.load('en')

In [325]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(pos_description),deacc=True))

In [326]:
data_words = list(sent_to_words(pos_description))

In [328]:
#Building bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)

trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

In [329]:
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [331]:
def remove_stopwords(text):
    return [[word for word in simple_preprocess(str(doc)) if word not in STOP_WORDS] for doc in text]

def make_bigrams(text):
    return [bigram_mod[doc] for doc in text]

def make_trigrams(text):
    return [trigram_mod[bigram_mod[doc]] for doc in text]

def lemmatization(text, allowed_postags = ['NOUN','ADJ','VERB','ADV']):
    text_rem = []
    for sent in text:
        doc = nlp(" ".join(sent))
        text_rem.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        return text_rem 

In [332]:
words_no_stops = remove_stopwords(data_words)

word_bigrams = make_bigrams(words_no_stops)

data_lemma =  lemmatization(word_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [335]:
id2word = corpora.Dictionary(data_lemma)

In [336]:
texts = data_lemma

In [337]:
corpus = [id2word.doc2bow(text) for text in texts]

In [338]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word,num_topics=20,
                               random_state=42,
                               update_every=1,
                               chunksize=100,
                               passes=10,
                               alpha='auto',
                               per_word_topics=True)

In [339]:
doc_lda = lda_model[corpus]

In [340]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = coherencemodel.CoherenceModel(model=lda_model, texts=data_lemma, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.106306833828683

Coherence Score:  0.26450302724652314


In [341]:
pyLDAvis.gensim.prepare(lda_model,corpus, id2word)