## Job Web-Scraping
Note: This is intented for practice to extract real time job descriptions. Please adhere to the Robots.txt

In [180]:
#If you are using Selenium for the first time, please download the webdriver and note the filepath
from selenium import webdriver
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

import pandas as pd
import numpy as np
pd.set_option('display.max_colwidth', 100)  # increase the string length before truncating

# Monitor the status of the webscraping to get an idea of how long it will take 
import time
from tqdm import tqdm
tqdm.monitor_interval = 0

# Used as a import for dateing the CSV
import datetime

# NLP libaries
import spacy
from spacy.lang.en import stop_words

# Punctuation attribute will be used for removing punctuation
import string

# Import nltk for removing stopwords and lemmatizing
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn

import gensim
import pyLDAvis.gensim
from gensim.models import LdaModel
import gensim.corpora as corpora  # *****
from gensim.utils import simple_preprocess
from gensim.models import coherencemodel
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/guillermogonzalez/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
path_to_chromedriver = "/Users/guillermogonzalez/projects/webscraping/chromedriver"

In [3]:
!ls /Users/guillermogonzalez/projects/webscraping/

Webscraping_job_posts.ipynb chromedriver_mac64.zip
[31mchromedriver[m[m                [34mjob_topic_modeling[m[m


In [342]:
#setup browser window
browser = webdriver.Chrome(executable_path= path_to_chromedriver)

In [340]:
#adds the data in URL format by removing any white space and adding the data. 
def Keyword_to_url(kw):
    '''
    Stips any leading and trailing white space from a string and replaces spaces with'%20'.
    '''
    kw = kw.strip().replace(" ","%20")
    return kw

### URL Example
```https://www.careeronestop.org/Toolkit/Jobs/find-jobs.aspx?keyword=data%20analyst&ajax=0&location=austin,%20texas&radius=5&source=IN&pagesize=100&sortcolumns=accquisitiondate&sortdirections=DSC```
- occupation: keyword=data%20analyst  
- location: location=austin,%20texas  
- radius: radius=5  
- page_size: pagesize=100  
- page_sort: sortcolumns=accquisitiondate&sortdirections=DSC  
- source: source=IN  


<img width="983" alt="screen shot 2018-06-24 at 10 16 01 pm" src="https://user-images.githubusercontent.com/7989686/41828679-4cf946ce-77fc-11e8-8b80-6479d6865873.png">

In [217]:
# this will be used as the URL input for multiple job searches. 
occupation = 'data scientist' # str(input("Job you are looking for: "))
location = 'austin, texas' # str(input("Where do you want to look: "))
radius = str(5)
page_sort = 'DSC' # ASC (ascending) or DSC (descending) dates 
page_size = "100" # how many pages would you like to get. 
source = 'IN' # stands for indeed

#give it the site URL
url = 'https://www.careeronestop.org/Toolkit/Jobs/find-jobs.aspx?'+\
    'keyword='+Keyword_to_url(occupation)+\
    '&ajax=0&location='+Keyword_to_url(location)+\
    '&radius='+radius+\
    '&source='+ source +\
    '&pagesize='+page_size+\
    '&sortcolumns=accquisitiondate&sortdirections='+page_sort
browser.get(url)
browser.implicitly_wait(30) # tells WebDriver to poll the DOM (Document Object Model) 30s 
                            # for life of WebDriver object

# Default it 100 job listings for this page and this verifies that thera are 100 job titles
listing_num = len(browser.find_elements_by_xpath('//td[@data-title="Job Title"]'))

job_titles = browser.find_elements_by_xpath('//td[@data-title="Job Title"]') # list of job titles
company_titles = browser.find_elements_by_xpath('//td[@data-title="Company"]') # list of company titles 
locations = browser.find_elements_by_xpath('//td[@data-title="Location"]') # list of locations
dates = browser.find_elements_by_xpath('//td[@data-title="Date Posted"]') # list of dates posted

data = {}
data['company'] = []
data['date_posted'] = []
data['location'] = []
data['position'] = []
data['pos_link'] = []
data['pos_description'] = []

for job in tqdm(range(listing_num)):
    data['position'].append(job_titles[job].text) # position title
    data['company'].append(company_titles[job].text) # comapany title
    data['location'].append(locations[job].text) # job location
    data['date_posted'].append(dates[job].text) # date job posted
    # link to job description
    data['pos_link'].append(browser.find_element_by_link_text(job_titles[job].text).get_attribute('href')) 
    
for link in tqdm(data['pos_link']):
    browser.get(link) # open link to job description in the same window
    # append job description
    data['pos_description'].append(browser.find_element_by_xpath('//td[@class="snip"]').text.replace('\n', ' '))
    browser.back() # click the back button to return to original page

# transform the dictionary into a DataFrame    
df_ds = pd.DataFrame(data)
       

100%|██████████| 100/100 [00:11<00:00,  9.09it/s]
100%|██████████| 100/100 [02:19<00:00,  1.40s/it]


In [220]:
df_ds.shape

(100, 6)

# occupation = 'data scientist'  
> #### Scrubbing dataframe to identify titles more closely matched to data science and data analyst roles

In [242]:
# identifying all positions specific to data science or analysis
positions_ds = df_ds['position']
ds_positions = set()
for position in positions_ds:
    if position.lower().find('data scientist') > -1 \
        or position.lower().find('analyst') > -1\
        or position.lower().find('machine learning') > -1\
        or position.lower().find('ai') > -1\
        or position.lower().find('data science') > -1\
        or position.lower().find('analytics') > -1:
        ds_positions.add(position)    

In [243]:
# data science/analyst positions in this list
df_ds[ df_ds['position'].isin(ds_positions) ].shape

(45, 6)

In [244]:
df_ds[ df_ds['position'].isin(ds_positions) ]['position'].value_counts().index

Index(['Data Scientist', 'Senior Data Scientist',
       'Part Time Data Science Instructor',
       'Data Scientist/Modeling & Simulation Engineer',
       'Data Scientist – IoT Cloud Services',
       'Manager, Decision Analytics Services',
       'Air Quality Staff Engineer/Scientist', 'Data Scientist II',
       'Digital Analytics Consultant, Decision Analytics (Risk Analsyt/Modeler)',
       'Data Scientist, IdentityAI', 'Sr Data Scientist',
       'DATA SCIENTIST BL LAB',
       'Sr. Data Scientist/Modeling & Simulation Engineer Expert',
       'Associate Data Scientist',
       'Senior Business Analyst, Decision Analytics Services',
       'Digital Analytics Consultant, Decision Analytics (Senior Modeler)',
       'Senior Director, Data Science',
       'Data Analyst Disaster Recovery Implementation',
       'Senior Scientist, Data Science Architect',
       'Quantitative Marketing Analyst', 'Business Analyst',
       'Data Scientist/Cognitive Software Engineer',
       'Digital

In [245]:
# identifying non-data science or analysis positions
non_ds_positions = set()
for position in positions_ds:
    if position.lower().find('data scientist') == -1 \
        and position.lower().find('analyst') == -1\
        and position.lower().find('machine learning') == -1\
        and position.lower().find('ai') == -1\
        and position.lower().find('data science') == -1\
        and position.lower().find('analytics') == -1:
        non_ds_positions.add(position)    

In [246]:
# non-data science/analyst positions in this list
df_ds[ df_ds['position'].isin(non_ds_positions) ].shape

(55, 6)

In [247]:
df_ds[ df_ds['position'].isin(non_ds_positions) ]['position'].value_counts().index

Index(['Test Technician 1', 'Product Owner',
       'Billing Call Center Representative', 'Power BI Developer',
       'Medical Technologist', 'Software Developer', 'Research Scientist III',
       'Engineering Scientist - Sensor Systems Engineering Scientist',
       'Clinical Data Review Assistant', 'QA Lead',
       'Research Scientist - Sedimentary Petrographer',
       'Epigenetics Specialist', 'Director of Product Development',
       'Sr Big Data Engineer', 'Technical Account Manager',
       'Nokia IoT IMPACT platform summer intern',
       'Research Engineering/ Scientist Associate IV - 3D Geomodeler',
       'Data Engineer', 'Research Associate II',
       'Senior Scientist, Development',
       'Research Engineering/ Scientist Assistant',
       'Research Engineering/ Scientist Associate III',
       'Senior Environmental Project Manager',
       'Senior Environmental Scientist', 'Account Manager',
       'Security Specialist', 'SOFTWARE ENGINEER III (BLACK LOCUS)',
       '

In [269]:
print('{:.3}% of data are non-data science titles'.format(
                                (df_ds[ df_ds['position'].isin(non_ds_positions) ].shape[0]/len(df_ds))*100))

55.0% of data are non-data science titles


# occupation = 'data analyst'
> #### Scrubbing dataframe to identify titles more closely matched to data science and data analyst roles

In [343]:
# this will be used as the URL input for multiple job searches. 
occupation = 'data analyst' # str(input("Job you are looking for: "))
location = 'austin, texas' # str(input("Where do you want to look: "))
radius = str(5)
page_sort = 'DSC' # ASC (ascending) or DSC (descending) dates 
page_size = "100" # how many pages would you like to get. 
source = 'IN' # stands for indeed

#give it the site URL
url = 'https://www.careeronestop.org/Toolkit/Jobs/find-jobs.aspx?'+\
    'keyword='+Keyword_to_url(occupation)+\
    '&ajax=0&location='+Keyword_to_url(location)+\
    '&radius='+radius+\
    '&source='+ source +\
    '&pagesize='+page_size+\
    '&sortcolumns=accquisitiondate&sortdirections='+page_sort
browser.get(url)
browser.implicitly_wait(30)

# Default is 100 job listings for this page and this verifies that thera are 100 job titles
listing_num = len(browser.find_elements_by_xpath('//td[@data-title="Job Title"]'))

job_titles = browser.find_elements_by_xpath('//td[@data-title="Job Title"]')
company_titles = browser.find_elements_by_xpath('//td[@data-title="Company"]')
locations = browser.find_elements_by_xpath('//td[@data-title="Location"]')
dates = browser.find_elements_by_xpath('//td[@data-title="Date Posted"]')

data = {}
data['company'] = []
data['date_posted'] = []
data['location'] = []
data['position'] = []
data['pos_link'] = []
data['pos_description'] = []

for job in tqdm(range(listing_num)):
    data['position'].append(job_titles[job].text) # position title
    data['company'].append(company_titles[job].text) # comapany title
    data['location'].append(locations[job].text) # job location
    data['date_posted'].append(dates[job].text) # date job posted
    # link to job description
    data['pos_link'].append(browser.find_element_by_link_text(job_titles[job].text).get_attribute('href')) 
    
for link in tqdm(data['pos_link']):
    browser.get(link) # open link to job description in the same window
    # append job description
    data['pos_description'].append(browser.find_element_by_xpath('//td[@class="snip"]').text.replace('\n', ' '))
    browser.back() # click the back button to return to original page

# transform the dictionary into a DataFrame    
df = pd.DataFrame(data)

100%|██████████| 100/100 [00:09<00:00, 10.01it/s]
100%|██████████| 100/100 [02:08<00:00,  1.29s/it]


In [344]:
# identifying all positions specific to data science or analysis
positions_da = df['position']
da_positions = set()
for position in positions_da:
    if position.lower().find('data scientist') > -1 \
        or position.lower().find('analyst') > -1\
        or position.lower().find('machine learning') > -1\
        or position.lower().find('ai') > -1\
        or position.lower().find('data science') > -1\
        or position.lower().find('analytics') > -1:
        da_positions.add(position)    

In [150]:
# Need to come back to, but was making a function to drop rows that were not relavent job posts
def ds_or_da(dataframe):
    index_to_drop = []
    for position in dataframe['position']:
        if position.lower().find('data scientist') == -1 \
        and position.lower().find('analyst') == -1\
        and position.lower().find('machine learning') == -1\
        and position.lower().find('ai') == -1\
        and position.lower().find('data science') == -1\
        and position.lower().find('analytics') == -1:
            index_to_drop.append(position.index())  
        
    return index_to_drop
            

In [345]:
# data science/analyst positions in this list
df[ df['position'].isin(da_positions) ].shape

(98, 6)

In [351]:
# Sample of ru
df[ df['position'].isin(da_positions) ]['position'][:10]

0                                     Inventory Data Analyst
1                                      Regional Data Analyst
2                             Business Analyst - GIS Analyst
3                                      Food Curation Analyst
4                                    Data Scientist 4, Poker
5                                     Business Analyst 3-Ops
6    Data Analyst (Consumer Lending) - Analytic Consultant 4
7                                      Data Analyst with MDM
8                      Business Intelligence Analyst - Sales
9                      Business Intelligence Analyst - Sales
Name: position, dtype: object

In [347]:
# identifying non-data science or analysis positions
non_da_positions = set()
for position in positions_da:
    if position.lower().find('data scientist') == -1 \
        and position.lower().find('analyst') == -1\
        and position.lower().find('machine learning') == -1\
        and position.lower().find('ai') == -1\
        and position.lower().find('data science') == -1\
        and position.lower().find('analytics') == -1:
        non_da_positions.add(position)    

In [348]:
# there are 11 non-data science/analyst positions in this list
df[ df['position'].isin(non_da_positions) ].shape

(2, 6)

In [349]:
df[ df['position'].isin(non_da_positions) ]['position']

27      Data Engineer
87    Data Strategist
Name: position, dtype: object

In [352]:
print('{:.3}% of data are non-data science titles'.format((df[ df['position'].isin(non_da_positions) ]\
                                                            .shape[0]/len(df))*100))

2.0% of data are non-data science titles


In [401]:
df.head()

Unnamed: 0,company,date_posted,location,pos_description,pos_link,position
0,Four Hands,07/11/2018,"Austin, TX",Our Four Hands team is looking for an Inventory Data Analyst to join the Inventory Control team....,http://www.indeed.com/viewjob?jk=96ebe26a988ddb57&qd=gMF-077E7aWVyicXRrUZA568_ZMafrHmjPUuefcLv6b...,Inventory Data Analyst
1,"GateHouse Media, Inc",07/11/2018,"Austin, TX",Summary Media is seeking an experienced Data Analyst to support the regional Consumer Revenue te...,http://www.indeed.com/viewjob?jk=fe39960c0ac7f34d&qd=gMF-077E7aWVyicXRrUZA568_ZMafrHmjPUuefcLv6b...,Regional Data Analyst
2,University of Texas at Austin,07/10/2018,"Austin, TX","Purpose To maintain current, accurate and complete geolocation data for the built environment at...",http://www.indeed.com/viewjob?jk=f72ff2930f74a3d5&qd=gMF-077E7aWVyicXRrUZA568_ZMafrHmjPUuefcLv6b...,Business Analyst - GIS Analyst
3,ZeroCater,07/10/2018,"Austin, TX","At ZeroCater, we believe food is one of the easiest, and most effective, ways to build culture i...",http://www.indeed.com/viewjob?jk=09a97835efc5e42f&qd=gMF-077E7aWVyicXRrUZA568_ZMafrHmjPUuefcLv6b...,Food Curation Analyst
4,Zynga,07/10/2018,"Austin, TX","Zynga’s data science team uses our unique and expansive data to model and predict user behavior,...",http://www.indeed.com/viewjob?jk=cdc3c8a3a140f0c9&qd=gMF-077E7aWVyicXRrUZA568_ZMafrHmjPUuefcLv6b...,"Data Scientist 4, Poker"


# Text Preprocessing

## Create function to REMOVE PUCTUATION

In [560]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

#### Function DOES NOT WORK with this dataset (see below why)

In [403]:
# sring of all punctuations
string.punctuation

def remove_punct(text):
    # only returns char not in the punctuation list then join the characters to look like the original document                                                                                  #  punctuation list
    text_nopunct = ''.join([char for char in text if char not in string.punctuation]) 
    return text_nopunct

# make a new colum of the descriptions with punctuation removed
df['docs_nopunct'] = df['pos_description'].apply(lambda x: remove_punct(x))

Noticed that there are still apostrophies in the new column???

In [404]:
# the apostrophe from the document does not match the puncutation string's apostophe; need to find another solution
print('You can see that \"{}\" from the punctuation string does not look like \"{}\" from the document and\
 this is what it like\n manually typed \"{}\"'.format(string.punctuation[6], df['docs_nopunct'][4][5],'\''))

You can see that "'" from the punctuation string does not look like "’" from the document and this is what it like
 manually typed "'"


#### Function WORKS correctly utilizing regex

In [405]:
import re

In [406]:
# Using regular expressions to remove punctuation
def remove_punct(text):
    '''
    Finds all word elements only and joins all the stings on ' '
    '''
    text_nopunct = ' '.join(re.findall('\w+', text)) # \w+ selects word elements only 
    return text_nopunct

df['docs_nopunct'] = df['pos_description'].apply(lambda x: remove_punct(x))

## Create a function to TOKENIZE the text

In [408]:
def tokenizer(text):
    docs_tokenized = re.split('\W+', text)
    return docs_tokenized   

df['docs_tokenized'] = df['docs_nopunct'].apply(lambda x: tokenizer(x.lower()))

## Create a function to REMOVE STOPWORDS

In [410]:
stopwords = nltk.corpus.stopwords.words('english')

In [411]:
def remove_stopwords(tokenized_text):
    docs_no_stopwords = [word for word in tokenized_text if word not in stopwords]
    return docs_no_stopwords     

df['docs_tokenized_nostopwords'] = df['docs_tokenized'].apply(lambda x: remove_stopwords(x))

## Create a function to REMOVE CRAP WORDS

In [421]:
crap_words = ['hour', 'hours', 'day', 'days', 'minutes', 'austin', 'tx', 'ago']

In [422]:
def remove_crap(text):
    docs_no_crap = [word for word in text if word not in crap_words]
    return docs_no_crap

df['docs_no_crap'] = df['docs_tokenized'].apply(lambda x: remove_crap(x))

## Create a function to LEMMATIZE the tokens

In [369]:
def word_lemmatizer(word):   
    lemma = wn.morphy(word)
    if lemma is None:
        return  word
    else:
        return lemma

In [370]:
def doc_lemmatizer(text):
    docs_lemmatized = [word_lemmatizer(word) for word in text]
    return docs_lemmatized

df['docs_lemmatized'] = df['docs_tokenized_nostopwords'].apply(lambda x: doc_lemmatizer(x))

## Job recommendations at bottom of webpages
 - At the end of every job description there is a "save job" link.  A function was made to call out the index of "save" in every job post in order to index the output of the text_processing function to only include everything before it.  
<img width="406" alt="screen shot 2018-07-12 at 1 16 00 am" src="https://user-images.githubusercontent.com/7989686/42616824-2ebf21c0-8575-11e8-9cb9-e2c0a6c037b1.png">

# Combining into a one TEXT PREPROCESSING FUNCTION 

In [618]:
stopwords = nltk.corpus.stopwords.words('english')
crap_words = ['hour', 'hours', 'day', 'days', 'minutes', 'austin', 'tx', 'ago', 'experience', 'team',
              'work', 'skill', 'understanding', 'ability', 'member', 'years', 'requirement', 'job', 'data',
              'analyst']

# function for wordnet lemmatizing a word returns the word if it's not in wordnet
#  this allows there to be a combination of wordnet lemmatized words and non-wordnet words in the doc
def word_lemmatizer(word):   
    lemma = wn.morphy(word) 
    if lemma is None:  # return the word if there is no lemma in WordNet for that word
        return  word
    else:
        return lemma
# Bottom of web page has job recommendations. This function identifies the index of 'save' at 
# the bottom of every job description
def save_index(text):
    save = text.index('save')
    return save

def text_preprocess(text):
    text_nopunct = ' '.join(re.findall('\w+', text))  # remove puncuation by only selecting word elements '\w+'
    docs_tokenized = re.split('\W+', text_nopunct)  # tokenize splitting on non-word elements '\W+'
    docs_no_stopwords = [word for word in docs_tokenized if word not in stopwords]  # remove stopwords
    docs_lemmatized = [word_lemmatizer(word) for word in docs_no_stopwords]  # wordnet lemmatizing
    docs_no_crapwords = [word for word in docs_lemmatized if word not in crap_words]
    return docs_no_crapwords[ : save_index(docs_no_crapwords)]  # return the list up to 'save'

In [None]:
def revert(df):
    try:
        return df.drop('pos_description_processed', axis=1, inplace=True)
    except:
        pass
revert(df)

In [620]:
# Adding a new column of the processed text
df['pos_description_processed'] = df['pos_description'].apply(lambda x: text_preprocess(x.lower()))

In [621]:
df.head()

Unnamed: 0,company,date_posted,location,pos_description,pos_link,position,pos_description_processed
0,Four Hands,07/11/2018,"Austin, TX",Our Four Hands team is looking for an Inventory Data Analyst to join the Inventory Control team....,http://www.indeed.com/viewjob?jk=96ebe26a988ddb57&qd=gMF-077E7aWVyicXRrUZA568_ZMafrHmjPUuefcLv6b...,Inventory Data Analyst,"[four, hands, looking, inventory, join, inventory, control, role, ensure, location, properly, ma..."
1,"GateHouse Media, Inc",07/11/2018,"Austin, TX",Summary Media is seeking an experienced Data Analyst to support the regional Consumer Revenue te...,http://www.indeed.com/viewjob?jk=fe39960c0ac7f34d&qd=gMF-077E7aWVyicXRrUZA568_ZMafrHmjPUuefcLv6b...,Regional Data Analyst,"[summary, medium, seeking, support, regional, consumer, revenue, role, responsible, gathering, a..."
2,University of Texas at Austin,07/10/2018,"Austin, TX","Purpose To maintain current, accurate and complete geolocation data for the built environment at...",http://www.indeed.com/viewjob?jk=f72ff2930f74a3d5&qd=gMF-077E7aWVyicXRrUZA568_ZMafrHmjPUuefcLv6b...,Business Analyst - GIS Analyst,"[purpose, maintain, current, accurate, complete, geolocation, build, environment, university, te..."
3,ZeroCater,07/10/2018,"Austin, TX","At ZeroCater, we believe food is one of the easiest, and most effective, ways to build culture i...",http://www.indeed.com/viewjob?jk=09a97835efc5e42f&qd=gMF-077E7aWVyicXRrUZA568_ZMafrHmjPUuefcLv6b...,Food Curation Analyst,"[zerocater, believe, food, one, easy, effective, ways, build, culture, office, make, easy, compa..."
4,Zynga,07/10/2018,"Austin, TX","Zynga’s data science team uses our unique and expansive data to model and predict user behavior,...",http://www.indeed.com/viewjob?jk=cdc3c8a3a140f0c9&qd=gMF-077E7aWVyicXRrUZA568_ZMafrHmjPUuefcLv6b...,"Data Scientist 4, Poker","[zynga, science, use, unique, expansive, model, predict, user, behavior, making, game, personali..."


### After reviewing the results using "data scientist" and "data analyst" there were more positive titles directly related to Data Science or Data Analytics using the "data analyst" keywords.

## Time for LDA

In [622]:
# Text data
texts = df['pos_description_processed']

In [623]:
# validate there that crap words are removed
count = 0
words = []
for word in texts[0]:
    if word in crap_words:
        count += 1
        words.append(word)
print(count)
print(words)

0
[]


In [624]:
dictionary = corpora.Dictionary(texts)

In [625]:
# Illustration of what doc2bow is doing. "data" appears 5 tims in this position
pd.concat([pd.Series(dictionary.doc2bow(texts[0])[:10]),pd.Series(texts[0][:10])], axis=1)

Unnamed: 0,0,1
0,"(0, 2)",four
1,"(1, 2)",hands
2,"(2, 1)",looking
3,"(3, 12)",inventory
4,"(4, 1)",join
5,"(5, 2)",inventory
6,"(6, 1)",control
7,"(7, 1)",role
8,"(8, 1)",ensure
9,"(9, 1)",location


In [626]:
# Validation that there are 5 instances of 'data' in the first job description
count = 0
for word in texts[0]:
    if word == 'data':
        count += 1
print('There are {} instances of \'{}\''.format(count, 'data'))

There are 0 instances of 'data'


In [627]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [628]:
# pickling the corpus object for 
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

## LDA with Gensim

### Training a 2 topic model

In [629]:
topic_count = 2
ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                           num_topics=topic_count,
                                           id2word=dictionary,
                                           passes=15,
                                           random_state=42)
ldamodel.save('model2.gensim')


topics = ldamodel.print_topics(num_words=3)
for topic in topics:
    print(topic)

(0, '0.009*"analysis" + 0.008*"business" + 0.007*"marketing"')
(1, '0.020*"business" + 0.010*"process" + 0.008*"management"')


### Training a 5 topic model

In [630]:
topic_count = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                           num_topics=topic_count,
                                           id2word=dictionary,
                                           passes=15,
                                           random_state=42)
ldamodel.save('model5.gensim')


topics = ldamodel.print_topics(num_words=3)
for topic in topics:
    print(topic)

(0, '0.011*"analysis" + 0.010*"marketing" + 0.008*"customer"')
(1, '0.013*"business" + 0.008*"sales" + 0.007*"process"')
(2, '0.023*"business" + 0.014*"project" + 0.012*"process"')
(3, '0.021*"business" + 0.008*"software" + 0.006*"product"')
(4, '0.015*"sales" + 0.013*"business" + 0.007*"management"')


### Traing an 8 topic model

In [631]:
topic_count = 8
ldamodel = gensim.models.ldamodel.LdaModel(corpus=corpus, 
                                           num_topics=topic_count,
                                           id2word=dictionary,
                                           passes=15,
                                           random_state=42)
ldamodel.save('model8.gensim')


topics = ldamodel.print_topics(num_words=3)
for topic in topics:
    print(topic)

(0, '0.011*"marketing" + 0.009*"analysis" + 0.007*"customer"')
(1, '0.011*"sales" + 0.008*"business" + 0.008*"analysis"')
(2, '0.015*"business" + 0.013*"system" + 0.010*"knowledge"')
(3, '0.020*"business" + 0.009*"software" + 0.006*"drive"')
(4, '0.010*"text" + 0.010*"survey" + 0.007*"clustering"')
(5, '0.018*"business" + 0.010*"sales" + 0.010*"reporting"')
(6, '0.016*"investment" + 0.016*"market" + 0.009*"estate"')
(7, '0.027*"business" + 0.017*"process" + 0.014*"project"')


### MacOS online advice for clicking links
element.send_keys(Keys.CONTROL + Keys.RETURN)  
driver.switch_to.window(self.driver.window_handles[1])  
#### do stuff  
driver.close()  
driver.switch_to.window(self.driver.window_handles[0])  

## Using SpaCy for NLP - this is a Work in Progress (Eddie has not started this part yet)
Objective: Setup for topic modeling and use LDA to determine feature importance

In [324]:
parser = spacy.lang.en.English()
nlp = spacy.load('en')

In [325]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(pos_description),deacc=True))

In [326]:
data_words = list(sent_to_words(pos_description))

In [328]:
#Building bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)

trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

In [329]:
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [331]:
def remove_stopwords(text):
    return [[word for word in simple_preprocess(str(doc)) if word not in STOP_WORDS] for doc in text]

def make_bigrams(text):
    return [bigram_mod[doc] for doc in text]

def make_trigrams(text):
    return [trigram_mod[bigram_mod[doc]] for doc in text]

def lemmatization(text, allowed_postags = ['NOUN','ADJ','VERB','ADV']):
    text_rem = []
    for sent in text:
        doc = nlp(" ".join(sent))
        text_rem.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        return text_rem 

In [332]:
words_no_stops = remove_stopwords(data_words)

word_bigrams = make_bigrams(words_no_stops)

data_lemma =  lemmatization(word_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [335]:
id2word = corpora.Dictionary(data_lemma)

In [336]:
texts = data_lemma

In [337]:
corpus = [id2word.doc2bow(text) for text in texts]

In [338]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word,num_topics=20,
                               random_state=42,
                               update_every=1,
                               chunksize=100,
                               passes=10,
                               alpha='auto',
                               per_word_topics=True)

In [339]:
doc_lda = lda_model[corpus]

In [340]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = coherencemodel.CoherenceModel(model=lda_model, texts=data_lemma, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.106306833828683

Coherence Score:  0.26450302724652314


In [341]:
pyLDAvis.gensim.prepare(lda_model,corpus, id2word)