In [25]:
import pandas as pd
import numpy as np

#Used to allow the browser to load. 
import time
from tqdm import tqdm

#used as a import for dateing the CSV
import datetime

#NLP libaries
import spacy
from spacy.lang.en import STOP_WORDS

import gensim
import pyLDAvis.gensim
from gensim.models import LdaModel
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import coherencemodel
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt

## Using Topic Modeling using Gensim
Objective: Setup for topic modeling and use LDA to determine feature importance

In [26]:
job_list = pd.read_csv('./data/2018-06-25_Data Scientist_Austin, TX job_list.csv')

In [39]:
nlp = spacy.load('en', disable=['parser', 'ner'])

In [40]:
pos_description = job_list['position_description'].values

In [41]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(pos_description),deacc=True))

In [42]:
data_words = list(sent_to_words(pos_description))

In [43]:
#Building bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=4, threshold=100)

trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

In [44]:
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [45]:
def remove_stopwords(text):
    return [[word for word in simple_preprocess(str(doc)) if word not in STOP_WORDS] for doc in text]

def make_bigrams(text):
    return [bigram_mod[doc] for doc in text]

def make_trigrams(text):
    return [trigram_mod[bigram_mod[doc]] for doc in text]

def lemmatization(text, allowed_postags = ['NOUN','ADJ','VERB','ADV']):
    text_rem = []
    for sent in text:
        doc = nlp(" ".join(sent))
        text_rem.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        return text_rem 

In [46]:
remove_more_words = ['skill','datum','day','week','year','ability','analyst','recovery','austin','document','ago']

In [47]:
words_no_stops = remove_stopwords(data_words)

word_bigrams = make_bigrams(words_no_stops)

data_lemma =  lemmatization(word_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [48]:
data_lemma[0] = [word for word in data_lemma[0] if word not in remove_more_words]

In [49]:
id2word = corpora.Dictionary(data_lemma)

texts = data_lemma

corpus = [id2word.doc2bow(text) for text in texts]

In [50]:
data_lemma

[['client',
  'company',
  'use',
  'artificial',
  'intelligence',
  'machine',
  'learning',
  'technique',
  'build',
  'software',
  'system',
  'think',
  'adapt',
  'interact',
  'people',
  'quality',
  'assurancelead',
  'responsible',
  'define',
  'shaping',
  'testing',
  'methofology_alongside',
  'engineering',
  'leadership',
  'work',
  'engineering',
  'leadership',
  'shape',
  'define',
  'test',
  'methodology',
  'work',
  'closely',
  'solution',
  'architect',
  'project',
  'manager',
  'customer',
  'define',
  'work',
  'item',
  'acceptance',
  'criterion',
  'write',
  'execute',
  'test',
  'plan',
  'integrate',
  'test',
  'suite',
  'pipeline',
  'work',
  'closely',
  'machine',
  'learn',
  'engineer',
  'scientist',
  'craft',
  'testing',
  'methodology',
  'ai',
  'produce',
  'thoroughly',
  'detailed',
  'defect',
  'report',
  'participate',
  'code',
  'review',
  'necessary',
  'work',
  'understand',
  'code',
  'testing',
  'prepare',
  'sprin

In [51]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word,num_topics=10,
                               random_state=42,
                               update_every=1,
                               chunksize=100,
                               passes=10,
                               alpha='auto',
                               per_word_topics=False)

In [52]:
doc_lda = lda_model[corpus]

In [55]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = coherencemodel.CoherenceModel(model=lda_model, texts=data_lemma , dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.341735867717185

Coherence Score:  0.3090886691580874


In [56]:
pyLDAvis.gensim.prepare(lda_model,corpus, id2word)