In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
from pymongo import MongoClient
from time import time
from collections import Counter
from textblob import TextBlob
import pandas as pd
import numpy as np
import re
import os

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

# default plot stying changes
import seaborn as sns
sns.set_style("white")
sns.set_context("poster", font_scale=1.25, rc={"lines.linewidth": 2.5})
sns.set_palette("Set2")
colors = sns.color_palette('Set2',12)

# Credentials

In [None]:
pw_file = 'pw.txt'
if os.path.exists(pw_file): 
    with open(pw_file, 'r') as f:
        email, indeed_pw = f.readline().strip().split(', ')
        username, pia_pw = f.readline().strip().split(', ')
        pub_ip, mongo_usr, mongo_usr_pw = f.readline().strip().split(', ')

# Connect to DB

In [None]:
# connect to ec2 mongo client
client = MongoClient('{0}:27017'.format(pub_ip))

In [None]:
# get reference to  resume_db
db = client.resume_db

In [None]:
# authenticate user for database
db.authenticate(mongo_usr, mongo_usr_pw)

# Pull MongoDB into Dataframe

In [None]:
def read_mongo(db, collection, query={}, no_id=True):
    '''
    db: mongodb already connected and authenticated
    collection: desired collection in db
    query: query filter
    no_id: include mongos _id (False) or not (True)
    return => pandas dataframe
    '''
    # Make a query to the specific DB and Collection
    cursor = db[collection].find(query)

    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(list(cursor))

    # Delete the _id
    if no_id:
        del df['_id']

    return df

In [None]:
t_start = time()

# load database data into dataframe
df = read_mongo(db, 'originals')

print('Time to load data: {0}s'.format(time() - t_start))

In [None]:
df.head(3)

In [None]:
print(list(df['search_term'].unique()))

# Second Pass - Clean Text

In [None]:
df['resume_clean'] = df['resume_text'].str.replace(':|;', '')
df['resume_clean'] = df['resume_clean'].str.replace('.', '')
df['resume_clean'] = df['resume_clean'].str.replace(',', '')

In [None]:
df.head()

# Remove StopWords

In [None]:
# cache stopwords first to reduce compute time
cachedStopWords = stopwords.words("english")
cachedStopWords += ['tot']

# convert all text to lower case and separate into list
df['resume_stopped'] = df['resume_clean'].str.lower().str.split()

# remove stopwords
df['resume_stopped'] = df['resume_stopped'].apply(lambda x: ' '.join([item for item in x if item not in cachedStopWords]))

In [None]:
df = df[['resume_text', 'resume_clean', 'resume_stopped']]
df.head()

# Word Frequency

In [None]:
raw = ' '.join(df['resume_text'].tolist()).split()
len(raw)

In [None]:
clean = ' '.join(df['resume_clean'].tolist()).split()
len(clean)

In [None]:
stp = ' '.join(df['resume_stopped'].tolist()).split()
len(stp)

# Stem With Text Blob

In [None]:
text = ' '.join(df['resume_stopped'].tolist())

# porter stemmer
port_stem = []
stemmer = PorterStemmer()

for word in TextBlob(text).words:
    port_stem.append(stemmer.stem(word))

In [None]:
# lancaster stemmer
lanc_stem = []
stemmer = LancasterStemmer()

for word in TextBlob(text).words:
    lanc_stem.append(stemmer.stem(word))

In [None]:
np = TextBlob(text).noun_phrases

In [None]:
# nouns only

port_noun = []
stemmer = PorterStemmer()


for word in TextBlob(text).noun_phrases:
    port_noun.append(stemmer.stem(word))

In [None]:
# get wordcount for stemmed corpus
wordct_stem = Counter(stem)

# limit wordcounts for visualization
wordct_stem = wordct_stem.most_common(60)
print(len(wordct_stem))

In [None]:
# get wordcount for unstemmed corpus
wordct = Counter(' '.join(df['resume_stopped']).split(' '))

# limit wordcounts for visualization
wordct = wordct.most_common(60)
print(len(wordct))

In [None]:
# un-stemmed most common for bar chart
labels = [lbl for lbl, ct in wordct]
count = [ct for lbl, ct in wordct]

# stemmed most common for bar chart
labels_stem = [lbl for lbl, ct in wordct_stem]
count_stem = [ct for lbl, ct in wordct_stem]

In [None]:
# make figure
fig = plt.figure(figsize=(20,20))
ax = fig.add_subplot(111)

# color
colors = sns.color_palette("BrBG", len(labels))

# plots
y_pos = np.arange(len(labels))
ax.barh(y_pos, count, align='center', color=colors, edgecolor=colors)

#plt.xlim(0,170000)
plt.ylim(-1,len(labels))

# labels/titles
plt.legend(loc="best")
plt.title('Un-Stemmed Word/Term Frequency')
plt.xlabel('Word/Term Count')
plt.yticks(y_pos, labels)
plt.ylabel('Word/Term')
plt.xticks(np.linspace(0,180000, 13))

# remove border
ax.spines["top"].set_visible(False)  
ax.spines["right"].set_visible(False) 
ax.spines["bottom"].set_visible(False) 
ax.spines["left"].set_visible(False)

# show grid
#ax.xaxis.grid(True, alpha=0.2, linestyle='--') 
#ax.yaxis.grid(True, alpha=0.2, linestyle='--') 

# plot that biddy
plt.tight_layout()
_ = plt.show()

In [None]:
# make figure
fig = plt.figure(figsize=(20,20))
ax = fig.add_subplot(111)

# color
colors = sns.color_palette("BrBG", len(labels_stem))

# plots
y_pos = np.arange(len(labels_stem))
ax.barh(y_pos, count_stem, align='center', color=colors, edgecolor=colors)

#plt.xlim(0,170000)
plt.ylim(-1,len(labels))

# labels/titles
plt.legend(loc="best")
plt.title('Stemmed Word/Term Frequency')
plt.xlabel('Word/Term Count')
plt.yticks(y_pos, labels)
plt.ylabel('Word/Term')
plt.xticks(np.linspace(0,180000, 13))

# remove border
ax.spines["top"].set_visible(False)  
ax.spines["right"].set_visible(False) 
ax.spines["bottom"].set_visible(False) 
ax.spines["left"].set_visible(False)

# show grid
#ax.xaxis.grid(True, alpha=0.2, linestyle='--') 
#ax.yaxis.grid(True, alpha=0.2, linestyle='--') 

# plot that biddy
plt.tight_layout()
_ = plt.show()

In [None]:
text = df['resume_text'].tolist()
clean = df['resume_clean'].tolist()
stopped = df['resume_stopped'].tolist()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [None]:
x_text = vectorizer.fit_transform(text)
print('Raw resume text number of features: {0}'.format(len(vectorizer.get_feature_names())))

In [None]:
x_clean = vectorizer.fit_transform(clean)
print('Cleaned resume text number of features: {0}'.format(len(vectorizer.get_feature_names())))

In [None]:
x_stop = vectorizer.fit_transform(stopped)
print('Stopped resume text number of features: {0}'.format(len(vectorizer.get_feature_names())))

In [None]:
vectorizer = CountVectorizer(stopped, stop_words='english')
x_stop = vectorizer.fit_transform(stopped)
print('Stopped resume text number of features: {0}'.format(len(vectorizer.get_feature_names())))

In [None]:
vect_stopped = vectorizer.get_feature_names()

stemmer = nltk.stem.porter.PorterStemmer()
stem_vect_stop = []
for i in vect_stopped:
    stem_vect_stop.append(stemmer.stem(i))

In [None]:
len(set(stem_vect_stop))

# Stemmed

In [None]:
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()

In [None]:
df['stem'] = df['resume_stopped'].apply(lambda x: stemmer.stem(x))
#df['stem'] = df['resume_stopped'].str.split(' ')

In [None]:
df.head()

In [None]:
# get wordcounts
wordcount = Counter(' '.join(df['resume_stopped']).split(' '))

# limit wordcounts for visualization
wordcount = wordcount.most_common(30)

In [None]:
labels = [lbl for lbl, ct in wordcount]
count = [ct for lbl, ct in wordcount]

In [None]:
# make figure
fig = plt.figure(figsize=(20,20))
ax = fig.add_subplot(111)

# color
colors = sns.color_palette("BrBG", len(labels))

# plots
y_pos = np.arange(len(labels))
ax.barh(y_pos, count, align='center', color=colors, edgecolor=colors)

#plt.xlim(0,170000)
plt.ylim(-1,30)

# labels/titles
plt.legend(loc="best")
plt.title('Word/Term Frequency')
plt.xlabel('Word/Term Count')
plt.yticks(y_pos, labels)
plt.ylabel('Word/Term')
plt.xticks(np.linspace(0,180000, 13))

# remove border
ax.spines["top"].set_visible(False)  
ax.spines["right"].set_visible(False) 
ax.spines["bottom"].set_visible(False) 
ax.spines["left"].set_visible(False)

# show grid
#ax.xaxis.grid(True, alpha=0.2, linestyle='--') 
#ax.yaxis.grid(True, alpha=0.2, linestyle='--') 

# plot that biddy
plt.tight_layout()
_ = plt.show()

# Stemming

In [None]:
fd = nltk.FreqDist(w.lower() for w in df['resume_stopped'])
#fd.plot(10)
fd

# N-Grams Count Vectorizer

In [None]:
n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20

In [None]:
t_start = time()

# convert resume texts to a sparse matrix of token counts
ct_vect = CountVectorizer(ngram_range=(1, 3), max_df=0.90, min_df=2, max_features=n_features, stop_words='english')
ct_vect_prep = ct_vect.fit_transform(df['resume_text'])

print('Time to count vectorize data: {0:.4}s'.format(time() - t_start))

# Latent Dirichlet Allocation

In [None]:
lda_mdl = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', 
                                learning_offset=50., random_state=0)

t_start = time()

lda_mdl.fit(ct_vect_prep)

print('Time to count vectorize data: {0:.4}s'.format(time() - t_start))

In [None]:
print("Topics in LDA model:")

# get feature names (topics) from model
feat_names = ct_vect.get_feature_names()

print('Start of list: ' + ', '.join(feat_names[:20]))
print('End of list: ' + ', '.join(feat_names[-10:]))

# Get Top Words in Topics

In [None]:
def print_top_words(model, feature_names, top_words):
    for i, topic in enumerate(model.components_):
        print("Topic {0}:".format(i))
        print(", ".join([feature_names[i] for i in topic.argsort()[:-top_words - 1:-1]]))
    print()

In [None]:
print_top_words(lda_mdl, feat_names, 12)

# TF-IDF

In [None]:
TfidfVectorizer(input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, token_pattern='(?u)\b\w\w+\b', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.int64'>, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)[source]¶