In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
from pymongo import MongoClient
from time import time
from collections import Counter
from textblob import TextBlob
import pandas as pd
import numpy as np
import re
import os
import pickle

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

# default plot stying changes
import seaborn as sns
sns.set_style("white")
sns.set_context("poster", font_scale=1.25, rc={"lines.linewidth": 2.5})
sns.set_palette("Set2")
colors = sns.color_palette('Set2',12)

# Credentials

In [None]:
pw_file = 'credentials/pw.txt'
if os.path.exists(pw_file): 
    with open(pw_file, 'r') as f:
        email, indeed_pw = f.readline().strip().split(', ')
        username, pia_pw = f.readline().strip().split(', ')
        pub_ip, mongo_usr, mongo_usr_pw = f.readline().strip().split(', ')

# Connect to DB

In [None]:
# connect to ec2 mongo client
client = MongoClient('{0}:27017'.format(pub_ip))

In [None]:
# get reference to  resume_db
db = client.resume_db

In [None]:
# authenticate user for database
db.authenticate(mongo_usr, mongo_usr_pw)

# Pull MongoDB into Dataframe

In [None]:
def read_mongo(db, collection, query={}, no_id=True):
    '''
    db: mongodb already connected and authenticated
    collection: desired collection in db
    query: query filter
    no_id: include mongos _id (False) or not (True)
    return => pandas dataframe
    '''
    # Make a query to the specific DB and Collection
    cursor = db[collection].find(query)

    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(list(cursor))

    # Delete the _id
    if no_id:
        del df['_id']

    return df

In [None]:
t_start = time()

# load database data into dataframe
df = read_mongo(db, 'originals')

print('Time to load data: {0}s'.format(time() - t_start))

In [None]:
df = df[['resume_text']]
df.head(3)

In [None]:
print(list(df['search_term'].unique()))

# Second Pass - Clean Text

In [None]:
#df['resume_clean'] = df['resume_text'].str.replace(':|;', '')
#df['resume_clean'] = df['resume_clean'].str.replace('.', '')
#df['resume_clean'] = df['resume_clean'].str.replace(',', '')
df['resume_stopped'] = df['resume_text'].str.replace(r'''[^0-9a-zA-Z ]+''', '')

In [None]:
df.head()

# Remove StopWords

In [None]:
# cache stopwords first to reduce compute time
cachedStopWords = stopwords.words("english")
cachedStopWords += ['tot']

# convert all text to lower case and separate into list
df['resume_stopped'] = df['resume_stopped'].str.lower().str.split()

# remove stopwords
df['resume_stopped'] = df['resume_stopped'].apply(lambda x: ' '.join([item for item in x if item not in cachedStopWords]))

# Total Word Count

In [None]:
text_ct = ' '.join(df['resume_text'].tolist()).split()
len(text_ct)

In [None]:
stop_ct = ' '.join(df['resume_stopped'].tolist()).split()
len(stop_ct)

# Stems (RESTART)

In [None]:
# if stemmed list already exists, load it
if os.path.isfile('pkl/port_stem.pkl'):
    with open(r'pkl/port_stem.pkl', 'rb') as infile:
       port_stem = pickle.load(infile)
else:
# otherwise make the stemmed list
    text = ' '.join(df['resume_stopped'].tolist())
    
    port_stem = []
    stemmer = PorterStemmer()

    for word in TextBlob(text).words:
        port_stem.append(stemmer.stem(word))

In [None]:
# if stemmed list already exists, load it
if os.path.isfile('pkl/lanc_stem.pkl'):
    with open(r'pkl/lanc_stem.pkl', 'rb') as infile:
       lanc_stem = pickle.load(infile)
else:
# otherwise make the stemmed list
    text = ' '.join(df['resume_stopped'].tolist())
    
    lanc_stem = []
    stemmer = LancasterStemmer()

    for word in TextBlob(text).words:
        lanc_stem.append(stemmer.stem(word))

In [None]:
print(len(set(port_stem)))
print(len(set(lanc_stem)))

### Pickle Stemmed Words

In [None]:
def save_pkl(data, filename):
    with open('{0}.pkl'.format(filename), 'wb') as f:
        pickle.dump(data, f)

In [None]:
save_pkl(port_stem, 'pkl/port_stem')
save_pkl(lanc_stem, 'pkl/lanc_stem')

# Word Counts

In [None]:
def get_wordcount(text_list, min_ct=3, most_common=30, get_all=False):
    '''
    returns => most common
    '''
    # get wordcount counter object
    word_count = Counter(text_list)

    # remove words that occur min_ct times or less
    word_count = Counter({k:v for k, v in word_count.items() if v >= min_ct})

    if get_all:
        # return all
        word_count = word_count.items()
    else:
        # limit wordcounts for visualization
        word_count = word_count.most_common(most_common)
    
    return word_count

### Porter Stemmed Word Count

In [None]:
wordct_port_stem = get_wordcount(port_stem, 3, 30)

### Lancaster Stemmed Word Count

In [None]:
wordct_lanc_stem = get_wordcount(lanc_stem, 3, 30)

### Un-Stemmed Word Count

In [None]:
txt = ' '.join(df['resume_stopped']).split(' ')
wordct = get_wordcount(txt, 3, 30)

### Get Lables, Counts of Word Counts

In [None]:
def label_count(word_count):
    label = [lbl for lbl, ct in word_count]
    count = [ct for lbl, ct in word_count]
    return (label, count)

# Word Count Plots

In [None]:
def plot_bar(data_tup, title, file_name):
    # make figure
    fig = plt.figure(figsize=(20,12))
    ax = fig.add_subplot(111)

    ########## DATA ##############
    lbl, ct = label_count(data_tup)
    ##############################

    # color
    colors = sns.color_palette("BrBG", len(lbl))

    # plots
    y_pos = np.arange(len(lbl))
    ax.barh(y_pos, ct, align='center', color=colors, edgecolor=colors)

    #plt.xlim(0,170000)
    plt.ylim(-0.5,len(lbl))

    # labels/titles
    plt.legend(loc="best")
    plt.title('{0} Word/Term Frequency'.format(title))
    plt.xlabel('Word/Term Count')
    plt.yticks(y_pos, lbl)
    plt.ylabel('Word/Term')
    plt.xticks(np.linspace(0,180000, 13))

    # remove border
    ax.spines["top"].set_visible(False)
    ax.spines["bottom"].set_alpha(0.2)
    ax.spines["right"].set_visible(False)
    ax.spines["left"].set_alpha(0.2)

    # plot that biddy
    plt.savefig('data/pics/{0}.png'.format(file_name), bbox_inches='tight')
    plt.close(fig)
    
    return None

### Save as Images

In [None]:
plot_bar(wordct_port_stem, 'Porter Stem', 'porter_bar')
plot_bar(wordct_lanc_stem, 'Lancaster Stem', 'lancaster_bar')
plot_bar(wordct, 'Non-Stemmed', 'non-stem_bar')

# Nouns

### Extrack Noun Phrases

In [None]:
nouns = lambda x: TextBlob(x).noun_phrases

df['resume_nouns'] = df['resume_stopped']
df['resume_nouns'] = df['resume_nouns'].apply(nouns)

### Merge Noun Phrases Back to Text String

In [None]:
lst_to_str = lambda x: ' '.join(x)

df['resume_nouns'] = df['resume_nouns'].apply(lst_to_str)

In [None]:
df.head()

# Save Dataframe to Pickle

In [None]:
df.to_pickle('pkl/df_stop_noun.pkl')
df = None

# Read Dataframe from Pickle (RESTART)

In [None]:
df = pd.read_pickle('pkl/df_stop_noun.pkl')
df.head()

### Noun Phrased Word Count

In [None]:
noun_txt = ' '.join(df['resume_nouns']).split(' ')
wordct_noun = get_wordcount(noun_txt, 3, 30)

In [None]:
wordct_noun = get_wordcount(noun_txt, 2, get_all=True)
wordct_noun_lst = [x for x,y in wordct_noun]

def getKey(item):
    return item[1]

#list(sorted(wordct_noun, key=getKey, reverse=True))

# NOT WORKING 

### Output Frequency Chart of Noun Phrases

In [None]:
#plot_bar(list(wordct_noun), 'Noun Phrases', 'noun_bar')

# Counter (Tuple) to Label List

In [None]:
def count_lbl_lst(cntr):
    lst = [x for x,y in cntr]
    return lst

# Strip (Select) Words from Text

In [None]:
# re-grab references to the stemmer objects
port_stemmer = PorterStemmer()
lanc_stemmer = LancasterStemmer()

### Nouns Only Series

In [None]:
# noun filter function to apply to dataframe
noun_filter = lambda cell: ' '.join([x for x in cell.split() if x in wordct_noun_lst])

# file name to save/load 
fname = 'df_noun_only'

# if filtered dataframe already exists, load it
if os.path.isfile('pkl/{0}.pkl'.format(fname)):
    df.read_pickle('pkl/{0}.pkl'.format(fname))
else:
    # otherwise strip all words not in noun words list
    df['resume_nouns'] = df['resume_nouns'].apply(noun_filter)
    
    # save to dataframe
    df.to_pickle('pkl/{0}.pkl'.format(fname))

### Porter Stemmed Only Series

In [None]:
# porter stemmer filter function to apply to dataframe
porter_filter = lambda cell: ' '.join([port_stemmer.stem(x) for x in cell.split() 
                                              if port_stemmer.stem(x) in count_lbl_lst(wordct_port_stem)])
# file name to save/load 
fname = 'df_noun_port'

# if filtered dataframe already exists, load it
if os.path.isfile('pkl/{0}.pkl'.format(fname)):
    df.read_pickle('pkl/{0}.pkl'.format(fname))
else:
    # otherwise strip all words not in porter stemmed words list
    df['resume_porter'] = df['resume_stopped']
    df['resume_porter'] = df['resume_porter'].apply(porter_filter)
    
    # save to dataframe
    df.to_pickle('pkl/{0}.pkl'.format(fname))

### Lancaster Stemmed Only Series

In [None]:
# lancaster stemmer filter function to apply to dataframe
lancaster_filter = lambda cell: ' '.join([lanc_stemmer.stem(x) for x in cell.split() 
                                              if lanc_stemmer.stem(x) in count_lbl_lst(wordct_lanc_stem)])
# file name to save/load 
fname = 'df_noun_port_lanc'

# if filtered dataframe already exists, load it
if os.path.isfile('pkl/{0}.pkl'.format(fname)):
    df.read_pickle('pkl/{0}.pkl'.format(fname))
else:
    # otherwise strip all words not in lancaster stemmed words list
    df['resume_lancaster'] = df['resume_stopped']
    df['resume_lancaster'] = df['resume_lancaster'].apply(lancaster_filter)
    
    # save to dataframe
    df.to_pickle('pkl/{0}.pkl'.format(fname))

# N-Grams Count Vectorizer

In [None]:
n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20

In [None]:
t_start = time()

# convert resume texts to a sparse matrix of token counts
ct_vect = CountVectorizer(ngram_range=(1, 3), max_df=0.90, min_df=2, max_features=n_features, stop_words='english')
ct_vect_prep = ct_vect.fit_transform(df['resume_text'])

print('Time to count vectorize data: {0:.4}s'.format(time() - t_start))

# Latent Dirichlet Allocation

In [None]:
lda_mdl = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', 
                                learning_offset=50., random_state=0)

t_start = time()

lda_mdl.fit(ct_vect_prep)

print('Time to count vectorize data: {0:.4}s'.format(time() - t_start))

In [None]:
print("Topics in LDA model:")

# get feature names (topics) from model
feat_names = ct_vect.get_feature_names()

print('Start of list: ' + ', '.join(feat_names[:20]))
print('End of list: ' + ', '.join(feat_names[-10:]))

# Get Top Words in Topics

In [None]:
def print_top_words(model, feature_names, top_words):
    for i, topic in enumerate(model.components_):
        print("Topic {0}:".format(i))
        print(", ".join([feature_names[i] for i in topic.argsort()[:-top_words - 1:-1]]))
    print()

In [None]:
print_top_words(lda_mdl, feat_names, 12)

# TF-IDF

In [None]:
TfidfVectorizer(input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, token_pattern='(?u)\b\w\w+\b', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.int64'>, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)[source]¶