In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import nltk
from nltk.corpus import stopwords
from pymongo import MongoClient
from time import time
from collections import Counter
import pandas as pd
import numpy as np
import re
import os

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

# default plot stying changes
import seaborn as sns
sns.set_style("white")
sns.set_context("poster", font_scale=1.25, rc={"lines.linewidth": 2.5})
sns.set_palette("Set2")
colors = sns.color_palette('Set2',12)

# Credentials

In [2]:
pw_file = 'pw.txt'
if os.path.exists(pw_file): 
    with open(pw_file, 'r') as f:
        email, indeed_pw = f.readline().strip().split(', ')
        username, pia_pw = f.readline().strip().split(', ')
        pub_ip, mongo_usr, mongo_usr_pw = f.readline().strip().split(', ')

# Connect to DB

In [3]:
# connect to ec2 mongo client
client = MongoClient('{0}:27017'.format(pub_ip))

In [4]:
# get reference to  resume_db
db = client.resume_db

In [5]:
# authenticate user for database
db.authenticate(mongo_usr, mongo_usr_pw)

True

# Pull MongoDB into Dataframe

In [6]:
def read_mongo(db, collection, query={}, no_id=True):
    '''
    db: mongodb already connected and authenticated
    collection: desired collection in db
    query: query filter
    no_id: include mongos _id (False) or not (True)
    return => pandas dataframe
    '''
    # Make a query to the specific DB and Collection
    cursor = db[collection].find(query)

    # Expand the cursor and construct the DataFrame
    df =  pd.DataFrame(list(cursor))

    # Delete the _id
    if no_id:
        del df['_id']

    return df

In [7]:
t_start = time()

# load database data into dataframe
df = read_mongo(db, 'originals')

print('Time to load data: {0}s'.format(time() - t_start))

Time to load data: 16.47765612602234s


In [8]:
df.head(3)

Unnamed: 0,link_id,resume_text,search_term
0,/r/2b5b06cff39ce808?,"Petros Gazazyan North Hollywood, CA Werkervari...",engineer
1,/r/8fe4de80947b60f2?,"Travis London Java Software Engineer Tucson, A...",engineer
2,/r/1c009e8f7e2f5309?,"Stephen A. Kraft Mechanical Engineer Seattle, ...",engineer


In [9]:
print(list(df['search_term'].unique()))

['engineer', 'analytics', 'data_analysis', 'data', 'big_data', 'scientist']


# Get Cities, States in resumes

In [11]:
# remove output file if it exists (due to append)
if os.path.isfile('city/city_state.csv'):
    os.remove('city/city_state.csv')
    
# city_state.txt generated from google geographical encoding
with open('city/city_state.txt', 'r', errors='ignore') as infile:
    for line in infile:
        line = line.replace('"', '')
        line = line.split(',')
        
        # remove the duplicated city name
        if line[2] == line[3]:
            line.pop(2)
        
        with open('city/city_state.csv', 'a') as outfile:
            outfile.write(', '.join(line[2:5]) + '\n')

In [12]:
# load city, state, country from output file (above)
df_cityState = pd.read_csv('city/city_state.csv')
df_cityState.columns = ['City', 'State', 'Country'] 

In [13]:
# load state abbreviations list
df_stateAbbr = pd.read_csv('city/state_abbr.csv')
df_stateAbbr.columns = ['State', 'Abbr']

# state, abbreviation zipped list
state = df_stateAbbr['State'].tolist()
abbr = df_stateAbbr['Abbr'].tolist()
st_ab = list(zip(state, abbr))

# state: abbreviation dictionary
abbr_dict = {s.strip():a for s,a in st_ab}

In [14]:
# match abbreviation to state
df_cityState['State'] = df_cityState['State'].str.strip()
df_cityState['Abbr'] = df_cityState['State'].map(abbr_dict)

In [15]:
bf = len(df_cityState)
print('Length before drops: {0}'.format(bf))
df_cityState = df_cityState.dropna()
af = len(df_cityState)
print('Length after drops: {0}'.format(af))

Length before drops: 55131
Length after drops: 54629


# Extract Locations

In [70]:
# convert dataframe to city, abbr list
city_states = df_cityState[['City', 'Abbr']].values.tolist()
city_states = [', '.join(x) for x in city_states]

In [71]:
def current_location(res_text, front_len):
    loc_match = []
    # look through city, abbr to find match in first (front_len) characters of resume text
    for cs in city_states:
        if re.search(cs, text[:front_len]) != None:
            loc_match.append(re.search(cs, text).group())
    
    # if no match, return something
    if loc_match != []:
        return(loc_match[0])
    else:
        return '?'

In [74]:
text = df.iloc[0]['resume_text']

In [75]:
current_location(text, 100)

'North Hollywood, CA'

# Find Current Location

In [42]:
df.columns

Index(['link_id', 'resume_text', 'search_term', 'location'], dtype='object')

In [76]:
df['location'] = df['resume_text']

res = lambda x: current_location(x, 100)

df['location'].apply(res)

KeyboardInterrupt: 

In [77]:
#df['location'] = df['resume_text']

#for i in range(len(df)):
#    df.iloc[i]['location'] = current_location(df.iloc[i]['resume_text'], 100)

In [55]:
df.head()

Unnamed: 0,link_id,resume_text,search_term,location
0,/r/2b5b06cff39ce808?,"Petros Gazazyan North Hollywood, CA Werkervari...",engineer,"Petros Gazazyan North Hollywood, CA Werkervari..."
1,/r/8fe4de80947b60f2?,"Travis London Java Software Engineer Tucson, A...",engineer,1
2,/r/1c009e8f7e2f5309?,"Stephen A. Kraft Mechanical Engineer Seattle, ...",engineer,"Stephen A. Kraft Mechanical Engineer Seattle, ..."
3,/r/f18875e484d5b766?,"Abdy Galeano Duarte, CA Utilize my skills and ...",engineer,"Abdy Galeano Duarte, CA Utilize my skills and ..."
4,/r/818d4e5de7455e18?,Thithi McWilliams New Product Development Engi...,engineer,Thithi McWilliams New Product Development Engi...


In [37]:
df.head()

Unnamed: 0,link_id,resume_text,search_term,location
0,/r/2b5b06cff39ce808?,"Petros Gazazyan North Hollywood, CA Werkervari...",engineer,<generator object current_location at 0x119d63...
1,/r/8fe4de80947b60f2?,"Travis London Java Software Engineer Tucson, A...",engineer,<generator object current_location at 0x119d63...
2,/r/1c009e8f7e2f5309?,"Stephen A. Kraft Mechanical Engineer Seattle, ...",engineer,<generator object current_location at 0x119d63...
3,/r/f18875e484d5b766?,"Abdy Galeano Duarte, CA Utilize my skills and ...",engineer,<generator object current_location at 0x119d63...
4,/r/818d4e5de7455e18?,Thithi McWilliams New Product Development Engi...,engineer,<generator object current_location at 0x119d63...


In [78]:
df.iloc[0]['resume_text']

"Petros Gazazyan North Hollywood, CA Werkervaring DESIGN ENGINEER, STRUCTURAL TTG Engineer Pasadena, CA december 2015 tot heden Designed nonstructural equipment anchorage for major southern California hospitals in accordance with ASCE, CBC and other local codes set forth by the Office of Statewide Planning and Development Gained extensive knowledge and experience in engineering programs for design including Enercalc, ETABS, and Hilti Profis for the design of remodel of buildings beams, columns, and foundations Surveyed area of work to be remodeled and inspected physical work after remodel to ensure work is done according to design CIVIL ENGINEERING STUDENT WORKER Los Angeles County Department of Public Works Alhambra, CA september 2014 tot december 2015 Worked alongside engineers to meet the publics needs in the transportation infrastructure project development division with the highest standards throughout all of Los Angeles county Assisted engineers and project managers with geograph

# Second Pass - Clean Text

In [None]:
df['resume_clean'] = df['resume_text'].str.replace(':|;', '')
df['resume_clean'] = df['resume_clean'].str.replace('.', '')
df['resume_clean'] = df['resume_clean'].str.replace(',', '')

In [None]:
df.head()

# Remove StopWords

In [None]:
# cache stopwords first to reduce compute time
cachedStopWords = stopwords.words("english")

# convert all text to lower case and separate into list
df['resume_stopped'] = df['resume_text'].str.lower().str.split()

# remove stopwords
df['resume_stopped'] = df['resume_stopped'].apply(lambda x: ' '.join([item for item in x if item not in cachedStopWords]))

In [None]:
df.head()

# Word Frequency

In [None]:
# get wordcounts
wordcount = Counter(' '.join(df['resume_stopped']).split(' '))

# limit wordcounts for visualization
wordcount = wordcount.most_common(30)

In [None]:
labels = [lbl for lbl, ct in wordcount]
count = [ct for lbl, ct in wordcount]

In [None]:
# make figure
fig = plt.figure(figsize=(20,12))
ax = fig.add_subplot(111)

# color
colors = sns.color_palette("BrBG", len(labels))

# plots
y_pos = np.arange(len(labels))
ax.barh(y_pos, count, align='center', color=colors, edgecolor=colors)

#plt.xlim(0,170000)
plt.ylim(-1,30)

# labels/titles
plt.legend(loc="best")
plt.title('Word/Term Frequency')
plt.xlabel('Word/Term Count')
plt.yticks(y_pos, labels)
plt.ylabel('Word/Term')
plt.xticks(np.linspace(0,180000, 13))

# remove border
ax.spines["top"].set_visible(False)  
ax.spines["right"].set_visible(False) 
ax.spines["bottom"].set_visible(False) 
ax.spines["left"].set_visible(False)

# show grid
#ax.xaxis.grid(True, alpha=0.2, linestyle='--') 
#ax.yaxis.grid(True, alpha=0.2, linestyle='--') 

# plot that biddy
plt.tight_layout()
_ = plt.show()

# Stemmed

In [None]:
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()

In [None]:
df['stem'] = df['resume_stopped'].apply(lambda x: stemmer.stem(x))
#df['stem'] = df['resume_stopped'].str.split(' ')

In [None]:
df.head()

In [None]:
# get wordcounts
wordcount = Counter(' '.join(df['resume_stopped']).split(' '))

# limit wordcounts for visualization
wordcount = wordcount.most_common(30)

In [None]:
labels = [lbl for lbl, ct in wordcount]
count = [ct for lbl, ct in wordcount]

In [None]:
# make figure
fig = plt.figure(figsize=(20,20))
ax = fig.add_subplot(111)

# color
colors = sns.color_palette("BrBG", len(labels))

# plots
y_pos = np.arange(len(labels))
ax.barh(y_pos, count, align='center', color=colors, edgecolor=colors)

#plt.xlim(0,170000)
plt.ylim(-1,30)

# labels/titles
plt.legend(loc="best")
plt.title('Word/Term Frequency')
plt.xlabel('Word/Term Count')
plt.yticks(y_pos, labels)
plt.ylabel('Word/Term')
plt.xticks(np.linspace(0,180000, 13))

# remove border
ax.spines["top"].set_visible(False)  
ax.spines["right"].set_visible(False) 
ax.spines["bottom"].set_visible(False) 
ax.spines["left"].set_visible(False)

# show grid
#ax.xaxis.grid(True, alpha=0.2, linestyle='--') 
#ax.yaxis.grid(True, alpha=0.2, linestyle='--') 

# plot that biddy
plt.tight_layout()
_ = plt.show()

# Stemming

In [None]:
fd = nltk.FreqDist(w.lower() for w in df['resume_stopped'])
#fd.plot(10)
fd

# N-Grams Count Vectorizer

In [None]:
n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20

In [None]:
t_start = time()

# convert resume texts to a sparse matrix of token counts
ct_vect = CountVectorizer(ngram_range=(1, 3), max_df=0.90, min_df=2, max_features=n_features, stop_words='english')
ct_vect_prep = ct_vect.fit_transform(df['resume_text'])

print('Time to count vectorize data: {0:.4}s'.format(time() - t_start))

# Latent Dirichlet Allocation

In [None]:
lda_mdl = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', 
                                learning_offset=50., random_state=0)

t_start = time()

lda_mdl.fit(ct_vect_prep)

print('Time to count vectorize data: {0:.4}s'.format(time() - t_start))

In [None]:
print("Topics in LDA model:")

# get feature names (topics) from model
feat_names = ct_vect.get_feature_names()

print('Start of list: ' + ', '.join(feat_names[:20]))
print('End of list: ' + ', '.join(feat_names[-10:]))

# Get Top Words in Topics

In [None]:
def print_top_words(model, feature_names, top_words):
    for i, topic in enumerate(model.components_):
        print("Topic {0}:".format(i))
        print(", ".join([feature_names[i] for i in topic.argsort()[:-top_words - 1:-1]]))
    print()

In [None]:
print_top_words(lda_mdl, feat_names, 12)

# TF-IDF

In [None]:
TfidfVectorizer(input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, token_pattern='(?u)\b\w\w+\b', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.int64'>, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)[source]¶