In [149]:

"""
Import Statements
"""

# Classics
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Base
import re
import os
import gensim
from collections import Counter

# CountVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# NLP Libraries
import spacy
from nltk.stem import PorterStemmer
from spacy.tokenizer import Tokenizer

# Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from scipy.stats import randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline


# Topic Modeling
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.ldamulticore import LdaMulticore

# Topic Distance Visualiztion
import pyLDAvis.gensim

nlp = spacy.load("en_core_web_lg")

def tokenize(text):
    """Parses a string into a list of semantic units (words)

    Args:
        text (str): The string that the function will tokenize.

    Returns:
        list: tokens parsed out by the mechanics of your choice
    """
    
    tokens = re.sub(r'[^a-zA-Z ^0-9]', '', text)
    tokens = tokens.lower().split()

In [107]:
%pwd

'/Users/jorge/Job-Funnel-ds-Data-dev/notebooks'

In [108]:
ls

Basilica_quickhire_test.ipynb
EDA-1.2.ipynb
EDA.ipynb
Indeed_Job_Scraper_Analysis_and_Salary_Predictions.txt
Quickhire_LDA_1.1.ipynb
Spacy_quickhire_test.ipynb
delete_nurse_rows-Copy1.ipynb
delete_nurse_rows.ipynb
fastapi_hello_world.py
job_listings.7z
job_listings_1.2.csv
job_listings_1.3.csv
job_listings_10000.csv
quickhire-1.2.csv
quickhire-TPlus-1.1.csv
quickhire_10000_jobs-1.1.ipynb
quickhire_dev_1.1.ipynb
selenium_example.py
tech_job_listings.csv
training_data_profile_report.html
training_data_profile_report_1.2.html


In [109]:
df = pd.read_csv ('/Users/jorge/Job-Funnel-ds-Data-dev/notebooks/job_listings_10000.csv')
df = df.fillna('none')
print(df.shape)
df.head(3)

(9999, 20)


Unnamed: 0.1,Unnamed: 0,post_date_utc,pay_min,pay_max,pay_exact,title,seniority,job_id,job_description,company_id,keyphrase,location_id,name,company_description,size,revenue,logo_url,city,state_province,country
0,0,2020-03-13 08:16:37+00:00,none,none,none,Site Reliability Software Engineer,none,124991.0,Cloud Data center performance and growth deman...,20931.0,none,705.0,ITRenew,none,none,none,none,none,none,none
1,1,2020-03-13 09:10:06+00:00,none,none,none,Sr. Cloud Solutions Architect,none,125082.0,"**About Sesame, ITRenew** Sesame is the new ...",20931.0,none,705.0,ITRenew,none,none,none,none,none,none,none
2,2,2020-03-13 08:57:40+00:00,none,none,none,Senior Software Engineer (PHP),none,125054.0,"SchoolMint is looking for a self-driven, eager...",20932.0,none,880.0,SchoolMint,none,none,none,none,none,none,none


In [110]:
df.describe(exclude="number")

Unnamed: 0,post_date_utc,pay_min,pay_max,pay_exact,title,seniority,job_description,keyphrase,name,company_description,size,revenue,logo_url,city,state_province,country
count,9999,9999,9999,9999,9999,9999,9999,9999,9999,9999,9999,9999,9999,9999,9999,9999
unique,9998,1,1,1,8265,1,9304,1,4557,257,1,1,661,25,13,2
top,none,none,none,none,Software Engineer,none,## DescriptionPerforms general store operation...,none,Jpm Chase,none,none,none,none,none,none,none
freq,2,9999,9999,9999,100,9999,506,9999,520,8961,9999,9999,8379,9955,9967,9955


In [111]:
df['job_description'] = df['job_description'].str.replace("*"," ")

In [112]:
df['job_description'][:10]

0    Cloud Data center performance and growth deman...
1      About Sesame, ITRenew     Sesame is the new ...
2    SchoolMint is looking for a self-driven, eager...
3    At Built Robotics, we develop AI guidance syst...
4    At Built Robotics, we develop AI guidance syst...
5      Intelligent Retail Lab   is part of Walmart'...
6      ABOUT THE JOB:  As an Android Developer, you...
7    As an iOS Developer, you will join a team of s...
8    The primary duties and job focus of this posit...
9    _   Job Title  _NetSuite Administrator   Devel...
Name: job_description, dtype: object

## Part 1: Tokenize Function

In [113]:
#Tokenizer Pipe

tokens = []

# Make the tokens
for doc in nlp.pipe(df['job_description'], disable=['tagger', 'parser', 'ner']):
    
    doc_tokens = []
    
    for token in doc:
        if (token.is_stop==False) & (token.is_punct==False):
           doc_tokens.append(token.text)
   
    tokens.append(doc_tokens)
    
df['tokens'] = tokens

## Text Preprocessing

In [114]:
# 1) Plain Python - ''.split command
# 2) Spacy - just the lemmas from the document
# 3) Gensim - simple_preprocess

# def tokenize(text):
#    "Complete this function"
    
#    return [token for token in simple_preprocess(text) if token in STOPWORDS]

In [115]:
#def gather_data(path_to_data): 
#    data = []
#    for f in os.listdir(path):
#        if os.path.isdir(f) == False:
#            if f[-3:] == 'txt':
#                with open(os.path.join(path,f)) as t:
#                    text = t.read().strip('\n')
#                    data.append(tokenize(str(text)))       
#    return data

In [116]:
 def tokenize(doc):
    nlp_doc = nlp(doc.lower())
    doc_tokens = [token.text for token in nlp_doc
    if (token.is_stop==False) and (token.is_punct==False)
    and token.pos_!="-PRON-" and token.text != " "]
    
    return doc_tokens

In [117]:
tokenize(df['job_description'][0])

['cloud',
 'data',
 'center',
 'performance',
 'growth',
 'demands',
 'higher',
 'constraints',
 'tighter',
 'sesame',
 'meet',
 'range',
 'performance',
 'demands',
 'built',
 'powerful',
 'hyperscale',
 'hardware',
 'world',
 'fully',
 'certified',
 'high',
 'performance',
 'line',
 'compute',
 'storage',
 'solutions',
 'sustain',
 'engineering',
 'work',
 'hiring',
 'site',
 'reliability',
 'engineer',
 'multidisciplinary',
 'engineering',
 'team',
 'work',
 'infrastructure',
 'management',
 'development',
 'platform',
 'including',
 'data',
 'center',
 'hardware',
 'testing',
 'platform',
 'power',
 'open',
 'compute',
 'storage',
 'hardware',
 'certification',
 'program',
 'internal',
 'platforms',
 'testing',
 'benchmarking',
 'production',
 'platforms',
 'host',
 'customer',
 'benchmark',
 'environments',
 'support',
 'rack',
 'level',
 'hardware',
 'solutions',
 'services',
 'created',
 'sesame',
 'division',
 'site',
 'reliability',
 'engineer',
 'spend',
 '50',
 'time',
 'wor

In [118]:
"this is a sample string with a \n newline character".replace('\n', '')

'this is a sample string with a  newline character'

In [119]:
# Lets apply the above and creat our series of tokenized texts:
df['tokens'] = df['job_description'].apply(lambda x: tokenize(x))
df['tokens'].head()

0    [cloud, data, center, performance, growth, dem...
1    [  , sesame, itrenew,     , sesame, new, hyper...
2    [schoolmint, looking, self, driven, eager, tal...
3    [built, robotics, develop, ai, guidance, syste...
4    [built, robotics, develop, ai, guidance, syste...
Name: tokens, dtype: object

In [144]:
%%time

STOPWORDS = set(STOPWORDS).union(set(['from', 'subject','\n\n','\xa0','\n\n\n','this…your','-', '|', '\n','&','ul','li','position','jobs','job','etc','work','business','employment',
                   'employer','employee','applicant','good','use','must','work']))

def tokenize(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

CPU times: user 34 µs, sys: 0 ns, total: 34 µs
Wall time: 39.1 µs


In [145]:
tokenize("Hello World! This a test of the tokenization method")

['hello', 'world', 'test', 'tokenization', 'method']

In [146]:
len(tokens)

9999

In [147]:
# Object from Base Python
from collections import Counter

# The object `Counter` takes an iterable, but you can instaniate an empty one and update it. 
word_counts = Counter()

# Update it based on a split of each of our documents
df['tokens'].apply(lambda x: word_counts.update(x))

# Print out the 10 most common words
word_counts.most_common(10)

[('   ', 110824),
 ('  ', 43888),
 ('experience', 41713),
 ('work', 27026),
 ('    ', 22627),
 ('data', 20729),
 ('team', 19612),
 ('business', 17417),
 ('development', 17383),
 ('skills', 17360)]

## Part 2: Vector Representation

In [151]:
# instantiate vectorizer object:
tfidf = TfidfVectorizer(max_df=.97,
                        min_df=3,
                        stop_words='english')

# create a vocabulary and get word counts per document:
dtm_tfidf = tfidf.fit_transform(df['job_description'])

# print word counts

# get feature names to use as dataframe column headers:
tf_jobs = pd.DataFrame(dtm_tfidf.todense(), columns=tfidf.get_feature_names())

# view Feature Matrix as DataFrame:
tf_jobs.head()

Unnamed: 0,00,000,0000,000000,0001pt,000technologists,001,002,003,0030,...,zweig,для,ищем,команду,на,не,но,новые,сервисы,создавать
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.055303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.055629,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [152]:
# fit on dtm_tfidf:
nn = NearestNeighbors(n_neighbors=5, algorithm='kd_tree')
nn.fit(tf_jobs)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [153]:
nn.kneighbors([tf_jobs.iloc[0]])

(array([[0.        , 1.        , 1.        , 1.21301601, 1.2336151 ]]),
 array([[   0, 4224, 6335, 9140, 8009]]))

In [157]:
df['job_description'][0]

"Cloud Data center performance and growth demands have never been higher, and constraints have never been tighter. With Sesame, meet the full range of performance demands. Built using the most powerful hyperscale hardware, this is the world's first fully re-certified , high-performance line of compute and storage solutions. To sustain our engineering work, we are hiring a Site Reliability Engineer. Within a multidisciplinary engineering team, you'll work on infrastructure management and development for various platform, including:  - Data center hardware testing platform to power our open compute and storage hardware re-certification program  - Internal platforms for testing and benchmarking  - Production platforms to host customer benchmark environments in support of the rack-level hardware solutions and services being created by the Sesame division.  As a Site Reliability Engineer, you'll spend 50% of your time working on platform administration and maintenance and the other 50% deve

In [158]:
job_description = ['Cloud Data center performance and growth demands have never been higher, and constraints have never been tighter.']

In [159]:
jobs_text = tfidf.transform(job_description)

In [160]:
nn.kneighbors(jobs_text.todense())

(array([[1.        , 1.        , 1.16721402, 1.20839893, 1.24623425]]),
 array([[6335, 4224,    0, 5936, 4492]]))

In [161]:
for num in [6335, 4224,    0, 5936, 4492]:
    print(df['job_description'][num])
    print('\n')

none


none


Cloud Data center performance and growth demands have never been higher, and constraints have never been tighter. With Sesame, meet the full range of performance demands. Built using the most powerful hyperscale hardware, this is the world's first fully re-certified , high-performance line of compute and storage solutions. To sustain our engineering work, we are hiring a Site Reliability Engineer. Within a multidisciplinary engineering team, you'll work on infrastructure management and development for various platform, including:  - Data center hardware testing platform to power our open compute and storage hardware re-certification program  - Internal platforms for testing and benchmarking  - Production platforms to host customer benchmark environments in support of the rack-level hardware solutions and services being created by the Sesame division.  As a Site Reliability Engineer, you'll spend 50% of your time working on platform administration and maintenance and the o

## Part 3: Topic Modeling


In [175]:
from gensim.models import LdaMulticore
from gensim.corpora import Dictionary

In [176]:
df['tokens'].head()

0    [cloud, data, center, performance, growth, dem...
1    [  , sesame, itrenew,     , sesame, new, hyper...
2    [schoolmint, looking, self, driven, eager, tal...
3    [built, robotics, develop, ai, guidance, syste...
4    [built, robotics, develop, ai, guidance, syste...
Name: tokens, dtype: object

In [178]:
id2word = corpora.Dictionary(df['tokens'])

In [179]:
corpus = [id2word.doc2bow(text)for text in df['tokens']]

In [180]:
lda = LdaMulticore(corpus=corpus,
                   id2word=id2word,
                   iterations=5,
                   workers=4,
                   num_topics = 10 # You can change this parameter
                  )

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

In [181]:
lda.print_topics()

[(0,
  '0.033*"   " + 0.016*"experience" + 0.013*"  " + 0.007*"work" + 0.006*"    " + 0.005*"data" + 0.005*"team" + 0.005*"including" + 0.005*"skills" + 0.005*"ability"'),
 (1,
  '0.024*"   " + 0.013*"  " + 0.010*"experience" + 0.008*"work" + 0.005*"    " + 0.005*"development" + 0.005*"business" + 0.005*"ability" + 0.004*"skills" + 0.004*"team"'),
 (2,
  '0.027*"   " + 0.015*"experience" + 0.012*"  " + 0.009*"work" + 0.007*"    " + 0.006*"data" + 0.006*"skills" + 0.005*"ability" + 0.005*"team" + 0.005*"     "'),
 (3,
  '0.040*"   " + 0.013*"experience" + 0.012*"  " + 0.007*"including" + 0.006*"team" + 0.005*"work" + 0.005*"    " + 0.004*"years" + 0.004*"development" + 0.004*"skills"'),
 (4,
  '0.025*"   " + 0.012*"experience" + 0.010*"  " + 0.008*"work" + 0.007*"data" + 0.006*"business" + 0.006*"    " + 0.006*"team" + 0.006*"     " + 0.006*"skills"'),
 (5,
  '0.028*"   " + 0.010*"  " + 0.009*"experience" + 0.007*"work" + 0.007*"including" + 0.006*"     " + 0.006*"data" + 0.006*"    " +

In [182]:
words = [re.findall(r'"([^"]*)"',t[1]) for t in lda.print_topics()]

In [183]:
topics = [' '.join(t[0:5]) for t in words]

In [184]:
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
    experience    work     

------ Topic 1 ------
       experience work     

------ Topic 2 ------
    experience    work     

------ Topic 3 ------
    experience    including team

------ Topic 4 ------
    experience    work data

------ Topic 5 ------
       experience work including

------ Topic 6 ------
       experience work including

------ Topic 7 ------
       experience work including

------ Topic 8 ------
       work experience including

------ Topic 9 ------
       experience work     



## Topic Distance Visualization

In [185]:
pyLDAvis.enable_notebook()

In [186]:
pyLDAvis.gensim.prepare(lda, corpus, id2word)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
