In [1]:
import csv
from datetime import datetime

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
import re

import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy

from IPython.display import Image
import seaborn as sns

import time

  from pandas.core import datetools


In [2]:
# pandas display settings

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)
pd.set_option('display.precision', 3)

In [14]:
# Load the pickled parsed job posts data
df1 = pd.read_pickle('../data_local/job_posts_clean_022618.pkl')
df2 = pd.read_pickle('../data_local/job_posts_clean_022818.pkl')

In [15]:
print(len(df1))
print(len(df2))

496
128


In [187]:
# df_all = pd.concat([df1, df2])
df_all = df1.append(df2, ignore_index=True)
print(len(df_all))

624


In [188]:
df_all = df_all.drop_duplicates(subset=['company_name', 'job_title'])
df_all = df_all.reset_index(drop=True)
len(df_all)

428

In [189]:
df_all.loc[33]

company_descr    Trust is rare in today’s digital advertising l...
company_name                                         Goodway Group
date_posted                                             9 days ago
email_content    Indeed Job Alert data science jobs  Jobs 1-30 ...
email_date                   Mon, 19 Feb 2018 17:27:47 -0600 (CST)
email_from                                                  Indeed
email_subject                            30+ new data science jobs
job_post_link    https://www.indeed.com/viewjob?jk=d07982277170...
job_posting      The Director of Data Science & Engineering is ...
job_title                       Director of Data Science - Virtual
location                                              New York, NY
job_role                                     data science director
job_level                                                   senior
country                                                        USA
state                                                         

## Address jargon and specialized language
(based on insights from TF-IDF and NMF topic modeling during data cleaning)

### The equal opportunity clause

In [18]:
equal_opp_clause_words = "status employment protected gender applicants race color disability religion opportunity sexual age equal orientation qualified veteran sex employer regard identity demonstrated law marital".split(' ')
len(equal_opp_clause_words)

23

In [19]:
hr_words = 'job required education master type years salary year location desired minimum include tasks level range highly expert degree bachelor position resume knowledge related duties summary requirements benefits preferred applicants phd present needed need employee employer effectively effective employees'.split()
hr_words

['job',
 'required',
 'education',
 'master',
 'type',
 'years',
 'salary',
 'year',
 'location',
 'desired',
 'minimum',
 'include',
 'tasks',
 'level',
 'range',
 'highly',
 'expert',
 'degree',
 'bachelor',
 'position',
 'resume',
 'knowledge',
 'related',
 'duties',
 'summary',
 'requirements',
 'benefits',
 'preferred',
 'applicants',
 'phd',
 'present',
 'needed',
 'need',
 'employee',
 'employer',
 'effectively',
 'effective',
 'employees']

In [190]:
df_all['equal_opportunity_clause'] = 0
#df_all['hr_words_count'] = 0

for i in range(len(df_all)):
    job_posting = df_all.job_posting[i].lower()
    
    # Equal opp clause
    eqw_count = 0
    for eqw in equal_opp_clause_words:
        if eqw in job_posting:
            eqw_count += 1
            eqw_regex = r"\b" + re.escape(eqw) + r"\b"
            job_posting = re.sub(eqw_regex, r'', job_posting, flags=re.IGNORECASE)
    
    if eqw_count >= 5:
        df_all['equal_opportunity_clause'][i] = 1
        
        
    # Remove generic HR words
    #hrw_count = 0
    for hrw in hr_words:
        if hrw in job_posting:
            hrw_count += 1
            hrw_regex = r"\b" + re.escape(hrw) + r"\b"
            job_posting = re.sub(hrw_regex, r'', job_posting, flags=re.IGNORECASE)
    #df_all['hr_words_count'] = hrw_count
    
    df_all.job_posting[i] = job_posting
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [191]:
df_all.head()

Unnamed: 0,company_descr,company_name,date_posted,email_content,email_date,email_from,email_subject,job_post_link,job_posting,job_title,location,job_role,job_level,country,state,city,job_post_date,equal_opportunity_clause
0,At Bluecore we are transforming the way eComme...,Bluecore,7 days ago,Recommended Jobs for You Jobs 1 to 23 of 23 r...,"Mon, 19 Feb 2018 13:12:58 -0600 (CST)",Indeed Job Alert,You have new recommended jobs: Senior Data Sci...,https://www.indeed.com/viewjob?jk=6c7ebde0dbd8...,we are looking for senior data scientists with...,Senior Data Scientist,"New York, NY 10002",data scientist,senior,USA,NY,New York,2018-02-19,1
1,,Loadsmart,7 days ago,Recommended Jobs for You Jobs 1 to 23 of 23 r...,"Mon, 19 Feb 2018 13:12:58 -0600 (CST)",Indeed Job Alert,You have new recommended jobs: Senior Data Sci...,https://www.indeed.com/viewjob?jk=59a797491eb2...,"who we are:\nat loadsmart, we won't settle for...",Data Scientist,"New York, NY",data scientist,,USA,NY,New York,2018-02-19,0
2,,Foursquare,8 days ago,Recommended Jobs for You Jobs 1 to 23 of 23 r...,"Mon, 19 Feb 2018 13:12:58 -0600 (CST)",Indeed Job Alert,You have new recommended jobs: Senior Data Sci...,https://www.indeed.com/viewjob?jk=7490c513ac10...,about foursquare:\nsince our inception in 2009...,Data Scientist,"New York, NY",data scientist,,USA,NY,New York,2018-02-18,1
3,About HRG\n\nHRG's TRADING TEAM recruits the t...,"The Hagan-Ricci Group, Inc.",9 days ago,Recommended Jobs for You Jobs 1 to 23 of 23 r...,"Mon, 19 Feb 2018 13:12:58 -0600 (CST)",Indeed Job Alert,You have new recommended jobs: Senior Data Sci...,https://www.indeed.com/viewjob?jk=4eceac18fce0...,a small and very successful quantitative fund ...,Data Scientist / Signal Researcher,"New York, NY",data scientist,,USA,NY,New York,2018-02-17,0
4,,Oath Inc,8 days ago,Recommended Jobs for You Jobs 1 to 23 of 23 r...,"Mon, 19 Feb 2018 13:12:58 -0600 (CST)",Indeed Job Alert,You have new recommended jobs: Senior Data Sci...,https://www.indeed.com/viewjob?jk=6b82f29c6741...,"oath, a subsidiary of verizon, is a values-led...",Data Scientist Intern,"New York, NY",data scientist,intern,USA,NY,New York,2018-02-18,1


In [192]:
len(df_all[df_all['equal_opportunity_clause'] == 1])

147

In [57]:
#len(df_all[df_all['hr_words_count'] >= 12])

In [58]:
print(df_all.job_posting[0])

we are looking for senior data scientists with strong mathematical backgrounds to work alongside our engineering teams to build the next generation of retail and commerce models that delight and empower marketers. the ideal candidate is one that has several  of experience researching, building, serving, and maintaining data science models at scale. they have first-hand experience with what works and what doesn’t, and are eager to share this experience with more junior members and guide them through that process. they are also able to and excited to help architect and build out the data science architecture  to accelerate innovation on models and facilitate serving and maintaining them. finally, they should be curious and eager to identify and explore the myriad of other products that can be built on our data asset. our culture emphasizes making good tradeoffs, working as a team, and leaving your ego at the door.

first-party data is at the core of everything we build and the data scien

In [59]:

from textblob import TextBlob

In [62]:

jobpost_blob = TextBlob(df_all.job_posting[0])

In [63]:
jobpost_blob.sentiment

Sentiment(polarity=0.22148574695744508, subjectivity=0.5689761496365269)

In [64]:
for i in range(len(df_all)):
    print(i)
    print(TextBlob(df_all.job_posting[i]).sentiment)

0
Sentiment(polarity=0.22148574695744508, subjectivity=0.5689761496365269)
1
Sentiment(polarity=0.2888095238095238, subjectivity=0.5622619047619047)
2
Sentiment(polarity=0.2820632515632516, subjectivity=0.528485088985089)
3
Sentiment(polarity=0.1291156462585034, subjectivity=0.46977891156462587)
4
Sentiment(polarity=0.1254573799216656, subjectivity=0.4252396413110698)
5
Sentiment(polarity=0.10781750906750909, subjectivity=0.5378483678483678)
6
Sentiment(polarity=0.20629657228017886, subjectivity=0.4956603505783833)
7
Sentiment(polarity=0.01428571428571429, subjectivity=0.3)
8
Sentiment(polarity=0.2072420634920635, subjectivity=0.4114417989417989)
9
Sentiment(polarity=0.2764136904761904, subjectivity=0.4731026785714285)
10
Sentiment(polarity=0.2476590909090909, subjectivity=0.6360025252525252)
11
Sentiment(polarity=0.07974386724386726, subjectivity=0.3918975468975469)
12
Sentiment(polarity=0.11218181818181816, subjectivity=0.5066060606060606)
13
Sentiment(polarity=0.19886243386243388, s

Sentiment(polarity=0.18425925925925923, subjectivity=0.41574074074074074)
148
Sentiment(polarity=0.03958333333333334, subjectivity=0.5177083333333333)
149
Sentiment(polarity=0.2263544536271809, subjectivity=0.45417814508723603)
150
Sentiment(polarity=0.24298245614035086, subjectivity=0.5460526315789473)
151
Sentiment(polarity=0.06452978056426333, subjectivity=0.3527517884414435)
152
Sentiment(polarity=0.15123376623376625, subjectivity=0.4688195138195138)
153
Sentiment(polarity=0.13403679653679654, subjectivity=0.4896841720371132)
154
Sentiment(polarity=0.0773452380952381, subjectivity=0.4111071428571428)
155
Sentiment(polarity=0.035400299289188183, subjectivity=0.4692012719790498)
156
Sentiment(polarity=0.17977736549165124, subjectivity=0.4701762523191093)
157
Sentiment(polarity=0.23126641118444394, subjectivity=0.47653821588247824)
158
Sentiment(polarity=0.1430747322852586, subjectivity=0.33268853953064476)
159
Sentiment(polarity=0.13852080123266564, subjectivity=0.4117360041088855)
1

In [65]:


import nltk
from nltk.util import ngrams

from collections import Counter
from operator import itemgetter

from nltk.corpus import stopwords

In [67]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import NMF

In [195]:

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [196]:

# testing with job posts as docs
documents = df_all.job_posting

no_features = 500

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.6, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

In [197]:
no_topics = 10

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)


no_top_words = 12
display_topics(nmf, tfidf_feature_names, no_top_words)

Topic 0:
machine models techniques statistical algorithms scientist predictive modeling statistics mining using problems
Topic 1:
management project support information development technical ability design solutions systems provide including
Topic 2:
company world people help digital platform build marketing technology working building engineering
Topic 3:
analytics advanced clients client predictive modeling analytical value leadership marketing strategic global
Topic 4:
research computational center scientific community software intelligence college scientists ph applications field
Topic 5:
risk financial investment firm management quantitative finance state companies models including modeling
Topic 6:
health clinical care medical healthcare scientific plan quality appropriate products outcomes support
Topic 7:
product products development user metrics quantitative drive digital define manager teams customer
Topic 8:
customer sets large marketing like functions analytical analyze dat

In [198]:

def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        for doc_index in top_doc_indices:
            print(doc_index)

documents = df_all.job_posting

no_features = 500

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.6, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

no_topics = 10

# Run NMF
nmf_model = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
nmf_W = nmf_model.transform(tfidf)
nmf_H = nmf_model.components_


no_top_words = 5
no_top_documents = 5
display_topics(nmf_H, nmf_W, tfidf_feature_names, documents, no_top_words, no_top_documents)


Topic 0:
machine models techniques statistical algorithms
294
298
182
251
49
Topic 1:
management project support information development
248
348
414
233
118
Topic 2:
company world people help digital
339
198
13
165
365
Topic 3:
analytics advanced clients client predictive
42
32
156
39
387
Topic 4:
research computational center scientific community
297
302
46
340
29
Topic 5:
risk financial investment firm management
197
243
237
304
256
Topic 6:
health clinical care medical healthcare
215
28
371
102
221
Topic 7:
product products development user metrics
146
81
150
151
148
Topic 8:
customer sets large marketing like
98
180
137
423
95
Topic 9:
capgemini client group practice expression
31
383
8
119
396


In [199]:
df_all.loc[209:242]

Unnamed: 0,company_descr,company_name,date_posted,email_content,email_date,email_from,email_subject,job_post_link,job_posting,job_title,location,job_role,job_level,country,state,city,job_post_date,equal_opportunity_clause
209,Crowe Horwath LLP (www.crowehorwath.com) is on...,Crowe Horwath,3 days ago,Indeed Job Alert data science jobs Jobs 1-30 ...,"Fri, 23 Feb 2018 17:42:17 -0600 (CST)",Indeed,30+ new data science jobs,https://www.indeed.com/viewjob?jk=c8b74be9782f...,\nleverage large sets of structured and unstr...,Data Scientist/Statistics - Applied Technology...,"Indianapolis, IN 46240",data scientist,,USA,IN,Indianapolis,2018-02-23,0
210,,Comcentric,3 days ago,Indeed Job Alert data science jobs Jobs 1-30 ...,"Fri, 23 Feb 2018 17:42:17 -0600 (CST)",Indeed,30+ new data science jobs,https://www.indeed.com/viewjob?jk=f07408e46d83...,we are seeking a big data engineer for a 12 mo...,Remote Big Data opportunity,United States,data science other,,USA,,,2018-02-23,0
211,PARA HealthCare Financial Services was founded...,"PARA HealthCare Analytics, Inc",3 days ago,Indeed Job Alert data science jobs Jobs 1-30 ...,"Fri, 23 Feb 2018 17:42:17 -0600 (CST)",Indeed,30+ new data science jobs,https://www.indeed.com/viewjob?jk=b942842395c8...,"para healthcare analytics, inc. was founded in...",Healthcare Data Analyst (US-Remote),Remote,data analyst,,,,,2018-02-23,0
212,,Arch,2 days ago,Indeed Job Alert data science jobs Jobs 1-30 ...,"Fri, 23 Feb 2018 17:42:17 -0600 (CST)",Indeed,30+ new data science jobs,https://www.indeed.com/viewjob?jk=14d660a43a6f...,overview\nthe company\narch capital group ltd....,"Data Scientist, Strategic Analytics Services","Jersey City, NJ",data scientist,,USA,NJ,Jersey City,2018-02-24,0
213,,EZEN Computer Services Inc.,3 days ago,Indeed Job Alert data science jobs Jobs 1-30 ...,"Fri, 23 Feb 2018 17:42:17 -0600 (CST)",Indeed,30+ new data science jobs,https://www.indeed.com/viewjob?jk=7c739cc0ff86...,"title: data scientist\n: richardson, tx\ndurat...",Data Scientist,"Richardson, TX",data scientist,,USA,TX,Richardson,2018-02-23,0
214,,"Pleio, Inc",2 days ago,Indeed Job Alert data science jobs Jobs 1-30 ...,"Fri, 23 Feb 2018 17:42:17 -0600 (CST)",Indeed,30+ new data science jobs,https://www.indeed.com/viewjob?jk=dce6fe9d9262...,data analyst (remote)\n description\nabout ple...,Data Analyst (Remote),"Newark, NJ",data analyst,,USA,NJ,Newark,2018-02-24,0
215,Sanofi is dedicated to supporting people throu...,Sanofi US,3 days ago,Indeed Job Alert data science jobs Jobs 1-30 ...,"Fri, 23 Feb 2018 17:42:17 -0600 (CST)",Indeed,30+ new data science jobs,https://www.indeed.com/viewjob?jk=b3622e5372e1...,the medical science liaison (msl) will be resp...,Medical Science Liaison,United States,,,USA,,,2018-02-23,0
216,,Omnicom Health Group,4 days ago,Indeed Job Alert data science jobs Jobs 1-30 ...,"Fri, 23 Feb 2018 17:42:17 -0600 (CST)",Indeed,30+ new data science jobs,https://www.indeed.com/viewjob?jk=85a979680b1c...,title: director data and analytics\ncompany / ...,"Director, Data Analytics","New York, NY",data science director,senior,USA,NY,New York,2018-02-22,0
217,,Turner,4 days ago,Indeed Job Alert data science jobs Jobs 1-30 ...,"Fri, 23 Feb 2018 17:42:17 -0600 (CST)",Indeed,30+ new data science jobs,https://www.indeed.com/viewjob?jk=7c5e15716b7b...,": the senior vice president, data, analytics, ...","SVP, Data Analytics/Decision Sc","Atlanta, GA 30319",data science director,senior,USA,GA,Atlanta,2018-02-22,1
218,,Altamira Technologies Corporation,3 days ago,Indeed Job Alert data science jobs Jobs 1-30 ...,"Fri, 23 Feb 2018 17:42:17 -0600 (CST)",Indeed,30+ new data science jobs,https://www.indeed.com/viewjob?jk=7d5a580c49d6...,altamira is a top open source technology compa...,Data Scientist,"Tampa, FL",data scientist,,USA,FL,Tampa,2018-02-23,1


In [174]:
df_all.loc[428].job_post_link

'https://www.indeed.com/viewjob?jk=c54019e3d2f3060e&from=rje&rgtk=1c773s9vb05l66lg'

In [200]:
# calculating cosine similarities
from sklearn.preprocessing import normalize

In [208]:
norm_features = normalize(nmf_W)
norm_features

array([[ 0.7717836 ,  0.        ,  0.61287696, ...,  0.1695049 ,
         0.        ,  0.        ],
       [ 0.72612043,  0.        ,  0.65179208, ...,  0.21889768,
         0.        ,  0.        ],
       [ 0.07541182,  0.        ,  0.66253316, ...,  0.23151346,
         0.        ,  0.        ],
       ..., 
       [ 0.56148583,  0.        ,  0.82475522, ...,  0.04857007,
         0.        ,  0.        ],
       [ 0.        ,  0.21675788,  0.56410612, ...,  0.        ,
         0.3110409 ,  0.        ],
       [ 0.35028428,  0.92508306,  0.        , ...,  0.        ,
         0.14613429,  0.        ]])

In [204]:
target_doc1 = norm_features[0,:]
target_doc1

array([ 0.7717836 ,  0.        ,  0.61287696,  0.        ,  0.        ,
        0.        ,  0.        ,  0.1695049 ,  0.        ,  0.        ])

In [206]:
similarities = norm_features.dot(target_doc1)
similarities

array([  1.00000000e+00,   9.96980421e-01,   5.03495577e-01,
         7.57780400e-01,   7.40475237e-01,   5.16117083e-01,
         8.44297084e-01,   7.30987933e-01,   5.98134684e-02,
         8.95349112e-01,   9.56641731e-01,   6.87744485e-01,
         1.97295505e-01,   5.92948108e-01,   3.38271665e-01,
         7.28506626e-02,   5.78675675e-01,   1.47748101e-01,
         3.75548830e-01,   8.07581860e-01,   0.00000000e+00,
         4.52136498e-01,   4.42090660e-01,   5.90584170e-01,
         2.64192133e-01,   5.42752189e-01,   4.31632748e-01,
         7.50515989e-01,   6.22231402e-02,   1.61772443e-01,
         7.90978713e-01,   1.04684035e-01,   1.07118944e-01,
         7.89423413e-01,   6.58304329e-01,   7.81448168e-02,
         9.29794671e-01,   4.80271179e-01,   6.61274601e-01,
         2.83938321e-01,   5.56300289e-01,   3.61309220e-01,
         9.71077779e-02,   7.14344429e-01,   8.75414528e-01,
         0.00000000e+00,   9.73279173e-02,   7.95990012e-01,
         7.09333178e-01,

In [209]:
job_titles = df_all.job_title

In [210]:
norm_features_df = pd.DataFrame(norm_features, index=job_titles)
norm_features_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9
job_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Senior Data Scientist,0.772,0.0,0.613,0.0,0.0,0.0,0.0,0.17,0.0,0.0
Data Scientist,0.726,0.0,0.652,0.0,0.0,0.0,0.0,0.219,0.0,0.0
Data Scientist,0.075,0.0,0.663,0.708,0.02,0.0,0.0,0.232,0.0,0.0
Data Scientist / Signal Researcher,0.679,0.0,0.381,0.0,0.34,0.527,0.0,0.0,0.0,0.0
Data Scientist Intern,0.211,0.062,0.942,0.0,0.252,0.0,0.0,0.0,0.0,0.0


In [218]:
target_doc1b = norm_features_df.iloc[0]

In [216]:
similarities = norm_features_df.dot(target_doc1b)
print(similarities.nlargest(10))

job_title
Senior Data Scientist                           1.000
Data Scientist                                  0.997
Data Scientist                                  0.993
Data Science Intern                             0.985
Data Scientist - Sr Level - 120k-140k           0.972
Data Science Intern                             0.969
Data Scientist                                  0.966
Data Scientist (PYTHON, HADOOP)                 0.960
Data Scientist                                  0.957
Data Scientist - Alexa Language Technologies    0.957
dtype: float64


In [219]:
target_doc2b = norm_features_df.iloc[4]

In [220]:
similarities2 = norm_features_df.dot(target_doc2b)
print(similarities2.nlargest(10))

job_title
Data Scientist Intern                            1.000
Data Scientist                                   0.979
Data Scientist                                   0.979
Machine Learning Educator / Research Engineer    0.969
Data Scientist                                   0.965
Applied Data Scientist                           0.965
Principal Data Scientist                         0.962
Data Scientist                                   0.962
Senior Data Scientist (Life Science Startup)     0.962
Software Engineer, Machine Learning              0.961
dtype: float64


In [231]:
print((similarities - similarities2).nlargest(10))

job_title
Data Scientist                                                        0.561
Data Scientist                                                        0.561
Data Scientist                                                        0.557
Data Scientist                                                        0.552
Data Scientist @ Apple                                                0.536
Data Scientist - Data Modelling (or) Machine learning (US Citizen)    0.535
Data Scientist                                                        0.534
Data Scientist - Top Secret (TS) Clearance Required                   0.532
Data Scientist Intern                                                 0.530
Data Scientist (ASD)                                                  0.529
dtype: float64


## Topic modeling on subsets

In [78]:
df_all.job_role.value_counts()

data scientist             326
data analyst                70
data science director       54
data science other          42
data science manager        41
data engineer               32
data science developer      20
                            18
data science consultant     14
data science researcher      4
data science academia        3
Name: job_role, dtype: int64

In [83]:
df_all.job_level.value_counts()

          418
senior    145
intern     41
junior     20
Name: job_level, dtype: int64

In [154]:

# testing with job posts as docs
#sliced = df_all[df_all.job_role == 'data analyst'].copy()
sliced = df_all[(df_all.job_level == '') | (df_all.job_level == 'junior')].copy()
documents = sliced.job_posting

no_features = 500

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.6, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()


no_topics = 10

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)


no_top_words = 12
display_topics(nmf, tfidf_feature_names, no_top_words)

Topic 0:
analytics models new techniques problems ability scientist statistical using develop strong statistics
Topic 1:
google hadoop big growing content production build systems scale web plus real
Topic 2:
firm advisory group investment private partners candidates services information clients results technology
Topic 3:
research software computational clearance center security scientific intelligence engineering federal technology development
Topic 4:
ll requires analytics engineering make decision help bring professionals produce right experienced
Topic 5:
customer marketing sets functions large like database analytical managing analyze oriented perform
Topic 6:
capgemini client group practice management global digital collaborative needs services media solutions
Topic 7:
reliability engineering quality automation resources discipline physics providing cross functional focused database
Topic 8:
000 support scientific 10 requires broad language management program presentations subje