In [184]:
import csv
from datetime import datetime

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
import re

import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy

from IPython.display import Image
import seaborn as sns

import time

In [185]:
# pandas display settings

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)
pd.set_option('display.precision', 3)

In [186]:
# Load the pickled cleaned encounters-level data
df = pd.read_pickle('../data_local/job_posts_parsed_022818.pkl')

In [187]:
df.head()

Unnamed: 0,company_descr,company_name,date_posted,email_content,email_date,email_from,email_subject,job_post_link,job_posting,job_title,location
0,,Geophy,3 days ago,Recommended Jobs for You Jobs 1 to 20 of 20 r...,"Mon, 26 Feb 2018 13:47:08 -0600 (CST)",Indeed Job Alert,You have new recommended jobs: Data Scientist ...,https://www.indeed.com/viewjob?jk=1fbba60eb2c9...,GeoPhy is a company that offers independent pr...,Data Scientist,"New York, NY"
1,Bloomberg is a company dedicated to helping so...,Bloomberg,4 days ago,Recommended Jobs for You Jobs 1 to 20 of 20 r...,"Mon, 26 Feb 2018 13:47:08 -0600 (CST)",Indeed Job Alert,You have new recommended jobs: Data Scientist ...,https://www.indeed.com/viewjob?jk=a1e3470c4783...,Job Requisition Number:65487\n\nAt Bloomberg's...,Machine Learning Educator / Research Engineer,"New York, NY"
2,,Simons Foundation,5 days ago,Recommended Jobs for You Jobs 1 to 20 of 20 r...,"Mon, 26 Feb 2018 13:47:08 -0600 (CST)",Indeed Job Alert,You have new recommended jobs: Data Scientist ...,https://www.indeed.com/viewjob?jk=d4a1c8ed325a...,The Simons Foundation is beginning a new compu...,Systems Software Engineer - Neuroscience,"New York, NY 10010"
3,"Just because you match a job on paper, doesn't...",KellyMitchell,4 days ago,Recommended Jobs for You Jobs 1 to 20 of 20 r...,"Mon, 26 Feb 2018 13:47:08 -0600 (CST)",Indeed Job Alert,You have new recommended jobs: Data Scientist ...,https://www.indeed.com/viewjob?jk=4dbf56a8c979...,KellyMitchell matches the best IT and business...,Data Scientist - Intermediate,"Chesterfield, MO"
4,"Headquartered in Woonsocket, Rhode Island, CVS...",CVS Health,5 days ago,Recommended Jobs for You Jobs 1 to 20 of 20 r...,"Mon, 26 Feb 2018 13:47:08 -0600 (CST)",Indeed Job Alert,You have new recommended jobs: Data Scientist ...,https://www.indeed.com/viewjob?jk=83793cb590f2...,The Systems Data Analyst is responsible for pe...,Data Analyst,"New York, NY"


In [188]:
df.describe()

Unnamed: 0,company_descr,company_name,date_posted,email_content,email_date,email_from,email_subject,job_post_link,job_posting,job_title,location
count,128.0,128.0,128,128,128,128,128,128,128.0,128,128
unique,63.0,108.0,24,6,6,3,4,127,114.0,85,82
top,,,1 day ago,Indeed Job Alert data science jobs Jobs 1-30 ...,"Tue, 27 Feb 2018 18:01:23 -0600 (CST)",Indeed,30+ new data science jobs,http://talentsolvers.com/career/?cjobid=KM3317...,,Data Scientist,"New York, NY"
freq,40.0,3.0,43,33,33,66,66,2,3.0,31,20


In [189]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 11 columns):
company_descr    128 non-null object
company_name     128 non-null object
date_posted      128 non-null object
email_content    128 non-null object
email_date       128 non-null object
email_from       128 non-null object
email_subject    128 non-null object
job_post_link    128 non-null object
job_posting      128 non-null object
job_title        128 non-null object
location         128 non-null object
dtypes: object(11)
memory usage: 11.1+ KB


In [190]:
df.company_name.value_counts()

                                        3
McKinsey & Company                      3
Taboola                                 3
IBM                                     3
Rockstar New York                       2
Glocomms                                2
BDIPlus                                 2
PerkinElmer                             2
Geophy                                  2
Western & Southern Financial Group      2
GSSA, LLC                               2
Verisk Maplecroft                       2
Simons Foundation                       2
Arthur Grand Technologies Inc           2
Bristol-Myers Squibb                    2
OppenheimerFunds                        2
Comcast                                 1
Redolent, Inc.                          1
Tax                                     1
Clearlink                               1
Integral Ad Science                     1
FIR.ai                                  1
ZyLAB                                   1
Nite Services Recruitment         

In [8]:
df[df.company_name == 'Facebook'].job_title.value_counts()

Director, Data Science & Platforms - SMB           5
Data Scientist, Analytics (Instagram)              3
Software Engineer, Machine Learning                2
Data Scientist, SMB                                2
Data Science Manager, Analytics (Instagram NYC)    1
Name: job_title, dtype: int64

In [191]:
df.job_title.value_counts()
# Standardize this 
# Maybe derive role and level from this
# levels = [intern, junior, senior]
# roles = [scientist, engineer, analyst, manager, director, consultant, other]


Data Scientist                                                          31
                                                                         3
Data Scientist - Healthcare Analytics & Delivery, Advanced Analytics     3
Data Science Intern                                                      3
Data Science Associate                                                   2
Data Science Analytics Leader                                            2
Data Science Statistician                                                2
Data Science Analyst                                                     2
Data Analyst                                                             2
Systems Software Engineer - Neuroscience                                 2
Senior Data Engineer (Data Science Team)                                 2
Senior Associate, Data Science                                           1
Manager, GPS Analytics and Data Science                                  1
Data Science Manager     

In [192]:
df2 = df.copy()

In [193]:
df2['job_role'] = ''
df2['job_level'] = ''
#df2.head()

In [194]:
# roles
scientist_roles = ['scientist', 'statistician', 'data science specialist']
engineer_roles = ['engineer']
analyst_roles = ['analyst']
director_roles = ['director', 'vice president', 'vp', 'svp', 'cto', 'cte']
manager_roles = ['manager', 'leader', 'lead']
developer_roles = ['developer', 'programmer', 'architect']
consultant_roles = ['consultant']
researcher_roles = ['researcher']
academic_roles = ['tenure', 'post doc']
other_roles = ['data science', 'big data', 'machine learning']


# levels = [intern, junior, senior]
intern_levels = ['intern']
junior_levels = ['junior', 'jr', 'associate']
senior_levels = ['senior', 'sr', 'principal'] + director_roles + manager_roles

for i in range(len(df2)):
     
    if not df2['job_role'][i]:
        for role in academic_roles:
            if role in df2.job_title[i].lower():
                df2['job_role'][i] = 'data science academia' 
    if not df2['job_role'][i]:            
        for role in scientist_roles:
            if role in df2.job_title[i].lower():
                df2['job_role'][i] = 'data scientist' 
    if not df2['job_role'][i]:
        for role in engineer_roles:
            if role in df2.job_title[i].lower():
                df2['job_role'][i] = 'data engineer' 
    if not df2['job_role'][i]:
        for role in analyst_roles:
            if role in df2.job_title[i].lower():
                df2['job_role'][i] = 'data analyst' 
    if not df2['job_role'][i]:
        for role in director_roles:
            if role in df2.job_title[i].lower():
                df2['job_role'][i] = 'data science director' 
    if not df2['job_role'][i]:
        for role in manager_roles:
            if role in df2.job_title[i].lower():
                df2['job_role'][i] = 'data science manager' 
    if not df2['job_role'][i]:
        for role in developer_roles:
            if role in df2.job_title[i].lower():
                df2['job_role'][i] = 'data science developer' 
    if not df2['job_role'][i]:
        for role in consultant_roles:
            if role in df2.job_title[i].lower():
                df2['job_role'][i] = 'data science consultant'
    if not df2['job_role'][i]:
        for role in researcher_roles:
            if role in df2.job_title[i].lower():
                df2['job_role'][i] = 'data science researcher' 
    if not df2['job_role'][i]:
        for role in other_roles:
            if role in df2.job_title[i].lower():
                df2['job_role'][i] = 'data science other' 
                
    
    
    for level in intern_levels:
        if level in df2.job_title[i].lower():
            df2['job_level'][i] = 'intern'
    if not df2['job_level'][i]:
        for level in junior_levels:
            if level in df2.job_title[i].lower():
                df2['job_level'][i] = 'junior'
    if not df2['job_level'][i]:
        for level in senior_levels:
            if level in df2.job_title[i].lower():
                df2['job_level'][i] = 'senior'
        
                
    

In [195]:
df2[df2.job_role == 'data analyst'].job_level.value_counts()

          11
intern     2
Name: job_level, dtype: int64

In [196]:
df2.job_role.value_counts()

data scientist             65
data science other         17
data analyst               13
data science manager        9
data engineer               9
data science director       5
                            5
data science developer      2
data science consultant     2
data science researcher     1
Name: job_role, dtype: int64

In [197]:
df2.job_level.value_counts()

          83
senior    25
intern    12
junior     8
Name: job_level, dtype: int64

In [198]:
df2.head()

Unnamed: 0,company_descr,company_name,date_posted,email_content,email_date,email_from,email_subject,job_post_link,job_posting,job_title,location,job_role,job_level
0,,Geophy,3 days ago,Recommended Jobs for You Jobs 1 to 20 of 20 r...,"Mon, 26 Feb 2018 13:47:08 -0600 (CST)",Indeed Job Alert,You have new recommended jobs: Data Scientist ...,https://www.indeed.com/viewjob?jk=1fbba60eb2c9...,GeoPhy is a company that offers independent pr...,Data Scientist,"New York, NY",data scientist,
1,Bloomberg is a company dedicated to helping so...,Bloomberg,4 days ago,Recommended Jobs for You Jobs 1 to 20 of 20 r...,"Mon, 26 Feb 2018 13:47:08 -0600 (CST)",Indeed Job Alert,You have new recommended jobs: Data Scientist ...,https://www.indeed.com/viewjob?jk=a1e3470c4783...,Job Requisition Number:65487\n\nAt Bloomberg's...,Machine Learning Educator / Research Engineer,"New York, NY",data engineer,
2,,Simons Foundation,5 days ago,Recommended Jobs for You Jobs 1 to 20 of 20 r...,"Mon, 26 Feb 2018 13:47:08 -0600 (CST)",Indeed Job Alert,You have new recommended jobs: Data Scientist ...,https://www.indeed.com/viewjob?jk=d4a1c8ed325a...,The Simons Foundation is beginning a new compu...,Systems Software Engineer - Neuroscience,"New York, NY 10010",data engineer,
3,"Just because you match a job on paper, doesn't...",KellyMitchell,4 days ago,Recommended Jobs for You Jobs 1 to 20 of 20 r...,"Mon, 26 Feb 2018 13:47:08 -0600 (CST)",Indeed Job Alert,You have new recommended jobs: Data Scientist ...,https://www.indeed.com/viewjob?jk=4dbf56a8c979...,KellyMitchell matches the best IT and business...,Data Scientist - Intermediate,"Chesterfield, MO",data scientist,
4,"Headquartered in Woonsocket, Rhode Island, CVS...",CVS Health,5 days ago,Recommended Jobs for You Jobs 1 to 20 of 20 r...,"Mon, 26 Feb 2018 13:47:08 -0600 (CST)",Indeed Job Alert,You have new recommended jobs: Data Scientist ...,https://www.indeed.com/viewjob?jk=83793cb590f2...,The Systems Data Analyst is responsible for pe...,Data Analyst,"New York, NY",data analyst,


In [200]:
df2.location.value_counts()
# Derive country, state andd city into separate fields from this

New York, NY                  20
New York City, NY, US          5
New York, NY 10022             4
Los Angeles, CA                4
New York, New York             3
San Francisco, CA              3
Research Triangle Park, NC     3
                               3
Cincinnati, OH                 2
Herndon, VA 20170              2
Chicago, IL                    2
Denver, CO                     2
Waltham, MA                    2
Columbus, OH                   2
Sunnyvale, CA                  2
New York, NY 10010             2
Cary, NC                       2
Israel                         1
Oakbrook Terrace, IL, US       1
Hopewell, NJ, US               1
Dearborn, MI 48126             1
Salt Lake County, UT, US       1
Freeport, TX 77541             1
New York, NY 10003             1
Brooklyn, NY, US               1
                              ..
White Plains, NY 10601         1
Greater New York City Area     1
Mahwah, New Jersey             1
Bellevue, WA 98005             1
United Sta

In [201]:
# Derive country, state andd city into separate fields
df2['country'] = ''
df2['state'] = ''
df2['city'] = ''

murica_list = [', US', 'USA', 'United States', 'America']


for i in range(len(df2)):
    location = df2.location[i]
    location = re.sub(r'\d', r'', location)
    location = re.sub(r'(Greater )?New York City( Area)?', r'New York', location, flags=re.IGNORECASE)
    location = re.sub(r'San Francisco Bay Area', r'San Francisco', location, flags=re.IGNORECASE)
    location = re.sub(r'Pennsylvania', r'PA', location, flags=re.IGNORECASE)
    location = re.sub(r'Metro Area', r'', location, flags=re.IGNORECASE)
    location = location.strip()
    
    
    
    # USA
    #linky_clean = re.search(r'(?<=\() ?https?://.*(?=\))', linky, re.IGNORECASE).group().strip()
    for mur in murica_list:
        if mur.lower() in location.lower():
            df2['country'][i] = 'USA'
    
    ## International locations
    # Delhi, IN
    if location.lower() == 'delhi, in':
        df2['city'][i] = 'Delhi' 
        df2['country'][i] = 'India'
    # Bangalore, IN
    elif location.lower() == 'bangalore, in':
        df2['city'][i] = 'Bangalore' 
        df2['country'][i] = 'India'
    # San José, CR
    elif location.lower() == 'san josé, cr':
        df2['city'][i] = 'San José' 
        df2['country'][i] = 'Costa Rica'
    # São Paulo Area, Brazil
    elif location.lower() == 'são paulo area, brazil':
        df2['city'][i] = 'São Paulo' 
        df2['country'][i] = 'Brazil'
    
    else:
        
        # US city and state
        if location.lower() == 'washington d.c.':
            df2['city'][i] = 'Washington'
            df2['state'][i] = 'DC'
            df2['country'][i] = 'USA'
        elif location.lower() == 'new york':
            df2['city'][i] = 'New York'
            df2['state'][i] = 'NY'
            df2['country'][i] = 'USA'
        elif location.lower() == 'michigan':
            df2['state'][i] = 'MI'
            df2['country'][i] = 'USA'
        elif location.lower() == 'minnesota':
            df2['state'][i] = 'MN'
            df2['country'][i] = 'USA'
        elif location.lower() == 'california':
            df2['state'][i] = 'CA'
            df2['country'][i] = 'USA'
        elif location.lower() == 'maryland':
            df2['state'][i] = 'MD'
            df2['country'][i] = 'USA'
        else:    
            location = re.sub(r', US', r'', location, flags=re.IGNORECASE)
            if ',' in location:
                df2['city'][i], df2['state'][i] = location.split(',')
                df2['country'][i] = 'USA'
    
     

In [202]:
df2['country'].value_counts()

USA      122
           5
India      1
Name: country, dtype: int64

In [203]:
df2[df2.city == ''].location.value_counts()

                 3
Israel           1
United States    1
Virginia         1
Name: location, dtype: int64

In [204]:
from datetime import timedelta

In [55]:
tdelta = timedelta(days=1)

In [62]:
(datetime.today() - tdelta).date()

datetime.date(2018, 2, 26)

In [205]:
df2.date_posted.value_counts()
# Calculate the actual dates from this - to be run on the day of scraping (02/26/18 for this run)

1 day ago              43
2 days ago             14
4 days ago             11
Posted 1 day ago        9
5 days ago              8
Posted 2 days ago       7
3 days ago              6
22 hours ago            4
6 days ago              4
11 days ago             3
                        3
Posted 21 hours ago     2
7 days ago              2
18 days ago             2
30+ days ago            1
Posted 20 hours ago     1
9 days ago              1
15 days ago             1
Posted 18 hours ago     1
23 hours ago            1
Posted 9 hours ago      1
Posted 23 hours ago     1
21 hours ago            1
21 days ago             1
Name: date_posted, dtype: int64

In [70]:
test = df2.date_posted[0]
test

'7 days ago'

In [82]:
date_posted_str = re.sub(r'[\+s]', r'', test)
#date_posted_str
ndays_text = re.search(r'\d* ((hour)|(day)|(week))', date_posted_str, re.IGNORECASE).group().strip()
ndays_text

'7 day'

In [206]:
def clean_date_posted(date_posted_str, refdt=datetime.today()):
    # ndays = something from date_posted_str
    date_posted_str = re.sub(r'[\+s]', r'', date_posted_str)
    if re.search(r'\d* ((hour)|(day)|(week))', date_posted_str, re.IGNORECASE):
        ndays_text = re.search(r'\d* ((hour)|(day)|(week))', date_posted_str, re.IGNORECASE).group().strip()
        n, period = ndays_text.split(' ')
        ndays = int(n)
        if period == 'hour':
            ndays = 0
        elif period == 'week':
            ndays = ndays * 7
    
        tdelta = timedelta(days=ndays)
        return (refdt - tdelta).date()

In [207]:
df2['job_post_date'] = df2['date_posted'].apply(clean_date_posted)

In [208]:
df2.job_post_date.value_counts()

2018-02-27    52
2018-02-26    21
2018-02-28    12
2018-02-24    11
2018-02-23     8
2018-02-25     6
2018-02-22     4
2018-02-17     3
2018-02-21     2
2018-02-10     2
2018-02-19     1
2018-02-07     1
2018-01-29     1
2018-02-13     1
Name: job_post_date, dtype: int64

In [209]:
df2.head()

Unnamed: 0,company_descr,company_name,date_posted,email_content,email_date,email_from,email_subject,job_post_link,job_posting,job_title,location,job_role,job_level,country,state,city,job_post_date
0,,Geophy,3 days ago,Recommended Jobs for You Jobs 1 to 20 of 20 r...,"Mon, 26 Feb 2018 13:47:08 -0600 (CST)",Indeed Job Alert,You have new recommended jobs: Data Scientist ...,https://www.indeed.com/viewjob?jk=1fbba60eb2c9...,GeoPhy is a company that offers independent pr...,Data Scientist,"New York, NY",data scientist,,USA,NY,New York,2018-02-25
1,Bloomberg is a company dedicated to helping so...,Bloomberg,4 days ago,Recommended Jobs for You Jobs 1 to 20 of 20 r...,"Mon, 26 Feb 2018 13:47:08 -0600 (CST)",Indeed Job Alert,You have new recommended jobs: Data Scientist ...,https://www.indeed.com/viewjob?jk=a1e3470c4783...,Job Requisition Number:65487\n\nAt Bloomberg's...,Machine Learning Educator / Research Engineer,"New York, NY",data engineer,,USA,NY,New York,2018-02-24
2,,Simons Foundation,5 days ago,Recommended Jobs for You Jobs 1 to 20 of 20 r...,"Mon, 26 Feb 2018 13:47:08 -0600 (CST)",Indeed Job Alert,You have new recommended jobs: Data Scientist ...,https://www.indeed.com/viewjob?jk=d4a1c8ed325a...,The Simons Foundation is beginning a new compu...,Systems Software Engineer - Neuroscience,"New York, NY 10010",data engineer,,USA,NY,New York,2018-02-23
3,"Just because you match a job on paper, doesn't...",KellyMitchell,4 days ago,Recommended Jobs for You Jobs 1 to 20 of 20 r...,"Mon, 26 Feb 2018 13:47:08 -0600 (CST)",Indeed Job Alert,You have new recommended jobs: Data Scientist ...,https://www.indeed.com/viewjob?jk=4dbf56a8c979...,KellyMitchell matches the best IT and business...,Data Scientist - Intermediate,"Chesterfield, MO",data scientist,,USA,MO,Chesterfield,2018-02-24
4,"Headquartered in Woonsocket, Rhode Island, CVS...",CVS Health,5 days ago,Recommended Jobs for You Jobs 1 to 20 of 20 r...,"Mon, 26 Feb 2018 13:47:08 -0600 (CST)",Indeed Job Alert,You have new recommended jobs: Data Scientist ...,https://www.indeed.com/viewjob?jk=83793cb590f2...,The Systems Data Analyst is responsible for pe...,Data Analyst,"New York, NY",data analyst,,USA,NY,New York,2018-02-23


In [210]:
# Pickle the cleaned job posts dataset
df2.to_pickle('../data_local/job_posts_clean_022818.pkl')

In [211]:
df2.email_date.value_counts()
# not using this field for now, just keeping it for reference

Tue, 27 Feb 2018 18:01:23 -0600 (CST)    33
Mon, 26 Feb 2018 18:18:35 -0600 (CST)    33
Tue, 27 Feb 2018 13:27:48 -0600 (CST)    20
Mon, 26 Feb 2018 13:47:08 -0600 (CST)    20
Tue, 27 Feb 2018 01:00:46 +0000 (UTC)    11
Wed, 28 Feb 2018 01:24:21 +0000 (UTC)    11
Name: email_date, dtype: int64

## Job posts

In [212]:
df2.job_posting[0]

"GeoPhy is a company that offers independent property intelligence for a variety of businesses that invest or are otherwise involved in the real estate market and/or financial sector. GeoPhy provides objective data and analysis that is accessible for business through our online platform.\n\nYou will have the opportunity to accelerate our rapidly growing organisation. We're a lean team, so your impact will be felt immediately.\n\nWe're already working with some of the largest real estate lenders and investors across the globe, and we believe that our AVM will truly disrupt the commercial real estate industry. Using your machine learning and analytical skills, you will contribute to the development of GeoPhy's core information products. This includes working on the development of our flagship product, the Automated Valuation Model (AVM) that we've developed for the commercial real estate market.\n\nWhat you'll be responsible for\nDeveloping and maintaining predictive valuation algorithms

In [110]:
print(df2.job_posting[0])

We are looking for Senior Data Scientists with strong mathematical backgrounds to work alongside our engineering teams to build the next generation of retail and commerce models that delight and empower marketers. The ideal candidate is one that has several years of experience researching, building, serving, and maintaining data science models at scale. They have first-hand experience with what works and what doesn’t, and are eager to share this experience with more junior members and guide them through that process. They are also able to and excited to help architect and build out the data science architecture needed to accelerate innovation on models and facilitate serving and maintaining them. Finally, they should be curious and eager to identify and explore the myriad of other products that can be built on our data asset. Our culture emphasizes making good tradeoffs, working as a team, and leaving your ego at the door.

First-party data is at the core of everything we build and the

In [165]:
count = 0
for i in range(len(df2)):
    if 'equal opportunity employer' in df2.job_posting[i]:
        count += 1
print(count)


# status employment protected gender applicants race color disability religion opportunity sexual age equal orientation qualified veteran sex employer regard identity demonstrated law marital


# job required education master type years salary year location desired minimum include tasks level range highly expert degree bachelor position resume knowledge related duties summary requirements benefits preferred applicants phd present needed need employee employer effectively effective employees

53


In [91]:
# testing

from textblob import TextBlob

In [111]:

jobpost_blob = TextBlob(df2.job_posting[0])

In [112]:
jobpost_blob.noun_phrases

WordList(['senior data scientists', 'strong mathematical backgrounds', 'engineering teams', 'commerce models', 'empower marketers', 'ideal candidate', 'data science models', 'first-hand experience', 'doesn ’ t', 'junior members', 'data science architecture', 'accelerate innovation', 'data asset', 'first-party', 'data', 'science team', 'bluecore', 'exciting ways', 'data asset', '’ re', 'data scientists', 'powerful models', 'empower marketers', 'right decisions', 'building models', 'literature search', 'iterative process', 'suitable model', 'wide variety', 'bayesian', 'customer lifetime value', 'matrix factorization', 'customer ’ s product affinity', 'data points', 'double revenue', 'flexible manner', '300+ diverse customers', 'span industries', 'bigquery', 'spark', 'cloud sql', 'keras', 'tensorflow', 'airflow', 'google app engine', 'google compute engine', '’ re', '’ s', 'boltzmann', 'recurrent neural networks', 'convolutional neural networks', 'extensive dive', 'differential privacy', 

In [113]:
jobpost_blob.sentiment

Sentiment(polarity=0.21752183489025595, subjectivity=0.5575275309485834)

In [115]:
for i in range(len(df2)):
    print(i)
    print(TextBlob(df2.job_posting[i]).sentiment)

0
Sentiment(polarity=0.21752183489025595, subjectivity=0.5575275309485834)
1
Sentiment(polarity=0.2888095238095238, subjectivity=0.5622619047619047)
2
Sentiment(polarity=0.27485096500721506, subjectivity=0.5180242153679654)
3
Sentiment(polarity=0.1291156462585034, subjectivity=0.46977891156462587)
4
Sentiment(polarity=0.12274914801510545, subjectivity=0.42007230358294184)
5
Sentiment(polarity=0.10781750906750909, subjectivity=0.5378483678483678)
6
Sentiment(polarity=0.20129370629370633, subjectivity=0.4879786879786879)
7
Sentiment(polarity=0.012500000000000004, subjectivity=0.3125)
8
Sentiment(polarity=0.20635888501742164, subjectivity=0.40801393728222995)
9
Sentiment(polarity=0.2459273182957393, subjectivity=0.43743734335839596)
10
Sentiment(polarity=0.2243340380549683, subjectivity=0.6019403335682404)
11
Sentiment(polarity=0.07249442476715205, subjectivity=0.3805129214220124)
12
Sentiment(polarity=0.1015151515151515, subjectivity=0.49727272727272726)
13
Sentiment(polarity=0.195376190

In [116]:
df2.job_posting[194]

"Greetings,\nHope you doing good.\nThis is Karan Senior Technical Recruiter form STCG Inc.. We have come across your profile on LinkedIn/Job Boards and we found your profile matching to the below job description.\nJob Title: Data Scientist\nLocation : Redmond, WA & Palo Alto, CA\nDuration : Long term Contract\nThis is junior to mid level position ( look for around 3 years of PHD guys)\nKey skills needed:\nData Science and Machine learning techniques.\nKnowledge of a variety of machine learning techniques (clustering, decision tree learning, artificial neural networks, etc.) and their real-world advantages/drawbacks.\nHands on experience with ML techniques for text processing\nHand on experience with Deep learning techniques (Not mandatory)\nProgramming with R or Python, SQL\nJob Type: Contract\nRequired experience:\nTotal IT: 10 years\nMachine Learing: 6 years\nScripting: 7 years\nData Scientist: 8 years\nRequired education:\nBachelor's\nJob Location:\nRedmond, WA"

In [117]:
def get_count(item):
    return item[1]

for word, count in sorted(jobpost_blob.word_counts.items(), key=get_count, reverse=True):
    print("%15s %i" % (word, count))

            and 44
             to 30
             of 16
            the 15
         models 13
              a 11
           data 10
           with 10
           that 10
     experience 10
             as 10
             we 9
            our 9
             at 8
            for 7
            are 6
             is 6
             or 6
          build 5
              ’ 5
           team 5
             in 5
        science 4
          scale 4
       identify 4
             an 4
          teams 3
            one 3
        serving 3
    maintaining 3
           they 3
           what 3
           them 3
          asset 3
       bluecore 3
          their 3
      customers 3
          model 3
           such 3
       leverage 3
       relevant 3
       learning 3
            all 3
     scientists 2
         strong 2
    backgrounds 2
           work 2
    engineering 2
        empower 2
      marketers 2
          years 2
       building 2
           have 2
          eager 2
          share 2

In [98]:


import nltk

In [99]:
from nltk.util import ngrams

from collections import Counter
from operator import itemgetter

from nltk.corpus import stopwords

In [118]:

stop = stopwords.words('english')
stop += ['.', ',', '(', ')', "'", '"']
stop += ['degree', 'phd', 'employer', 'data', "analytics", 'analysis', 'machine', 'learning', 'experience', 'years']
stop = set(stop)

counter = Counter()

n = 2
for doc in df2.job_posting:
    doc = doc.lower()
    words = TextBlob(doc).words  # tokenize words
    words = [w for w in words if w not in stop]   
    bigrams = ngrams(words, n)
    counter += Counter(bigrams)

for phrase, count in counter.most_common(30):
    print('%20s %i' % (" ".join(phrase), count))

    computer science 296
communication skills 149
  sexual orientation 140
     national origin 137
   equal opportunity 124
      without regard 117
      veteran status 117
       related field 114
          race color 109
         regard race 108
            new york 108
qualified applicants 98
     gender identity 97
        ability work 93
        science team 92
   business problems 89
            r python 88
      skills ability 86
      color religion 84
 predictive modeling 82
            job type 82
        team members 81
         bachelor 's 81
software development 78
  orientation gender 77
            master ’ 75
 operations research 73
          large sets 72
            python r 71
          bachelor ’ 69


In [120]:
#### TF: frequency in this document
#### IDF: inverse frequency in the corpus

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

#vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,2))
#doc_vectors = vectorizer.fit_transform(df.job_posting)
#classes = np.array(['pos']*int(len(df)/2) + ['neg']*int(len(df)/2))
#model = MultinomialNB().fit(doc_vectors, classes)

#job_post_vector = vectorizer.transform(df.job_posting)
#model.predict(job_post_vector)

In [137]:

# testing with job posts as docs
documents = df2.job_posting

no_features = 500

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.7, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

In [122]:
from sklearn.decomposition import NMF

In [178]:
no_topics = 16

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)


In [180]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 100
display_topics(nmf, tfidf_feature_names, no_top_words)

Topic 0:
management support project development research systems software information solutions technical quality provide design related including internal analysis ability requirements projects clinical ensure knowledge develop intelligence integration processes process environment tools demonstrated working implementation needs teams security stakeholders lead performance technology standards years database minimum relevant required technologies meet practices application functional federal scientific level manage reporting based expertise provides external effectively open communication activities visualization strategies applications source problem leadership maintain qualifications specific review discipline new concepts programs end opportunities operational plans training critical position responsibilities candidates analytic organization change department delivery computer high preferred innovation excellent written best manager
Topic 1:
learning machine techniques models algor

In [None]:
# remove equal opportunity clause and topic 10 stuff. Try adding more stopwords. Rerun the above.
# try subsetting by job roles, job levels, region (?)

In [144]:
# try LDA -- looks pretty useless here
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation


In [142]:

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [150]:
# Run LDA
no_topics = 15
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

In [152]:

no_top_words = 20
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
project management clinical provide required support change job experience team development education activities standards plans plan design minimum manager review
Topic 1:
team business including benefits management techniques company employees decision mining modeling present support quality value plan model algorithms office variety
Topic 2:
experience machine learning management change ability science team spark products years technical build skills services research algorithms business support work
Topic 3:
experience team solutions analysis work python business knowledge job engineering skills status employment years design hands related technology using position
Topic 4:
business analytics team science work experience clients client management services skills new advanced solutions ability analytical firm strategic technology learning
Topic 5:
experience science modeling world development machine skills learning building analysis python techniques algorithms years time 

## Company descr

In [15]:
df.company_descr[0]

"At Bluecore we are transforming the way eCommerce marketers use data and automation to communicate with customers. Bluecore’s customer experience platform is designed to simplify the process of ingesting terabytes of behavioral data and automatically taking action on precise insights, driving engagement and conversion rates that defy industry standards. We're one of New York City’s fastest growing SaaS start-ups, working with more than 190 customers representing more than 250 high-end apparel, electronics, automotive and other consumer brands.\r\n\r\nWe have only one rule: use good judgment. We hire incredible people so we don’t have to institute cumbersome rules or processes. We want you to be free to think, create and get things done on your own terms. We love positive energy in the office. In fact, we need it—happy people are more efficient and creative. So if 6 PM improv classes or midday gym sessions are what makes you happy, we will support you. When you join us, you’re stuck wi

In [16]:
print(df.company_descr[0])

At Bluecore we are transforming the way eCommerce marketers use data and automation to communicate with customers. Bluecore’s customer experience platform is designed to simplify the process of ingesting terabytes of behavioral data and automatically taking action on precise insights, driving engagement and conversion rates that defy industry standards. We're one of New York City’s fastest growing SaaS start-ups, working with more than 190 customers representing more than 250 high-end apparel, electronics, automotive and other consumer brands.

We have only one rule: use good judgment. We hire incredible people so we don’t have to institute cumbersome rules or processes. We want you to be free to think, create and get things done on your own terms. We love positive energy in the office. In fact, we need it—happy people are more efficient and creative. So if 6 PM improv classes or midday gym sessions are what makes you happy, we will support you. When you join us, you’re stuck with us