In [1]:
# General Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Web Scraping Libraries
import urllib
import requests
from bs4 import BeautifulSoup

# Regex Library
import re

# Time-related Libraries
import time

# NLP Libraries
import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from wordcloud import WordCloud

import MVP_Bojado

import warnings
warnings.filterwarnings("ignore")

### URL Format of Indeed.com
1. Search chemist in TX<br>
https://www.indeed.com/jobs?q=chemist&l=TX
2. Search chemist in San Antonio, TX<br>
https://www.indeed.com/jobs?q=chemist&l=San+Antonio%2C+TX
3. Search data scientist in San Antonio, TX<br>
https://www.indeed.com/jobs?q=data+scientist&l=San+Antonio%2C+TX
4. Search data scientist intern in San Anotnio, TX<br>
https://www.indeed.com/jobs?q=data+scientist+intern&l=San+Antonio%2C+TX
5. Sort the data scientist jobs posting by date<br>
https://www.indeed.com/jobs?q=data+scientist&l=San+Antonio%2C+TX&sort=date

**Takeaways**
1. q = job title
2. l = location

### URL Format of Monster.com
https://www.monster.com/jobs/search/?q=data-scientist&where=San-Antonio__2C-TX

### Generate the URL of a Job Search at Indeed.com

In [2]:
def first_page_url_indeed(job_title, location):
    '''
    This function returns a URL of the 1st page of a job search at Indeed.com 
    based on the job title and the location.
    '''
    # Create the base URL for a job serch at Indeed.com
    base_url = 'https://www.indeed.com/jobs?'
    # Create a dictionary to map the keys to the input parameters
    dic = {'q': job_title, 'l': location, 'sort': 'date'}
    # Convert the dictionary to a query string
    relative_url = urllib.parse.urlencode(dic)
    # Generate the full URL of the first page
    url = base_url + relative_url
    return url

In [42]:
# Test the function
url = first_page_url_indeed('data scientist', 'al')
url

'https://www.indeed.com/jobs?q=data+scientist&l=al&sort=date'

### Make the HTTP Request

In [49]:
def first_page_soup_indeed(job_title, location):
    '''
    This function returns a BeautifulSoup object to hold the content 
    of the first page of a request for job searching at Indeed.com
    '''
    # Generate the URL of the job search based on title and location
    url = first_page_url_indeed(job_title, location)
    # Make the HTTP request
    response = requests.get(url)
    # Print the status code of the request
    print("Status code of the request: ", response.status_code)
    # Sanity check to make sure the document type is HTML
    print("Document type: ", response.text[:15])
    # Take a break
    time.sleep(5)
    # Make a soup to hold the response content
    soup = BeautifulSoup(response.content, "html.parser")
    # Print out the title of the content
    print("Title of the response: ", soup.title.string)
    return soup

In [52]:
first_page_soup = first_page_soup_indeed("data scientist", 'al')
type(first_page_soup)

Status code of the request:  200
Document type:  <!DOCTYPE html>
Title of the response:  Data Scientist Jobs, Employment in Alabama | Indeed.com


bs4.BeautifulSoup

In [12]:
# Find out the tag that contains the number of the jobs by seaching

num_jobs = first_page_soup.find('div', id='searchCountPages')
print("Data Type: ", type(num_jobs))
print("Name of the Tag: ", num_jobs.name)
print("Attributes of the Tag: ", num_jobs.attrs)
print("Text within the Tag: ")
num_jobs.text

Data Type:  <class 'bs4.element.Tag'>
Name of the Tag:  div
Attributes of the Tag:  {'id': 'searchCountPages'}
Text within the Tag: 


'\n                    Page 1 of 560 jobs'

In [13]:
# Find the number of the jobs in the text
match = re.findall(r'(\d+)', num_jobs.text)
match[1]

'560'

In [9]:
def num_jobs_indeed(first_page_soup):
    '''
    This function returns the total number of the jobs in the searching result.
    '''
    # Find out the section contains total number of jobs  
    div = first_page_soup.find('div', id='searchCountPages')
    # Extract the number
    num_jobs = re.findall(r'(\d+)', div.text)[1]
    return num_jobs

In [53]:
# Test the function num_jobs_indeed
num_jobs_indeed(first_page_soup)

'40'

In [12]:
def page_num_indeed(url):
    '''
    This function returns the page number of job searching results. 
    '''
    # Create a Soup object based on the url
    soup = page_soup_indeed(url)
    # Find out the section contains total number of jobs  
    div = soup.find('div', id='searchCountPages')
    # Extract the number
    page_num = re.findall(r'(\d+)', div.text)[0]
    return page_num

In [54]:
# Test the function num_jobs_indeed
page_num_indeed(first_page_soup)

'1'

In [14]:
# Define a function to extract all job cards in a Indeed page

def job_cards_indeed(soup):
    '''
    This function accepts the Soup object of a Indeed page 
    return an iterator containing the all the job cards in this page.
    '''
    # Find the appropriate tag that contains all of the job listings in this page
    tag = soup.find('td', id="resultsCol")
    # Extract all job cards
    job_cards = tag.find_all('div', class_='jobsearch-SerpJobCard')
    return job_cards

In [55]:
# Test the function job_cards_indeed
job_cards = job_cards_indeed(first_page_soup)

# Print the data type of job_cards
type(job_cards)

bs4.element.ResultSet

**Quick Note**: job_cards is an iterator

In [56]:
# How many jobs listed in the 1st page? 
len(job_cards)

15

In [17]:
def job_titles_indeed(job_cards):
    '''
    This function extract the job titles from a job_cards set. 
    '''
    # Create a list to hold the job titles
    titles = []
    # For Loop throught the job cards to extract the titles
    for job in job_cards:
        title = job.find('h2', class_='title')
        title = title.text.strip()
        titles.append(title)
    return titles

In [57]:
titles = job_titles_indeed(job_cards)
titles

['Data Scientist\nnew',
 'MANAGER, ACCOUNT DEVELOPMENT/DATA SCIENCE\nnew',
 'Data Scientist\nnew',
 'Data Analyst - Microsoft Stack (mid-senior)\nnew',
 'Statistical Analyst\nnew',
 'Data Scientist Intern\nnew',
 'Machine Learning/Artificial Intelligence Software Developer\nnew',
 'Software Engineer/Data Scientist\nnew',
 'Lead Financial Analyst - Artificial Intelligence Strategic G...\nnew',
 'BI Architect/Data Scientist',
 'Asst Research Professional - Research Data Scientist',
 '2021-18 Software Engineers for BMDS Data Analysis Suite',
 'Cyber Artificial Intelligence (AI) SME',
 '2021-02 Artificial Intelligence Designer',
 'Deep Learning Engineer']

In [19]:
# Define a function to pull the company names from a set of job cards

def company_names_indeed(job_cards):
    '''
    This function extracts the company names from a set of job cards.
    '''
    # Create a list to hold the company names
    names = []
    # For loop through the job cards to pull the company names
    for job in job_cards:
        name = job.find('span', class_='company')
        name = name.text.strip()
        names.append(name)
    return names

In [58]:
# Test the function: comany_names_indeed

company_names = company_names_indeed(job_cards)
company_names

['Quiq Inc',
 'B.A.S.S., LLC',
 'Vision',
 'Vaco',
 'The Personnel Board of Jefferson County',
 'LOCKHEED MARTIN CORPORATION',
 'IERUS Technologies, Inc.',
 'Torch Technologies, Inc.',
 'Deloitte',
 'Doozer Software',
 'The University of Alabama',
 '1st Edge',
 'Quantum Research International, Inc.',
 '1st Edge',
 'Numerator']

In [21]:
# Define a function to pull the post ages from a set of job cards

def post_ages_indeed(job_cards):
    '''
    This function pulls the post ages from a set of job cards.
    '''
    # Create a list to hold the post ages
    ages = []
    # For loop through the job cards to pull the post ages
    for job in job_cards:
        age = job.find('span', class_='date')
        age = age.text.strip()
        ages.append(age)
    return ages

In [59]:
# Test the function: post_ages_indeed
ages = post_ages_indeed(job_cards)
ages

['Today',
 'Today',
 'Today',
 'Today',
 '2 days ago',
 '4 days ago',
 '5 days ago',
 '6 days ago',
 '6 days ago',
 '11 days ago',
 '11 days ago',
 '12 days ago',
 '22 days ago',
 '21 days ago',
 '28 days ago']

In [23]:
# Define a function to pull the location from a set of job cards

def job_locations_indeed(job_cards):
    '''
    This function pulls the job locations from a set of job cards.
    '''
    # Create a list to hold the locations
    locations = []
    # For loop through the job cards to pull the locations
    for job in job_cards:
        location = job.find('div', class_='location accessible-contrast-color-location')
        if location == None:
            location = job.find('span', class_='location accessible-contrast-color-location')
        location = location.text.strip()
        locations.append(location)
    return locations

In [60]:
locations = job_locations_indeed(job_cards)
locations

['United States',
 'Birmingham, AL 35243',
 'Huntsville, AL',
 'Hartselle, AL',
 'Jefferson County, AL',
 'Huntsville, AL 35806',
 'Huntsville, AL 35805',
 'Huntsville, AL 35802',
 'Birmingham, AL 35203 (Central City area)',
 'Birmingham, AL 35216',
 'Tuscaloosa, AL',
 'Huntsville, AL',
 'Huntsville, AL 35806',
 'Huntsville, AL',
 'Alabama']

In [25]:
# Define a function to pull the company ratings from a set of job cards

def company_rating_indeed(job_cards):
    '''
    This function pulls the company rating from a set of job cards.
    If the rating is unavailable, it will be marked as 'missing'.
    '''
    # Create a list to hold the locations
    ratings = []
    # For loop through the job cards to pull the locations
    for job in job_cards:
        rating = job.find('span', class_='ratingsContent')
        if rating == None:
            ratings.append('missing')
            continue
        rating = rating.text.strip()
        ratings.append(rating)
    return ratings

In [61]:
ratings = company_rating_indeed(job_cards)
ratings

['missing',
 'missing',
 'missing',
 '3.7',
 'missing',
 '4.0',
 '4.7',
 'missing',
 '4.0',
 '4.8',
 '4.4',
 'missing',
 '4.0',
 'missing',
 '3.6']

In [27]:
def acuqire_indeed_job_description(url):
    '''
    This function accepts the URL of a job posting and pull its description.
    '''
    # Make the HTTP request
    request = requests.get(url)
    print("Status Code: ", request.status_code)
    # Take a break
    time.sleep(5)
    # Make a soup variable holding the response content
    soup = BeautifulSoup(request.content, "html.parser")
    if soup == None:
        description = 'error'
    else:
        # Print the page's title
        print(soup.title.string)
        # Find the section that contains job description
        description = soup.find('div', id="jobDescriptionText")
        if description == None:
            description = 'error'
        else:
            description = description.text
    return description

def job_links_and_contents_indeed(job_cards):
    '''
    This function pulls the job links and descriptions from a set of job cards.
    '''
    # Create a list to hold the links and descriptions
    links = []
    descriptions = []
    # For loop through the job cards to pull the links and descriptions
    for job in job_cards:
        link = job.find('a')['href']
        link = 'https://www.indeed.com' + link
        link = link.replace(';', '&')
        description = acuqire_indeed_job_description(link)
        links.append(link)
        descriptions.append(description)
    return links, descriptions

In [62]:
# Test the function: job_links_and_contents_indeed
links, descriptions = job_links_and_contents_indeed(job_cards)

Status Code:  200
Data Scientist - United States - Indeed.com
Status Code:  200
MANAGER, ACCOUNT DEVELOPMENT/DATA SCIENCE - Birmingham, AL 35243 - Indeed.com
Status Code:  200
Data Scientist - Huntsville, AL - Indeed.com
Status Code:  200
Vaco Careers and Employment | Indeed.com
Status Code:  200
Statistical Analyst - Jefferson County, AL - Indeed.com
Status Code:  200
Data Scientist Intern - Huntsville, AL 35806 - Indeed.com
Status Code:  200
Machine Learning/Artificial Intelligence Software Developer - Huntsville, AL 35805 - Indeed.com
Status Code:  200
Software Engineer/Data Scientist - Huntsville, AL 35802 - Indeed.com
Status Code:  200
Lead Financial Analyst - Artificial Intelligence Strategic Growth Offering (AI SGO) Finance & Investment - Birmingham, AL 35203 - Indeed.com
Status Code:  200
BI Architect/Data Scientist - Birmingham, AL 35216 - Indeed.com
Status Code:  200
Asst Research Professional - Research Data Scientist - Tuscaloosa, AL - Indeed.com
Status Code:  200
2021-18 S

In [63]:
links

['https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0B9JzWmK0GDxRzYzuZf9xSyXN8pQP8ihv6GH-rkAji3LyaR-hLXYB_NfHRnutLWTuqKBdImdeUix5pYB3Uv4mWnHLZlzd5py00o1lMFznkqSTkX2n8aY8mwnVhJALIJXce_3bj6Yi5-UDH-VhWVn3o6zeb2Pm8VmBf7XYjX1VtegcSkVtTpQAVHk_cSXYlbbb9ppduWw_Qf5XW46x30WeNoo3_22bLgKZLzhJEV_E3Ixv1DiqZPYzqp0IMYZ0Djwgv954h1Bqmz53jK2UmTAk6lmmvbIoTkfhAW5z9S56qUXHWlwvic9MvReVXyrEpYz_zaY4XbmDDXr3uIHFCGuQKnKzNsLz5vp9EkzV1o5kbpCj6qti0RZ7NseG1iXFMZ2GODxurbs811H733Ly8Iz-Y8CM0wqrwTs_8vPgE0g-b_Tx-QYca5xn1D&p=0&fvj=1&vjs=3',
 'https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0DVr7NBsl4cHSE4aBurKzaRDI_6xQvAm7MDK25NP7GKwrPFBHjnIhnnMkAKtOTcAVcwvdJYQ1iX_OUlRkCcJ9lYHn6yw2Xoq1V-R4Wdf5ZtG2pRL3zKYjPaUIJ7FfKiCmO7nZu9zD3akcI0SzfFz7oRw23qh6o47s6hI0zsqL0wfiKJ4EFcwcqvgr2zn3ygOEqvyjn_BqruYjouMp_OPBKXN1sJf_xD6jAidmRKsWVQCbimyT8pNek2EIcd1SHGimJOFbqnE5QnWqRtOwd1Em2mVe_WF7eRHtG2Ad9d5Btcaz-YQLMoJ5pB8XdgfSqvtHMJyTpI5ilFsZgBFjZL1WHSJK1XUvqlXQd2hYnYwqyUbescsJP9n5A1Qu-e6G6MmRKAdUa65Lq5aeDjyXXxYGCtPmDVqGApWN3X_1kyn3niS0FsFkisS

In [64]:
descriptions[1]

"MANAGER, ACCOUNT DEVELOPMENT/DATA SCIENCEBasic Duties & Responsibilities: Query and report on dataBuild and maintain analytical tools, metrics & models across the following channels: membership, sales, marketing and tournaments.Create compelling data visualizations and stories across internal LOB’sResearch internal data in order to build out analytics for both sales and marketing benefitsOwnership of data flow management between our internal systems and 3rd party systems such as PCD, Dovetail, Salesforce, US Sweeps and morePresent information to organizational stakeholders and executivesQualifications: Bachelor’s Degree in mathematics, engineering, computer science, data science, or other relevant fieldStrong written & verbal communication as well as visualization skillsProficiency in analytical/statistical methods and technologies (e.g. SQL, R, Python, Tableau, Microsoft BI, etc)Familiarity with concepts such as predictive modelling, cloud computing, machine learning, artificial inte

In [35]:
# Define a function to create a Soup object based on a job search url

def page_soup_indeed(url):
    '''
    This function returns a BeautifulSoup object to hold the content 
    of a page for a job searching results at Indeed.com
    '''
    # Make the HTTP request
    response = requests.get(url)
    # Print the status code of the request
    print("Status code of the request: ", response.status_code)
    # Sanity check to make sure the document type is HTML
    print("Document type: ", response.text[:15])
    # Take a break
    time.sleep(5)
    # Make a soup to hold the response content
    soup = BeautifulSoup(response.content, "html.parser")
    # Print out the title of the content
    print("Title of the response: ", soup.title.string)
    return soup

In [84]:
# Test the function: page_soup_indeed

url = 'https://www.indeed.com/jobs?q=data+scientist&l=al&sort=date'
soup = page_soup_indeed(url)
type(soup)

Status code of the request:  200
Document type:  <!DOCTYPE html>
Title of the response:  Data Scientist Jobs, Employment in Alabama | Indeed.com


bs4.BeautifulSoup

In [86]:
# Find out the page number
int(page_num_indeed(soup))

1

In [38]:
# Pull the job cards from the soup
type(job_cards_indeed(soup))

bs4.element.ResultSet

In [99]:
# Define a function to pull job information from a job search URL

def acquire_page_indeed(url):
    '''
    This function accepts a job search URL and returns a pandas dataframe 
    containing job title, location, company, company rating, post age and description. 
    '''
    # Create a Soup object based on the url
    soup = page_soup_indeed(url)
    # Pull the job cards
    job_cards = job_cards_indeed(soup)
    # Pull the job titles
    titles = job_titles_indeed(job_cards)   
    # Pull the names of the companies
    companies = company_names_indeed(job_cards)
    # Pull the post ages
    ages = post_ages_indeed(job_cards)
    # Pull the job locations
    locations = job_locations_indeed(job_cards)
    # Pull the company ratings
    ratings = company_rating_indeed(job_cards)
    # Pull the hyperlinks and job description
    links, descriptions = job_links_and_contents_indeed(job_cards)    
    # Create a dataframe
    d = {'title': titles,
         'locations': locations,
         'company': companies, 
         'company_rating': ratings,
         'post_age': ages, 
         'job_link': links, 
         'job_description': descriptions}
    df = pd.DataFrame(d)
    return df

In [72]:
# Test function acquire_page_indeed
page_num, df = acquire_page_indeed(url)

Status code of the request:  200
Document type:  <!DOCTYPE html>
Title of the response:  Data Scientist Jobs, Employment in Alabama | Indeed.com
Status Code:  200
Data Scientist - Huntsville, AL 35806 - Indeed.com
Status Code:  200
Data Scientist - United States - Indeed.com
Status Code:  200
Vaco Careers and Employment | Indeed.com
Status Code:  200
Data Scientist - Huntsville, AL - Indeed.com
Status Code:  200
MANAGER, ACCOUNT DEVELOPMENT/DATA SCIENCE - Birmingham, AL 35243 - Indeed.com
Status Code:  200
Statistical Analyst - Jefferson County, AL - Indeed.com
Status Code:  200
Data Scientist Intern - Huntsville, AL 35806 - Indeed.com
Status Code:  200
Machine Learning/Artificial Intelligence Software Developer - Huntsville, AL 35805 - Indeed.com
Status Code:  200
Software Engineer/Data Scientist - Huntsville, AL 35802 - Indeed.com
Status Code:  200
Lead Financial Analyst - Artificial Intelligence Strategic Growth Offering (AI SGO) Finance & Investment - Birmingham, AL 35203 - Indeed.

In [73]:
# Print the page number
page_num

'1'

In [79]:
df.columns

Index(['title', 'locations', 'company', 'company_rating', 'post_age',
       'job_link', 'job_description'],
      dtype='object')

In [76]:
df.job_description[0]

"MTA, Inc. is a Woman Owned Small Business with headquarters in Huntsville, AL. We are a diversified company recognized for excellence in Engineering, Integrated Logistics, and Quality Assurance. MTA provides services to U.S. defense agencies, NASA, and the U.S. Corp of Engineers.\nMTA, Inc. has an immediate opening for the position of Data Scientist.\nJob Description:\nTo provide support to the System Readiness Directorate's Reliability, Availability, Maintainability and System Assessment (RAM-SA) Division in support of developing and implementing machine learning algorithms and tools in SQL and Python. The objective of this project is to develop, implement and support data science work in the RAM division in support of tool development research and the Predictive Maintenance National Mission Initiative (PMx NMI).\nDuties and responsibilities:\nPerforming detailed and complex calculations necessary to assess advanced systems concepts\nTransferring data into a new format to make it mor

In [100]:
def jobs_indeed(job_title, location):
    '''
    This function accepts the job title and location and return 
    the job information pull from Indeed.com.
    '''
    # Generate the urls based on job title and location (state)
    url = first_page_url = first_page_url_indeed(job_title, location)
    # Set up an counter
    counter = 1
    # Create an empty dataframe to hold the job information
    df_jobs = pd.DataFrame(columns = ['title', 'locations', 'company', 'company_rating', 
                                      'post_age','job_link', 'job_description'])
    # Pull the page number
    page_num = int(page_num_indeed(url))
    # Set up an checker
    keep_going = (counter == page_num)   
    # For loop through the urls to pull job information
    while keep_going and page_num <=40:
        df = acquire_page_indeed(url)
        print("--------------------------------")
        print("Page: ", page_num)
        print("--------------------------------")
        df_jobs = df_jobs.append(df, ignore_index=True)
        time.sleep(180)
        dic = {'start': page_num*10}
        relative_url = urllib.parse.urlencode(dic)
        url = first_page_url + '&' + relative_url
        counter = counter + 1
        page_num = int(page_num_indeed(url))
        keep_going = (counter == page_num)
    # Print the total number of jobs
    print(f"Total number of {job_title} positions in {location}: ", df_jobs.shape[0])
    return df_jobs

In [101]:
# Test function jobs_in_state_indeed
df = jobs_indeed('data scientist', 'al')

Status code of the request:  200
Document type:  <!DOCTYPE html>
Title of the response:  Data Scientist Jobs, Employment in Alabama | Indeed.com
Status code of the request:  200
Document type:  <!DOCTYPE html>
Title of the response:  Data Scientist Jobs, Employment in Alabama | Indeed.com
Status Code:  200
Data Scientist - Huntsville, AL 35806 - Indeed.com
Status Code:  200
Data Scientist - United States - Indeed.com
Status Code:  200
Vaco Careers and Employment | Indeed.com
Status Code:  200
MANAGER, ACCOUNT DEVELOPMENT/DATA SCIENCE - Birmingham, AL 35243 - Indeed.com
Status Code:  200
Data Scientist - Huntsville, AL - Indeed.com
Status Code:  200
Statistical Analyst - Jefferson County, AL - Indeed.com
Status Code:  200
Data Scientist Intern - Huntsville, AL 35806 - Indeed.com
Status Code:  200
Machine Learning/Artificial Intelligence Software Developer - Huntsville, AL 35805 - Indeed.com
Status Code:  200
Software Engineer/Data Scientist - Huntsville, AL 35802 - Indeed.com
Status Cod

In [114]:
# Print a concise summary of the df
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   title            42 non-null     object
 1   locations        42 non-null     object
 2   company          42 non-null     object
 3   company_rating   42 non-null     object
 4   post_age         42 non-null     object
 5   job_link         42 non-null     object
 6   job_description  42 non-null     object
dtypes: object(7)
memory usage: 2.4+ KB


In [115]:
df.sample(5)

Unnamed: 0,title,locations,company,company_rating,post_age,job_link,job_description
17,Member of Technical Staff- Technical Support -...,"Huntsville, AL",Wind River,3.7,30+ days ago,https://www.indeed.com/rc/clk?jk=c768446d7ff0b...,Member of Technical Staff - Technical Support ...
16,Machine Learning Engineer,"Huntsville, AL 35806",CFD Research Corporation,4.2,28 days ago,https://www.indeed.com/rc/clk?jk=189d948471f4a...,This position will join an R&D team focused on...
3,"MANAGER, ACCOUNT DEVELOPMENT/DATA SCIENCE\nnew","Birmingham, AL 35243","B.A.S.S., LLC",missing,Today,"https://www.indeed.com/company/B.A.S.S.,-LLC/j...","MANAGER, ACCOUNT DEVELOPMENT/DATA SCIENCEBasic..."
30,AI/ML Software Engineer,"Huntsville, AL",COLSA,3.9,30+ days ago,https://www.indeed.com/rc/clk?jk=fad792fe1641a...,"General Summary\n\n\nDesigns, develops, troubl..."
41,DATA SCIENCE CONSULTANT BIRMINGHAM,"Birmingham, AL 35203 (Central City area)",managementsolutions,3.8,30+ days ago,https://www.indeed.com/rc/clk?jk=aa00dcc168e47...,United States\n\nDATA SCIENCE CONSULTANT BIRMI...


In [112]:
df.job_link[41]

'https://www.indeed.com/rc/clk?jk=aa00dcc168e475b5&fccid=f3eca80b6759548b&vjs=3'

In [113]:
df.job_description[41]

'United States\n\nDATA SCIENCE CONSULTANT BIRMINGHAM\n\nBirmingham / Internship / Number of vacancies: 2\n\n\n\n\nYou will be working in key projects for leading organizations in data mining & knowledge Discovery, predictive modeling, trend modeling, Simulation models (Monte Carlo), Review of credit rating and scoring models and quant support to the business and R&D projects.\n\nRequirements\n\nRecent graduates or final year students from disciplines relating to Mathematics, Physics, Statistics, Econometrics or other Quantitative fields.\nPostgraduate studies and/or specialised courses are an asset, especially in Data Science, Quantitative Finance or similar.\nShould desirably have knowledge of modeling techniques (logit, GLM, time series, decision trees, random forests, clustering), statistical programming languages (SAS, R, Python, Matlab) and big data tools and platforms (Hadoop, Hive, etc.).\nSolid academic record.\nStrong computer skills.\nKnowledge of other languages is desirable

In [11]:
# Load data scientist job posts in WA
df_ds_wa = pd.read_csv("data_scientist_wa_indeed_012021.csv", index_col=0)
df_ds_wa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 435 entries, 0 to 434
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   title            435 non-null    object
 1   locations        435 non-null    object
 2   company          435 non-null    object
 3   company_rating   435 non-null    object
 4   post_age         435 non-null    object
 5   job_link         435 non-null    object
 6   job_description  435 non-null    object
dtypes: object(7)
memory usage: 27.2+ KB


In [17]:
df_ds_wa.head(2)

Unnamed: 0,title,locations,company,company_rating,post_age,job_link,job_description
0,Research Data Scientist\nnew,"Seattle, WA",Facebook,4.2,Just posted,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,The Infrastructure Quantitative Engineering gr...
1,"Data Scientist I, Digital and Innovation\nnew","Opportunity, WA",INFORMATION TECHNOLOGY SERVICE,4.3,Just posted,https://www.indeed.com/rc/clk?jk=dc3e7796faa74...,Overview:\n\nCommonSpirit Health was formed by...


In [13]:
df_ds_wa.job_description[0]

"The Infrastructure Quantitative Engineering group is responsible for the strategic analysis to support and enable the continued growth critical to Facebook’s infrastructure organization. We are applied quantitative and computational experts using math, statistics and machine learning to measure & optimize cost, performance, reliability and efficiency of Facebook’s infrastructure & global telecom systems to deliver the best experience to our global audience. The ideal candidate will be passionate about Facebook, have strong analytical and modeling aptitude and has experience using data to drive cost effective decision making.\nBuild pragmatic, scalable, and statistically rigorous solutions to large-scale web, mobile and data infrastructure problems by leveraging or developing state-of-the-art statistical and machine learning methodologies on top of Facebook's unparalleled data infrastructure\nWork cross-functionally to define problem statements, collect data, build analytical models an

In [18]:
df_ds_wa_clean = MVP_Bojado.prep_job_description_data(df_ds_wa, 'job_description')
df_ds_wa_clean.head(2)

Unnamed: 0,title,job_description,tokenized,lemmatized,clean
0,Research Data Scientist\nnew,The Infrastructure Quantitative Engineering gr...,the infrastructure quantitative engineering gr...,the infrastructure quantitative engineering gr...,infrastructure quantitative engineering group ...
1,"Data Scientist I, Digital and Innovation\nnew",Overview:\n\nCommonSpirit Health was formed by...,overview\n\ncommonspirit health was formed by ...,overview commonspirit health wa formed by the ...,overview commonspirit health formed alignment ...


In [19]:
df_ds_wa.head(2)

Unnamed: 0,title,locations,company,company_rating,post_age,job_link,job_description,clean,tokenized,stemmed,lemmatized
0,Research Data Scientist\nnew,"Seattle, WA",Facebook,4.2,Just posted,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,The Infrastructure Quantitative Engineering gr...,infrastructure quantitative engineering group ...,the infrastructure quantitative engineering gr...,the infrastructur quantit engin group is respo...,the infrastructure quantitative engineering gr...
1,"Data Scientist I, Digital and Innovation\nnew","Opportunity, WA",INFORMATION TECHNOLOGY SERVICE,4.3,Just posted,https://www.indeed.com/rc/clk?jk=dc3e7796faa74...,Overview:\n\nCommonSpirit Health was formed by...,overview commonspirit health formed alignment ...,overview\n\ncommonspirit health was formed by ...,overview commonspirit health wa form by the al...,overview commonspirit health wa formed by the ...


**Takeaways**:
1. 

In [20]:
# Create 'words' variable
words = [re.sub(r'([^a-z0-9\s]|\s.\s)', '', doc).split() for doc in df_ds_wa.clean]

# Add 'words' column to dataframe
# Column will contain lists of separated words in each repo
df_ds_wa = pd.concat([df_ds_wa, pd.DataFrame({'words': words})], axis=1)

df_ds_wa.head(2)

Unnamed: 0,title,locations,company,company_rating,post_age,job_link,job_description,clean,tokenized,stemmed,lemmatized,words
0,Research Data Scientist\nnew,"Seattle, WA",Facebook,4.2,Just posted,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,The Infrastructure Quantitative Engineering gr...,infrastructure quantitative engineering group ...,the infrastructure quantitative engineering gr...,the infrastructur quantit engin group is respo...,the infrastructure quantitative engineering gr...,"[infrastructure, quantitative, engineering, gr..."
1,"Data Scientist I, Digital and Innovation\nnew","Opportunity, WA",INFORMATION TECHNOLOGY SERVICE,4.3,Just posted,https://www.indeed.com/rc/clk?jk=dc3e7796faa74...,Overview:\n\nCommonSpirit Health was formed by...,overview commonspirit health formed alignment ...,overview\n\ncommonspirit health was formed by ...,overview commonspirit health wa form by the al...,overview commonspirit health wa formed by the ...,"[overview, commonspirit, health, formed, align..."


In [21]:
df_ds_wa.words[0][:5]

['infrastructure', 'quantitative', 'engineering', 'group', 'responsible']

In [22]:
df_ds_wa.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 435 entries, 0 to 434
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   title            435 non-null    object
 1   locations        435 non-null    object
 2   company          435 non-null    object
 3   company_rating   435 non-null    object
 4   post_age         435 non-null    object
 5   job_link         435 non-null    object
 6   job_description  435 non-null    object
 7   clean            435 non-null    object
 8   tokenized        435 non-null    object
 9   stemmed          435 non-null    object
 10  lemmatized       435 non-null    object
 11  words            435 non-null    object
dtypes: object(12)
memory usage: 60.3+ KB


In [24]:
df_ds_wa.company.value_counts()

Microsoft                                           55
Apple                                               41
Amazon.com Services LLC                             28
Facebook                                            18
Deloitte                                            15
                                                    ..
Neal Analytics                                       1
Kelly                                                1
Walt Disney Direct-to-Consumer and International     1
Shelf Engine                                         1
Blue Nile                                            1
Name: company, Length: 155, dtype: int64

In [25]:
# Create words variables for All, Microsoft, Apple, Amazon, and Facebook

all_words = ' '.join(df_ds_wa.clean)
microsoft_words = ' '.join(df_ds_wa[df_ds_wa.company == "Microsoft"].clean)
apple_words = ' '.join(df_ds_wa[df_ds_wa.company == "Apple"].clean)
amazon_words = ' '.join(df_ds_wa[df_ds_wa.company == "Amazon.com Services LLC"].clean)
facebook_words = ' '.join(df_ds_wa[df_ds_wa.company == "Facebook"].clean)

In [26]:
# Word Frequency

all_freq = pd.Series(all_words.split()).value_counts()
microsoft_freq = pd.Series(microsoft_words.split()).value_counts()
apple_freq = pd.Series(apple_words.split()).value_counts()
amazon_freq = pd.Series(amazon_words.split()).value_counts()
facebook_freq = pd.Series(facebook_words.split()).value_counts()

# Word Count for All, Microsoft, Apple, Amazon, and Facebook

word_counts = (pd.concat([all_freq, microsoft_freq, apple_freq, 
                          amazon_freq, facebook_freq], axis=1, sort=True)
               .set_axis(['All', 'Microsoft', 'Apple', 
                          'Amazon', 'Facebook'], axis=1, inplace=False)
               .fillna(0)
               .apply(lambda s: s.astype(int))
              )

word_counts.sort_values(by='All', ascending=False).head(10)

Unnamed: 0,All,Microsoft,Apple,Amazon,Facebook
data,3543,447,304,155,46
experience,2586,296,257,145,103
team,1939,276,201,134,87
learning,1447,188,99,72,53
work,1389,212,104,60,37
product,1212,117,132,68,90
business,1177,204,7,128,29
machine,1066,134,79,60,47
science,1026,124,50,89,20
customer,815,136,34,74,1


In [27]:
mask = (word_counts.index == 'python')
word_counts[mask].sort_values(by='All', ascending=False).head(10)

Unnamed: 0,All,Microsoft,Apple,Amazon,Facebook
python,310,34,24,21,13


In [28]:
mask = (word_counts.index == 'aws')
word_counts[mask].sort_values(by='All', ascending=False).head(10)

Unnamed: 0,All,Microsoft,Apple,Amazon,Facebook
aws,151,4,6,17,0


In [29]:
mask = (word_counts.index == 'sql')
word_counts[mask].sort_values(by='All', ascending=False).head(10)

Unnamed: 0,All,Microsoft,Apple,Amazon,Facebook
sql,196,27,4,11,7


In [30]:
# Added 'Bigram' column to dataframe
df_ds_wa['bigrams'] = [list(nltk.ngrams(wordlist, 2)) for wordlist in df_ds_wa.words]
df_ds_wa.head()

Unnamed: 0,title,locations,company,company_rating,post_age,job_link,job_description,clean,tokenized,stemmed,lemmatized,words,bigrams
0,Research Data Scientist\nnew,"Seattle, WA",Facebook,4.2,Just posted,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,The Infrastructure Quantitative Engineering gr...,infrastructure quantitative engineering group ...,the infrastructure quantitative engineering gr...,the infrastructur quantit engin group is respo...,the infrastructure quantitative engineering gr...,"[infrastructure, quantitative, engineering, gr...","[(infrastructure, quantitative), (quantitative..."
1,"Data Scientist I, Digital and Innovation\nnew","Opportunity, WA",INFORMATION TECHNOLOGY SERVICE,4.3,Just posted,https://www.indeed.com/rc/clk?jk=dc3e7796faa74...,Overview:\n\nCommonSpirit Health was formed by...,overview commonspirit health formed alignment ...,overview\n\ncommonspirit health was formed by ...,overview commonspirit health wa form by the al...,overview commonspirit health wa formed by the ...,"[overview, commonspirit, health, formed, align...","[(overview, commonspirit), (commonspirit, heal..."
2,Principal Data Scientist - CTJ\nnew,"Redmond, WA 98052 (Overlake area)",Microsoft,4.2,Just posted,https://www.indeed.com/rc/clk?jk=f5a04b6142156...,Do you want to contribute to Azure Growth and ...,want contribute azure growth adoption market w...,do you want to contribute to azure growth and ...,do you want to contribut to azur growth and ad...,do you want to contribute to azure growth and ...,"[want, contribute, azure, growth, adoption, ma...","[(want, contribute), (contribute, azure), (azu..."
3,Data & Applied Scientist II\nnew,"Bellevue, WA",Microsoft,4.2,Just posted,https://www.indeed.com/rc/clk?jk=7b1ffafe00111...,What if your job description were: make tomorr...,job description make tomorrow better essence r...,what if your job description were make tomorro...,what if your job descript were make tomorrow b...,what if your job description were make tomorro...,"[job, description, make, tomorrow, better, ess...","[(job, description), (description, make), (mak..."
4,Data Science Intern\nnew,"Seattle, WA",REI,4.1,Just posted,https://www.indeed.com/rc/clk?jk=10dd3089b9957...,REI is looking for a Data Science Intern to co...,rei looking data science intern contribute rea...,rei is looking for a data science intern to co...,rei is look for a data scienc intern to contri...,rei is looking for a data science intern to co...,"[rei, looking, data, science, intern, contribu...","[(rei, looking), (looking, data), (data, scien..."


In [31]:
# Bigrams Frequency

all_bigrams_freq = pd.Series(list(nltk.ngrams(all_words.split(), 2))).value_counts()
microsoft_bigrams_freq = pd.Series(list(nltk.ngrams(microsoft_words.split(), 2))).value_counts()
apple_bigrams_freq = pd.Series(list(nltk.ngrams(apple_words.split(), 2))).value_counts()
amazon_bigrams_freq = pd.Series(list(nltk.ngrams(amazon_words.split(), 2))).value_counts()
facebook_bigrams_freq = pd.Series(list(nltk.ngrams(facebook_words.split(), 2))).value_counts()

# Word Count for All, Microsoft, Apple, Amazon, and Facebook

bigrams_counts = (pd.concat([all_bigrams_freq, microsoft_bigrams_freq, apple_bigrams_freq, 
                          amazon_bigrams_freq, facebook_bigrams_freq], axis=1, sort=True)
               .set_axis(['All', 'Microsoft', 'Apple', 
                          'Amazon', 'Facebook'], axis=1, inplace=False)
               .fillna(0)
               .apply(lambda s: s.astype(int))
              )

bigrams_counts.sort_values(by='All', ascending=False).head(10)

Unnamed: 0,Unnamed: 1,All,Microsoft,Apple,Amazon,Facebook
machine,learning,1011,125,74,56,40
data,science,494,63,11,7,2
data,scientist,357,68,20,19,6
computer,science,341,44,36,29,16
year,experience,298,42,23,23,20
sexual,orientation,266,55,8,20,18
equal,opportunity,262,55,20,15,18
national,origin,251,55,8,11,18
veteran,status,227,55,8,13,18
gender,identity,226,55,7,20,18


In [32]:
bigrams_counts.sort_values(by='All', ascending=False).head(20)

Unnamed: 0,Unnamed: 1,All,Microsoft,Apple,Amazon,Facebook
machine,learning,1011,125,74,56,40
data,science,494,63,11,7,2
data,scientist,357,68,20,19,6
computer,science,341,44,36,29,16
year,experience,298,42,23,23,20
sexual,orientation,266,55,8,20,18
equal,opportunity,262,55,20,15,18
national,origin,251,55,8,11,18
veteran,status,227,55,8,13,18
gender,identity,226,55,7,20,18


In [33]:
# Added 'Bigram' column to dataframe
df_ds_wa['trigrams'] = [list(nltk.ngrams(wordlist, 3)) for wordlist in df_ds_wa.words]
df_ds_wa.head()

Unnamed: 0,title,locations,company,company_rating,post_age,job_link,job_description,clean,tokenized,stemmed,lemmatized,words,bigrams,trigrams
0,Research Data Scientist\nnew,"Seattle, WA",Facebook,4.2,Just posted,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,The Infrastructure Quantitative Engineering gr...,infrastructure quantitative engineering group ...,the infrastructure quantitative engineering gr...,the infrastructur quantit engin group is respo...,the infrastructure quantitative engineering gr...,"[infrastructure, quantitative, engineering, gr...","[(infrastructure, quantitative), (quantitative...","[(infrastructure, quantitative, engineering), ..."
1,"Data Scientist I, Digital and Innovation\nnew","Opportunity, WA",INFORMATION TECHNOLOGY SERVICE,4.3,Just posted,https://www.indeed.com/rc/clk?jk=dc3e7796faa74...,Overview:\n\nCommonSpirit Health was formed by...,overview commonspirit health formed alignment ...,overview\n\ncommonspirit health was formed by ...,overview commonspirit health wa form by the al...,overview commonspirit health wa formed by the ...,"[overview, commonspirit, health, formed, align...","[(overview, commonspirit), (commonspirit, heal...","[(overview, commonspirit, health), (commonspir..."
2,Principal Data Scientist - CTJ\nnew,"Redmond, WA 98052 (Overlake area)",Microsoft,4.2,Just posted,https://www.indeed.com/rc/clk?jk=f5a04b6142156...,Do you want to contribute to Azure Growth and ...,want contribute azure growth adoption market w...,do you want to contribute to azure growth and ...,do you want to contribut to azur growth and ad...,do you want to contribute to azure growth and ...,"[want, contribute, azure, growth, adoption, ma...","[(want, contribute), (contribute, azure), (azu...","[(want, contribute, azure), (contribute, azure..."
3,Data & Applied Scientist II\nnew,"Bellevue, WA",Microsoft,4.2,Just posted,https://www.indeed.com/rc/clk?jk=7b1ffafe00111...,What if your job description were: make tomorr...,job description make tomorrow better essence r...,what if your job description were make tomorro...,what if your job descript were make tomorrow b...,what if your job description were make tomorro...,"[job, description, make, tomorrow, better, ess...","[(job, description), (description, make), (mak...","[(job, description, make), (description, make,..."
4,Data Science Intern\nnew,"Seattle, WA",REI,4.1,Just posted,https://www.indeed.com/rc/clk?jk=10dd3089b9957...,REI is looking for a Data Science Intern to co...,rei looking data science intern contribute rea...,rei is looking for a data science intern to co...,rei is look for a data scienc intern to contri...,rei is looking for a data science intern to co...,"[rei, looking, data, science, intern, contribu...","[(rei, looking), (looking, data), (data, scien...","[(rei, looking, data), (looking, data, science..."


In [34]:
# Trigrams Frequency

all_trigrams_freq = pd.Series(list(nltk.ngrams(all_words.split(), 3))).value_counts()
microsoft_trigrams_freq = pd.Series(list(nltk.ngrams(microsoft_words.split(), 3))).value_counts()
apple_trigrams_freq = pd.Series(list(nltk.ngrams(apple_words.split(), 3))).value_counts()
amazon_trigrams_freq = pd.Series(list(nltk.ngrams(amazon_words.split(), 3))).value_counts()
facebook_trigrams_freq = pd.Series(list(nltk.ngrams(facebook_words.split(), 3))).value_counts()

# Word Count for All, Microsoft, Apple, Amazon, and Facebook

trigrams_counts = (pd.concat([all_trigrams_freq, microsoft_trigrams_freq, apple_trigrams_freq, 
                          amazon_trigrams_freq, facebook_trigrams_freq], axis=1, sort=True)
               .set_axis(['All', 'Microsoft', 'Apple', 
                          'Amazon', 'Facebook'], axis=1, inplace=False)
               .fillna(0)
               .apply(lambda s: s.astype(int))
              )

trigrams_counts.sort_values(by='All', ascending=False).head(10)

Unnamed: 0,Unnamed: 1,Unnamed: 2,All,Microsoft,Apple,Amazon,Facebook
equal,opportunity,employer,206,55,13,15,0
protected,veteran,status,129,55,0,13,18
employment,without,regard,124,55,0,0,0
receive,consideration,employment,121,55,0,0,0
consideration,employment,without,118,55,0,0,0
qualified,applicant,receive,115,55,0,0,0
applicant,receive,consideration,115,55,0,0,0
sexual,orientation,gender,108,0,7,0,18
machine,learning,model,99,13,2,4,2
race,color,religion,98,0,7,0,0


In [35]:
# Load data scientist job posts in TX
df_ds_tx = pd.read_csv("data_scientist_tx_indeed_012121.csv", index_col=0)
df_ds_tx.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 482 entries, 0 to 481
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   title            482 non-null    object
 1   locations        482 non-null    object
 2   company          482 non-null    object
 3   company_rating   482 non-null    object
 4   post_age         482 non-null    object
 5   job_link         482 non-null    object
 6   job_description  482 non-null    object
dtypes: object(7)
memory usage: 30.1+ KB


In [36]:
df_ds_tx.head(2)

Unnamed: 0,title,locations,company,company_rating,post_age,job_link,job_description
0,Data Scientist Associate Sr\nnew,"Plano, TX","JPMorgan Chase Bank, N.A.",3.9,Just posted,https://www.indeed.com/rc/clk?jk=fdb25f52f6062...,J.P. Morgan's Corporate & Investment Bank (CIB...
1,Senior Data Scientist\nnew,"Austin, TX 78701 (Downtown area)",Sam's Club,3.5,Just posted,https://www.indeed.com/rc/clk?jk=3d235cc44e5cc...,What you'll do...\nPosition: Senior Data Scien...


In [37]:
df_ds_tx.job_description[0]

'J.P. Morgan\'s Corporate & Investment Bank (CIB) is a global leader in Banking. The world\'s corporations, governments and institutions entrust us with their business in more than 100 countries. The Corporate & Investment Bank supports our clients around the world providing strategic advice, raising capital and managing risk.\nJ.P. Morgan Wholesale Payments is one of the world\'s largest providers of treasury management and merchant services. Wholesale Payments sits at the intersection of finance and technology and is one of the largest players (processing $6 trillion of payments a day) in an industry undergoing a major transformation. Merchant Services is the global payment acceptance and merchant acquiring business of JPMorgan Chase & Co. Merchant Services is a leading provider of payment, fraud management and data security solutions, capable of authorizing transactions in more than 130 currencies. The company\'s proprietary platforms provide integrated commerce solutions for all ma

In [39]:
# Rank the companies based on the number of posts
df_ds_tx.company.value_counts()

Apple                            19
Deloitte                         17
USAA                              8
University of Texas at Austin     7
Advanced Micro Devices, Inc.      6
                                 ..
Baylor College of Medicine        1
Michaels                          1
Lennox International              1
InVisionApp                       1
Omnitracs LLC                     1
Name: company, Length: 280, dtype: int64

In [40]:
# View the job posts by Apple

mask = df_ds_tx.company == 'Apple'
df_ds_tx[mask].head(19)

Unnamed: 0,title,locations,company,company_rating,post_age,job_link,job_description
65,Data Scientist,"Austin, TX",Apple,4.2,12 days ago,https://www.indeed.com/rc/clk?jk=d031f5622fdc5...,"Summary\nPosted: Jan 8, 2021\nWeekly Hours: 40..."
98,Data Science Engineer - Strategic Data Solutions,"Austin, TX",Apple,4.2,13 days ago,https://www.indeed.com/rc/clk?jk=01ce7718bc2e9...,"Summary\nPosted: Jan 7, 2021\nWeekly Hours: 40..."
264,Data Scientist Intern - Strategic Data Solutions,"Austin, TX",Apple,4.2,30+ days ago,https://www.indeed.com/rc/clk?jk=16aa9cc6841e0...,"Summary\nPosted: Nov 6, 2020\nWeekly Hours: 40..."
295,Strategic Data Solutions - Data Scientist Inte...,"Austin, TX",Apple,4.2,30+ days ago,https://www.indeed.com/rc/clk?jk=4c6902d7462a8...,"Summary\nPosted: Dec 22, 2020\nWeekly Hours: 4..."
300,Machine Learning Program Manager,"Austin, TX",Apple,4.2,30+ days ago,https://www.indeed.com/rc/clk?jk=1c625476fb0e9...,"Summary\nPosted: Nov 9, 2020\nWeekly Hours: 40..."
311,"Data Scientist, Apple Pay","Austin, TX",Apple,4.2,30+ days ago,https://www.indeed.com/rc/clk?jk=cba535b6c0165...,"Summary\nPosted: Oct 30, 2020\nRole Number:200..."
313,"Senior Data Scientist, Ad Platforms","Austin, TX",Apple,4.2,30+ days ago,https://www.indeed.com/rc/clk?jk=b3449f0adfdba...,"Summary\nPosted: Oct 31, 2020\nWeekly Hours: 4..."
314,"AI/ML - Annotation Analyst, Spanish Language","Austin, TX",Apple,4.2,30+ days ago,https://www.indeed.com/rc/clk?jk=2d3edeb413750...,"Summary\nPosted: Nov 11, 2020\nWeekly Hours: 4..."
320,Strategic Data Solutions - Data Scientist Inte...,"Austin, TX",Apple,4.2,30+ days ago,https://www.indeed.com/rc/clk?jk=4c6902d7462a8...,"Summary\nPosted: Dec 22, 2020\nWeekly Hours: 4..."
328,Machine Learning Program Manager,"Austin, TX",Apple,4.2,30+ days ago,https://www.indeed.com/rc/clk?jk=1c625476fb0e9...,"Summary\nPosted: Nov 9, 2020\nWeekly Hours: 40..."


In [41]:
# Clean the text in the `job_description` column
df_ds_tx_clean = MVP_Bojado.prep_job_description_data(df_ds_tx, 'job_description')
df_ds_tx_clean.head(2)

Unnamed: 0,title,job_description,tokenized,lemmatized,clean
0,Data Scientist Associate Sr\nnew,J.P. Morgan's Corporate & Investment Bank (CIB...,jp morgans corporate investment bank cib is a ...,jp morgan corporate investment bank cib is a g...,jp morgan corporate investment bank cib global...
1,Senior Data Scientist\nnew,What you'll do...\nPosition: Senior Data Scien...,what youll do\nposition senior data scientist\...,what youll do position senior data scientist j...,youll position senior data scientist job locat...


In [42]:
# Create 'words' variable
words = [re.sub(r'([^a-z0-9\s]|\s.\s)', '', doc).split() for doc in df_ds_tx.clean]

# Add 'words' column to dataframe
# Column will contain lists of separated words in each repo
df_ds_tx = pd.concat([df_ds_tx, pd.DataFrame({'words': words})], axis=1)

df_ds_tx.head(2)

Unnamed: 0,title,locations,company,company_rating,post_age,job_link,job_description,clean,tokenized,stemmed,lemmatized,words
0,Data Scientist Associate Sr\nnew,"Plano, TX","JPMorgan Chase Bank, N.A.",3.9,Just posted,https://www.indeed.com/rc/clk?jk=fdb25f52f6062...,J.P. Morgan's Corporate & Investment Bank (CIB...,jp morgan corporate investment bank cib global...,jp morgans corporate investment bank cib is a ...,jp morgan corpor invest bank cib is a global l...,jp morgan corporate investment bank cib is a g...,"[jp, morgan, corporate, investment, bank, cib,..."
1,Senior Data Scientist\nnew,"Austin, TX 78701 (Downtown area)",Sam's Club,3.5,Just posted,https://www.indeed.com/rc/clk?jk=3d235cc44e5cc...,What you'll do...\nPosition: Senior Data Scien...,youll position senior data scientist job locat...,what youll do\nposition senior data scientist\...,what youll do posit senior data scientist job ...,what youll do position senior data scientist j...,"[youll, position, senior, data, scientist, job..."


In [46]:
# Create words variables for All, Microsoft, Apple, Amazon, and Facebook

all_words = ' '.join(df_ds_tx.clean)
apple_words = ' '.join(df_ds_tx[df_ds_tx.company == "Apple"].clean)
deloitte_words = ' '.join(df_ds_tx[df_ds_tx.company == "Deloitte"].clean)
usaa_words = ' '.join(df_ds_tx[df_ds_tx.company == "USAA"].clean)

In [47]:
# Word Frequency

all_freq = pd.Series(all_words.split()).value_counts()
apple_freq = pd.Series(apple_words.split()).value_counts()
deloitte_freq = pd.Series(deloitte_words.split()).value_counts()
usaa_freq = pd.Series(usaa_words.split()).value_counts()

# Word Count for All, Microsoft, Apple, Amazon, and Facebook

word_counts = (pd.concat([all_freq, apple_freq, 
                          deloitte_freq, usaa_freq], axis=1, sort=True)
               .set_axis(['All', 'Apple', 
                          'Deloitte', 'USAA'], axis=1, inplace=False)
               .fillna(0)
               .apply(lambda s: s.astype(int))
              )

word_counts.sort_values(by='All', ascending=False).head(10)

Unnamed: 0,All,Apple,Deloitte,USAA
data,4351,158,143,69
experience,2798,115,204,82
business,1697,84,107,96
team,1627,99,81,17
work,1344,41,83,60
learning,1148,39,34,21
science,1082,54,31,28
skill,984,45,59,10
analytics,933,21,106,23
year,909,11,47,50


In [49]:
mask = (word_counts.index == 'python')
word_counts[mask].sort_values(by='All', ascending=False).head(10)

Unnamed: 0,All,Apple,Deloitte,USAA
python,394,15,15,10


In [50]:
mask = (word_counts.index == 'aws')
word_counts[mask].sort_values(by='All', ascending=False).head(10)

Unnamed: 0,All,Apple,Deloitte,USAA
aws,196,0,17,2


In [51]:
mask = (word_counts.index == 'sql')
word_counts[mask].sort_values(by='All', ascending=False).head(10)

Unnamed: 0,All,Apple,Deloitte,USAA
sql,292,13,18,7


In [52]:
# Added 'Bigram' column to dataframe
df_ds_tx['bigrams'] = [list(nltk.ngrams(wordlist, 2)) for wordlist in df_ds_tx.words]
df_ds_tx.head()

Unnamed: 0,title,locations,company,company_rating,post_age,job_link,job_description,clean,tokenized,stemmed,lemmatized,words,bigrams
0,Data Scientist Associate Sr\nnew,"Plano, TX","JPMorgan Chase Bank, N.A.",3.9,Just posted,https://www.indeed.com/rc/clk?jk=fdb25f52f6062...,J.P. Morgan's Corporate & Investment Bank (CIB...,jp morgan corporate investment bank cib global...,jp morgans corporate investment bank cib is a ...,jp morgan corpor invest bank cib is a global l...,jp morgan corporate investment bank cib is a g...,"[jp, morgan, corporate, investment, bank, cib,...","[(jp, morgan), (morgan, corporate), (corporate..."
1,Senior Data Scientist\nnew,"Austin, TX 78701 (Downtown area)",Sam's Club,3.5,Just posted,https://www.indeed.com/rc/clk?jk=3d235cc44e5cc...,What you'll do...\nPosition: Senior Data Scien...,youll position senior data scientist job locat...,what youll do\nposition senior data scientist\...,what youll do posit senior data scientist job ...,what youll do position senior data scientist j...,"[youll, position, senior, data, scientist, job...","[(youll, position), (position, senior), (senio..."
2,Director of Data Science\nnew,"Austin, TX 78701 (Downtown area)",CyberCoders,3.3,Just posted,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,Director of Data Science\nHumanity is sufferin...,director data science humanity suffering healt...,director of data science\nhumanity is sufferin...,director of data scienc human is suffer from a...,director of data science humanity is suffering...,"[director, data, science, humanity, suffering,...","[(director, data), (data, science), (science, ..."
3,"AI/Data Scientist Senior Consultant - Austin, ...","Austin, TX",Deloitte,4.0,Today,https://www.indeed.com/rc/clk?jk=ff1dd9fe875d7...,"AI/Data Scientist Senior Consultant – Austin, ...",aidata scientist senior consultant austin tx w...,aidata scientist senior consultant austin tx\n...,aidata scientist senior consult austin tx do y...,aidata scientist senior consultant austin tx d...,"[aidata, scientist, senior, consultant, austin...","[(aidata, scientist), (scientist, senior), (se..."
4,Director of Data Science & Analytics\nnew,"The Woodlands, TX",Apergy,3.0,Today,https://www.indeed.com/rc/clk?jk=0485ec81823db...,ChampionX aspires to be the trusted advisor to...,championx aspires trusted advisor customer pro...,championx aspires to be the trusted advisor to...,championx aspir to be the trust advisor to our...,championx aspires to be the trusted advisor to...,"[championx, aspires, trusted, advisor, custome...","[(championx, aspires), (aspires, trusted), (tr..."


In [53]:
# Bigrams Frequency

all_bigrams_freq = pd.Series(list(nltk.ngrams(all_words.split(), 2))).value_counts()
apple_bigrams_freq = pd.Series(list(nltk.ngrams(apple_words.split(), 2))).value_counts()
deloitte_bigrams_freq = pd.Series(list(nltk.ngrams(deloitte_words.split(), 2))).value_counts()
usaa_bigrams_freq = pd.Series(list(nltk.ngrams(usaa_words.split(), 2))).value_counts()

# Word Count for All, Apple, Deloitte, and USAA

bigrams_counts = (pd.concat([all_bigrams_freq, apple_bigrams_freq, 
                             deloitte_bigrams_freq, usaa_bigrams_freq], axis=1, sort=True)
               .set_axis(['All', 'Apple', 
                          'Deloitte', 'USAA'], axis=1, inplace=False)
               .fillna(0)
               .apply(lambda s: s.astype(int))
              )

bigrams_counts.sort_values(by='All', ascending=False).head(10)

Unnamed: 0,Unnamed: 1,All,Apple,Deloitte,USAA
machine,learning,806,30,11,14
data,science,602,43,11,6
year,experience,392,6,25,18
data,scientist,362,25,5,2
computer,science,337,9,8,4
big,data,204,6,0,5
sexual,orientation,200,0,0,0
equal,opportunity,198,1,0,0
national,origin,194,0,0,0
communication,skill,190,7,10,0
