In [1]:
# General Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Web Scraping Libraries
import urllib
import requests
from bs4 import BeautifulSoup

# Regex Library
import re

# Time-related Libraries
import time
import datetime

# NLP Libraries
import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from wordcloud import WordCloud

# Helper functions
import MVP_Bojado, MVP_Shi

# Environment file
import env, env_Shi

import warnings
warnings.filterwarnings("ignore")

## Data Acquisition

### URL Format of Indeed.com
1. Search chemist in TX<br>
https://www.indeed.com/jobs?q=chemist&l=TX
2. Search chemist in San Antonio, TX<br>
https://www.indeed.com/jobs?q=chemist&l=San+Antonio%2C+TX
3. Search data scientist in San Antonio, TX<br>
https://www.indeed.com/jobs?q=data+scientist&l=San+Antonio%2C+TX
4. Search data scientist intern in San Anotnio, TX<br>
https://www.indeed.com/jobs?q=data+scientist+intern&l=San+Antonio%2C+TX
5. Sort the data scientist jobs posting by date<br>
https://www.indeed.com/jobs?q=data+scientist&l=San+Antonio%2C+TX&sort=date

**Takeaways**
1. q = job title
2. l = location

### URL Format of Monster.com
https://www.monster.com/jobs/search/?q=data-scientist&where=San-Antonio__2C-TX

### Generate the URL of a Job Search at Indeed.com

In [2]:
def first_page_url_indeed(job_title, location):
    '''
    This function returns a URL of the 1st page of a job search at Indeed.com 
    based on the job title and the location.
    '''
    # Create the base URL for a job serch at Indeed.com
    base_url = 'https://www.indeed.com/jobs?'
    # Create a dictionary to map the keys to the input parameters
    dic = {'q': job_title, 'l': location, 'sort': 'date'}
    # Convert the dictionary to a query string
    relative_url = urllib.parse.urlencode(dic)
    # Generate the full URL of the first page
    url = base_url + relative_url
    return url

In [42]:
# Test the function
url = first_page_url_indeed('data scientist', 'al')
url

'https://www.indeed.com/jobs?q=data+scientist&l=al&sort=date'

### Make the HTTP Request

In [49]:
def first_page_soup_indeed(job_title, location):
    '''
    This function returns a BeautifulSoup object to hold the content 
    of the first page of a request for job searching at Indeed.com
    '''
    # Generate the URL of the job search based on title and location
    url = first_page_url_indeed(job_title, location)
    # Make the HTTP request
    response = requests.get(url)
    # Print the status code of the request
    print("Status code of the request: ", response.status_code)
    # Sanity check to make sure the document type is HTML
    print("Document type: ", response.text[:15])
    # Take a break
    time.sleep(5)
    # Make a soup to hold the response content
    soup = BeautifulSoup(response.content, "html.parser")
    # Print out the title of the content
    print("Title of the response: ", soup.title.string)
    return soup

In [52]:
first_page_soup = first_page_soup_indeed("data scientist", 'al')
type(first_page_soup)

Status code of the request:  200
Document type:  <!DOCTYPE html>
Title of the response:  Data Scientist Jobs, Employment in Alabama | Indeed.com


bs4.BeautifulSoup

In [12]:
# Find out the tag that contains the number of the jobs by seaching

num_jobs = first_page_soup.find('div', id='searchCountPages')
print("Data Type: ", type(num_jobs))
print("Name of the Tag: ", num_jobs.name)
print("Attributes of the Tag: ", num_jobs.attrs)
print("Text within the Tag: ")
num_jobs.text

Data Type:  <class 'bs4.element.Tag'>
Name of the Tag:  div
Attributes of the Tag:  {'id': 'searchCountPages'}
Text within the Tag: 


'\n                    Page 1 of 560 jobs'

In [13]:
# Find the number of the jobs in the text
match = re.findall(r'(\d+)', num_jobs.text)
match[1]

'560'

In [9]:
def num_jobs_indeed(first_page_soup):
    '''
    This function returns the total number of the jobs in the searching result.
    '''
    # Find out the section contains total number of jobs  
    div = first_page_soup.find('div', id='searchCountPages')
    # Extract the number
    num_jobs = re.findall(r'(\d+)', div.text)[1]
    return num_jobs

In [53]:
# Test the function num_jobs_indeed
num_jobs_indeed(first_page_soup)

'40'

In [12]:
def page_num_indeed(url):
    '''
    This function returns the page number of job searching results. 
    '''
    # Create a Soup object based on the url
    soup = page_soup_indeed(url)
    # Find out the section contains total number of jobs  
    div = soup.find('div', id='searchCountPages')
    # Extract the number
    page_num = re.findall(r'(\d+)', div.text)[0]
    return page_num

In [54]:
# Test the function num_jobs_indeed
page_num_indeed(first_page_soup)

'1'

In [14]:
# Define a function to extract all job cards in a Indeed page

def job_cards_indeed(soup):
    '''
    This function accepts the Soup object of a Indeed page 
    return an iterator containing the all the job cards in this page.
    '''
    # Find the appropriate tag that contains all of the job listings in this page
    tag = soup.find('td', id="resultsCol")
    # Extract all job cards
    job_cards = tag.find_all('div', class_='jobsearch-SerpJobCard')
    return job_cards

In [55]:
# Test the function job_cards_indeed
job_cards = job_cards_indeed(first_page_soup)

# Print the data type of job_cards
type(job_cards)

bs4.element.ResultSet

**Quick Note**: job_cards is an iterator

In [56]:
# How many jobs listed in the 1st page? 
len(job_cards)

15

In [17]:
def job_titles_indeed(job_cards):
    '''
    This function extract the job titles from a job_cards set. 
    '''
    # Create a list to hold the job titles
    titles = []
    # For Loop throught the job cards to extract the titles
    for job in job_cards:
        title = job.find('h2', class_='title')
        title = title.text.strip()
        titles.append(title)
    return titles

In [57]:
titles = job_titles_indeed(job_cards)
titles

['Data Scientist\nnew',
 'MANAGER, ACCOUNT DEVELOPMENT/DATA SCIENCE\nnew',
 'Data Scientist\nnew',
 'Data Analyst - Microsoft Stack (mid-senior)\nnew',
 'Statistical Analyst\nnew',
 'Data Scientist Intern\nnew',
 'Machine Learning/Artificial Intelligence Software Developer\nnew',
 'Software Engineer/Data Scientist\nnew',
 'Lead Financial Analyst - Artificial Intelligence Strategic G...\nnew',
 'BI Architect/Data Scientist',
 'Asst Research Professional - Research Data Scientist',
 '2021-18 Software Engineers for BMDS Data Analysis Suite',
 'Cyber Artificial Intelligence (AI) SME',
 '2021-02 Artificial Intelligence Designer',
 'Deep Learning Engineer']

In [19]:
# Define a function to pull the company names from a set of job cards

def company_names_indeed(job_cards):
    '''
    This function extracts the company names from a set of job cards.
    '''
    # Create a list to hold the company names
    names = []
    # For loop through the job cards to pull the company names
    for job in job_cards:
        name = job.find('span', class_='company')
        name = name.text.strip()
        names.append(name)
    return names

In [58]:
# Test the function: comany_names_indeed
company_names = company_names_indeed(job_cards)
company_names

['Quiq Inc',
 'B.A.S.S., LLC',
 'Vision',
 'Vaco',
 'The Personnel Board of Jefferson County',
 'LOCKHEED MARTIN CORPORATION',
 'IERUS Technologies, Inc.',
 'Torch Technologies, Inc.',
 'Deloitte',
 'Doozer Software',
 'The University of Alabama',
 '1st Edge',
 'Quantum Research International, Inc.',
 '1st Edge',
 'Numerator']

In [21]:
# Define a function to pull the post ages from a set of job cards

def post_ages_indeed(job_cards):
    '''
    This function pulls the post ages from a set of job cards.
    '''
    # Create a list to hold the post ages
    ages = []
    # For loop through the job cards to pull the post ages
    for job in job_cards:
        age = job.find('span', class_='date')
        age = age.text.strip()
        ages.append(age)
    return ages

In [59]:
# Test the function: post_ages_indeed
ages = post_ages_indeed(job_cards)
ages

['Today',
 'Today',
 'Today',
 'Today',
 '2 days ago',
 '4 days ago',
 '5 days ago',
 '6 days ago',
 '6 days ago',
 '11 days ago',
 '11 days ago',
 '12 days ago',
 '22 days ago',
 '21 days ago',
 '28 days ago']

In [23]:
# Define a function to pull the location from a set of job cards

def job_locations_indeed(job_cards):
    '''
    This function pulls the job locations from a set of job cards.
    '''
    # Create a list to hold the locations
    locations = []
    # For loop through the job cards to pull the locations
    for job in job_cards:
        location = job.find('div', class_='location accessible-contrast-color-location')
        if location == None:
            location = job.find('span', class_='location accessible-contrast-color-location')
        location = location.text.strip()
        locations.append(location)
    return locations

In [60]:
# Test function: job_locations_indeed
locations = job_locations_indeed(job_cards)
locations

['United States',
 'Birmingham, AL 35243',
 'Huntsville, AL',
 'Hartselle, AL',
 'Jefferson County, AL',
 'Huntsville, AL 35806',
 'Huntsville, AL 35805',
 'Huntsville, AL 35802',
 'Birmingham, AL 35203 (Central City area)',
 'Birmingham, AL 35216',
 'Tuscaloosa, AL',
 'Huntsville, AL',
 'Huntsville, AL 35806',
 'Huntsville, AL',
 'Alabama']

In [25]:
# Define a function to pull the company ratings from a set of job cards

def company_rating_indeed(job_cards):
    '''
    This function pulls the company rating from a set of job cards.
    If the rating is unavailable, it will be marked as 'missing'.
    '''
    # Create a list to hold the locations
    ratings = []
    # For loop through the job cards to pull the locations
    for job in job_cards:
        rating = job.find('span', class_='ratingsContent')
        if rating == None:
            ratings.append('missing')
            continue
        rating = rating.text.strip()
        ratings.append(rating)
    return ratings

In [61]:
ratings = company_rating_indeed(job_cards)
ratings

['missing',
 'missing',
 'missing',
 '3.7',
 'missing',
 '4.0',
 '4.7',
 'missing',
 '4.0',
 '4.8',
 '4.4',
 'missing',
 '4.0',
 'missing',
 '3.6']

In [27]:
def acuqire_indeed_job_description(url):
    '''
    This function accepts the URL of a job posting and pull its description.
    '''
    # Make the HTTP request
    request = requests.get(url)
    print("Status Code: ", request.status_code)
    # Take a break
    time.sleep(5)
    # Make a soup variable holding the response content
    soup = BeautifulSoup(request.content, "html.parser")
    if soup == None:
        description = 'error'
    else:
        # Print the page's title
        print(soup.title.string)
        # Find the section that contains job description
        description = soup.find('div', id="jobDescriptionText")
        if description == None:
            description = 'error'
        else:
            description = description.text
    return description

def job_links_and_contents_indeed(job_cards):
    '''
    This function pulls the job links and descriptions from a set of job cards.
    '''
    # Create a list to hold the links and descriptions
    links = []
    descriptions = []
    # For loop through the job cards to pull the links and descriptions
    for job in job_cards:
        link = job.find('a')['href']
        link = 'https://www.indeed.com' + link
        link = link.replace(';', '&')
        description = acuqire_indeed_job_description(link)
        links.append(link)
        descriptions.append(description)
    return links, descriptions

In [62]:
# Test the function: job_links_and_contents_indeed
links, descriptions = job_links_and_contents_indeed(job_cards)

Status Code:  200
Data Scientist - United States - Indeed.com
Status Code:  200
MANAGER, ACCOUNT DEVELOPMENT/DATA SCIENCE - Birmingham, AL 35243 - Indeed.com
Status Code:  200
Data Scientist - Huntsville, AL - Indeed.com
Status Code:  200
Vaco Careers and Employment | Indeed.com
Status Code:  200
Statistical Analyst - Jefferson County, AL - Indeed.com
Status Code:  200
Data Scientist Intern - Huntsville, AL 35806 - Indeed.com
Status Code:  200
Machine Learning/Artificial Intelligence Software Developer - Huntsville, AL 35805 - Indeed.com
Status Code:  200
Software Engineer/Data Scientist - Huntsville, AL 35802 - Indeed.com
Status Code:  200
Lead Financial Analyst - Artificial Intelligence Strategic Growth Offering (AI SGO) Finance & Investment - Birmingham, AL 35203 - Indeed.com
Status Code:  200
BI Architect/Data Scientist - Birmingham, AL 35216 - Indeed.com
Status Code:  200
Asst Research Professional - Research Data Scientist - Tuscaloosa, AL - Indeed.com
Status Code:  200
2021-18 S

In [35]:
# Define a function to create a Soup object based on a job search url

def page_soup_indeed(url):
    '''
    This function returns a BeautifulSoup object to hold the content 
    of a page for a job searching results at Indeed.com
    '''
    # Make the HTTP request
    response = requests.get(url)
    # Print the status code of the request
    print("Status code of the request: ", response.status_code)
    # Sanity check to make sure the document type is HTML
    print("Document type: ", response.text[:15])
    # Take a break
    time.sleep(5)
    # Make a soup to hold the response content
    soup = BeautifulSoup(response.content, "html.parser")
    # Print out the title of the content
    print("Title of the response: ", soup.title.string)
    return soup

In [84]:
# Test the function: page_soup_indeed

url = 'https://www.indeed.com/jobs?q=data+scientist&l=al&sort=date'
soup = page_soup_indeed(url)
type(soup)

Status code of the request:  200
Document type:  <!DOCTYPE html>
Title of the response:  Data Scientist Jobs, Employment in Alabama | Indeed.com


bs4.BeautifulSoup

In [86]:
# Find out the page number
int(page_num_indeed(soup))

1

In [38]:
# Pull the job cards from the soup
type(job_cards_indeed(soup))

bs4.element.ResultSet

In [99]:
# Define a function to pull job information from a job search URL

def acquire_page_indeed(url):
    '''
    This function accepts a job search URL and returns a pandas dataframe 
    containing job title, location, company, company rating, post age and description. 
    '''
    # Create a Soup object based on the url
    soup = page_soup_indeed(url)
    # Pull the job cards
    job_cards = job_cards_indeed(soup)
    # Pull the job titles
    titles = job_titles_indeed(job_cards)   
    # Pull the names of the companies
    companies = company_names_indeed(job_cards)
    # Pull the post ages
    ages = post_ages_indeed(job_cards)
    # Pull the job locations
    locations = job_locations_indeed(job_cards)
    # Pull the company ratings
    ratings = company_rating_indeed(job_cards)
    # Pull the hyperlinks and job description
    links, descriptions = job_links_and_contents_indeed(job_cards)    
    # Create a dataframe
    d = {'title': titles,
         'location': locations,
         'company': companies, 
         'company_rating': ratings,
         'post_age': ages, 
         'job_link': links, 
         'job_description': descriptions}
    df = pd.DataFrame(d)
    return df

In [72]:
# Test function acquire_page_indeed
page_num, df = acquire_page_indeed(url)

Status code of the request:  200
Document type:  <!DOCTYPE html>
Title of the response:  Data Scientist Jobs, Employment in Alabama | Indeed.com
Status Code:  200
Data Scientist - Huntsville, AL 35806 - Indeed.com
Status Code:  200
Data Scientist - United States - Indeed.com
Status Code:  200
Vaco Careers and Employment | Indeed.com
Status Code:  200
Data Scientist - Huntsville, AL - Indeed.com
Status Code:  200
MANAGER, ACCOUNT DEVELOPMENT/DATA SCIENCE - Birmingham, AL 35243 - Indeed.com
Status Code:  200
Statistical Analyst - Jefferson County, AL - Indeed.com
Status Code:  200
Data Scientist Intern - Huntsville, AL 35806 - Indeed.com
Status Code:  200
Machine Learning/Artificial Intelligence Software Developer - Huntsville, AL 35805 - Indeed.com
Status Code:  200
Software Engineer/Data Scientist - Huntsville, AL 35802 - Indeed.com
Status Code:  200
Lead Financial Analyst - Artificial Intelligence Strategic Growth Offering (AI SGO) Finance & Investment - Birmingham, AL 35203 - Indeed.

In [73]:
# Print the page number
page_num

'1'

In [79]:
df.columns

Index(['title', 'locations', 'company', 'company_rating', 'post_age',
       'job_link', 'job_description'],
      dtype='object')

In [76]:
df.job_description[0]

"MTA, Inc. is a Woman Owned Small Business with headquarters in Huntsville, AL. We are a diversified company recognized for excellence in Engineering, Integrated Logistics, and Quality Assurance. MTA provides services to U.S. defense agencies, NASA, and the U.S. Corp of Engineers.\nMTA, Inc. has an immediate opening for the position of Data Scientist.\nJob Description:\nTo provide support to the System Readiness Directorate's Reliability, Availability, Maintainability and System Assessment (RAM-SA) Division in support of developing and implementing machine learning algorithms and tools in SQL and Python. The objective of this project is to develop, implement and support data science work in the RAM division in support of tool development research and the Predictive Maintenance National Mission Initiative (PMx NMI).\nDuties and responsibilities:\nPerforming detailed and complex calculations necessary to assess advanced systems concepts\nTransferring data into a new format to make it mor

In [100]:
def jobs_indeed(job_title, location):
    '''
    This function accepts the job title and location and return 
    the job information pull from Indeed.com.
    '''
    # Generate the urls based on job title and location (state)
    url = first_page_url = first_page_url_indeed(job_title, location)
    # Set up an counter
    counter = 1
    # Create an empty dataframe to hold the job information
    df_jobs = pd.DataFrame(columns = ['title', 'location', 'company', 'company_rating', 
                                      'post_age','job_link', 'job_description'])
    # Pull the page number
    page_num = int(page_num_indeed(url))
    # Set up an checker
    keep_going = (counter == page_num)   
    # For loop through the urls to pull job information
    while keep_going and page_num <=35:
        df = acquire_page_indeed(url)
        print("--------------------------------")
        print("Page: ", page_num)
        print("--------------------------------")
        df_jobs = df_jobs.append(df, ignore_index=True)
        time.sleep(180)
        dic = {'start': page_num*10}
        relative_url = urllib.parse.urlencode(dic)
        url = first_page_url + '&' + relative_url
        counter = counter + 1
        page_num = int(page_num_indeed(url))
        keep_going = (counter == page_num)
    # Print the total number of jobs
    print(f"Total number of {job_title} positions in {location}: ", df_jobs.shape[0])
    return df_jobs

### Data Preparation

In [2]:
# Load data scientist job posts in TX

# Import the file path
database = env_Shi.database

# Read the daily data scientist jobs in TX
df_ds_new = pd.read_csv(f"{database}data_scientist_tx_indeed_013121.csv", index_col=0)

In [3]:
# Inspect the first 2 rows of the new posts
df_ds_new.head(2)

Unnamed: 0,title,locations,company,company_rating,post_age,job_link,job_description
0,Jr. Data Scientist\nnew,"Allen, TX",Refinitiv,3.6,1 day ago,https://www.indeed.com/rc/clk?jk=81a88a5468d89...,Better data. Better solutions. Better outcomes...
1,Data Scientist Sr. Associate\nnew,"Lewisville, TX","JPMorgan Chase Bank, N.A.",3.9,1 day ago,https://www.indeed.com/rc/clk?jk=2023754d9a344...,The Data Scientist is an individual contributo...


In [2]:
# Define a function to compute the date of the job posts

def compute_post_date(df):
    '''
    This function computes the date of the job post based on post age
    and set the date as the index of the dataframe.
    '''
    # Create an empty list to hold the post date
    post_date = []
    # For loop the column post_age and convert the values to date
    for age in df.post_age:
        if age == 'Just posted':
            date = datetime.date.today()
            post_date.append(date)
        elif age == 'Today':
            date = datetime.date.today()
            post_date.append(date)
        else:
            # Extract the number
            num = re.findall(r'(\d+)', age)[0]
            # Cast the string number to integer
            num = int(num)
            # Convert the integer to timedelta object
            num = datetime.timedelta(days=num)
            # Compute post date        
            date = datetime.date.today()
            date = date - num
            post_date.append(date)
    # Add post date as new column
    df['date'] = post_date
    # Set the column post_date as the index and sort the values
    df = df.set_index('date').sort_index(ascending=False)
    return df

In [5]:
# Test the function: compute_post_date

df_test = compute_post_date(df_ds_new)
df_test.head(2) # Works

Unnamed: 0_level_0,title,locations,company,company_rating,post_age,job_link,job_description
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01-30,Jr. Data Scientist\nnew,"Allen, TX",Refinitiv,3.6,1 day ago,https://www.indeed.com/rc/clk?jk=81a88a5468d89...,Better data. Better solutions. Better outcomes...
2021-01-30,Senior Business Intelligence Data Analyst\nnew,"Austin, TX",Tango Card,4.0,1 day ago,https://www.indeed.com/rc/clk?jk=d7fd13fae54c7...,What we are up to at Tango Card:\n\nTango Card...


In [3]:
# Define a function to remove the duplicates

def remove_duplicates(df):
    '''
    This function removes the duplicates in the dataframe
    '''
    # Define the columns for identifying duplicates
    columns = ['title', 'location', 'company', 'job_link', 'job_description']
    # Drop the duplicates except for the last occurrence
    df.drop_duplicates(subset=columns, inplace=True, keep='last')
    return df

In [4]:
def daily_update_ds(df):
    '''
    This function updates job posts of data scientist in TX by adding the daily acquring
    of data scientist job posts in TX. 
    '''
    # Read the job posts of data scientist in TX
    database = env_Shi.database
    df_ds_tx = pd.read_csv(f"{database}df_ds_tx.csv")
    num_jobs = df_ds_tx.shape[0]
    # Convert the date column to datetime type
    df_ds_tx.date = pd.to_datetime(df_ds_tx.date)
    # Set the date column as the index and sort the index
    df_ds_tx = df_ds_tx.set_index('date').sort_index(ascending=False)
    # Add the daily update
    df = compute_post_date(df)
    df_ds_tx = pd.concat([df_ds_tx, df]).sort_index(ascending=False)
    # Remove the duplicates
    df_ds_tx = remove_duplicates(df_ds_tx)
    # Save as csv file
    df_ds_tx.to_csv(f"{database}df_ds_tx.csv")
    # Print the new jobs posted today
    num_new_jobs = df_ds_tx.shape[0] - num_jobs
    print("New Jobs Posted Today: ", num_new_jobs)
    return df_ds_tx

In [8]:
# Test the function: daily_update_ds

df_test = daily_update_ds(df_ds_new)
df_test.head(2) # Works

New Jobs Posted Today:  39


Unnamed: 0_level_0,title,locations,company,company_rating,post_age,job_link,job_description
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01-30 00:00:00,Risk - Corporate Risk - Wholesale Credit Solut...,"Plano, TX","JPMorgan Chase Bank, N.A.",3.9,Just posted,https://www.indeed.com/rc/clk?jk=3571b89812e39...,Organization\nJPMorgan Chase & Co. (NYSE: JPM)...
2021-01-30,Data Scientist Sr. Associate\nnew,"Lewisville, TX","JPMorgan Chase Bank, N.A.",3.9,1 day ago,https://www.indeed.com/rc/clk?jk=2023754d9a344...,The Data Scientist is an individual contributo...


In [9]:
# Print the information of the dateframe
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1070 entries, 2021-01-30 00:00:00 to 2020-12-22 00:00:00
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   title            1070 non-null   object
 1   locations        1070 non-null   object
 2   company          1070 non-null   object
 3   company_rating   1070 non-null   object
 4   post_age         1070 non-null   object
 5   job_link         1070 non-null   object
 6   job_description  1070 non-null   object
dtypes: object(7)
memory usage: 66.9+ KB


In [7]:
# Define a function to prepare the job post for exploration

def prepare_job_posts_indeed():
    '''
    The function reads the csv file of job posts and returns a cleaned dataframe
    ready for exploration.
    '''
    # Read the job posts of data scientist in TX
    database = env_Shi.database
    df = pd.read_csv(f"{database}df_ds_tx.csv")
    # Conver the string date to datetime object
    df.date = pd.to_datetime(df.date)
    # Set the date as the index and sort the dataframe in descending order
    df = df.set_index('date').sort_index(ascending=False)
    # Create columns of city, state, and zipcode
    location = df.location.str.split(', ', expand=True)
    location.columns = ['city', 'zipcode']
    location.city = location.city.apply(lambda i: 0 if i == 'United States' else i)
    location.city = location.city.apply(lambda i: 0 if i == 'Texas' else i)
    location.zipcode = location.zipcode.apply(lambda i: 0 if re.findall(r"(\d+)", str(i)) == [] 
                                          else re.findall(r"(\d+)", str(i))[0])
    df['city'] = location.city
    df['state'] = 'TX'
    df['zipcode'] = location.zipcode
    # Replace the missing values in the company rating with 0
    df.company_rating = df.company_rating.apply(lambda i: 0 if i == 'missing' else i)
    # Drop the column post_age
    df = df.drop(columns=['post_age', 'locations'])
    # Clean the text in the job description
    df = MVP_Bojado.prep_job_description_data(df, 'job_description')
    return df

In [10]:
%%time
# Test the function: prepare_job_posts_indeed

df_test = prepare_job_posts_indeed()
df_test.head(2)

CPU times: user 18.3 s, sys: 229 ms, total: 18.5 s
Wall time: 18.7 s


Unnamed: 0_level_0,title,company,company_rating,job_link,job_description,city,state,zipcode,clean,tokenized,stemmed,lemmatized
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2021-01-30,Risk - Corporate Risk - Wholesale Credit Solut...,"JPMorgan Chase Bank, N.A.",3.9,https://www.indeed.com/rc/clk?jk=3571b89812e39...,Organization\nJPMorgan Chase & Co. (NYSE: JPM)...,Plano,TX,0,organization jpmorgan chase co nyse jpm leadin...,organization\njpmorgan chase co nyse jpm is a ...,organ jpmorgan chase co nyse jpm is a lead glo...,organization jpmorgan chase co nyse jpm is a l...
2021-01-30,Data Scientist Sr. Associate\nnew,"JPMorgan Chase Bank, N.A.",3.9,https://www.indeed.com/rc/clk?jk=2023754d9a344...,The Data Scientist is an individual contributo...,Lewisville,TX,0,data scientist individual contributor able app...,the data scientist is an individual contributo...,the data scientist is an individu contributor ...,the data scientist is an individual contributo...


In [11]:
# Print the top 5 companies by the number of posts
df_test.company.value_counts().head(5)

Cognizant Technology Solutions    42
Dell Technologies                 31
Deloitte                          30
USAA                              20
Facebook                          18
Name: company, dtype: int64

In [12]:
# Print the top 5 cities by the number of posts
df_test.city.value_counts().head(5)

Austin         322
Dallas         173
Houston        126
San Antonio     86
Plano           83
Name: city, dtype: int64

In [13]:
# Sanity check: the dataframe has datetime index
df_test.resample("W").title.count()

date
2020-12-27    392
2021-01-03     95
2021-01-10     89
2021-01-17    106
2021-01-24    258
2021-01-31    130
Freq: W-SUN, Name: title, dtype: int64

In [14]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1070 entries, 2021-01-30 to 2020-12-22
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   title            1070 non-null   object
 1   company          1070 non-null   object
 2   company_rating   1070 non-null   object
 3   job_link         1070 non-null   object
 4   job_description  1070 non-null   object
 5   city             1070 non-null   object
 6   state            1070 non-null   object
 7   zipcode          1070 non-null   object
 8   clean            1070 non-null   object
 9   tokenized        1070 non-null   object
 10  stemmed          1070 non-null   object
 11  lemmatized       1070 non-null   object
dtypes: object(12)
memory usage: 108.7+ KB


In [21]:
# Create 'words' variable
words = [re.sub(r'([^a-z0-9\s]|\s.\s)', '', doc).split() for doc in df_ds_tx.clean]

# Add 'words' column to dataframe
# Column will contain lists of separated words in each repo
df_ds_tx = pd.concat([df_ds_tx, pd.DataFrame({'words': words})], axis=1)

df_ds_tx.head(2)

Unnamed: 0,title,locations,company,company_rating,post_age,job_link,job_description,clean,tokenized,stemmed,lemmatized,words
0,Data Scientist Associate Sr\nnew,"Plano, TX","JPMorgan Chase Bank, N.A.",3.9,Just posted,https://www.indeed.com/rc/clk?jk=fdb25f52f6062...,J.P. Morgan's Corporate & Investment Bank (CIB...,jp morgan corporate investment bank cib global...,jp morgans corporate investment bank cib is a ...,jp morgan corpor invest bank cib is a global l...,jp morgan corporate investment bank cib is a g...,"[jp, morgan, corporate, investment, bank, cib,..."
1,Director of Data Science\nnew,"Austin, TX 78701 (Downtown area)",CyberCoders,3.3,Just posted,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,Director of Data Science\nHumanity is sufferin...,director data science humanity suffering healt...,director of data science\nhumanity is sufferin...,director of data scienc human is suffer from a...,director of data science humanity is suffering...,"[director, data, science, humanity, suffering,..."


## Data Exploration

### Frequency Analysis of Mono-, Bi-, and Tri-grams

In [67]:
# Define the function to create the words that appear in the job descriptions

def words_variables_v1(df):
    '''
    This function accepts the dataframe with cleaned job description 
    and return a dictionary in which the values are the words that 
    appear in the job description. 
    '''
    # Create the words that appear all the job descritipons
    all_words = ' '.join(df.clean)
    # Create a dictionary to hold the variable all_words
    d_words = {'frequency': all_words}
    return d_words

In [98]:
# Upgrade the function `words_variables_v1`

def words_variables_v2(df, companies):
    '''
    This function accepts the dataframe containing cleaned job description and 
    a list of company names and return a dictionary in which the values are the words 
    that appear in the job description. 
    '''
    # Create the words that appear all the job descritipons
    all_words = ' '.join(df.clean)
    # Create a dictionary to hold the variable all_words
    d_words = {'all': all_words}
    # For loop the companies and create the words that appear in their job descriptions
    for company in companies:
        mask = (df.company == company)
        s_company = df[mask].clean
        words = ' '.join(s_company)
        d_words[company] = words
    return d_words

In [69]:
# Test the helper function: words_variables_v1
dic = words_variables_v1(df_ds_tx)

# Print out the keys
print(dic.keys())

# Print the first 100 characters of the value
dic['frequency'][:100]

dict_keys(['frequency'])


'jp morgan corporate investment bank cib global leader banking world corporation government instituti'

In [99]:
# Test the helper function: words_variables_v2

companies = ['Apple']
dic_v2 = words_variables_v2(df_ds_tx, companies)

# Print out the keys
print(dic_v2.keys())

# Print the first 100 characters of the value of `Apple`
dic_v2['Apple'][:400]

dict_keys(['all', 'Apple'])


'summary posted oct 29 2020 role number200189417 looking talented passionate resultsoriented individual join team craft future apple pay analytically skilled strong business acumen thought partner product business team understand goal use analytical power surface actionable insight support goal culture getting thing done iteratively rapidly open feedback debate along way believe analytics team spor'

In [95]:
# Define a function to compute the word frequency in the job description

def word_frequency_v1(d_words):
    '''
    This function accept the dictionary created by function words_variables_v1
    and return the word frequency in the job description. 
    '''
    # Create a dataframe to hold the word frequency
    word_counts = pd.DataFrame()
    # Compute the words frequency
    freq = pd.Series(d_words['frequency'].split()).value_counts()
    word_counts = pd.concat([word_counts, freq], axis=1, sort=True)
    word_counts.columns = d_words.keys()
    word_counts.sort_values(by='frequency', ascending=False, inplace=True)
    return word_counts

In [75]:
# Upgrade `word_frequency_v1`

def word_frequency_v2(d_words):
    '''
    This function accept the dictionary created by function words_variables_v2
    and return the word frequency in the job description. 
    '''
    # Read the company names from the dictionary
    companies = d_words.keys()
    # Create a dataframe to hold the word frequency
    word_counts = pd.DataFrame()
    # For loop through the companies and generate the word frequency in their job descriptions
    for company in companies:
        freq = pd.Series(d_words[company].split()).value_counts()
        word_counts = pd.concat([word_counts, freq], axis=1, sort=True)
    word_counts.columns = companies
    word_counts = word_counts.fillna(0).apply(lambda s: s.astype(int))
    word_counts.sort_values(by='all', ascending=False, inplace=True)
    return word_counts

In [96]:
# Test the function word_frequency_v1

df_word_frequency = word_frequency_v1(dic)
df_word_frequency.head(5)

Unnamed: 0,frequency
data,6469
experience,3966
business,2514
team,2355
work,1995


In [97]:
df_word_frequency.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12386 entries, data to ottawa
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   frequency  12386 non-null  int64
dtypes: int64(1)
memory usage: 193.5+ KB


In [100]:
# Test the function word_frequency_v2

df_word_frequency_v2 = word_frequency_v2(dic_v2)
df_word_frequency_v2.head(5)

Unnamed: 0,all,Apple
data,6469,133
experience,3966,91
business,2514,66
team,2355,79
work,1995,30


In [29]:
# Added 'Bigram' column to dataframe
df_ds_tx['bigrams'] = [list(nltk.ngrams(wordlist, 2)) for wordlist in df_ds_tx.words]
df_ds_tx.head(2)

Unnamed: 0,title,locations,company,company_rating,post_age,job_link,job_description,clean,tokenized,stemmed,lemmatized,words,bigrams
0,Data Scientist Associate Sr\nnew,"Plano, TX","JPMorgan Chase Bank, N.A.",3.9,Just posted,https://www.indeed.com/rc/clk?jk=fdb25f52f6062...,J.P. Morgan's Corporate & Investment Bank (CIB...,jp morgan corporate investment bank cib global...,jp morgans corporate investment bank cib is a ...,jp morgan corpor invest bank cib is a global l...,jp morgan corporate investment bank cib is a g...,"[jp, morgan, corporate, investment, bank, cib,...","[(jp, morgan), (morgan, corporate), (corporate..."
1,Director of Data Science\nnew,"Austin, TX 78701 (Downtown area)",CyberCoders,3.3,Just posted,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,Director of Data Science\nHumanity is sufferin...,director data science humanity suffering healt...,director of data science\nhumanity is sufferin...,director of data scienc human is suffer from a...,director of data science humanity is suffering...,"[director, data, science, humanity, suffering,...","[(director, data), (data, science), (science, ..."
2,Sr Big Data/Data Engineer\nnew,"Houston, TX 77002 (Downtown area)",CFoundations,missing,Today,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,BIG DATA ENGINEERPay Rate - $80-105 per hour C...,big data engineerpay rate 80105 per hour c2c 1...,big data engineerpay rate 80105 per hour c2c o...,big data engineerpay rate 80105 per hour c2c o...,big data engineerpay rate 80105 per hour c2c o...,"[big, data, engineerpay, rate, 80105, per, hou...","[(big, data), (data, engineerpay), (engineerpa..."
3,"Analyst I, Statistical (226 Days)\nnew","Dallas, TX",Dallas Independent School District,3.7,Today,https://www.indeed.com/rc/clk?jk=9ec38e7c6c285...,"Analyst I, Statistical (226 Days) -(RTP2020121...",analyst statistical 226 day rtp20201216030 des...,analyst i statistical 226 days rtp20201216030\...,analyst i statist 226 day rtp20201216030 descr...,analyst i statistical 226 day rtp20201216030 d...,"[analyst, statistical, 226, day, rtp2020121603...","[(analyst, statistical), (statistical, 226), (..."
4,AI Engineer: UI & Release Management\nnew,"San Antonio, TX",Pearson,3.7,Today,https://www.indeed.com/rc/clk?jk=106922ddebbba...,Description\nWe are looking for a passionate f...,description looking passionate frontend engine...,description\nwe are looking for a passionate f...,descript we are look for a passion frontend en...,description we are looking for a passionate fr...,"[description, looking, passionate, frontend, e...","[(description, looking), (looking, passionate)..."


In [115]:
def bigrams_frequency_v1(d_words):
    '''
    This function accept the dictionary created by function words_variables_v1
    and return the word frequency in the job description. 
    '''
    # Create a dataframe to hold the word frequency
    word_counts = pd.DataFrame()
    # Compute the words frequency
    freq = pd.Series(list(nltk.ngrams(d_words['frequency'].split(), 2))).value_counts()
    # Add the `freq` seires to `word_counts` dataframe
    word_counts = pd.concat([word_counts, freq], axis=1, sort=True)
    # Rename the coumns
    word_counts.columns = d_words.keys()
    # Sort the dataframe by the values in column `frequency`
    word_counts.sort_values(by='frequency', ascending=False, inplace=True)
    return word_counts

In [117]:
bigrams = bigrams_frequency_v1(dic)
bigrams.head()

Unnamed: 0,Unnamed: 1,frequency
machine,learning,1100
data,science,802
year,experience,557
data,scientist,486
computer,science,464


In [119]:
# Define a function to compute the bigrams frequency in the job description

def bigrams_frequency_v2(d_words):
    '''
    This function accept the dictionary created by function words_variables_v2
    and return the bigrams frequency in the job description. 
    '''
    # Read the company names from the dictionary
    companies = d_words.keys()
    # Create a dataframe to hold the word frequency
    bigrams_counts = pd.DataFrame()
    # For loop through the companies and generate the word frequency in their job descriptions
    for company in companies:
        freq = pd.Series(list(nltk.ngrams(d_words[company].split(), 2))).value_counts()
        bigrams_counts = pd.concat([bigrams_counts, freq], axis=1, sort=True)
    bigrams_counts.columns = companies
    bigrams_counts = bigrams_counts.fillna(0).apply(lambda s: s.astype(int))
    bigrams_counts.sort_values(by='all', ascending=False, inplace=True)
    return bigrams_counts

In [120]:
# Compute bigrams_frequency

bigrams_v2 = bigrams_frequency_v2(dic_v2)
bigrams_v2.head()

Unnamed: 0,Unnamed: 1,all,Apple
machine,learning,1100,23
data,science,802,33
year,experience,557,5
data,scientist,486,21
computer,science,464,9


In [121]:
def trigrams_frequency_v1(d_words):
    '''
    This function accept the dictionary created by function words_variables_v1
    and return the word frequency in the job description. 
    '''
    # Create a dataframe to hold the word frequency
    word_counts = pd.DataFrame()
    # Compute the words frequency
    freq = pd.Series(list(nltk.ngrams(d_words['frequency'].split(), 3))).value_counts()
    # Add the `freq` seires to `word_counts` dataframe
    word_counts = pd.concat([word_counts, freq], axis=1, sort=True)
    # Rename the coumns
    word_counts.columns = d_words.keys()
    # Sort the dataframe by the values in column `frequency`
    word_counts.sort_values(by='frequency', ascending=False, inplace=True)
    return word_counts

In [122]:
# Test function: trigrams_frequency_v1

trigrams = trigrams_frequency_v1(dic)
trigrams.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,frequency
sexual,orientation,gender,196
race,color,religion,194
equal,opportunity,employer,187
orientation,gender,identity,176
without,regard,race,147


In [124]:
# Define a function to compute the trigrams frequency in the job description

def trigrams_frequency_v2(d_words):
    '''
    This function accept the dictionary created by function words_variables_v2
    and return the trigrams frequency in the job description. 
    '''
    # Read the company names from the dictionary
    companies = d_words.keys()
    # Create a dataframe to hold the word frequency
    trigrams_counts = pd.DataFrame()
    # For loop through the companies and generate the word frequency in their job descriptions
    for company in companies:
        freq = pd.Series(list(nltk.ngrams(d_words[company].split(), 3))).value_counts()
        trigrams_counts = pd.concat([trigrams_counts, freq], axis=1, sort=True)
    trigrams_counts.columns = companies
    trigrams_counts = trigrams_counts.fillna(0).apply(lambda s: s.astype(int))
    trigrams_counts.sort_values(by='all', ascending=False, inplace=True)
    return trigrams_counts

In [125]:
# Test function: trigrams_frequency_v2

trigrams_v2 = trigrams_frequency_v2(dic_v2)
trigrams_v2.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,all,Apple
sexual,orientation,gender,196,0
race,color,religion,194,0
equal,opportunity,employer,187,1
orientation,gender,identity,176,0
without,regard,race,147,0


### Skills Match Job Search

In [34]:
# Create the masks for different skills

mask_python = df_ds_tx.clean.str.contains('python')
mask_sql = df_ds_tx.clean.str.contains('sql')
mask_ml = df_ds_tx.clean.str.contains('machine learning')
mask_tableau = df_ds_tx.clean.str.contains('tableau')
mask_aws = df_ds_tx.clean.str.contains('aws')

mask = mask_python & mask_sql & mask_tableau

In [35]:
# How many companies need all three skills: python, sql and tableau
mask.sum()

76

In [102]:
df_ds_tx[mask].head(1)

Unnamed: 0,title,locations,company,company_rating,post_age,job_link,job_description,clean,tokenized,stemmed,lemmatized,words,bigrams
0,Data Scientist Associate Sr\nnew,"Plano, TX","JPMorgan Chase Bank, N.A.",3.9,Just posted,https://www.indeed.com/rc/clk?jk=fdb25f52f6062...,J.P. Morgan's Corporate & Investment Bank (CIB...,jp morgan corporate investment bank cib global...,jp morgans corporate investment bank cib is a ...,jp morgan corpor invest bank cib is a global l...,jp morgan corporate investment bank cib is a g...,"[jp, morgan, corporate, investment, bank, cib,...","[(jp, morgan), (morgan, corporate), (corporate..."


In [101]:
df_ds_tx.clean[0][:100]

'jp morgan corporate investment bank cib global leader banking world corporation government instituti'

### Compute Top 5 Skills in a Predifined Library

In [37]:
# Create a library for all skills

library = ['python', 'r', 'sql', 'tableau', 'scikitlearn', 'tensorflow', 'pytorch', 
           'aws', 'hadoop', 'hive', 'impala', 'matlab', 'model', 'algorithm', 
           'storytelling', 'statistic', 'etl', 'exploration', 'extraction', 
           'sharepoint', 'dashboard']

library_tech = ['programming', 'big data', 'wrangling', 'version control', 'visualiztion', ]
library_soft = ['communication', 'business acumen', 'storytelling']
library_tools = ['python', 'git', 'sql', 'pandas']

In [38]:
# data visualization
# big data
# software engineering
# model
# models
# algorithms
# storytelling
# statistic
# statistical
# machine learning
# deep learning
# etl
# extraction
# crud
# exploration

In [113]:
def top_skills_ds_v1(k):
    '''
    This function accepts a positive integer k and 
    returns a dataframe containing the top k skills needed
    for data scientist positions.
    '''
    # Import the file path
    database = env_Shi.database
    # Load the prepared dataframe with job search results
    df = pd.read_csv(f"{database}df_tx_ds.csv", index_col=0)
    # Create a string of all words that appear in the job description
    dic = words_variables_v1(df)
    # Compute the words frequency
    df_word_frequency = word_frequency_v1(dic)
    # Define a library that has a complete sillset for data scientist
    library = ['python', 'r', 'sql', 'tableau', 'scikitlearn', 'tensorflow', 'pytorch', 'aws', 'hadoop', 'hive', 
        'impala', 'matlab', 'model', 'algorithm', 'storytelling', 'statistic', 'etl', 'exploration', 'extraction', 
        'sharepoint', 'dashboard']
    # Create a empty dataframe to hold the rank of the skills
    df_skills = pd.DataFrame()
    # For loop through the library to find out the frequency of the skills mentioned in the job description
    for skill in library:
        mask = (df_word_frequency.index == skill)
        df = df_word_frequency[mask]
        df_skills = pd.concat([df_skills, df])
    df_skills.sort_values(by='frequency', ascending=False, inplace=True)
    return df_skills.head(k)

In [114]:
# Test function top_skills_ds

top_skills = top_skills_ds_v1(7)
top_skills

(696, 13)
dict_keys(['frequency'])
Index(['frequency'], dtype='object')


Unnamed: 0,frequency
model,1283
python,595
statistic,482
algorithm,446
sql,436
r,300
aws,299


In [42]:
mask = (df_word_frequency.index == 'python')
df_word_frequency[mask]

Unnamed: 0,all,Apple,Deloitte,USAA
python,595,13,20,13


In [40]:
mask = (df_word_frequency.index == 'r')
df_word_frequency[mask].sort_values(by='all', ascending=False).head(10)

Unnamed: 0,all,Apple,Deloitte,USAA
r,300,1,8,3


In [38]:
mask = (df_word_frequency.index == 'aws')
df_word_frequency[mask].sort_values(by='all', ascending=False).head(10)

Unnamed: 0,all,Apple,Deloitte,USAA
aws,299,0,20,4


In [39]:
mask = (df_word_frequency.index == 'sql')
df_word_frequency[mask].sort_values(by='all', ascending=False).head(10)

Unnamed: 0,all,Apple,Deloitte,USAA
sql,436,11,23,11


In [1]:
### Test git push