In [None]:
# General Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Web Scraping Libraries
import urllib
import requests
from bs4 import BeautifulSoup

# Regex Library
import re

# Time-related Libraries
import time
import datetime

# NLP Libraries
import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from wordcloud import WordCloud

# Helper functions
import MVP_Bojado, MVP_Shi

# Environment file
import env, env_Shi

# AWS
import logging
import boto3
from botocore.exceptions import ClientError

import json

import warnings
warnings.filterwarnings("ignore")

## Data Acquisition

### URL Format of Indeed.com
1. Search chemist in TX<br>
https://www.indeed.com/jobs?q=chemist&l=TX
2. Search chemist in San Antonio, TX<br>
https://www.indeed.com/jobs?q=chemist&l=San+Antonio%2C+TX
3. Search data scientist in San Antonio, TX<br>
https://www.indeed.com/jobs?q=data+scientist&l=San+Antonio%2C+TX
4. Search data scientist intern in San Anotnio, TX<br>
https://www.indeed.com/jobs?q=data+scientist+intern&l=San+Antonio%2C+TX
5. Sort the data scientist jobs posting by date<br>
https://www.indeed.com/jobs?q=data+scientist&l=San+Antonio%2C+TX&sort=date

**Takeaways**
1. q = job title
2. l = location

### URL Format of Monster.com
https://www.monster.com/jobs/search/?q=data-scientist&where=San-Antonio__2C-TX

### Generate the URL of a Job Search at Indeed.com

In [None]:
def first_page_url_indeed(job_title, location):
    '''
    This function returns a URL of the 1st page of a job search at Indeed.com 
    based on the job title and the location.
    '''
    # Create the base URL for a job serch at Indeed.com
    base_url = 'https://www.indeed.com/jobs?'
    # Create a dictionary to map the keys to the input parameters
    dic = {'q': job_title, 'l': location, 'sort': 'date'}
    # Convert the dictionary to a query string
    relative_url = urllib.parse.urlencode(dic)
    # Generate the full URL of the first page
    url = base_url + relative_url
    return url

In [None]:
# Test the function
url = first_page_url_indeed('data scientist', 'al')
url

### Make the HTTP Request

In [None]:
def first_page_soup_indeed(job_title, location):
    '''
    This function returns a BeautifulSoup object to hold the content 
    of the first page of a request for job searching at Indeed.com
    '''
    # Generate the URL of the job search based on title and location
    url = first_page_url_indeed(job_title, location)
    # Make the HTTP request
    response = requests.get(url)
    # Print the status code of the request
    print("Status code of the request: ", response.status_code)
    # Sanity check to make sure the document type is HTML
    print("Document type: ", response.text[:15])
    # Take a break
    time.sleep(5)
    # Make a soup to hold the response content
    soup = BeautifulSoup(response.content, "html.parser")
    # Print out the title of the content
    print("Title of the response: ", soup.title.string)
    return soup

In [None]:
first_page_soup = first_page_soup_indeed("data scientist", 'al')
type(first_page_soup)

In [None]:
# Find out the tag that contains the number of the jobs by seaching

num_jobs = first_page_soup.find('div', id='searchCountPages')
print("Data Type: ", type(num_jobs))
print("Name of the Tag: ", num_jobs.name)
print("Attributes of the Tag: ", num_jobs.attrs)
print("Text within the Tag: ")
num_jobs.text

In [None]:
# Find the number of the jobs in the text
match = re.findall(r'(\d+)', num_jobs.text)
match[1]

In [None]:
def num_jobs_indeed(first_page_soup):
    '''
    This function returns the total number of the jobs in the searching result.
    '''
    # Find out the section contains total number of jobs  
    div = first_page_soup.find('div', id='searchCountPages')
    # Extract the number
    num_jobs = re.findall(r'(\d+)', div.text)[1]
    return num_jobs

In [None]:
# Test the function num_jobs_indeed
num_jobs_indeed(first_page_soup)

In [None]:
def page_num_indeed(url):
    '''
    This function returns the page number of job searching results. 
    '''
    # Create a Soup object based on the url
    soup = page_soup_indeed(url)
    # Find out the section contains total number of jobs  
    div = soup.find('div', id='searchCountPages')
    # Extract the number
    page_num = re.findall(r'(\d+)', div.text)[0]
    return page_num

In [None]:
# Test the function num_jobs_indeed
page_num_indeed(first_page_soup)

In [None]:
# Define a function to extract all job cards in a Indeed page

def job_cards_indeed(soup):
    '''
    This function accepts the Soup object of a Indeed page 
    return an iterator containing the all the job cards in this page.
    '''
    # Find the appropriate tag that contains all of the job listings in this page
    tag = soup.find('td', id="resultsCol")
    # Extract all job cards
    job_cards = tag.find_all('div', class_='jobsearch-SerpJobCard')
    return job_cards

In [None]:
# Test the function job_cards_indeed
job_cards = job_cards_indeed(first_page_soup)

# Print the data type of job_cards
type(job_cards)

**Quick Note**: job_cards is an iterator

In [None]:
# How many jobs listed in the 1st page? 
len(job_cards)

In [None]:
def job_titles_indeed(job_cards):
    '''
    This function extract the job titles from a job_cards set. 
    '''
    # Create a list to hold the job titles
    titles = []
    # For Loop throught the job cards to extract the titles
    for job in job_cards:
        title = job.find('h2', class_='title')
        title = title.text.strip()
        titles.append(title)
    return titles

In [None]:
titles = job_titles_indeed(job_cards)
titles

In [None]:
# Define a function to pull the company names from a set of job cards

def company_names_indeed(job_cards):
    '''
    This function extracts the company names from a set of job cards.
    '''
    # Create a list to hold the company names
    names = []
    # For loop through the job cards to pull the company names
    for job in job_cards:
        name = job.find('span', class_='company')
        name = name.text.strip()
        names.append(name)
    return names

In [None]:
# Test the function: comany_names_indeed
company_names = company_names_indeed(job_cards)
company_names

In [None]:
# Define a function to pull the post ages from a set of job cards

def post_ages_indeed(job_cards):
    '''
    This function pulls the post ages from a set of job cards.
    '''
    # Create a list to hold the post ages
    ages = []
    # For loop through the job cards to pull the post ages
    for job in job_cards:
        age = job.find('span', class_='date')
        age = age.text.strip()
        ages.append(age)
    return ages

In [None]:
# Test the function: post_ages_indeed
ages = post_ages_indeed(job_cards)
ages

In [None]:
# Define a function to pull the location from a set of job cards

def job_locations_indeed(job_cards):
    '''
    This function pulls the job locations from a set of job cards.
    '''
    # Create a list to hold the locations
    locations = []
    # For loop through the job cards to pull the locations
    for job in job_cards:
        location = job.find('div', class_='location accessible-contrast-color-location')
        if location == None:
            location = job.find('span', class_='location accessible-contrast-color-location')
        location = location.text.strip()
        locations.append(location)
    return locations

In [None]:
# Test function: job_locations_indeed
locations = job_locations_indeed(job_cards)
locations

In [None]:
# Define a function to pull the company ratings from a set of job cards

def company_rating_indeed(job_cards):
    '''
    This function pulls the company rating from a set of job cards.
    If the rating is unavailable, it will be marked as 'missing'.
    '''
    # Create a list to hold the locations
    ratings = []
    # For loop through the job cards to pull the locations
    for job in job_cards:
        rating = job.find('span', class_='ratingsContent')
        if rating == None:
            ratings.append('missing')
            continue
        rating = rating.text.strip()
        ratings.append(rating)
    return ratings

In [None]:
ratings = company_rating_indeed(job_cards)
ratings

In [None]:
def acuqire_indeed_job_description(url):
    '''
    This function accepts the URL of a job posting and pull its description.
    '''
    # Make the HTTP request
    request = requests.get(url)
    print("Status Code: ", request.status_code)
    # Take a break
    time.sleep(5)
    # Make a soup variable holding the response content
    soup = BeautifulSoup(request.content, "html.parser")
    if soup == None:
        description = 'error'
    else:
        # Print the page's title
        print(soup.title.string)
        # Find the section that contains job description
        description = soup.find('div', id="jobDescriptionText")
        if description == None:
            description = 'error'
        else:
            description = description.text
    return description

def job_links_and_contents_indeed(job_cards):
    '''
    This function pulls the job links and descriptions from a set of job cards.
    '''
    # Create a list to hold the links and descriptions
    links = []
    descriptions = []
    # For loop through the job cards to pull the links and descriptions
    for job in job_cards:
        link = job.find('a')['href']
        link = 'https://www.indeed.com' + link
        link = link.replace(';', '&')
        description = acuqire_indeed_job_description(link)
        links.append(link)
        descriptions.append(description)
    return links, descriptions

In [None]:
# Test the function: job_links_and_contents_indeed
links, descriptions = job_links_and_contents_indeed(job_cards)

In [None]:
# Define a function to create a Soup object based on a job search url

def page_soup_indeed(url):
    '''
    This function returns a BeautifulSoup object to hold the content 
    of a page for a job searching results at Indeed.com
    '''
    # Make the HTTP request
    response = requests.get(url)
    # Print the status code of the request
    print("Status code of the request: ", response.status_code)
    # Sanity check to make sure the document type is HTML
    print("Document type: ", response.text[:15])
    # Take a break
    time.sleep(5)
    # Make a soup to hold the response content
    soup = BeautifulSoup(response.content, "html.parser")
    # Print out the title of the content
    print("Title of the response: ", soup.title.string)
    return soup

In [None]:
# Test the function: page_soup_indeed

url = 'https://www.indeed.com/jobs?q=data+scientist&l=al&sort=date'
soup = page_soup_indeed(url)
type(soup)

In [None]:
# Find out the page number
int(page_num_indeed(soup))

In [None]:
# Pull the job cards from the soup
type(job_cards_indeed(soup))

In [None]:
# Define a function to pull job information from a job search URL

def acquire_page_indeed(url):
    '''
    This function accepts a job search URL and returns a pandas dataframe 
    containing job title, location, company, company rating, post age and description. 
    '''
    # Create a Soup object based on the url
    soup = page_soup_indeed(url)
    # Pull the job cards
    job_cards = job_cards_indeed(soup)
    # Pull the job titles
    titles = job_titles_indeed(job_cards)   
    # Pull the names of the companies
    companies = company_names_indeed(job_cards)
    # Pull the post ages
    ages = post_ages_indeed(job_cards)
    # Pull the job locations
    locations = job_locations_indeed(job_cards)
    # Pull the company ratings
    ratings = company_rating_indeed(job_cards)
    # Pull the hyperlinks and job description
    links, descriptions = job_links_and_contents_indeed(job_cards)    
    # Create a dataframe
    d = {'title': titles,
         'location': locations,
         'company': companies, 
         'company_rating': ratings,
         'post_age': ages, 
         'job_link': links, 
         'job_description': descriptions}
    df = pd.DataFrame(d)
    return df

In [None]:
# Test function acquire_page_indeed
page_num, df = acquire_page_indeed(url)

In [None]:
def jobs_indeed(job_title, location):
    '''
    This function accepts the job title and location and return 
    the job information pull from Indeed.com.
    '''
    # Generate the urls based on job title and location (state)
    url = first_page_url = first_page_url_indeed(job_title, location)
    # Set up an counter
    counter = 1
    # Create an empty dataframe to hold the job information
    df_jobs = pd.DataFrame(columns = ['title', 'location', 'company', 'company_rating', 
                                      'post_age','job_link', 'job_description'])
    # Pull the page number
    page_num = int(page_num_indeed(url))
    # Set up an checker
    keep_going = (counter == page_num)   
    # For loop through the urls to pull job information
    while keep_going and page_num <=35:
        df = acquire_page_indeed(url)
        print("--------------------------------")
        print("Page: ", page_num)
        print("--------------------------------")
        df_jobs = df_jobs.append(df, ignore_index=True)
        time.sleep(180)
        dic = {'start': page_num*10}
        relative_url = urllib.parse.urlencode(dic)
        url = first_page_url + '&' + relative_url
        counter = counter + 1
        page_num = int(page_num_indeed(url))
        keep_going = (counter == page_num)
    # Print the total number of jobs
    print(f"Total number of {job_title} positions in {location}: ", df_jobs.shape[0])
    return df_jobs

### Data Preparation

In [None]:
# Define a function to remove the duplicates

def remove_duplicates(df):
    '''
    This function removes the duplicates in the dataframe
    '''
    # Define the columns for identifying duplicates
    columns = ['title', 'location', 'company', 'job_link', 'job_description']
    # Drop the duplicates except for the last occurrence
    df.drop_duplicates(subset=columns, inplace=True, keep='last')
    return df

In [None]:
# Define a function to compute the date of the job posts

def compute_post_date(df):
    '''
    This function computes the date of the job post based on post age
    and set the date as the index of the dataframe.
    '''
    # Create an empty list to hold the post date
    post_date = []
    # For loop the column post_age and convert the values to date
    for age in df.post_age:
        if age == 'Just posted':
            date = datetime.date.today()
            post_date.append(date)
        elif age == 'Today':
            date = datetime.date.today()
            post_date.append(date)
        else:
            # Extract the number
            num = re.findall(r'(\d+)', age)[0]
            # Cast the string number to integer
            num = int(num)
            # Convert the integer to timedelta object
            num = datetime.timedelta(days=num)
            # Compute post date        
            date = datetime.date.today()
            date = date - num
            post_date.append(date)
    # Add post date as new column
    df['date'] = post_date
    # Set the column post_date as the index and sort the values
    df = df.set_index('date').sort_index(ascending=False)
    return df

In [None]:
# Define a function to transform old job posts files

def transform_old_file(df, date_string):
    '''
    This function accepts old daily job posts and convert the post age to post date. 
    '''
    # Create an empty list to hold the post date
    post_date = []
    # For loop the column post_age and convert the values to date
    for age in df.post_age:
        if age == 'Just posted':
            date = datetime.date.fromisoformat(date_string)
            post_date.append(date)
        elif age == 'Today':
            date = datetime.date.fromisoformat(date_string)
            post_date.append(date)
        else:
            # Extract the number
            num = re.findall(r'(\d+)', age)[0]
            # Cast the string number to integer
            num = int(num)
            # Convert the integer to timedelta object
            num = datetime.timedelta(days=num)
            # Compute post date        
            date = datetime.date.fromisoformat(date_string)
            date = date - num
            post_date.append(date)
    # Add post date as new column
    df['date'] = post_date
    # Set the column post_date as the index and sort the values
    df = df.set_index('date').sort_index(ascending=False)
    return df

#### Web Deveopment

In [None]:
# Load web developer job posts in TX today

# Import the file path
database = env_Shi.database

# Read the daily data scientist jobs in TX
df_wd_new = pd.read_csv(f"{database}web_developer_tx_indeed_020721.csv", index_col=0)

# Print the dimentionality
print(df_wd_new.shape)

# Print the first two rows
df_wd_new.head(2)

In [None]:
def daily_update_wd(df):
    '''
    This function updates job posts of web developer in TX by adding the daily acquring
    of web developer job posts in TX. 
    '''
    # Read the job posts of web developer in TX
    database = env_Shi.database
    df_wd_tx = pd.read_csv(f"{database}df_wd_tx_backup.csv")
    num_jobs = df_wd_tx.shape[0]
    # Convert the date column to datetime type
    df_wd_tx.date = pd.to_datetime(df_wd_tx.date)
    # Set the date column as the index and sort the index
    df_wd_tx = df_wd_tx.set_index('date').sort_index(ascending=False)
    # Add the daily update
    df = compute_post_date(df)
    df_wd_tx = pd.concat([df_wd_tx, df]).sort_index(ascending=False)
    # Remove the duplicates
    df_wd_tx = remove_duplicates(df_wd_tx)
    # Save as csv file
    df_wd_tx.to_csv(f"{database}df_wd_tx_backup.csv")
    num_new_jobs = df_wd_tx.shape[0] - num_jobs
    print("New Jobs Posted Today: ", num_new_jobs)
    return df_wd_tx

In [None]:
# Test function: daily_update_wd

df_test = daily_update_wd(df_wd_new)
df_test.head(2)

In [None]:
df_test.info()

In [None]:
# Define a function to prepare the job posts of web developer

def prepare_job_posts_indeed_wd():
    '''
   The function cleans the csv file of web developer job posts and save as json. 
    '''
    # Read the job posts of web developer in TX
    database = env_Shi.database
    df = pd.read_csv(f"{database}df_wd_tx_backup.csv")
    # Create columns of city, state, and zipcode
    location = df.location.str.split(', ', expand=True)
    location.columns = ['city', 'zipcode']
    location.city = location.city.apply(lambda i: 0 if i == 'United States' else i)
    location.city = location.city.apply(lambda i: 0 if i == 'Texas' else i)
    location.zipcode = location.zipcode.apply(lambda i: 0 if re.findall(r"(\d+)", str(i)) == [] 
                                          else re.findall(r"(\d+)", str(i))[0])
    df['city'] = location.city
    df['state'] = 'TX'
    df['zipcode'] = location.zipcode
    # Replace the missing values in the company rating with 0
    df.company_rating = df.company_rating.apply(lambda i: 0 if i == 'missing' else i)
    # Drop the column post_age and location
    df = df.drop(columns=['post_age', 'location'])
    # Clean the text in the job description
    df = MVP_Bojado.prep_job_description_data(df, 'job_description')
    # Save a JSON version of the prepared data
    df.to_json(f"{database}df_wd_tx_prepared_backup.json", orient='records')
    return df

In [None]:
%%time

# Test the function: prepare_job_posts_indeed_wd
df_test = prepare_job_posts_indeed_wd()
df_test.head(2)

In [None]:
df_test.info()

In [None]:
# Define the columns for identifying duplicates
columns = ['date', 'title', 'company', 'job_link', 'job_description', 'city', 'state', 'zipcode']
   
# Check for duplicates
duplicates = df_test.duplicated(subset=columns,keep='last')
duplicates.sum()

In [None]:
# Read the json file

result = open(f"{database}df_wd_tx_prepared_backup.json")
parsed = json.load(result)
parsed[0]

#### Data Scientist

In [None]:
# # Load old data scientist job posts in TX

# # Import the file path
# database = env_Shi.database

# # Read the daily data scientist jobs in TX
# df_ds_old = pd.read_csv(f"{database}data_scientist_tx_indeed_020221.csv", index_col=0)

# # Print the first 2 rows
# df_ds_old.head(2)

# # Transform old file

# df_test = transform_old_file(df_ds_old, '2021-02-02')
# df_test.head(2)

In [None]:
# Load data scientist job posts in TX on 2021-02-03

# Import the file path
database = env_Shi.database

# Read the daily data scientist jobs in TX
df_ds_new = pd.read_csv(f"{database}data_scientist_tx_indeed_020621.csv", index_col=0)

# Inspect the first 2 rows of the new posts
df_ds_new.head(2)

In [None]:
# Test the function: compute_post_date

df_test = compute_post_date(df_ds_new)
df_test.head(2) # Works

In [None]:
def daily_update_ds(df):
    '''
    This function updates job posts of data scientist in TX by adding the daily acquring
    of data scientist job posts in TX. 
    '''
    # Read the job posts of data scientist in TX
    database = env_Shi.database
    df_ds_tx = pd.read_csv(f"{database}df_ds_tx_backup.csv")
    num_jobs = df_ds_tx.shape[0]
    # Convert the date column to datetime type
    df_ds_tx.date = pd.to_datetime(df_ds_tx.date)
    # Set the date column as the index and sort the index
    df_ds_tx = df_ds_tx.set_index('date').sort_index(ascending=False)
    # Add the daily update
    df = compute_post_date(df)
    df_ds_tx = pd.concat([df_ds_tx, df]).sort_index(ascending=False)
    # Remove the duplicates
    df_ds_tx = remove_duplicates(df_ds_tx)
    # Save as csv file
    df_ds_tx.to_csv(f"{database}df_ds_tx_backup.csv")
    # Print the new jobs posted today
    num_new_jobs = df_ds_tx.shape[0] - num_jobs
    print("New Jobs Posted Today: ", num_new_jobs)
    return df_ds_tx

In [None]:
# Test the function: daily_update_ds

df_test = daily_update_ds(df_ds_new)
df_test.head() # Works

In [None]:
# Print the information of the dateframe
df_test.info()

In [None]:
# Define a function to prepare the job post for exploration

def prepare_job_posts_indeed_ds():
    '''
    The function cleans the csv file of data scientist job posts and save as json. 
    '''
    # Read the job posts of data scientist in TX
    database = env_Shi.database
    df = pd.read_csv(f"{database}df_ds_tx_backup.csv")
    # Create columns of city, state, and zipcode
    location = df.location.str.split(', ', expand=True)
    location.columns = ['city', 'zipcode']
    location.city = location.city.apply(lambda i: 0 if i == 'United States' else i)
    location.city = location.city.apply(lambda i: 0 if i == 'Texas' else i)
    location.zipcode = location.zipcode.apply(lambda i: 0 if re.findall(r"(\d+)", str(i)) == [] 
                                          else re.findall(r"(\d+)", str(i))[0])
    df['city'] = location.city
    df['state'] = 'TX'
    df['zipcode'] = location.zipcode
    # Replace the missing values in the company rating with 0
    df.company_rating = df.company_rating.apply(lambda i: 0 if i == 'missing' else i)
    # Drop the column post_age and location
    df = df.drop(columns=['post_age', 'location'])
    # Clean the text in the job description
    df = MVP_Bojado.prep_job_description_data(df, 'job_description')
    # Save a JSON version of the prepared data
    df.to_json(f"{database}df_ds_tx_prepared_backup.json", orient='records')
    return df

In [None]:
%%time
# Test the function: prepare_job_posts_indeed

df_test = prepare_job_posts_indeed_ds()
df_test.head(2)

In [None]:
df_test.info()

In [None]:
# Read the json file
database = env_Shi.database
result = open(f"{database}df_ds_tx_prepared_backup.json")

# Print the type of the file
print(type(result))

# 
parsed = json.load(result)
parsed[0]

## Data Exploration

In [None]:
# import 
s3 = boto3.resource('s3')

# Print the data type of s3
print(type(s3))

# Print the bucket names
for bucket in s3.buckets.all():
    print(bucket.name)

In [None]:
# Create the bucket object

In [10]:
# Read the json file
database = env_Shi.database
df_ds_tx = pd.read_json(f"{database}df_ds_tx_prepared_backup.json")

# Print the number of job posts
print("Number of Job Post: ", df_ds_tx.shape[0])

# Conver the string date to datetime object
df_ds_tx.date = pd.to_datetime(df_ds_tx.date)

# Set the date as the index and sort the dataframe in descending order
df_ds_tx = df_ds_tx.set_index('date').sort_index(ascending=False)
df_ds_tx.head()

Number of Job Post:  1430


Unnamed: 0_level_0,title,company,company_rating,job_link,job_description,city,state,zipcode,clean,tokenized,stemmed,lemmatized
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2021-02-06,Wholesale Payments - Small Business Data Scien...,"JPMorgan Chase Bank, N.A.",3.9,https://www.indeed.com/rc/clk?jk=fbd85e1549e4f...,CIB Wholesale Payments Data and Analytics - Da...,Plano,TX,0,cib wholesale payment data analytics data scie...,cib wholesale payments data and analytics data...,cib wholesal payment data and analyt data scie...,cib wholesale payment data and analytics data ...
2021-02-06,Senior Data Scientist\nnew,Vistra Corporate Services Company,3.8,https://www.indeed.com/rc/clk?jk=d96310969eba0...,Responsibilities\n• A person in this role is e...,Irving,TX,0,responsibility person role expected execute en...,responsibilities\n a person in this role is ex...,respons a person in thi role is expect to exec...,responsibility a person in this role is expect...
2021-02-06,"Senior Manager, Data Science\nnew",Dell Technologies,4.0,https://www.indeed.com/rc/clk?jk=13daea469fd59...,"Senior Manager, Data Science (Round Rock TX or...",Round Rock,TX,0,senior manager data science round rock tx remo...,senior manager data science round rock tx or r...,senior manag data scienc round rock tx or remo...,senior manager data science round rock tx or r...
2021-02-06,Senior Data Analyst - US Remote\nnew,UnitedHealth Group,3.7,https://www.indeed.com/rc/clk?jk=81c33357c602f...,Welcome to one of the toughest and most fulfil...,Dallas,TX,75202,welcome one toughest fulfilling way help peopl...,welcome to one of the toughest and most fulfil...,welcom to one of the toughest and most fulfil ...,welcome to one of the toughest and most fulfil...
2021-02-06,Sr. Machine Learning Scientist\nnew,Amazon.com Services LLC,3.6,https://www.indeed.com/rc/clk?jk=2541f249fc43d...,"\nMS or PhD in Artificial Intelligence, Comput...",Austin,TX,0,m phd artificial intelligence computer science...,ms or phd in artificial intelligence computer ...,ms or phd in artifici intellig comput scienc m...,m or phd in artificial intelligence computer s...


In [11]:
# Print the information of the df_ds_tx
df_ds_tx.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1430 entries, 2021-02-06 to 2020-12-22
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            1430 non-null   object 
 1   company          1430 non-null   object 
 2   company_rating   1430 non-null   float64
 3   job_link         1430 non-null   object 
 4   job_description  1430 non-null   object 
 5   city             1430 non-null   object 
 6   state            1430 non-null   object 
 7   zipcode          1430 non-null   int64  
 8   clean            1430 non-null   object 
 9   tokenized        1430 non-null   object 
 10  stemmed          1430 non-null   object 
 11  lemmatized       1430 non-null   object 
dtypes: float64(1), int64(1), object(10)
memory usage: 145.2+ KB


In [12]:
# Print the top 5 companies by the number of posts
df_ds_tx.company.value_counts().head()

Cognizant Technology Solutions    53
Dell Technologies                 39
Deloitte                          31
Facebook                          28
USAA                              25
Name: company, dtype: int64

In [13]:
# Print the top 5 cities by the number of posts
df_ds_tx.city.value_counts().head()

Austin         408
Dallas         223
Houston        181
San Antonio    112
Plano          108
Name: city, dtype: int64

In [14]:
# Sanity check: the dataframe has datetime index
df_ds_tx.resample("W").title.count()

date
2020-12-27    392
2021-01-03    136
2021-01-10    152
2021-01-17    123
2021-01-24    288
2021-01-31    231
2021-02-07    108
Freq: W-SUN, Name: title, dtype: int64

### Extract Job Requirements by Regular Expression

In [None]:
# Take a random job link

job_url = df_ds.job_link.sample(1, random_state=1)[0]
job_url

In [None]:
# Make the rquest

response = requests.get(job_url)
response.status_code

In [None]:
# Make a soup to hold the response content
soup = BeautifulSoup(response.content, 'html.parser')
soup.title.string

In [None]:
soup.style

In [None]:
print(soup.prettify())

In [None]:
# Create 'words' variable
words = [re.sub(r'([^a-z0-9\s]|\s.\s)', '', doc).split() for doc in df_ds_tx.clean]

# Add 'words' column to dataframe
# Column will contain lists of separated words in each repo
df_ds_tx = pd.concat([df_ds_tx, pd.DataFrame({'words': words})], axis=1)

df_ds_tx.head(2)

## Frequency Analysis of Mono-, Bi-, and Tri-grams

### Create a list of all the words appear in the job descriptions

In [9]:
# Define the function to create the words that appear in the job descriptions

def words_variables_v1(df):
    '''
    This function accepts the dataframe with cleaned job description 
    and return a dictionary in which the values are the words that 
    appear in the job description. 
    '''
    # Create the words that appear all the job descritipons
    all_words = ' '.join(df.clean)
    # Create a dictionary to hold the variable all_words
    d_words = {'frequency': all_words}
    return d_words

In [32]:
# Upgrade the function `words_variables_v1`

def words_variables_v2(df, companies):
    '''
    This function accepts the dataframe containing cleaned job description and 
    a list of company names and return a dictionary in which the values are the words 
    that appear in the job description. 
    '''
    # Create the words that appear all the job descritipons
    all_words = ' '.join(df.clean)
    # Create a dictionary to hold the variable all_words
    d_words = {'all': all_words}
    # For loop the companies and create the words that appear in their job descriptions
    for company in companies:
        mask = (df.company == company)
        s_company = df[mask].clean
        words = ' '.join(s_company)
        d_words[company] = words
    return d_words

In [15]:
# Test the helper function: words_variables_v1
dic = words_variables_v1(df_ds_tx)

# Print out the keys
print(dic.keys())

# Print the first 100 characters of the value
dic['frequency'][:100]

dict_keys(['frequency'])


'cib wholesale payment data analytics data scientist new rapidly growing team change agent inside jp '

In [33]:
# Test the helper function: words_variables_v2

companies = ['Apple']
dic_v2 = words_variables_v2(df_ds_tx, companies)

# Print out the keys
print(dic_v2.keys())

# Print the first 100 characters of the value of `Apple`
dic_v2['Apple'][:400]

dict_keys(['all', 'Apple'])


'summary posted jan 25 2021 role number200218691 apple great idea way becoming great product service selfmotivated highenergy person afraid challenge looking apple seeking ml advanced research application manager join amp data science analytics covering app store apple music apple tv apple podcasts apple fitness etc amp data science analytics collaborates executive various partner across business p'

### Monogram Analysis

In [16]:
# Define a function to compute the word frequency in the job description

def word_frequency_v1(d_words):
    '''
    This function accept the dictionary created by function words_variables_v1
    and return the word frequency in the job description. 
    '''
    # Create a dataframe to hold the word frequency
    word_counts = pd.DataFrame()
    # Compute the words frequency
    freq = pd.Series(d_words['frequency'].split()).value_counts()
    word_counts = pd.concat([word_counts, freq], axis=1, sort=True)
    word_counts.columns = d_words.keys()
    word_counts.sort_values(by='frequency', ascending=False, inplace=True)
    return word_counts

In [34]:
# Upgrade `word_frequency_v1`

def word_frequency_v2(d_words):
    '''
    This function accept the dictionary created by function words_variables_v2
    and return the word frequency in the job description. 
    '''
    # Read the company names from the dictionary
    companies = d_words.keys()
    # Create a dataframe to hold the word frequency
    word_counts = pd.DataFrame()
    # For loop through the companies and generate the word frequency in their job descriptions
    for company in companies:
        freq = pd.Series(d_words[company].split()).value_counts()
        word_counts = pd.concat([word_counts, freq], axis=1, sort=True)
    word_counts.columns = companies
    word_counts = word_counts.fillna(0).apply(lambda s: s.astype(int))
    word_counts.sort_values(by='all', ascending=False, inplace=True)
    return word_counts

In [23]:
# Test the function word_frequency_v1

monogram = word_frequency_v1(dic)
monogram.head(5)

Unnamed: 0,frequency
data,14575
experience,8188
business,5210
team,4605
work,4050


In [24]:
monogram.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15001 entries, data to oneknowledge
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   frequency  15001 non-null  int64
dtypes: int64(1)
memory usage: 234.4+ KB


### Bigram Analysis

In [None]:
# Test the function word_frequency_v2

df_word_frequency_v2 = word_frequency_v2(dic_v2)
df_word_frequency_v2.head(5)

In [None]:
# Added 'Bigram' column to dataframe
df_ds_tx['bigrams'] = [list(nltk.ngrams(wordlist, 2)) for wordlist in df_ds_tx.words]
df_ds_tx.head(2)

In [19]:
def bigrams_frequency_v1(d_words):
    '''
    This function accept the dictionary created by function words_variables_v1
    and return the word frequency in the job description. 
    '''
    # Create a dataframe to hold the word frequency
    word_counts = pd.DataFrame()
    # Compute the words frequency
    freq = pd.Series(list(nltk.ngrams(d_words['frequency'].split(), 2))).value_counts()
    # Add the `freq` seires to `word_counts` dataframe
    word_counts = pd.concat([word_counts, freq], axis=1, sort=True)
    # Rename the coumns
    word_counts.columns = d_words.keys()
    # Sort the dataframe by the values in column `frequency`
    word_counts.sort_values(by='frequency', ascending=False, inplace=True)
    return word_counts

In [20]:
bigrams = bigrams_frequency_v1(dic)
bigrams.head()

Unnamed: 0,Unnamed: 1,frequency
machine,learning,2242
data,science,1757
year,experience,1222
data,scientist,1000
computer,science,926


In [None]:
# Define a function to compute the bigrams frequency in the job description

def bigrams_frequency_v2(d_words):
    '''
    This function accept the dictionary created by function words_variables_v2
    and return the bigrams frequency in the job description. 
    '''
    # Read the company names from the dictionary
    companies = d_words.keys()
    # Create a dataframe to hold the word frequency
    bigrams_counts = pd.DataFrame()
    # For loop through the companies and generate the word frequency in their job descriptions
    for company in companies:
        freq = pd.Series(list(nltk.ngrams(d_words[company].split(), 2))).value_counts()
        bigrams_counts = pd.concat([bigrams_counts, freq], axis=1, sort=True)
    bigrams_counts.columns = companies
    bigrams_counts = bigrams_counts.fillna(0).apply(lambda s: s.astype(int))
    bigrams_counts.sort_values(by='all', ascending=False, inplace=True)
    return bigrams_counts

In [None]:
# Compute bigrams_frequency

bigrams_v2 = bigrams_frequency_v2(dic_v2)
bigrams_v2.head()

### Trigram Analysis

In [21]:
def trigrams_frequency_v1(d_words):
    '''
    This function accept the dictionary created by function words_variables_v1
    and return the word frequency in the job description. 
    '''
    # Create a dataframe to hold the word frequency
    word_counts = pd.DataFrame()
    # Compute the words frequency
    freq = pd.Series(list(nltk.ngrams(d_words['frequency'].split(), 3))).value_counts()
    # Add the `freq` seires to `word_counts` dataframe
    word_counts = pd.concat([word_counts, freq], axis=1, sort=True)
    # Rename the coumns
    word_counts.columns = d_words.keys()
    # Sort the dataframe by the values in column `frequency`
    word_counts.sort_values(by='frequency', ascending=False, inplace=True)
    return word_counts

In [22]:
# Test function: trigrams_frequency_v1

trigrams = trigrams_frequency_v1(dic)
trigrams.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,frequency
sexual,orientation,gender,419
race,color,religion,413
equal,opportunity,employer,361
orientation,gender,identity,345
without,regard,race,280


In [None]:
# Define a function to compute the trigrams frequency in the job description

def trigrams_frequency_v2(d_words):
    '''
    This function accept the dictionary created by function words_variables_v2
    and return the trigrams frequency in the job description. 
    '''
    # Read the company names from the dictionary
    companies = d_words.keys()
    # Create a dataframe to hold the word frequency
    trigrams_counts = pd.DataFrame()
    # For loop through the companies and generate the word frequency in their job descriptions
    for company in companies:
        freq = pd.Series(list(nltk.ngrams(d_words[company].split(), 3))).value_counts()
        trigrams_counts = pd.concat([trigrams_counts, freq], axis=1, sort=True)
    trigrams_counts.columns = companies
    trigrams_counts = trigrams_counts.fillna(0).apply(lambda s: s.astype(int))
    trigrams_counts.sort_values(by='all', ascending=False, inplace=True)
    return trigrams_counts

In [None]:
# Test function: trigrams_frequency_v2

trigrams_v2 = trigrams_frequency_v2(dic_v2)
trigrams_v2.head()

### Combine Mono-, Bi- and Trigrams

#### Method 1: Simple concatenation

In [29]:
# What is total number of grams? 

monogram.shape[0]+bigrams.shape[0]+trigrams.shape[0]

338056

In [31]:
# Concat all three grams

everygram = pd.concat([monogram, bigrams, trigrams])
print(everygram.shape)
everygram.head()

(338056, 1)


Unnamed: 0,frequency
data,14575
experience,8188
business,5210
team,4605
work,4050


### Skills Match Job Search

In [None]:
# Create the masks for different skills

mask_python = df_ds_tx.clean.str.contains('python')
mask_sql = df_ds_tx.clean.str.contains('sql')
mask_ml = df_ds_tx.clean.str.contains('machine learning')
mask_tableau = df_ds_tx.clean.str.contains('tableau')
mask_aws = df_ds_tx.clean.str.contains('aws')

mask = mask_python & mask_sql & mask_tableau

In [None]:
# How many companies need all three skills: python, sql and tableau
mask.sum()

In [None]:
df_ds_tx[mask].head(1)

In [None]:
df_ds_tx.clean[0][:100]

### Compute Top 5 Skills in a Predifined Library

In [None]:
# Create a library for all skills

library = ['python', 'r', 'sql', 'tableau', 'scikitlearn', 'tensorflow', 'pytorch', 
           'aws', 'hadoop', 'hive', 'impala', 'matlab', 'model', 'algorithm', 
           'storytelling', 'statistic', 'etl', 'exploration', 'extraction', 
           'sharepoint', 'dashboard']

library_tech = ['programming', 'big data', 'wrangling', 'version control', 'visualiztion', ]
library_soft = ['communication', 'business acumen', 'storytelling']
library_tools = ['python', 'git', 'sql', 'pandas']

In [None]:
# data visualization
# big data
# software engineering
# model
# models
# algorithms
# storytelling
# statistic
# statistical
# machine learning
# deep learning
# etl
# extraction
# crud
# exploration

In [None]:
def top_skills_ds_v1(k):
    '''
    This function accepts a positive integer k and 
    returns a dataframe containing the top k skills needed
    for data scientist positions.
    '''
    # Import the file path
    database = env_Shi.database
    # Load the prepared dataframe with job search results
    df = pd.read_csv(f"{database}df_tx_ds.csv", index_col=0)
    # Create a string of all words that appear in the job description
    dic = words_variables_v1(df)
    # Compute the words frequency
    df_word_frequency = word_frequency_v1(dic)
    # Define a library that has a complete sillset for data scientist
    library = ['python', 'r', 'sql', 'tableau', 'scikitlearn', 'tensorflow', 'pytorch', 'aws', 'hadoop', 'hive', 
        'impala', 'matlab', 'model', 'algorithm', 'storytelling', 'statistic', 'etl', 'exploration', 'extraction', 
        'sharepoint', 'dashboard']
    # Create a empty dataframe to hold the rank of the skills
    df_skills = pd.DataFrame()
    # For loop through the library to find out the frequency of the skills mentioned in the job description
    for skill in library:
        mask = (df_word_frequency.index == skill)
        df = df_word_frequency[mask]
        df_skills = pd.concat([df_skills, df])
    df_skills.sort_values(by='frequency', ascending=False, inplace=True)
    return df_skills.head(k)

In [None]:
# Test function top_skills_ds

top_skills = top_skills_ds_v1(7)
top_skills

In [None]:
mask = (df_word_frequency.index == 'python')
df_word_frequency[mask]

In [None]:
mask = (df_word_frequency.index == 'r')
df_word_frequency[mask].sort_values(by='all', ascending=False).head(10)

In [None]:
mask = (df_word_frequency.index == 'aws')
df_word_frequency[mask].sort_values(by='all', ascending=False).head(10)

In [None]:
mask = (df_word_frequency.index == 'sql')
df_word_frequency[mask].sort_values(by='all', ascending=False).head(10)

In [None]:
### Test git push