# MVP

## Imports

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Web Scraping Libraries
import urllib
import requests
from bs4 import BeautifulSoup

# Regex Library
import re

# Time-related Libraries
import time

# NLP Libraries
import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from wordcloud import WordCloud

# Plotly/Dash
import plotly.express as px  # (version 4.7.0)
import plotly.graph_objects as go
import dash  # (version 1.12.0) pip install dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output

# Warnings
import warnings
warnings.filterwarnings("ignore")

# 1. Acquire

### Generate the URL of a Job Search at Indeed.com

In [2]:
def first_page_url_indeed(job_title, location):
    '''
    This function returns a URL of the 1st page of a job search at Indeed.com 
    based on the job title and the location.
    '''
    # Create the base URL for a job serch at Indeed.com
    base_url = 'https://www.indeed.com/jobs?'
    # Create a dictionary to map the keys to the input parameters
    dic = {'q': job_title, 'l': location, 'sort': 'date'}
    # Convert the dictionary to a query string
    relative_url = urllib.parse.urlencode(dic)
    # Generate the full URL of the first page
    url = base_url + relative_url
    return url

In [3]:
# Test the function
url = first_page_url_indeed('data scientist', 'al')
url

'https://www.indeed.com/jobs?q=data+scientist&l=al&sort=date'

### Make the HTTP Request

In [4]:
def first_page_soup_indeed(job_title, location):
    '''
    This function returns a BeautifulSoup object to hold the content 
    of the first page of a request for job searching at Indeed.com
    '''
    # Generate the URL of the job search based on title and location
    url = first_page_url_indeed(job_title, location)
    # Make the HTTP request
    response = requests.get(url)
    # Print the status code of the request
    print("Status code of the request: ", response.status_code)
    # Sanity check to make sure the document type is HTML
    print("Document type: ", response.text[:15])
    # Take a break
    time.sleep(5)
    # Make a soup to hold the response content
    soup = BeautifulSoup(response.content, "html.parser")
    # Print out the title of the content
    print("Title of the response: ", soup.title.string)
    return soup

In [5]:
first_page_soup = first_page_soup_indeed("data scientist", 'al')
type(first_page_soup)

Status code of the request:  200
Document type:  <!DOCTYPE html>
Title of the response:  Data Scientist Jobs, Employment in Alabama | Indeed.com


bs4.BeautifulSoup

In [6]:
# Find out the tag that contains the number of the jobs by seaching

num_jobs = first_page_soup.find('div', id='searchCountPages')
print("Data Type: ", type(num_jobs))
print("Name of the Tag: ", num_jobs.name)
print("Attributes of the Tag: ", num_jobs.attrs)
print("Text within the Tag: ")
num_jobs.text

Data Type:  <class 'bs4.element.Tag'>
Name of the Tag:  div
Attributes of the Tag:  {'id': 'searchCountPages'}
Text within the Tag: 


'\n                    Page 1 of 49 jobs'

In [7]:
# Find the number of the jobs in the text
match = re.findall(r'(\d+)', num_jobs.text)
match[1]

'49'

In [8]:
def num_jobs_indeed(first_page_soup):
    '''
    This function returns the total number of the jobs in the searching result.
    '''
    # Find out the section contains total number of jobs  
    div = first_page_soup.find('div', id='searchCountPages')
    # Extract the number
    num_jobs = re.findall(r'(\d+)', div.text)[1]
    return num_jobs

In [9]:
# Test the function num_jobs_indeed
num_jobs_indeed(first_page_soup)

'49'

In [10]:
def page_num_indeed(url):
    '''
    This function returns the page number of job searching results. 
    '''
    # Create a Soup object based on the url
    soup = page_soup_indeed(url)
    # Find out the section contains total number of jobs  
    div = soup.find('div', id='searchCountPages')
    # Extract the number
    page_num = re.findall(r'(\d+)', div.text)[0]
    return page_num

In [12]:
# Define a function to extract all job cards in a Indeed page

def job_cards_indeed(soup):
    '''
    This function accepts the Soup object of a Indeed page 
    return an iterator containing the all the job cards in this page.
    '''
    # Find the appropriate tag that contains all of the job listings in this page
    tag = soup.find('td', id="resultsCol")
    # Extract all job cards
    job_cards = tag.find_all('div', class_='jobsearch-SerpJobCard')
    return job_cards

In [13]:
# Test the function job_cards_indeed
job_cards = job_cards_indeed(first_page_soup)

# Print the data type of job_cards
type(job_cards)

bs4.element.ResultSet

In [15]:
def job_titles_indeed(job_cards):
    '''
    This function extract the job titles from a job_cards set. 
    '''
    # Create a list to hold the job titles
    titles = []
    # For Loop throught the job cards to extract the titles
    for job in job_cards:
        title = job.find('h2', class_='title')
        title = title.text.strip()
        titles.append(title)
    return titles

In [16]:
titles = job_titles_indeed(job_cards)
titles

['Solution Architect - Data & Analytics\nnew',
 'Video Processing Engineer (Machine Learning)\nnew',
 'Data Management Solution Architect\nnew',
 'STATISTICIAN II\nnew',
 'MANAGER, ACCOUNT DEVELOPMENT/DATA SCIENCE',
 'Data Scientist',
 'Data Analyst - Microsoft Stack (mid-senior)',
 'Data Scientist',
 'Data Scientist Intern',
 'Data Scientist\nnew',
 'Software Engineer/Data Scientist',
 'Statistical Analyst',
 'Lead Financial Analyst - Artificial Intelligence Strategic G...',
 'Artificial Intelligence (AI) and Machine Learning (ML) Engin...\nnew',
 'Principal AI Engineer: DevSecOps\nnew']

In [17]:
# Define a function to pull the company names from a set of job cards

def company_names_indeed(job_cards):
    '''
    This function extracts the company names from a set of job cards.
    '''
    # Create a list to hold the company names
    names = []
    # For loop through the job cards to pull the company names
    for job in job_cards:
        name = job.find('span', class_='company')
        name = name.text.strip()
        names.append(name)
    return names

In [18]:
# Test the function: comany_names_indeed

company_names = company_names_indeed(job_cards)
company_names

['Deloitte',
 'Yardstick Assessment Strategies Inc.',
 'Deloitte',
 'University of Alabama at Birmingham',
 'B.A.S.S., LLC',
 'Quiq Inc',
 'Vaco',
 'Vision',
 'LOCKHEED MARTIN CORPORATION',
 'MTA Inc',
 'Torch Technologies, Inc.',
 'The Personnel Board of Jefferson County',
 'Deloitte',
 'COLSA',
 'Pearson']

In [19]:
# Define a function to pull the post ages from a set of job cards

def post_ages_indeed(job_cards):
    '''
    This function pulls the post ages from a set of job cards.
    '''
    # Create a list to hold the post ages
    ages = []
    # For loop through the job cards to pull the post ages
    for job in job_cards:
        age = job.find('span', class_='date')
        age = age.text.strip()
        ages.append(age)
    return ages

In [20]:
# Test the function: post_ages_indeed
ages = post_ages_indeed(job_cards)
ages

['Today',
 '1 day ago',
 '4 days ago',
 '5 days ago',
 '8 days ago',
 '8 days ago',
 '8 days ago',
 '8 days ago',
 '12 days ago',
 '7 days ago',
 '13 days ago',
 '10 days ago',
 '13 days ago',
 '7 days ago',
 '7 days ago']

In [21]:
# Define a function to pull the location from a set of job cards

def job_locations_indeed(job_cards):
    '''
    This function pulls the job locations from a set of job cards.
    '''
    # Create a list to hold the locations
    locations = []
    # For loop through the job cards to pull the locations
    for job in job_cards:
        location = job.find('div', class_='location accessible-contrast-color-location')
        if location == None:
            location = job.find('span', class_='location accessible-contrast-color-location')
        location = location.text.strip()
        locations.append(location)
    return locations

In [22]:
locations = job_locations_indeed(job_cards)
locations

['Huntsville, AL 35806',
 'Birmingham, AL 35216',
 'Birmingham, AL 35203 (Central City area)',
 'Birmingham, AL 35294',
 'Birmingham, AL 35243',
 'United States',
 'Hartselle, AL',
 'Huntsville, AL',
 'Huntsville, AL 35806',
 'Huntsville, AL 35806',
 'Huntsville, AL 35802',
 'Jefferson County, AL',
 'Birmingham, AL 35203 (Central City area)',
 'Huntsville, AL',
 'Montgomery, AL']

In [23]:
# Define a function to pull the company ratings from a set of job cards

def company_rating_indeed(job_cards):
    '''
    This function pulls the company rating from a set of job cards.
    If the rating is unavailable, it will be marked as 'missing'.
    '''
    # Create a list to hold the locations
    ratings = []
    # For loop through the job cards to pull the locations
    for job in job_cards:
        rating = job.find('span', class_='ratingsContent')
        if rating == None:
            ratings.append('missing')
            continue
        rating = rating.text.strip()
        ratings.append(rating)
    return ratings

In [24]:
ratings = company_rating_indeed(job_cards)
ratings

['4.0',
 'missing',
 '4.0',
 '4.1',
 'missing',
 'missing',
 '3.7',
 'missing',
 '4.0',
 '5.0',
 'missing',
 'missing',
 '4.0',
 '3.9',
 '3.8']

In [25]:
def acuqire_indeed_job_description(url):
    '''
    This function accepts the URL of a job posting and pull its description.
    '''
    # Make the HTTP request
    request = requests.get(url)
    print("Status Code: ", request.status_code)
    # Take a break
    time.sleep(5)
    # Make a soup variable holding the response content
    soup = BeautifulSoup(request.content, "html.parser")
    if soup == None:
        description = 'error'
    else:
        # Print the page's title
        print(soup.title.string)
        # Find the section that contains job description
        description = soup.find('div', id="jobDescriptionText")
        if description == None:
            description = 'error'
        else:
            description = description.text
    return description

def job_links_and_contents_indeed(job_cards):
    '''
    This function pulls the job links and descriptions from a set of job cards.
    '''
    # Create a list to hold the links and descriptions
    links = []
    descriptions = []
    # For loop through the job cards to pull the links and descriptions
    for job in job_cards:
        link = job.find('a')['href']
        link = 'https://www.indeed.com' + link
        link = link.replace(';', '&')
        description = acuqire_indeed_job_description(link)
        links.append(link)
        descriptions.append(description)
    return links, descriptions

In [26]:
# Test the function: job_links_and_contents_indeed
links, descriptions = job_links_and_contents_indeed(job_cards)

Status Code:  200
Solution Architect - Data & Analytics - Huntsville, AL 35806 - Indeed.com
Status Code:  200
Video Processing Engineer (Machine Learning) - Birmingham, AL 35216 - Indeed.com
Status Code:  200
Data Management Solution Architect - Birmingham, AL 35203 - Indeed.com
Status Code:  200
STATISTICIAN II - Birmingham, AL 35294 - Indeed.com
Status Code:  200
MANAGER, ACCOUNT DEVELOPMENT/DATA SCIENCE - Birmingham, AL 35243 - Indeed.com
Status Code:  200
Data Scientist - United States - Indeed.com
Status Code:  200
Vaco Careers and Employment | Indeed.com
Status Code:  200
Data Scientist - Huntsville, AL - Indeed.com
Status Code:  200
Data Scientist Intern - Huntsville, AL 35806 - Indeed.com
Status Code:  200
Data Scientist - Huntsville, AL 35806 - Indeed.com
Status Code:  200
Software Engineer/Data Scientist - Huntsville, AL 35802 - Indeed.com
Status Code:  200
Statistical Analyst - Jefferson County, AL - Indeed.com
Status Code:  200
Lead Financial Analyst - Artificial Intelligen

In [27]:
links

['https://www.indeed.com/rc/clk?jk=aa8db77103adac4c&fccid=9e215d88a6b33622&vjs=3',
 'https://www.indeed.com/rc/clk?jk=e62b97e0c7349e63&fccid=0099988d7a062fd2&vjs=3',
 'https://www.indeed.com/rc/clk?jk=711842cbd46ad126&fccid=9e215d88a6b33622&vjs=3',
 'https://www.indeed.com/rc/clk?jk=1016366ef3995d6f&fccid=61b32dc36af3abe6&vjs=3',
 'https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0DVr7NBsl4cHSE4aBurKzaRDI_6xQvAm7MDK25NP7GKwrPFBHjnIhnnMkAKtOTcAVcwvdJYQ1iX_OUlRkCcJ9lYHn6yw2Xoq1V-R4Wdf5ZtG2pRL3zKYjPaUIJ7FfKiCmO7nZu9zD3akcI0SzfFz7oRw23qh6o47s6hI0zsqL0wfonad9sPhl6t3UDFBwfjeQ8UsdkN1uSmy8FKX2FNZWe-fJW53S8ie_vstN1Ap00o4paMKCeXknkYgIrtJV8rHsi1LgbawmWtUu-OCJMr6-a3MVwpsIXK8Rhu4G2qk0wUYJgawwP0qOykN7COW640VsM-PysyrSSWhE7s4rN3nzmDI8Qk4964IteZ2AqIgcaSnqJxLL8X_iZ2EXQ-wU-94yMhtumDQHeVfZxE-uKpqUbF8uxg1hhVyfJb20-MJgI1qTUbjvT4xv1dynb_8B6oRRV77dRRIy2LzBnQCmzzTeYEY7Y3RpKmqfY=&p=4&fvj=1&vjs=3',
 'https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0B9JzWmK0GDxRzYzuZf9xSyXN8pQP8ihv6GH-rkAji3LyaR-hLXYB_NfHRnu

In [28]:
descriptions[1]

'The Company:\nHeadquartered in Birmingham, AL, ProctorU a growing software business on a mission to uphold the integrity of academic and professional testing. We use technology - not to "catch cheaters" - but to build trust in the expansion of access to education. Our team is responsible for building and supporting digital solutions trusted by over 1,000 institutions in 129 countries who administer over 2,000,000 exams per year and we plan to continue growing!\n\n\nThe Team:\n\nOur Engineering team gets to innovate and experiment daily with some of the latest technologies in our industry for a product that is paving the way in our space. We are always looking for opportunities to learn, grow, and have fun with each other.\n\nThe Product:\n\nLive+ uses a multi-layered approach to proctoring. Live proctors are augmented by AI that helps flag suspicious activity, and the whole process is overseen by interventionists who routinely audit live sessions and are specifically trained to deal w

In [29]:
# Define a function to create a Soup object based on a job search url

def page_soup_indeed(url):
    '''
    This function returns a BeautifulSoup object to hold the content 
    of a page for a job searching results at Indeed.com
    '''
    # Make the HTTP request
    response = requests.get(url)
    # Print the status code of the request
    print("Status code of the request: ", response.status_code)
    # Sanity check to make sure the document type is HTML
    print("Document type: ", response.text[:15])
    # Take a break
    time.sleep(5)
    # Make a soup to hold the response content
    soup = BeautifulSoup(response.content, "html.parser")
    # Print out the title of the content
    print("Title of the response: ", soup.title.string)
    return soup

In [30]:
# Test the function: page_soup_indeed

url = 'https://www.indeed.com/jobs?q=data+scientist&l=al&sort=date'
soup = page_soup_indeed(url)
type(soup)

Status code of the request:  200
Document type:  <!DOCTYPE html>
Title of the response:  Data Scientist Jobs, Employment in Alabama | Indeed.com


bs4.BeautifulSoup

In [37]:
# Pull the job cards from the soup
type(job_cards_indeed(soup))

bs4.element.ResultSet

In [38]:
# Define a function to pull job information from a job search URL

def acquire_page_indeed(url):
    '''
    This function accepts a job search URL and returns a pandas dataframe 
    containing job title, location, company, company rating, post age and description. 
    '''
    # Create a Soup object based on the url
    soup = page_soup_indeed(url)
    # Pull the job cards
    job_cards = job_cards_indeed(soup)
    # Pull the job titles
    titles = job_titles_indeed(job_cards)   
    # Pull the names of the companies
    companies = company_names_indeed(job_cards)
    # Pull the post ages
    ages = post_ages_indeed(job_cards)
    # Pull the job locations
    locations = job_locations_indeed(job_cards)
    # Pull the company ratings
    ratings = company_rating_indeed(job_cards)
    # Pull the hyperlinks and job description
    links, descriptions = job_links_and_contents_indeed(job_cards)    
    # Create a dataframe
    d = {'title': titles,
         'locations': locations,
         'company': companies, 
         'company_rating': ratings,
         'post_age': ages, 
         'job_link': links, 
         'job_description': descriptions}
    df = pd.DataFrame(d)
    return df

In [40]:
def jobs_indeed(job_title, location):
    '''
    This function accepts the job title and location and return 
    the job information pull from Indeed.com.
    '''
    # Generate the urls based on job title and location (state)
    url = first_page_url = first_page_url_indeed(job_title, location)
    # Set up an counter
    counter = 1
    # Create an empty dataframe to hold the job information
    df_jobs = pd.DataFrame(columns = ['title', 'locations', 'company', 'company_rating', 
                                      'post_age','job_link', 'job_description'])
    # Pull the page number
    page_num = int(page_num_indeed(url))
    # Set up an checker
    keep_going = (counter == page_num)   
    # For loop through the urls to pull job information
    while keep_going and page_num <=40:
        df = acquire_page_indeed(url)
        print("--------------------------------")
        print("Page: ", page_num)
        print("--------------------------------")
        df_jobs = df_jobs.append(df, ignore_index=True)
        time.sleep(180)
        dic = {'start': page_num*10}
        relative_url = urllib.parse.urlencode(dic)
        url = first_page_url + '&' + relative_url
        counter = counter + 1
        page_num = int(page_num_indeed(url))
        keep_going = (counter == page_num)
    # Print the total number of jobs
    print(f"Total number of {job_title} positions in {location}: ", df_jobs.shape[0])
    return df_jobs

### Combine Job Search Results

In [None]:
# Load data scientist job posts in TX

# Import the file path
database = env_Shi.database

# Read the data scientist jobs in TX
df_ds_tx1 = pd.read_csv(f"{database}data_scientist_tx_indeed_012121.csv", index_col=0)
df_ds_tx2 = pd.read_csv(f"{database}data_scientist_tx_indeed_012221.csv", index_col=0)
df_ds_tx3 = pd.read_csv(f"{database}data_scientist_tx_indeed_012421.csv", index_col=0)
df_ds_tx4 = pd.read_csv(f"{database}data_scientist_tx_indeed_012521.csv", index_col=0)

In [None]:
# Print out the information
df_ds_tx1.info()

In [None]:
# Print out the information
df_ds_tx2.info()

In [None]:
# Print out the information
df_ds_tx3.info()

In [None]:
# Print out the information
df_ds_tx4.info()

In [None]:
# Combine the results

df_ds_tx = pd.concat([df_ds_tx1, df_ds_tx2, df_ds_tx3, df_ds_tx4], ignore_index=True)
df_ds_tx.info()

In [None]:
df_ds_tx.head(2)

In [None]:
# Compute the number of duplicate rows 
# using columns: title, locations, company, job links and job description

df_ds_tx.duplicated(subset=['title', 'locations', 'company', 'job_link', 'job_description']).sum()

In [None]:
# Compute the number of new jobs posted

num_duplicates = df_ds_tx.duplicated(subset=['title', 'locations', 'company', 
                                              'job_link', 'job_description'], keep=False).sum()
num_new_jobs = df_ds_tx.shape[0] - num_duplicates
num_new_jobs

In [None]:
# Compute the number of unique jobs

num_jobs =  df_ds_tx.shape[0] - 855
num_jobs

In [None]:
df_ds_tx.job_link[20]

In [None]:
df_ds_tx.job_description[0]

# 2. Prepare

In [32]:
# Define a function to remove the duplicates

def remove_duplicates(df):
    '''
    This function removes the duplicates in the dataframe
    '''
    # Define the columns for identifying duplicates
    columns = ['title', 'locations', 'company', 'job_link', 'job_description']
    # Drop the duplicates except for the last occurrence
    df.drop_duplicates(subset=columns, keep='last', inplace=True, ignore_index=True)
    return df

In [33]:
# Test the function

df_ds_tx = remove_duplicates(df_ds_tx)
df_ds_tx.info()

NameError: name 'df_ds_tx' is not defined

In [34]:
# Rank the companies based on the number of posts
df_ds_tx.company.value_counts()

NameError: name 'df_ds_tx' is not defined

In [35]:
# Clean the text in the `job_description` column

df_ds_tx_clean = MVP_Bojado.prep_job_description_data(df_ds_tx, 'job_description')
df_ds_tx.head(2)

NameError: name 'MVP_Bojado' is not defined

In [36]:
df_ds_tx.clean[0]

NameError: name 'df_ds_tx' is not defined

In [None]:
# # Save the cleaned dataframe in the database
# df_ds_tx.to_csv(f"{database}df_tx_ds.csv")

In [None]:
# Create 'words' variable
words = [re.sub(r'([^a-z0-9\s]|\s.\s)', '', doc).split() for doc in df_ds_tx.clean]

# Add 'words' column to dataframe
# Column will contain lists of separated words in each repo
df_ds_tx = pd.concat([df_ds_tx, pd.DataFrame({'words': words})], axis=1)

df_ds_tx.head(2)

# 3. Explore

### Frequency Analysis of Mono-, Bi-, and Tri-grams

In [None]:
# Define the function to create the words that appear in the job descriptions

def words_variables(df, companies):
    '''
    This function accepts the dataframe containing cleaned job description and 
    a list of company names and return a dictionary in which the values are the words 
    that appear in the job description. 
    '''
    # Create the words that appear all the job descritipons
    all_words = ' '.join(df.clean)
    # Create a dictionary to hold the variable all_words
    d_words = {'all': all_words}
    # For loop the companies and create the words that appear in their job descriptions
    for company in companies:
        mask = (df.company == company)
        s_company = df[mask].clean
        words = ' '.join(s_company)
        d_words[company] = words
    return d_words

In [None]:
# Apply the helper function: words_variables

companies = ['Apple', 'Deloitte', 'USAA']
dic = words_variables(df_ds_tx, companies)

dic['all'][:400]

In [None]:
dic['Apple'][:400]

In [None]:
# Read the keys in the dictionary
dic.keys()

In [None]:
# Define a function to compute the word frequency in the job description

def word_frequency(d_words):
    '''
    This function accept the dictionary created by function words_variables
    and return the word frequency in the job description. 
    '''
    # Read the company names from the dictionary
    companies = d_words.keys()
    # Create a dataframe to hold the word frequency
    word_counts = pd.DataFrame()
    # For loop through the companies and generate the word frequency in their job descriptions
    for company in companies:
        freq = pd.Series(d_words[company].split()).value_counts()
        word_counts = pd.concat([word_counts, freq], axis=1, sort=True)
    word_counts.columns = companies
    word_counts = word_counts.fillna(0).apply(lambda s: s.astype(int))
    word_counts.sort_values(by='all', ascending=False, inplace=True)
    return word_counts

In [None]:
# Test the function word_frequency

df_word_frequency = word_frequency(dic)
df_word_frequency.head(5)

In [None]:
df_word_frequency.info()

In [None]:
# Added 'Bigram' column to dataframe
df_ds_tx['bigrams'] = [list(nltk.ngrams(wordlist, 2)) for wordlist in df_ds_tx.words]
df_ds_tx.head()

In [None]:
# Define a function to compute the bigrams frequency in the job description

def bigrams_frequency(d_words):
    '''
    This function accept the dictionary created by function words_variables
    and return the bigrams frequency in the job description. 
    '''
    # Read the company names from the dictionary
    companies = d_words.keys()
    # Create a dataframe to hold the word frequency
    bigrams_counts = pd.DataFrame()
    # For loop through the companies and generate the word frequency in their job descriptions
    for company in companies:
        freq = pd.Series(list(nltk.ngrams(d_words[company].split(), 2))).value_counts()
        bigrams_counts = pd.concat([bigrams_counts, freq], axis=1, sort=True)
    bigrams_counts.columns = companies
    bigrams_counts = bigrams_counts.fillna(0).apply(lambda s: s.astype(int))
    bigrams_counts.sort_values(by='all', ascending=False, inplace=True)
    return bigrams_counts

In [None]:
# Compute bigrams_frequency

bigrams = bigrams_frequency(dic)
bigrams.head()

In [None]:
# Define a function to compute the trigrams frequency in the job description

def trigrams_frequency(d_words):
    '''
    This function accept the dictionary created by function words_variables
    and return the trigrams frequency in the job description. 
    '''
    # Read the company names from the dictionary
    companies = d_words.keys()
    # Create a dataframe to hold the word frequency
    trigrams_counts = pd.DataFrame()
    # For loop through the companies and generate the word frequency in their job descriptions
    for company in companies:
        freq = pd.Series(list(nltk.ngrams(d_words[company].split(), 3))).value_counts()
        trigrams_counts = pd.concat([trigrams_counts, freq], axis=1, sort=True)
    trigrams_counts.columns = companies
    trigrams_counts = trigrams_counts.fillna(0).apply(lambda s: s.astype(int))
    trigrams_counts.sort_values(by='all', ascending=False, inplace=True)
    return trigrams_counts

In [None]:
# Compute trigrams_frequency

trigrams = trigrams_frequency(dic)
trigrams.head()

### Skills Match Job Search

In [None]:
# Create the masks for different skills

mask_python = df_ds_tx.clean.str.contains('python')
mask_sql = df_ds_tx.clean.str.contains('sql')
mask_ml = df_ds_tx.clean.str.contains('machine learning')
mask_tableau = df_ds_tx.clean.str.contains('tableau')
mask_aws = df_ds_tx.clean.str.contains('aws')

mask = mask_python & mask_sql & mask_tableau

In [None]:
# How many companies need all three skills: python, sql and tableau
mask.sum()

In [None]:
df_ds_tx[mask].head(2)

In [None]:
df_ds_tx.clean[0]

### Compute Top 5 Skills in a Predifined Library

In [None]:
# Create a library for all skills

library = ['python', 'r', 'sql', 'tableau', 'scikitlearn', 'tensorflow', 'pytorch', 
           'aws', 'hadoop', 'hive', 'impala', 'matlab', 'model', 'algorithm', 
           'storytelling', 'statistic', 'etl', 'exploration', 'extraction', 
           'sharepoint', 'dashboard']

In [None]:
# data visualization
# big data
# software engineering
# model
# models
# algorithms
# storytelling
# statistic
# statistical
# machine learning
# deep learning
# etl
# extraction
# crud
# exploration

In [None]:
def top_skills_ds(company, k):
    '''
    '''
    # Import the file path
    database = env_Shi.database
    # Load the prepared dataframe with job search results
    df = pd.read_csv(f"{database}df_tx_ds.csv", index_col=0)
    # Create the words that appear in the job description
    dic = words_variables(df, company)
    df_word_frequency = word_frequency(dic)
    # Define a library that has a complete sillset for data scientist
    library = ['python', 'r', 'sql', 'tableau', 'scikitlearn', 'tensorflow', 'pytorch', 'aws', 'hadoop', 'hive', 
        'impala', 'matlab', 'model', 'algorithm', 'storytelling', 'statistic', 'etl', 'exploration', 'extraction', 
        'sharepoint', 'dashboard']
    # Create a empty dataframe to hold the rank of the skills
    df_skills = pd.DataFrame()
    # For loop through the library to find out the frequency of the skills mentioned in the job description
    for skill in library:
        mask = (df_word_frequency.index == skill)
        df = df_word_frequency[mask]
        df_skills = pd.concat([df_skills, df])
    df_skills.sort_values(by=company, ascending=False, inplace=True)
    return df_skills.head(k)

In [None]:
# Test function top_skills_ds

top_skills = top_skills_ds(['Apple'],10)
top_skills

In [None]:
mask = (df_word_frequency.index == 'python')
df_word_frequency[mask]

In [None]:
mask = (df_word_frequency.index == 'r')
df_word_frequency[mask].sort_values(by='all', ascending=False).head(10)

In [None]:
mask = (df_word_frequency.index == 'aws')
df_word_frequency[mask].sort_values(by='all', ascending=False).head(10)

In [None]:
mask = (df_word_frequency.index == 'sql')
df_word_frequency[mask].sort_values(by='all', ascending=False).head(10)

In [None]:
# Word Clouds
python_cloud = WordCloud(background_color='white',
                      height=800, width=800).generate(python_words)

c_plus_plus_cloud = WordCloud(background_color='white', 
                      height=800, width=800).generate(c_plus_plus_words)

plt.figure(figsize=(15,15))
axs = [plt.axes([.25, 1, .5, .5]), plt.axes([.8, 1, .5, .5])]

# imshow => display data as an image
axs[0].imshow(python_cloud)
axs[1].imshow(c_plus_plus_cloud)

axs[0].set_title('Python')
axs[1].set_title('C++')

for ax in axs: ax.axis('off')

# 4. Model

In [None]:
import pandas as pd
import plotly.express as px  # (version 4.7.0)
import plotly.graph_objects as go

import dash  # (version 1.12.0) pip install dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output

app = dash.Dash(__name__)

# ---------- Import and clean data (importing csv into pandas)
# df = pd.read_csv("intro_bees.csv")
df = pd.read_csv("https://raw.githubusercontent.com/Coding-with-Adam/Dash-by-Plotly/master/Other/Dash_Introduction/intro_bees.csv")

df = df.groupby(['State', 'ANSI', 'Affected by', 'Year', 'state_code'])[['Pct of Colonies Impacted']].mean()
df.reset_index(inplace=True)
print(df[:5])

# ------------------------------------------------------------------------------
# App layout
app.layout = html.Div([

    html.H1("Web Application Dashboards with Dash", style={'text-align': 'center'}),

    dcc.Dropdown(id="slct_year",
                 options=[
                     {"label": "2015", "value": 2015},
                     {"label": "2016", "value": 2016},
                     {"label": "2017", "value": 2017},
                     {"label": "2018", "value": 2018}],
                 multi=False,
                 value=2015,
                 style={'width': "40%"}
                 ),

    html.Div(id='output_container', children=[]),
    html.Br(),

    dcc.Graph(id='my_bee_map', figure={})

])


# ------------------------------------------------------------------------------
# Connect the Plotly graphs with Dash Components
@app.callback(
    [Output(component_id='output_container', component_property='children'),
     Output(component_id='my_bee_map', component_property='figure')],
    [Input(component_id='slct_year', component_property='value')]
)
def update_graph(option_slctd):
    print(option_slctd)
    print(type(option_slctd))

    container = "The year chosen by user was: {}".format(option_slctd)

    dff = df.copy()
    dff = dff[dff["Year"] == option_slctd]
    dff = dff[dff["Affected by"] == "Varroa_mites"]

    # Plotly Express
    fig = px.choropleth(
        data_frame=dff,
        locationmode='USA-states',
        locations='state_code',
        scope="usa",
        color='Pct of Colonies Impacted',
        hover_data=['State', 'Pct of Colonies Impacted'],
        color_continuous_scale=px.colors.sequential.YlOrRd,
        labels={'Pct of Colonies Impacted': '% of Bee Colonies'},
        template='plotly_dark'
    )

    # Plotly Graph Objects (GO)
    # fig = go.Figure(
    #     data=[go.Choropleth(
    #         locationmode='USA-states',
    #         locations=dff['state_code'],
    #         z=dff["Pct of Colonies Impacted"].astype(float),
    #         colorscale='Reds',
    #     )]
    # )
    #
    # fig.update_layout(
    #     title_text="Bees Affected by Mites in the USA",
    #     title_xanchor="center",
    #     title_font=dict(size=24),
    #     title_x=0.5,
    #     geo=dict(scope='usa'),
    # )

    return container, fig


# ------------------------------------------------------------------------------
if __name__ == '__main__':
    app.run_server(debug=True)

# 5. Conclusions