# DSI2 Project 4 - Job Category Web Scraping

The goal is is classify jobs into 3 categories (data analyst, data engineer, data scientist) based on the job description summary.

In [1]:
import pandas as pd
import numpy as np
import pickle

import requests
from bs4 import BeautifulSoup

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
%matplotlib inline

pd.set_option('display.max_columns', None)

In [2]:
# Scrap indeed.com search results for 3 specific jobs: 'data_analyst', 'data engineer', 'data scientist'
# Uses Indeed mobile version.

links = []

for search in range(0, 550, 10):
    r = requests.get('https://www.indeed.com.sg/m/jobs?q=%28analyst+or+engineer+or+scientist%29+title%3Adata&l=Singapore&start=' + str(search))
    soup = BeautifulSoup(r.text, 'html.parser')
    for result in soup.find_all('h2', attrs={'class': 'jobTitle'}):
        links.append('https://www.indeed.com.sg/m/' + result.find('a')['href'])

print len(links)

550


In [14]:
# Go through each search result and extract the job description.

desc_results = []

for url in links:
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    desc_results.append(soup.find('div', attrs={'id':'desc'}).text.strip()) 

In [5]:
# Go through each search result and extract the job title.

job_results = []

for url in links:
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    job_results.append(soup.find('font', attrs={'size':'+1'}).text) 

In [15]:
# Create a new dataframe with scraped job title and description.

columns = ['job_title', 'description']
jobs2 = pd.DataFrame(columns=columns)
jobs2['job_title'] = job_results
jobs2['description'] = desc_results
jobs2.head()

Unnamed: 0,job_title,description
0,Data analyst,The Manager/Senior Manager uses data analytics...
1,Data Analyst,Job DescriptionsAnalyse customer and campaign ...
2,Data Analyst,Job SummaryWe are looking for Data Analyst who...
3,Regional Data Analyst (Reporting and Automation),Get to know the role:\nAs our reporting and au...
4,Junior Data Scientist,The Data Scientist will use information and mo...


In [7]:
# Change the text to lower case.

jobs2['job_title'] = jobs2['job_title'].str.lower()
jobs2['description'] = jobs2['description'].str.lower()

In [8]:
# Categorize job titles into 3 groups and append into a list.

category = []

for x in jobs2['job_title'].values:
    if 'analyst' in x:
        category.append(0)
    elif 'engineer' in x:
        category.append(1)
    elif 'scientist' in x:
        category.append(2)
    else:
        category.append('NA')

In [9]:
print jobs2.shape

category

(550, 2)


[0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 2,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 2,
 0,
 0,
 'NA',
 1,
 'NA',
 1,
 0,
 0,
 0,
 2,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 'NA',
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 'NA',
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 2,
 0,
 'NA',
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 2,
 1,
 1,
 0,
 0,
 'NA',
 0,
 0,
 2,
 1,
 'NA',
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 'NA',
 1,
 1,
 'NA',
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 'NA',
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 'NA',
 2,
 0,
 2,
 2,
 0,
 2,
 2,
 2,
 'NA',
 1,
 2,
 2,
 1,
 2,
 1,
 0,
 0,
 1,
 1,
 2,
 1,
 2,
 1,
 2,
 'NA',
 'NA',
 'NA',
 2,
 2,
 2,

In [12]:
# Add category list to dataframe.

jobs2['category'] = category

jobs2.head()

Unnamed: 0,job_title,description,category
0,data analyst,the manager/senior manager uses data analytics...,0
1,data analyst,job descriptionsanalyse customer and campaign ...,0
2,data analyst,job summarywe are looking for data analyst who...,0
3,regional data analyst (reporting and automation),get to know the role:\nas our reporting and au...,0
4,junior data scientist,the data scientist will use information and mo...,2


In [13]:
pickle_out = open("jobs2.pickle","wb")
pickle.dump(jobs2, pickle_out)
pickle_out.close()