In [1]:
# Import the necessary libraries for webscraping

import requests     # Pull raw HTML from site
from bs4 import BeautifulSoup     # Parsing library that pulls data from HTML/XML code
from lxml import html     # High-speed parsing library used with BeautifulSoup


# Import library to set up and work in DataFrame
import numpy as np     # Scientific computing
import pandas as pd     # Build out DataFrame
import scipy.stats as stats

# Import libraries for plotting and visualizations
import matplotlib.pyplot as plt
import seaborn as sns

import time
import regex as re
import pickle

sns.set_style("whitegrid")     # Control the appearances of the plots

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
# Initialize search parameters and dataframe
# 'my',
country_set = ['sg']
search_string = ['data analyst']
columns = ["job_category","job_title", "company_name", "location", "summary", "salary"]

In [3]:
# Initialize container to store all job postings
jobs_list = []

# Iterate through search parameters and store relevant data in respective columns in dataframe
for country in country_set:
    for query in search_string:
        
        url = 'https://www.indeed.com.' + country + '/jobs?q=' + '+'.join([word for word in query.split()]) + '&start='
        print(url)
        time.sleep(1)
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'lxml')
        jobs_count = soup.find_all('div', {'id':'searchCount'})[0].get_text()

        # Get maximum number of jobs to iterate over all pages
#         max_jobs = int(re.sub('[^0-9a-zA-Z]+', '', jobs_count.split()[-1]))
        max_jobs = int(jobs_count.replace(' Page 1 of ', '').replace('jobs', '').replace(',', ''))

        for start_number in range(0,max_jobs,10):
            time.sleep(1)
            url_page = url + str(start_number)
            page = requests.get(url_page)
            soup = BeautifulSoup(page.text, 'lxml')
            
            # Get all advertised job descriptions
            regex = re.compile('.*row.*')
            jobs = soup.find_all(name='div', attrs={'class':regex})
            
            # Get job title from job description
            for job in jobs:
                job_title = job.find(name='a', attrs={'data-tn-element':'jobTitle'})
                company = job.find(name='span', attrs={'class':'company'})
                location = job.find(name='span', attrs={'class':'location'})
                summary = job.find(name='span', attrs={'class':'summary'})
                salary = job.find(name='span', attrs={'class':'no-wrap'})

                # Put default for missing variables
                if job_title != None:
                    job_title_result = job_title.get_text()
                    job_title_result = job_title_result.replace('\n','')
                    job_title_result = job_title_result.strip()
                else:
                    job_title_result = np.nan

                if company != None:
                    company_result = company.get_text()
                    company_result = company_result.replace('\n','')
                    company_result = company_result.strip()
                else:
                    company_result = np.nan

                if location != None:
                    location_result = location.get_text()
                    location_result = location_result.replace('\n','')
                    location_result = location_result.strip()
                else:
                    location_result = np.nan

                if summary != None:
                    summary_result = summary.get_text()
                    summary_result = summary_result.replace('\n','')
                    summary_result = summary_result.strip()
                else:
                    summary_result = np.nan

                if salary != None:

                    salary_result = salary.get_text()
                    salary_result = salary_result.replace('\n','')
                    salary_result = salary_result.strip()
                else:
                    salary_result = np.nan

                # Append to list
                job_category = '_'.join([word for word in query.split()])
                jobs_list.append([job_category,job_title_result, company_result, location_result, summary_result, salary_result])

# Convert jobs list to dataframe
df = pd.DataFrame(jobs_list, columns = columns)
# drop all duplicated job postings based on summary
df.drop_duplicates(subset=['summary'], inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()

https://www.indeed.com.sg/jobs?q=data+analyst&start=
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 728 entries, 0 to 727
Data columns (total 6 columns):
job_category    728 non-null object
job_title       728 non-null object
company_name    712 non-null object
location        720 non-null object
summary         728 non-null object
salary          29 non-null object
dtypes: object(6)
memory usage: 34.2+ KB


In [4]:
df.to_csv("DA_Search")