<h1 align="center"> Coursera Job Scraper </h1>

<h3> Dependencies </h3>

In [148]:
# Uncomment and run this cell to install the required slibraries
# !pip install beautifulsoup4
# !pip install beautifulsoup4
# !pip install requests
# !pip install regex
# !pip install urlparse4
# !pip install lxml

<h3> Start </h3>
For re-running the code, run from this cell

In [149]:
%reset -f

<h3> Importing the required libraries </h3>

In [150]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
from lxml import etree
from urllib import parse

<h3> Sending a get request to get all the jobs </h3>

In [151]:
# url of the website
url = "https://boards.greenhouse.io/embed/job_board?for=coursera"
# Sending the get request to the website
r = requests.get(url)

<h3> Creating a BeautifulSoup Object </h3>
<ul>
<li> Using the beautifulSoup library to parse the html document </li>
<li> Storing the beautifulSoup object in the soup variable for further use </li>
</ul>

In [152]:
soup = BeautifulSoup(r.text, 'html.parser')

<h3> Extracting the url </h3>
<ul>
<li> Using the .find_all method to extract the url </li>
<li> The url was stored as a value of the attribute "href" </li>
</ul>

In [153]:
all_urls = []
for link in soup.find_all('a', 
                          attrs={'href': re.compile("^https://")}):
    # display the actual urls
    all_urls.append(link.get('href'))

<h3> Storing all the tokens in a list </h3>
<ul>
<li> Using the urlparse library,parsed all the urls to extract the job id </li>
<li> Then this job id is passed as a token to get the html document using requests</li>
</ul>

In [154]:
# get all gh_jid values
tokens = []
for x in all_urls:
    tokens.append(parse.parse_qs(parse.urlparse(x).query)['gh_jid'][0])

In [155]:
def get_job_details(job_token):
    r2 = requests.get(f'https://boards.greenhouse.io/embed/job_app?for=coursera&token={job_token}')
    soup2 = BeautifulSoup(r2.text, 'html.parser')
    # Using xpath to extract the required information
    dom = etree.HTML(str(soup2))
    designation = dom.xpath('//h1[@class="app-title"]')[0].text
    company_name = re.sub(r'\s', '',dom.xpath('//span[@class="company-name"]')[0].text)[2:]
    location = re.sub(r'\s', '',dom.xpath('//div[@class="location"]')[0].text).strip()
    job_overview = dom.xpath('//*[@id="content"]/p[2]/span')[0].text.replace('\xa0', '')
    responsibilities = []
    basic_qualifications = []
    preferred_qualifications = []
    list_items = soup2.find_all('ul')
    for li in list_items[0].find_all("li"):
        responsibilities.append(li.text)
    for li in list_items[1].find_all("li"):
        basic_qualifications.append(li.text)
    for li in list_items[2].find_all("li"):
        preferred_qualifications.append(li.text)
    detail_dict = {
        "designation":designation,
        "company_name":company_name,
        "location":location,
        "job_overview":job_overview,
        "responsibilities":responsibilities,
        "basic_qualifications":basic_qualifications,
        "preferred_qualifications":preferred_qualifications,
    }
    return detail_dict

<h3> Creating a dataframe to store all the job details </h3>

In [156]:
column_names = ['designation','company_name','location','job_overview','responsibilities','basic_qualifications','preferred_qualifications']
df = pd.DataFrame(columns=column_names)

<h3> Running a for loop </h3>
<ul>
<li> The loop iterates through all the tokens </li>
<li> It extracts all the details of the job and prints a completed statement </li>
<li> It prints a *not processed* statement if it fails to extract   </li>
</ul>

In [157]:
for token in tokens:
    try:
        df = df.append(get_job_details(token),ignore_index=True)
        print(f'Completed for token = {token}')
    except:
        print(f'The job for token = {token} could not be processed')

Completed for token = 4645401004
Completed for token = 4614435004
Completed for token = 4710854004
Completed for token = 4606915004
Completed for token = 4703409004
Completed for token = 4660113004
Completed for token = 4659828004
Completed for token = 4674413004
Completed for token = 4668328004
Completed for token = 4612922004
The job for token = 4687783004 could not be processed


<h3> Saving the dataframe as a CSV  file</h3>
<h4> Advantages of using CSV files </h4>
<ul> 
<li> CSV files are plain-text files, making them easier for the website developer to create</li>
<li> They're easier to import into a spreadsheet or another storage database, regardless of the specific software you're using </li>
<li> To better organize large amounts of data </li>
</ul>

In [158]:
df.to_csv('job_details.csv')