In [16]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import csv

##### Extract, Transform, and Load

In [17]:
def extract(page):
    '''Call this function first'''
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'}
    if page == 0:
        url = 'https://www.indeed.com/jobs?q=data+analyst&jt=fulltime&explvl=entry_level&fromage=7'
    else:
        url = f'https://www.indeed.com/jobs?q=data+analyst&jt=fulltime&explvl=entry_level&fromage=7&start={page}'
    
    response = requests.get(url,headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    return soup

def extract_expand(href_list):
    '''This function third'''
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'}
    
    for href_item in href_list:
        url = f'https://www.indeed.com{href_item}'
        response = requests.get(url,headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        soup_list.append(soup)
    return

In [18]:
def transform(soup):
    '''This function second'''
    divs = soup.find_all('div', class_='mosaic-zone', id="mosaic-zone-jobcards")
    for item in divs:
        urls = item.find_all('a', id=True, class_=True, href=True)
        for href in urls:
            href_list.append(href['href'])
    return

def transform_expand(soup, href):
    '''This one should be called in the extract_expand func'''
    divs = soup.find_all('div', class_='icl-Container--fluid fs-unmask jobsearch-ViewJobLayout-fluidContainer is-US icl-u-xs-p--sm')
    date = datetime.datetime.now()
    date = int(date.strftime('%Y%m%d'))
    
    for item in divs:
        title = item.find('h1').text
        company = item.find('div', class_='icl-u-lg-mr--sm icl-u-xs-mr--xs').text
        try:
            qualifications = item.find('div', id='qualificationsSection').text.split('Qualifications')[1]
        except:
            qualifications = ''
        job_description = item.find('div', id='jobDescriptionText', class_='jobsearch-jobDescriptionText').text.replace('\n', ' ')
        website = 'www.indeed.com' + href

        job = {
            'title' : title,
            'company' : company,
            'qualifications' : qualifications,
            'job_description' : job_description,
            'website' : website,
            'date' : date

        }
        job_list.append(job)
    return


##### Cleaning DataFrame

In [19]:
try:
    filepath = 'indeed_jobs.csv'
    df = pd.read_csv(filepath).drop('Unnamed: 0', axis=1)
except:
    df = pd.DataFrame()

In [20]:
for i in range (0,40,10):
    href_list = []
    soup_list = []
    job_list = []
    a = 0

    info_extracted = extract(i)
    transform(info_extracted)
    extract_expand(href_list)

    for soup in soup_list:
        transform_expand(soup, href_list[a])
        df = df.append(job_list, ignore_index=True)
        a += 1
        df

In [21]:
df

Unnamed: 0,title,company,qualifications,job_description,website,date
0,Data Analyst,Zeuner S.p.A,,Attività da svolgere: Creazione report clienti...,www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0Ay...,20220117
1,Data Analyst,Zeuner S.p.A,,Attività da svolgere: Creazione report clienti...,www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0Ay...,20220117
2,Jr. Data Analyst,Ignite Tek,Bachelor's (Preferred)Business Analysis: 1 yea...,The Researcher / Junior Strategist (RJS) perfo...,www.indeed.com/company/Ignite-Tek/jobs/Junior-...,20220117
3,Data Analyst,Zeuner S.p.A,,Attività da svolgere: Creazione report clienti...,www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0Ay...,20220117
4,Jr. Data Analyst,Ignite Tek,Bachelor's (Preferred)Business Analysis: 1 yea...,The Researcher / Junior Strategist (RJS) perfo...,www.indeed.com/company/Ignite-Tek/jobs/Junior-...,20220117
...,...,...,...,...,...,...
475,Data Analyst,Intellipro Group,SQL: 1 year (Preferred)Mandarin (Preferred)US ...,【Job Description】Create table or data ETL usin...,www.indeed.com/company/Intellipro-Group-Inc/jo...,20220117
476,Data Analyst (Open to Remote),Experian,,Company Description Experian is the world’s l...,www.indeed.com/rc/clk?jk=c0b154e955e4ad3d&fcci...,20220117
477,Data Analyst - Procurement,The Coca-Cola Company,,A bit about the role... This role will provide...,www.indeed.com/rc/clk?jk=9e4f7061c4ab1039&fcci...,20220117
478,Health Data Analyst,NextGen information services,,Essential Responsibilities:· Independently det...,www.indeed.com/company/NextGen-Information-Ser...,20220117


In [22]:
df_nodups = df.drop_duplicates(subset=['title', 'company', 'job_description'], keep='last')

In [23]:
df_indexfixed = df_nodups.reset_index(drop=True)

In [24]:
df_indexfixed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57 entries, 0 to 56
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   title            57 non-null     object
 1   company          57 non-null     object
 2   qualifications   57 non-null     object
 3   job_description  57 non-null     object
 4   website          57 non-null     object
 5   date             57 non-null     int64 
dtypes: int64(1), object(5)
memory usage: 2.8+ KB


In [25]:
df_indexfixed.to_csv('indeed_jobs.csv', mode='a', header=False)