In [210]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import csv

##### Extract, Transform, and Load

In [211]:
def extract(page):
    '''Call this function first'''
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'}
    if page == 0:
        url = 'https://www.indeed.com/jobs?q=data+analyst&jt=fulltime&explvl=entry_level&fromage=7'
    else:
        url = f'https://www.indeed.com/jobs?q=data+analyst&jt=fulltime&explvl=entry_level&fromage=7&start={page}'
    
    response = requests.get(url,headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    return soup

def extract_expand(href_list):
    '''This function third'''
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'}
    
    for href_item in href_list:
        url = f'https://www.indeed.com{href_item}'
        response = requests.get(url,headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        soup_list.append(soup)
    return

In [212]:
def transform(soup):
    '''This function second'''
    divs = soup.find_all('div', class_='mosaic-zone', id="mosaic-zone-jobcards")
    for item in divs:
        urls = item.find_all('a', id=True, class_=True, href=True)
        for href in urls:
            href_list.append(href['href'])
    return

def transform_expand(soup, href):
    '''This one should be called in the extract_expand func'''
    divs = soup.find_all('div', class_='icl-Container--fluid fs-unmask jobsearch-ViewJobLayout-fluidContainer is-US icl-u-xs-p--sm')
    date = datetime.datetime.now()
    date = int(date.strftime('%Y%m%d'))
    
    for item in divs:
        title = item.find('h1').text
        company = item.find('div', class_='icl-u-lg-mr--sm icl-u-xs-mr--xs').text
        try:
            qualifications = item.find('div', id='qualificationsSection').text.split('Qualifications')[1]
        except:
            qualifications = ''
        job_description = item.find('div', id='jobDescriptionText', class_='jobsearch-jobDescriptionText').text.strip().replace('\n', '')
        website = 'www.indeed.com' + href

        job = {
            'title' : title,
            'company' : company,
            'qualifications' : qualifications,
            'job_description' : job_description,
            'website' : website,
            'date' : date

        }
        job_list.append(job)
    return


##### Cleaning DataFrame

In [213]:
try:
    filepath = 'indeed_jobs.csv'
    df = pd.read_csv(filepath).drop('Unnamed: 0', axis=1)
except:
    df = pd.DataFrame()

In [214]:
for i in range (0,40,10):
    href_list = []
    soup_list = []
    job_list = []
    a = 0

    info_extracted = extract(i)
    transform(info_extracted)
    extract_expand(href_list)

    for soup in soup_list:
        transform_expand(soup, href_list[a])
        df = df.append(job_list, ignore_index=True)
        a += 1
        df

In [215]:
df

Unnamed: 0,title,company,qualifications,job_description,website,date
0,HR Data Analyst – Strategy,"Mavensoft Technologies, LLC.",,Job Title: HR Data Analyst – StrategyJob Code...,www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0Au...,20220114
1,HR Data Analyst – Strategy,"Mavensoft Technologies, LLC.",,Job Title: HR Data Analyst – StrategyJob Code...,www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0Au...,20220114
2,Data Analyst,System Innovators,,The essential responsibilities of the Data Ana...,www.indeed.com/rc/clk?jk=8d75008c997de250&fcci...,20220114
3,HR Data Analyst – Strategy,"Mavensoft Technologies, LLC.",,Job Title: HR Data Analyst – StrategyJob Code...,www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0Au...,20220114
4,Data Analyst,System Innovators,,The essential responsibilities of the Data Ana...,www.indeed.com/rc/clk?jk=8d75008c997de250&fcci...,20220114
...,...,...,...,...,...,...
475,Marketing Data Analyst,ROI ADS LLC,Marketing: 1 year (Required)SQL: 1 year (Prefe...,We are a digital marketing company that works ...,www.indeed.com/company/ROI-ADS-LLC/jobs/Market...,20220114
476,Data Analyst,"Social Market Analytics, Inc.",SQL: 1 year (Preferred),"Company Description: Social Market Analytics, ...",www.indeed.com/company/Social-Market-Analytics...,20220114
477,Junior Data Analyst,TRESUME,US work authorization (Preferred),We are seeking a Data Analyst for an entry-lev...,www.indeed.com/company/TRESUME/jobs/Junior-Dat...,20220114
478,Data Analyst,Intellipro Group,SQL: 1 year (Preferred)Mandarin (Preferred)US ...,【Job Description】Create table or data ETL usin...,www.indeed.com/company/Intellipro-Group-Inc/jo...,20220114


In [216]:
df_nodups = df.drop_duplicates(subset=['title', 'company', 'job_description'], keep='last')

In [217]:
df_indexfixed = df_nodups.reset_index(drop=True)

In [218]:
df_indexfixed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   title            56 non-null     object
 1   company          56 non-null     object
 2   qualifications   56 non-null     object
 3   job_description  56 non-null     object
 4   website          56 non-null     object
 5   date             56 non-null     int64 
dtypes: int64(1), object(5)
memory usage: 2.8+ KB


In [219]:
df_indexfixed.to_csv('indeed_jobs.csv', mode='a', header=False)