In [49]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import csv

##### Extract, Transform, and Load

In [50]:
def extract(page):
    '''Call this function first'''
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'}
    if page == 0:
        url = 'https://www.indeed.com/jobs?q=data+analyst&jt=fulltime&explvl=entry_level&fromage=7'
    else:
        url = f'https://www.indeed.com/jobs?q=data+analyst&jt=fulltime&explvl=entry_level&fromage=7&start={page}'
    
    response = requests.get(url,headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    return soup

def extract_expand(href_list):
    '''This function third'''
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'}
    
    for href_item in href_list:
        url = f'https://www.indeed.com{href_item}'
        response = requests.get(url,headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        soup_list.append(soup)
    return
def transform(soup):
    '''This function second'''
    divs = soup.find_all('div', class_='mosaic-zone', id="mosaic-zone-jobcards")
    for item in divs:
        urls = item.find_all('a', id=True, class_=True, href=True)
        for href in urls:
            href_list.append(href['href'])
    return

def transform_expand(soup, href):
    '''This one should be called in the extract_expand func'''
    divs = soup.find_all('div', class_='icl-Container--fluid fs-unmask jobsearch-ViewJobLayout-fluidContainer is-US icl-u-xs-p--sm')
    date = datetime.datetime.now()
    date = int(date.strftime('%Y%m%d'))
    
    for item in divs:
        title = item.find('h1').text
        company = item.find('div', class_='icl-u-lg-mr--sm icl-u-xs-mr--xs').text
        job_description = item.find('div', id='jobDescriptionText', class_='jobsearch-jobDescriptionText')

        p_list = []
        li_list = []

        for p_tags in job_description.find_all('p'):
            p_list.append(p_tags.text.replace('\n', ' '))
        for li_tags in job_description.find_all('li'):
            li_list.append(li_tags.text.replace('\n', ' '))

        p_li_list = p_list + li_list
        description = ' '.join(map(str, p_li_list))
        website = 'www.indeed.com' + href

        job = {
            'job_title' : title,
            'company' : company,
            'job_description' : description,
            'website' : website,
            'date_scraped' : date

        }
        job_list.append(job)
    return

##### Cleaning DataFrame

In [51]:
try:
    filepath = 'indeed_jobs.csv'
    df = pd.read_csv(filepath).drop('Unnamed: 0', axis=1)
except:
    df = pd.DataFrame()

In [52]:
for i in range (0,40,10):
    href_list = []
    soup_list = []
    job_list = []
    a = 0

    info_extracted = extract(i)
    transform(info_extracted)
    extract_expand(href_list)

    for soup in soup_list:
        transform_expand(soup, href_list[a])
        df = df.append(job_list, ignore_index=True)
        a += 1
        df

In [53]:
df

Unnamed: 0,job_title,company,job_description,website,date_scraped
0,Junior Investment Data Analyst,ACR Alpine Capital Research,JOB TITLE: Junior Investment Data Analyst Loca...,www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AS...,20220121
1,Junior Investment Data Analyst,ACR Alpine Capital Research,JOB TITLE: Junior Investment Data Analyst Loca...,www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AS...,20220121
2,Data Analyst *ENTRY LEVEL*,CCS Global Tech,"Responsibilities Skills Job Types: Full-time, ...",www.indeed.com/company/CCS-Global-Tech/jobs/Da...,20220121
3,Junior Investment Data Analyst,ACR Alpine Capital Research,JOB TITLE: Junior Investment Data Analyst Loca...,www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AS...,20220121
4,Data Analyst *ENTRY LEVEL*,CCS Global Tech,"Responsibilities Skills Job Types: Full-time, ...",www.indeed.com/company/CCS-Global-Tech/jobs/Da...,20220121
...,...,...,...,...,...
475,Public Health Data Analyst,Lantana Consulting Group,Company: Lantana Consulting Group provides ser...,www.indeed.com/company/Lantana-Consulting-Grou...,20220121
476,Business Analyst,Photon,,www.indeed.com/rc/clk?jk=3a318d52ed46bd08&fcci...,20220121
477,Data Analyst,University of Arkansas at Little Rock,Bachelor’s degree in management information s...,www.indeed.com/rc/clk?jk=6084324059483b93&fcci...,20220121
478,Data Analytics Analyst,American Transmission Co.,"Essential Responsibilities: In this role, ...",www.indeed.com/rc/clk?jk=ba39cb26474e1571&fcci...,20220121


In [54]:
df_nodups = df.drop_duplicates(subset=['job_title', 'company', 'job_description'], keep='last')

In [55]:
df_indexfixed = df_nodups.reset_index(drop=True)

In [56]:
df_indexfixed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   job_title        50 non-null     object
 1   company          50 non-null     object
 2   job_description  50 non-null     object
 3   website          50 non-null     object
 4   date_scraped     50 non-null     int64 
dtypes: int64(1), object(4)
memory usage: 2.1+ KB


In [57]:
df_indexfixed

Unnamed: 0,job_title,company,job_description,website,date_scraped
0,Junior Investment Data Analyst,ACR Alpine Capital Research,JOB TITLE: Junior Investment Data Analyst Loca...,www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0AS...,20220121
1,Sports Gaming Data Entry Analyst,Sports Gaming Technology Firm,"Sports Gaming Data Entry Analyst in Arlington,...",www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0Cn...,20220121
2,Workforce Data Analyst,California Public Employees' Retirement System...,The California Public Employees’ Retirement Sy...,www.indeed.com/company/California-Public-Emplo...,20220121
3,Data Analyst,PC Matic,"PC Matic, a leading cybersecurity company that...",www.indeed.com/company/PC-Matic/jobs/Data-Anal...,20220121
4,Data Analyst 2,PayPal,"Who we are: At PayPal (NASDAQ: PYPL), we belie...",www.indeed.com/rc/clk?jk=76ad377b677f0b5c&fcci...,20220121
5,Jr. Data Analyst,Ignite Tek,The Researcher / Junior Strategist (RJS) perfo...,www.indeed.com/company/Ignite-Tek/jobs/Junior-...,20220121
6,Data Analyst,"City Ranked Media, Inc.",Do you love digging into data to draw conclusi...,"www.indeed.com/company/City-Ranked-Media,-Inc....",20220121
7,Entry Level Business Analyst,Cloudinfraspecs,We are seeking a Talented and Driven Business ...,www.indeed.com/company/Cloudinfraspecs/jobs/En...,20220121
8,Junior Data Analyst (Banking),Invexer Technology,"Contract duration: long-term visa: H1b, US cit...",www.indeed.com/company/Invexer-Technology/jobs...,20220121
9,Data Specialist,Thompson School District,JobID: 9803,www.indeed.com/rc/clk?jk=7ad9e941e8fc14f4&fcci...,20220121


In [58]:
df_indexfixed.to_csv('indeed_jobs.csv', mode='a', header=False)