In [49]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import datetime
import csv

#### Extract, Transform, and Load

In [50]:
#Extracts soups from indeed's multiple job listing pages and saves soup to info_extracted
def extract(page):
    
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'}
    if page == 0:
        url = 'https://www.indeed.com/jobs?q=data+analyst&jt=fulltime&explvl=entry_level&fromage=7'
    else:
        url = f'https://www.indeed.com/jobs?q=data+analyst&jt=fulltime&explvl=entry_level&fromage=7&start={page}'
    
    response = requests.get(url,headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    return soup

#Parses all 15 hrefs from each soups from 'extract' function and appends to href_list
def transform(soup_extracted):
    
    divs = soup.find_all('div', class_='mosaic-zone', id="mosaic-zone-jobcards")
    for item in divs:
        urls = item.find_all('a', id=True, class_=True, href=True)
        for href in urls:
            href_list.append(href['href'])
    return

#Parses out soup from job listing for each href and appends to soup_list
def extract_expand(href_list):
    
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'}
    
    for href_item in href_list:
        url = f'https://www.indeed.com{href_item}'
        response = requests.get(url,headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        soup_list.append(soup)
    return


def transform_expand(soup, href):
    '''This one should be called in the extract_expand func'''
    divs = soup.find_all('div', class_='icl-Container--fluid fs-unmask jobsearch-ViewJobLayout-fluidContainer is-US icl-u-xs-p--sm')
    date = datetime.datetime.now()
    date = int(date.strftime('%Y%m%d'))
    
    for item in divs:
        title = item.find('h1').text
        company = item.find('div', class_='icl-u-lg-mr--sm icl-u-xs-mr--xs').text
        job_description = item.find('div', id='jobDescriptionText', class_='jobsearch-jobDescriptionText')

        p_list = []
        li_list = []

        for p_tags in job_description.find_all('p'):
            p_list.append(p_tags.text.replace('\n', ' '))
        for li_tags in job_description.find_all('li'):
            li_list.append(li_tags.text.replace('\n', ' '))

        p_li_list = p_list + li_list
        description = ' '.join(map(str, p_li_list))
        website = 'www.indeed.com' + href

        job = {
            'job_title' : title,
            'company' : company,
            'job_description' : description,
            'website' : website,
            'date_scraped' : date

        }
        job_list.append(job)
    return

#### Reading or Creating a New DataFrame

In [51]:
#Reading from filepath but if no file then creating a dataframe object
try:
    filepath = 'Resources/indeed_jobs.csv'
    df = pd.read_csv(filepath).drop('Unnamed: 0', axis=1)
except:
    df = pd.DataFrame()

#### Webscrapping

In [52]:
# For loop to extract url_hrefs, soups, and job listings from each page(i) to append to 'df'
for page in range (0,40,10):
    href_list = []
    soup_list = []
    job_list = []

    #href counter to reset for each new soup sparsed
    a = 0

    soup_extracted = extract(page)
    transform(soup_extracted)
    extract_expand(href_list)

    for soup in soup_list:
        transform_expand(soup, href_list[a])
        df = df.append(job_list, ignore_index=True)
        a += 1
        df

In [54]:
#Removing duplicate listings and fixing index
df_nodups = df.drop_duplicates(subset=['job_title', 'company', 'job_description'], keep='last')
df_indexfixed = df_nodups.reset_index(drop=True)

In [58]:
#Append or Create a new CSV using df_indexfixed
df_indexfixed.to_csv('indeed_jobs.csv', mode='a', header=False)