# Selenium
Using Selenium to get all detail info from US Seasonal Jobs web page.

In [93]:
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.touch_actions import TouchActions
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time

In [94]:
# define url
url = 'https://seasonaljobs.dol.gov/jobs?search=&location=&start_date=&job_type=H-2B&sort=accepted_date&radius=100&wage=all&facets='

# set location of web driver for Edge Browser
driver = webdriver.Edge(executable_path='D:\Programas\WebDrivers\msedgedriver.exe')

# navigate to the url
driver.get(url)

# wait 3 seconds to allow page to load
time.sleep(3)

In [95]:
# function returning the actua result number and total result number
def get_showing_results_numbers():
    # extract the text -- "showing XX from XXXX" --
    totals_text = driver.find_element_by_xpath('//*[@id="main-content"]/div/div[1]/div[1]/p')
    # save [actual showing results, total results] 
    results_total = [int(i) for i in totals_text.text.split() if str.isdigit(i)]
    return results_total[0], results_total[1]

In [96]:
def get_details(full_page):
    details = []

    # using the html pased in full_page argument
    soup = BeautifulSoup(full_page, 'html.parser')
    
    # getting only the job detail part
    job_details = soup.find(id = 'job-detail')

    # the first section tag holds all the nested tags
    section_element = job_details.find('section')

    # getting the first child, its tag is 'a'
    job_name_element = section_element.findChild('a')
    job_name = job_name_element.text

    # from the first child we get its div sibling
    main_data_element = job_name_element.find_next_sibling('div')
    
    # get all p tags
    main_data = [i.text for i in main_data_element.find_all('p')]
    
    # get al time tags
    date_data = [i.text.split()[2] for i in main_data_element.find_all('time')]

    # job Order link
    job_order = main_data_element.find('a').text

    # using main_data to fill variables
    commpany_name = main_data[0]
    city_name = main_data[1]
    payment_data = main_data[2]

    # split payment rate
    payment = payment_data.split()[0]
    payment_rate = payment_data.split()[2]

    # using date_data to fill variables
    begin_date = date_data[0]
    end_date = date_data[1]

    # from the first child we get its address sibling
    address_data_element = job_name_element.find_next_sibling('address')
    
    # get all 'dt' tags to check for their value and store in corresponding variables
    recruitment_information_titles = address_data_element.find_all('dt')

    # set all contact variables to empty srings
    telephone = ''
    email = ''
    web = ''

    for info in recruitment_information_titles:
        if 'telephone' in info.text.lower():
            telephone_data_element = info.find_next_sibling('dd')
            telephone = telephone_data_element.text
        elif 'email' in info.text.lower():
            email_data_element = info.find_next_sibling('dd')
            email = email_data_element.text
        elif 'web' in info.text.lower():
            web_data_element = info.find_next_sibling('dd')
            web = web_data_element.text

    # from the first child we get its section sibling (Job Description)
    job_description_title_element = job_name_element.find_next_sibling('section')
    # find all dd tags
    description_data = [i.text for i in job_description_title_element.find_all('dd')]
    # store text values 
    full_time = description_data[0]
    workers_needed = description_data[1]
    job_duties = description_data[2]
    
    # from job description find the next sibling with section tag
    job_requirement_title_element = job_description_title_element.find_next_sibling('section')
    job_requirement_titles = job_requirement_title_element.find_all('dt')

    experience_required = ''
    experience_months = ''
    special_requirements = ''
    hours_week = ''
    schedule = ''

    for title in job_requirement_titles:
        if title.text.lower() == "experience required:":
            experience_required_element = title.find_next_sibling('dd')
            experience_required = experience_required_element.text
        elif 'months' in title.text.lower():
            experience_months_element = title.find_next_sibling('dd')
            experience_months = experience_months_element.text
        elif 'special' in title.text.lower():
            special_requirements_element = title.find_next_sibling('dd')
            special_requirements = special_requirements_element.text
        elif 'hours' in title.text.lower():
            hours_week_element = title.find_next_sibling('dd')
            hours_week = hours_week_element.text
        elif 'schedule' in title.text.lower():
            schedule_element = title.find_next_sibling('dd')
            schedule = schedule_element.text

    details.append(job_name)
    details.append(commpany_name)
    details.append(city_name)
    details.append(payment)
    details.append(payment_rate)
    details.append(begin_date)
    details.append(end_date)
    details.append(job_order)
    details.append(telephone)
    details.append(email)
    details.append(web)
    details.append(full_time)
    details.append(workers_needed)
    details.append(job_duties)
    details.append(experience_required)
    details.append(experience_months)
    details.append(special_requirements)
    details.append(hours_week)
    details.append(schedule)    

    
    return details

In [97]:
def create_dataframe():

    # defining column names
    column_names = ['Job Name','Company Name','City Name','Payment','Payment Rate','Begin Date','End Date','Job Order','Telephone','Email','Web','Full Time',\
    
        'Workers Needed','Job Duties', 'Experience Required', 'Experience Months', 'Special Requirements', 'Hours Per Week', 'Schedule']
    
    # cretaing the dataframe
    df = pd.DataFrame(columns=column_names)

    return df

In [None]:
# cell to load all results

# get the "Load More" button
element = driver.find_elements_by_tag_name('button')
load_more = [b for b in element if b.text == "Load More"][0]

# get current and total result numbers
actual_result_number, total_result_number = get_showing_results_numbers()

# click the "Load More" button
while actual_result_number <= total_result_number: 
    print(actual_result_number)
    # load more results
    load_more.send_keys(Keys.ENTER)
    time.sleep(2)
    # update actual showing result number
    actual_result_number, total_result_number = get_showing_results_numbers()

In [None]:

# getting all the results inside the article tag
results = driver.find_elements_by_tag_name('article')

full_data = create_dataframe()

for result in results:
    # loop through each result and get the details
    result.send_keys(Keys.ENTER)
    # need to check if job-detail element is present
    count = 0
    while count < 3:
        try:
            print("Finding Job-detail...")
            time.sleep(2)
            job_detail = driver.find_element_by_id('job-detail')
            print("Job detail found!")
            break
        except:
            print("No 'job-detail' id was found, waiting...")
            time.sleep(5)
            count += 1
            print("Try No. " + str(count))
    
    # add new detail row to dataset
    print("Adding new file")
    full_data.loc[len(full_data.index)] = get_details(driver.page_source)

In [91]:
# export df to CSV
full_data.to_csv(r'C:\\Users\\gefry\\Downloads\\Seasonal Jobs\\full_data.csv')

## Beatiful Soup 
In the following cells there are some examples of bs4 usage.

In [None]:
from bs4 import BeautifulSoup

In [None]:
# using bs4 to navigate when eror 403 is present

user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.63'
referer = 'https://seasonaljobs.dol.gov/'

header = {'User-Agent':user_agent,'Referer':referer}

try:
    response = requests.get(url, headers=header)
except Exception as e:
    print("Cant get info from the specified URL Error: " + e)

In [41]:
# check if status code is OK
if response.status_code == 200:
    data = response.text
else:
    print("Status Code Not OK, Status Code: " + str(response.status_code))

In [None]:
# get info from all article tags
soup = BeautifulSoup(data, 'html.parser')
jobs = soup.find_all('article')
print(jobs)
for job in jobs:
    print(job.text)