# Selenium
Using Selenium to get all detail info from US Seasonal Jobs web page.

In [102]:
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time

In [103]:
# define url
url = 'https://seasonaljobs.dol.gov/jobs?search=&location=&start_date=&job_type=H-2B&sort=accepted_date&radius=100&wage=all&facets='

# set location of web driver for Edge Browser
driver = webdriver.Edge(executable_path='D:\Programs\Web Drivers\msedgedriver.exe')

# navigate to the url
driver.get(url)

# wait 3 seconds to allow page to load
time.sleep(3)

  driver = webdriver.Edge(executable_path='D:\Programs\Web Drivers\msedgedriver.exe')


In [104]:
# function returning the actua result number and total result number
def get_showing_results_numbers():
    # extract the text -- "showing XX from XXXX" --
    totals_text = driver.find_element(By.XPATH, '//*[@id="main-content"]/div/div[1]/div[1]/p')
    # save [actual showing results, total results] 
    results_total = [int(i) for i in totals_text.text.split() if str.isdigit(i)]
    print(results_total)
    return results_total[0], results_total[1]

In [109]:
def get_details(full_page):
    details = []

    # using the html pased in full_page argument
    soup = BeautifulSoup(full_page, 'html.parser')
    
    # getting only the job detail part
    job_details = soup.find(id = 'job-detail')

    # the first section tag holds all the nested tags
    section_element = job_details.find('section')

    # getting the first child, its tag is 'a'
    job_name_element = section_element.findChild('a')
    job_name = job_name_element.text

    # from the first child we get its div sibling
    main_data_element = job_name_element.find_next_sibling('div')
    
    # get all p tags
    main_data = [i.text for i in main_data_element.find_all('p')]
    
    # get al time tags
    date_data = [i.text.split()[2] for i in main_data_element.find_all('time')]

    # job Order link
    job_order_element = main_data_element.find('a')
    job_order = job_order_element.text if job_order_element else "N/A"

    # using main_data to fill variables
    commpany_name = main_data[0]
    city_name = main_data[1]
    payment_data = main_data[2]

    # split payment rate
    print(payment_data)
    payment = payment_data.split()[0]
    if len(payment_data.split()) >= 3:
        payment_rate = payment_data.split()[2]
    else:
        payment_rate = payment_data.split()[1]

    # using date_data to fill variables
    begin_date = date_data[0]
    end_date = date_data[1]

    # from the first child we get its address sibling
    address_data_element = job_name_element.find_next_sibling('address')
    
    # get all 'dt' tags to check for their value and store in corresponding variables
    recruitment_information_titles = address_data_element.find_all('dt')

    # set all contact variables to empty srings
    telephone = ''
    email = ''
    web = ''

    for info in recruitment_information_titles:
        if 'telephone' in info.text.lower():
            telephone_data_element = info.find_next_sibling('dd')
            telephone = telephone_data_element.text
        elif 'email' in info.text.lower():
            email_data_element = info.find_next_sibling('dd')
            email = email_data_element.text
        elif 'web' in info.text.lower():
            web_data_element = info.find_next_sibling('dd')
            web = web_data_element.text

    # from the first child we get its section sibling (Job Description)
    job_description_title_element = job_name_element.find_next_sibling('section')
    # find all dd tags
    description_data = [i.text for i in job_description_title_element.find_all('dd')]
    # store text values 
    full_time = description_data[0]
    workers_needed = description_data[1]
    job_duties = description_data[2]
    
    # from job description find the next sibling with section tag
    job_requirement_title_element = job_description_title_element.find_next_sibling('section')
    job_requirement_titles = job_requirement_title_element.find_all('dt')

    experience_required = ''
    experience_months = ''
    special_requirements = ''
    hours_week = ''
    schedule = ''

    for title in job_requirement_titles:
        if title.text.lower() == "experience required:":
            experience_required_element = title.find_next_sibling('dd')
            experience_required = experience_required_element.text
        elif 'months' in title.text.lower():
            experience_months_element = title.find_next_sibling('dd')
            experience_months = experience_months_element.text
        elif 'special' in title.text.lower():
            special_requirements_element = title.find_next_sibling('dd')
            special_requirements = special_requirements_element.text
        elif 'hours' in title.text.lower():
            hours_week_element = title.find_next_sibling('dd')
            hours_week = hours_week_element.text
        elif 'schedule' in title.text.lower():
            schedule_element = title.find_next_sibling('dd')
            schedule = schedule_element.text

    details.append(job_name)
    details.append(commpany_name)
    details.append(city_name)
    details.append(payment)
    details.append(payment_rate)
    details.append(begin_date)
    details.append(end_date)
    details.append(job_order)
    details.append(telephone)
    details.append(email)
    details.append(web)
    details.append(full_time)
    details.append(workers_needed)
    details.append(job_duties)
    details.append(experience_required)
    details.append(experience_months)
    details.append(special_requirements)
    details.append(hours_week)
    details.append(schedule)    

    
    return details

In [110]:
def create_dataframe():

    # defining column names
    column_names = ['Job Name','Company Name','City Name','Payment','Payment Rate','Begin Date','End Date','Job Order','Telephone','Email','Web','Full Time',\
    
        'Workers Needed','Job Duties', 'Experience Required', 'Experience Months', 'Special Requirements', 'Hours Per Week', 'Schedule']
    
    # cretaing the dataframe
    df = pd.DataFrame(columns=column_names)

    return df

In [107]:
# cell to load all results

# get the "Load More" button
element = driver.find_elements(By.TAG_NAME, "button")
load_more = [b for b in element if b.text == "Load More"][0]

# get current and total result numbers
actual_result_number, total_result_number = get_showing_results_numbers()

# click the "Load More" button
while not actual_result_number == total_result_number: 
    # load more results
    load_more.send_keys(Keys.ENTER)
    time.sleep(2)
    # update actual showing result number
    actual_result_number, total_result_number = get_showing_results_numbers()

[10, 947]
[20, 947]
[30, 947]
[40, 947]
[50, 947]
[60, 947]
[70, 947]
[80, 947]
[90, 947]
[100, 947]
[110, 947]
[120, 947]
[130, 947]
[140, 947]
[150, 947]
[160, 947]
[170, 947]
[180, 947]
[190, 947]
[200, 947]
[210, 947]
[220, 947]
[230, 947]
[240, 947]
[250, 947]
[260, 947]
[270, 947]
[280, 947]
[290, 947]
[300, 947]
[310, 947]
[320, 947]
[330, 947]
[340, 947]
[350, 947]
[360, 947]
[370, 947]
[380, 947]
[390, 947]
[400, 947]
[410, 947]
[420, 947]
[430, 947]
[440, 947]
[450, 947]
[460, 947]
[470, 947]
[480, 947]
[490, 947]
[500, 947]
[510, 947]
[520, 947]
[530, 947]
[540, 947]
[550, 947]
[560, 947]
[570, 947]
[580, 947]
[590, 947]
[600, 947]
[610, 947]
[620, 947]
[630, 947]
[640, 947]
[650, 947]
[660, 947]
[670, 947]
[680, 947]
[690, 947]
[700, 947]
[710, 947]
[720, 947]
[730, 947]
[740, 947]
[750, 947]
[760, 947]
[770, 947]
[780, 947]
[790, 947]
[800, 947]
[810, 947]
[820, 947]
[830, 947]
[840, 947]
[850, 947]
[860, 947]
[870, 947]
[880, 947]
[890, 947]
[900, 947]
[910, 947]
[920, 94

In [111]:

# getting all the results inside the article tag
results = driver.find_elements(By.TAG_NAME, 'article')

full_data = create_dataframe()

for result in results:
    # loop through each result and get the details
    result.send_keys(Keys.ENTER)
    # need to check if job-detail element is present
    count = 0
    while count < 3:
        try:
            time.sleep(2)
            job_detail = driver.find_element(By.ID, 'job-detail')
            break
        except:
            print("No 'job-detail' id was found, waiting...")
            time.sleep(5)
            result.send_keys(Keys.ENTER)
            count += 1
            print("Try No. " + str(count))
    
    # add new detail row to dataset
    print(len(full_data.index))
    full_data.loc[len(full_data.index)] = get_details(driver.page_source)

0
$14.28 per hour
1
$19.84-$22.84 per hour
2
$15.14-$18.14 per hour
3
$19.22 per hour
4
$21.49 per hour
5
$27.52 per hour
6
$20.27-$20.30 per hour
7
$13.94-$19.00 per hour
8
$19.74-$19.75 per hour
9
$17.21 per hour
10
$14.38 per hour
11
$15.98 per hour
12
$8.80-$25.34 per hour
13
$11.00 per hour
14
$14.28 per hour
15
$16.03 per hour
16
$14.11 per hour
17
$12.81 per hour
18
$15.21 per hour
19
$14.00 per hour
20
$18.42 per hour
21
$18.00-$20.00 per hour
22
$16.49-$18.00 per hour
23
$19.16 per hour
24
$20.00 per hour
25
$15.57-$20.94 per hour
26
$14.21-$28.00 per hour
27
$17.00-$32.00 per hour
28
$11.20 per hour
29
$16.71 per hour
30
$18.42 per hour
31
$17.47-$18.25 per hour
32
$17.53 piece rate
33
$20.35 per hour
34
$14.75 per hour
35
$16.94 per hour
36
$16.31 per hour
37
$12.31 per hour
38
$15.49 per hour
39
$16.30-$22.00 per hour
40
$15.20 per hour
41
$16.90 per hour
42
$16.50-$19.79 per hour
43
$12.78-$22.75 per hour
44
$14.35 per hour
45
$15.67 piece rate
46
$13.26 per hour
47
$14.66

In [112]:
# export df to CSV
full_data.to_csv(r'D:\\Learning\\Python\\Seasonal Jobs\\scraping_seasonal_jobs\\full_data.csv')

# sending Mails

Script to send mail to those jobs who don't require experience.

In [115]:
import smtplib, email, ssl
from email import encoders
from email.mime.base import MIMEBase
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart


In [116]:
def send_mail(_to, _subject):
  sender = 'gefry182@hotmail.com'
  password = 'G3fryC45tr0'
  to = _to
  subject = _subject
  file = "Gefry Castro CV.pdf"

  # create MIMEMulipart object
  msg = MIMEMultipart("alternative")
  msg["Subject"] = subject
  msg["From"] = sender
  msg["To"] = to

  # HTML message
  html = """\
  <html>
    <body>
      <p>
          Greetings,<br>
          <br>
          I'm sending my CV as an attachment and my <a href='https://www.linkedin.com/in/castro-gefry'>LinkedIn profile</a> to apply for the job offer mentioned in the subject line. <br>
          Maybe my professional profile doesn't quite fit for the required job but I learn fast and I'm in good shape. Also, <strong>I'm willing to do any kind of work.</strong><br>
          <br>
          It is not only my desire to be useful in the achievement of your company goals but to have the opportunity to work in the United States.<br> 
          I'm single, I can afford myself travel expenses and a few months of accommodation.<br>
          <br>
          Hoping we can keep in touch.<br>
          <br>
          Best regards,<br>
          <br>
          Gefry A. Castro<br>
          Bogotá - Colombia<br>
      </p>
    </body>
  </html>
  """

  part = MIMEText(html, "html")
  msg.attach(part)

  # add attachment
  with open(file, "rb") as attachment:
      part = MIMEBase("application", "octet-stream")
      part.set_payload(attachment.read())

  encoders.encode_base64(part)


  # set mail headers
  part.add_header("Content-Disposition", "attachment", filename=file)
  msg.attach(part)

  # create smtp connection
  s = smtplib.SMTP("smtp-mail.outlook.com",587)
  s.ehlo() # Hostname to send for this command defaults to the fully qualified domain name of the local host.
  s.starttls() #Puts connection to SMTP server in TLS mode
  s.ehlo()
  s.login(sender, password)
  s.sendmail(sender, to, msg.as_string())

  s.quit()

In [117]:
def write_transactions(email, job_name, success):
    print(success)

    if success:
        print("write to success")
        filename = "success.txt"
    else:
        filename = "failure.txt"
    # write in file
    print(filename)
    with open(filename, 'a') as file:
        file.write(job_name + ',' + email + '\n')
        file.close()

In [133]:
# run through each filtered result to send email

import pandas as pd

df = pd.read_csv(r'.//full_data.csv', index_col=False)
df_fields = df[['Job Name', 'Email', 'Experience Required']]
df_filtered = df_fields.query("`Experience Required` == 'No'").dropna()
df_filtered.reset_index(inplace=True)

# loop through each row
for index, row in df_filtered.iterrows():
    job_name = str(row['Job Name']).strip()
    email = str(row['Email']).strip()
    try:
        # TODO if email and job is on the success file, mail must not be sent
        send_mail(email, job_name)
        write_transactions(email, job_name, True)
        print(index, email, job_name, "Send!!")
        pass
    except:
        print("Error Sending Mail")
        write_transactions(email, job_name, False)
        continue
df_filtered.head()

True
write to success
success.txt
0 lindsay@firesidejacksonhole.com Housekeepers Send!!
True
write to success
success.txt
1 bdc5057@gmail.com Construction Laborer Send!!
True
write to success
success.txt
2 elena@aeroscapeutah.com Snow Shoveler Send!!
True
write to success
success.txt
3 pitchfordb23@gmail.com Snow Removal Laborer Send!!
True
write to success
success.txt
4 jgibson@crc.global Material Handler Send!!
True
write to success
success.txt
5 Brittany@titanlandscape.net Shop Helper Send!!
True
write to success
success.txt
6 kay@dyna-mist.com Laborer Send!!
True
write to success
success.txt
7 recruiting@steinlodge.com House Attendant Send!!
True
write to success
success.txt
8 recruiting@steinlodge.com Service Attendant Send!!
True
write to success
success.txt
9 admin@winterservicesgroup.com Laborer-landscape Send!!
True
write to success
success.txt
10 eddy@simonspropertymaintenance.com Maintenance Helper Send!!
True
write to success
success.txt
11 bsimms@merionls.com Tree Trimmers

Unnamed: 0,index,Job Name,Email,Experience Required
0,0,Housekeepers,lindsay@firesidejacksonhole.com,No
1,3,Construction Laborer,bdc5057@gmail.com,No
2,7,Snow Shoveler,elena@aeroscapeutah.com,No
3,9,Snow Removal Laborer,pitchfordb23@gmail.com,No
4,15,Material Handler,jgibson@crc.global,No


## Beatiful Soup 
In the following cells there are some examples of bs4 usage.

In [None]:
from bs4 import BeautifulSoup

In [None]:
# using bs4 to navigate when eror 403 is present

user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36 Edg/104.0.1293.63'
referer = 'https://seasonaljobs.dol.gov/'

header = {'User-Agent':user_agent,'Referer':referer}

try:
    response = requests.get(url, headers=header)
except Exception as e:
    print("Cant get info from the specified URL Error: " + e)

In [41]:
# check if status code is OK
if response.status_code == 200:
    data = response.text
else:
    print("Status Code Not OK, Status Code: " + str(response.status_code))

In [None]:
# get info from all article tags
soup = BeautifulSoup(data, 'html.parser')
jobs = soup.find_all('article')
print(jobs)
for job in jobs:
    print(job.text)