In [51]:
import csv
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import random 
from collections import OrderedDict

In [54]:
# creating URL base
def get_url (location):
    template = 'https://www.indeed.com/jobs?q&l={}'
    url = template.format(location)
    return url

In [55]:
# adding search criteria to URL
url = get_url("catalina+island")
print(url)

https://www.indeed.com/jobs?q&l=catalina+island


In [60]:
# connecting to website and checking for connection
response = requests.get(url)
response

<Response [200]>

In [61]:
# pulling html from website
soup = BeautifulSoup(response.text, 'html.parser')

In [62]:
print(soup)

<html>
<head>
<title>hCaptcha solve page</title>
<script async="" defer="" src="https://www.hcaptcha.com/1/api.js"></script>
</head>
<body>
<form action="/jobs?l=catalina+island" method="POST">
<div class="h-captcha" data-sitekey="eb27f525-f936-43b4-91e2-95a426d4a8bd"></div>
<br/>
<input type="submit" value="Submit"/>
</form>
</body>
</html>



In [41]:
# identifying the top level that includes data wanted
cards = soup.find_all('div', 'job_seen_beacon')

In [42]:
# confirming length of results is equal to the length
len(cards)

0

In [None]:
# pulling 1 card record to test results
card = cards[4]

In [None]:
print(card)

In [None]:
# pulling job title
job_title = card.find('h2', 'jobTitle').text.strip()
print(job_title)

In [None]:
# getting company name
company = card.find('span', 'companyName').text.strip()
print(company)

In [None]:
# pulling location of business
location = card.find('div', 'companyLocation').text.strip()
print(location)

In [None]:
# pulling job summary
summary = card.find('div', 'job-snippet').text.strip()
print(summary)

In [None]:
# getting date job was posted
post_date = card.find('span', 'date').text
print(post_date)

In [None]:
# using todays date to use a reference of when data was pulled
datetime.today().strftime('%y-%m-%d')

In [None]:
# getting salary information, since many don't have any, if there isn't a value it will be none instead
salary_tag = card.find('div', 'metadata salary-snippet-container')
if salary_tag:
    salary = salary_tag.text.strip()
else:
    salary = 'None'
    
print(salary)

In [None]:
# attempted to get job type, unsuccessful
# job_type = soup.find('div', 'metadata')
# job_type = job_type.text.strip()
# print(job_type)

In [None]:
# attempted to get job url, unsuccessful
# job_url = soup.find('a', 'vjs-highlight')('href')
# print(job_url)

In [None]:
# pulling all the data into 1 record
def get_record(card):
    job_title = card.find('h2', 'jobTitle').text.strip()
    company = card.find('span', 'companyName').text.strip()
    location = card.find('div', 'companyLocation').text.strip()
    summary = card.find('div', 'job-snippet').text.strip()
    post_date = card.find('span', 'date').text.strip()
    today = datetime.today().strftime('%y-%m-%d')
    
    salary_tag = card.find('div', 'metadata salary-snippet-container')
    if salary_tag:
        salary = salary_tag.text.strip()
    else:
        salary = 'None'
        
    record = (job_title, company, location, summary, post_date, today, salary)
    
    return record

    

In [None]:
# creating dictionary to hold all the records
records = []

for card in cards:
    record = get_record(card)
    records.append(record)

In [None]:
# checking the records look correct
records[14]

In [None]:
# going to the next page to get all job posting results
while True:
    try:
        next_page = 'https://www.indeed.com' + soup.find('a', {'aria-label': 'Next'}).get('href')
    except AttributeError:
        break
        
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    cards = soup.find_all('div', class_="slider_container")
    
    for card in cards:
        record = get_record(card)
        records.append(record)

In [None]:
# exporting data to csv file for use in SQL
with open('indeed_jobs.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Title', 'Company', 'Location', 'Summary', 'Post_Date', 'Extracted_Date', 'Salary'])
    writer.writerows(records)