### Python Web Scraping
Programming assignment for INST447 <br>
Goal: Extract the first page of job posts of given titles and locations, then export to a csv file<br>
Website: Indeed

In [2]:
# Importing libraries
import pandas as pd
from bs4 import BeautifulSoup
from requests import get
from time import time, sleep
from random import randint
from warnings import warn
from IPython.core.display import clear_output

#### Step 1: Modifying the URL

In [38]:
# Create list of target urls
jobs = ["Data Analyst", "Data Scientist", "Database Administrator", "Machine Learning Engineer", "Data Engineer"]
states = ["Virginia", "New York State", "California", "Texas", "Washington State"]

# Replace spaces with + for url compatibility
jobs = [i.replace(" ", "+") for i in jobs]
states = [i.replace(" ", "+") for i in states]

# Build list of all urls required
urls = []
for j in jobs:
    for s in states:
        urls.append("https://www.indeed.com/jobs?q=" + j  +"&l=" + s + "&sort=date")
        
# Lists for storing variables
titles = []
companies = []
ratings = []
locations = []
wages = []
descriptions = []
post_times = []

#### Step 2: Extracting the Elements

In [39]:
# Loop for requests
requests = 0
start_time = time()
for u in urls:
    # Make a get request
    response = get(u)
        
    # Pause the loop
    sleep(randint(8,15))

    # Monitor the requests
    requests += 1
    elapsed_time = time() - start_time
    print('Request:{}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
    clear_output(wait = True)

    # Throw a warning for non-200 status codes
    if response.status_code != 200:
        warn('Request: {}; Status code: {}'.format(requests, response.status_code))

    # Break the loop if the number of requests is greater than expected
    if requests > 100:
        warn('Number of requests was greater than expected.')
        break
    
    # Parse html
    soup = BeautifulSoup(response.text, 'html.parser')
    job_containers = soup.find_all('div', {'class':{'jobsearch-SerpJobCard unifiedRow row result'}})
    
    # Loop through each job and extract data; append 'Missing' if job does not have certain data
    for job in job_containers:
        # Job Title
        try:
            title = job.find('a', {'class':{'jobtitle turnstileLink'}})
            titles.append(title.text)
        except:
            titles.append('Missing')
            
        # Company Name
        try:
            company = job.find('span', {'class':{'company'}})
            companies.append(company.text)
        except:
            companies.append('Missing')
            
        # Job Location
        try:
            location = job.find(['span','div'], {'class':{'location accessible-contrast-color-location'}})
            locations.append(location.text)
        except:
            locations.append('Missing')
            
        # Job Wage/Salary
        try:
            wage = job.find('span', {'class':{'salaryText'}})
            wages.append(wage.text)
        except:
            wages.append('Missing')
            
        # Job Description
        try:
            summary = job.find('div', {'class':{'summary'}})
            descriptions.append(summary.text)
        except:
            descriptions.append('Missing')
            
        # Posting Date
        try:
            date = job.find('span', {'class':{'date'}})
            post_times.append(date.text)
        except:
            post_times.append('Missing')
            
        # Company rating
        try:
            rating = job.find('span', {'class':{'ratingsContent'}})
            ratings.append(rating.text)
        except:
            ratings.append('Missing')

Request:25; Frequency: 0.08549822300196386 requests/s


#### Step 3: Data Manipulation 

In [40]:
# Framing data and replacing newline escapes, printing first 5 rows
job_data = pd.DataFrame({
    "Job Title": titles,
    "Company Name": companies,
    "Job Location": locations,
    "Salary": wages,
    "Job Description": descriptions,
    "Date of Posting": post_times,
    "Company Rating": ratings
})

job_data = job_data.replace('\n','', regex=True)
job_data.head()

Unnamed: 0,Job Title,Company Name,Job Location,Salary,Job Description,Date of Posting,Company Rating
0,Publications Data Analyst,Piper Companies,"Falls Church, VA",Missing,"Keywords: Federal contacts, Procurement, Data ...",Just posted,4.5
1,EOC Business Analyst,CACI,"Arlington, VA 22201",Missing,Coordinate with cross-functional teams to asce...,Just posted,3.8
2,Office Administrator / Jr. Business Analyst,"Venesco, LLC","Falls Church, VA",Missing,"Conduct relevant research, data analysis, and ...",Just posted,3.5
3,"Lead Data and Analytic Modeler, Customer Analy...",KPMG,"McLean, VA 22102",Missing,"Work with clients to discover data sources, an...",Just posted,4.0
4,Business Analyst I,Fairfax County Government,"Fairfax, VA 22035","$54,219 - $90,366 a year",Integrates data from various sources into the ...,Today,4.0


In [41]:
# Splitting job location into separate columns, adding it to data frame
cities = [i.split(",")[0] for i in job_data["Job Location"]]
states = []
for i in job_data["Job Location"]:
    try:
        states.append(i.split(',')[1][1:])
    except:
        states.append('Missing')
        
job_data["City"] = cities
job_data["State/Zip"] = states

In [42]:
# Print the percentage of Jobs in each city using the value counts function.
print((job_data["City"].groupby(job_data["City"]).count()/len(job_data))*100)

City
Albany              0.266667
Alexandria          0.266667
Anacortes           0.266667
Arlington           3.200000
Armonk              0.533333
                      ...   
Vienna              0.533333
Virginia            0.266667
Washington State    0.533333
Westlake            0.800000
Willits             0.266667
Name: City, Length: 102, dtype: float64


In [43]:
# Save the data frame as a CSV file.
job_data.to_csv('Mannix_Colin_INST447_PA2.csv', index=False)