Trying to update the Webscraping Indeed Notebook to Python 3

In [66]:
# API Calls
import requests
# Parse HTML
import bs4
# Handle Dataframes (excel data)
import pandas as pd
# Better Math functions
import numpy as np
# Plotting library
# import matplotlib as plt
# Time library to create timestamps for filenames
import time

In [62]:
# Functions to extract specific pieces of data from the html of a indeed search page.
def extract_location(posting, null_value=None):
    try:
        return posting.find('div', {'class': 'location'}).text
    except:
        return null_value

def extract_company(posting, null_value=None):
    try:
        return posting.find('span', {'class':'company'}).text
    except:
        return null_value

def extract_job_title(posting, null_value=None):
    try:
        return posting.find('a', attrs = {'data-tn-element':'jobTitle'}).text
    except:
        return null_value
    
def extract_salary(posting, null_value=None):
    try:
        return posting.find(name="span", attrs={"class":"no-wrap"}).text
    except:
        return null_value

def extract_summary(posting, null_value=None):
    summaries=""
    try:
        spans = posting.findAll('span', attrs={'class': 'summary'})
        for span in spans:
            summaries += span.text.strip()
        return summaries
    except:
        return null_value

def extract_url(posting, null_value=None):
    try:
        return posting.get('data-jk')
    except:
        return null_value

In [63]:
# URL Format for Indeed Search
INDEED_URL_TEMPLATE = "http://www.indeed.com/jobs?q={}&l={}&start={}"

# Example
print(INDEED_URL_TEMPLATE.format("Software+Engineer", "New+York", 0))

http://www.indeed.com/jobs?q=Software+Engineer&l=New+York&start=0


### Query Params in Indeed URL
- q= refers to the query, usually the job title and salary you want
- l= refers to the location, usually the city or state
- start= refers to the result number you are at. i.e., start=10, you are viewing results 11-20.

In [64]:
# Fetch Page Information for Indeed Search
query = "Software+Engineer"
cities = ["California"]
max_results_per_city = 10
null_value = "NA"

df = pd.DataFrame()
for city in cities:
    for start in range(0, max_results_per_city, 10):
        url = INDEED_URL_TEMPLATE.format(query,city,start)
        html = requests.get(url).text
        soups = bs4.BeautifulSoup(html, "html.parser")
        rows = soups.find_all('div', attrs = {'class':'row'})
        for posting in rows:
            df = df.append({
                "location": extract_location(posting, city),
                "company": extract_company(posting, null_value),
                "job_title": extract_job_title(posting, null_value),
                "salary": extract_salary(posting, null_value),
                "url": extract_url(posting, null_value),
                "summary": extract_summary(posting, null_value)
            }, ignore_index=True)

In [65]:
# Clean fetched Data
df.drop_duplicates(inplace=True) #dropping duplicates
df.company.replace(regex=True,inplace=True,to_replace=["\n", "\r"],value="") #getting rid of /n in company
df.salary.replace(regex=True, inplace=True, to_replace=["\n", "\r", "\$"], value="") #getting rid of $ in salary
# Splitting up h and the rest of the url because url strings mess with jupyter notebook's formatting.
df['url'] = "h" + "ttps://www.indeed.com/viewjob?jk=" + df['url']
df.summary.replace(regex=True, inplace=True, to_replace=['\.\.\.'], value="") #getting rid of elipses in summary

Unnamed: 0,company,job_title,location,salary,summary,url
0,Marin Software,Software Engineer-Ruby on Rails,"San Francisco, CA 94105 (Financial District area)",,Marin Software's world headquarters is located...,https://www.indeed.com/viewjob?jk=19859f4e3e57...
1,Tynker,Software Engineer - Game Development,"Mountain View, CA",,Experience as a developer on a project that us...,https://www.indeed.com/viewjob?jk=a528ae91f7c9...
2,Laserfiche in partnership with Indeed ...,Software Engineer/Web Developer,"Long Beach, CA",,Extending client-side content management capab...,https://www.indeed.com/viewjob?jk=1e539366cc3e...
3,Grand Rounds,Software Engineer - Data Engineering,"San Francisco, CA 94107 (South Of Market area)",,"Python, Java, AWS, Kinesis/Kafka, MySQL/Postgr...",https://www.indeed.com/viewjob?jk=f2982cf91deb...
4,"Yardi Systems, Inc.",Software Development Engineer I,"Santa Barbara, CA",,Familiarity with one of the following computer...,https://www.indeed.com/viewjob?jk=ba33eb20d052...
5,Court Appearance Professionals,Software Engineer,"Santa Fe Springs, CA 90670","80,000 - 95,000 a year",Experience with issue tracking software (Open ...,https://www.indeed.com/viewjob?jk=0d71a3f968c9...
6,Google,"Software Test Engineer, Devices, Verily Life S...",California,,"South San Francisco, CA, USA. Apply to this jo...",https://www.indeed.com/viewjob?jk=4fdc702a46bb...
7,Snowflake Computing,Software Engineer - University Graduate (Entry...,California,,Software engineers at Snowflake are passionate...,https://www.indeed.com/viewjob?jk=f878758636a1...
8,Google,"Software Engineer, Verily Life Sciences - Sout...",California,,"South San Francisco, CA, USA. Apply to this jo...",https://www.indeed.com/viewjob?jk=533f85780bfb...
9,Google,"Software Test Engineer, Mobile and Web Applica...",California,,"Mountain View, CA, USA. Apply to this job on t...",https://www.indeed.com/viewjob?jk=1bf9aaf7196d...


Data is now cleaned and ready processing

In [67]:
# sending it to csvs to save the data
timestr = time.strftime("%Y_%m_%d-%H%M%S")
df.to_csv(f"{query}-{timestr}.csv" , sep=',', encoding='utf-8')

In [69]:
pd.read_csv("Software+Engineer-2018_11_24-030232.csv", index_col=0)

Unnamed: 0,company,job_title,location,salary,summary,url
0,Marin Software,Software Engineer-Ruby on Rails,"San Francisco, CA 94105 (Financial District area)",,Marin Software's world headquarters is located...,https://www.indeed.com/viewjob?jk=19859f4e3e57...
1,Tynker,Software Engineer - Game Development,"Mountain View, CA",,Experience as a developer on a project that us...,https://www.indeed.com/viewjob?jk=a528ae91f7c9...
2,Laserfiche in partnership with Indeed ...,Software Engineer/Web Developer,"Long Beach, CA",,Extending client-side content management capab...,https://www.indeed.com/viewjob?jk=1e539366cc3e...
3,Grand Rounds,Software Engineer - Data Engineering,"San Francisco, CA 94107 (South Of Market area)",,"Python, Java, AWS, Kinesis/Kafka, MySQL/Postgr...",https://www.indeed.com/viewjob?jk=f2982cf91deb...
4,"Yardi Systems, Inc.",Software Development Engineer I,"Santa Barbara, CA",,Familiarity with one of the following computer...,https://www.indeed.com/viewjob?jk=ba33eb20d052...
5,Court Appearance Professionals,Software Engineer,"Santa Fe Springs, CA 90670","80,000 - 95,000 a year",Experience with issue tracking software (Open ...,https://www.indeed.com/viewjob?jk=0d71a3f968c9...
6,Google,"Software Test Engineer, Devices, Verily Life S...",California,,"South San Francisco, CA, USA. Apply to this jo...",https://www.indeed.com/viewjob?jk=4fdc702a46bb...
7,Snowflake Computing,Software Engineer - University Graduate (Entry...,California,,Software engineers at Snowflake are passionate...,https://www.indeed.com/viewjob?jk=f878758636a1...
8,Google,"Software Engineer, Verily Life Sciences - Sout...",California,,"South San Francisco, CA, USA. Apply to this jo...",https://www.indeed.com/viewjob?jk=533f85780bfb...
9,Google,"Software Test Engineer, Mobile and Web Applica...",California,,"Mountain View, CA, USA. Apply to this job on t...",https://www.indeed.com/viewjob?jk=1bf9aaf7196d...
