Trying to update the Webscraping Indeed Notebook to Python 3

In [7]:
# API Calls
import requests
# Parse HTML
import bs4
# Handle Dataframes (excel data)
import pandas as pd
# Better Math functions
import numpy as np
# Plotting library
# import matplotlib as plt
# Time library to create timestamps for filenames
import time

In [62]:
# Functions to extract specific pieces of data from the html of a indeed search page.
def extract_location(posting, null_value=None):
    try:
        return posting.find('div', {'class': 'location'}).text
    except:
        return null_value

def extract_company(posting, null_value=None):
    try:
        return posting.find('span', {'class':'company'}).text
    except:
        return null_value

def extract_job_title(posting, null_value=None):
    try:
        return posting.find('a', attrs = {'data-tn-element':'jobTitle'}).text
    except:
        return null_value
    
def extract_salary(posting, null_value=None):
    try:
        return posting.find(name="span", attrs={"class":"no-wrap"}).text
    except:
        return null_value

def extract_summary(posting, null_value=None):
    summaries=""
    try:
        spans = posting.findAll('span', attrs={'class': 'summary'})
        for span in spans:
            summaries += span.text.strip()
        return summaries
    except:
        return null_value

def extract_url(posting, null_value=None):
    try:
        return posting.get('data-jk')
    except:
        return null_value

In [63]:
# URL Format for Indeed Search
INDEED_URL_TEMPLATE = "http://www.indeed.com/jobs?q={}&l={}&start={}"

# Example
print(INDEED_URL_TEMPLATE.format("Software+Engineer", "New+York", 0))

http://www.indeed.com/jobs?q=Software+Engineer&l=New+York&start=0


### Query Params in Indeed URL
- q= refers to the query, usually the job title and salary you want
- l= refers to the location, usually the city or state
- start= refers to the result number you are at. i.e., start=10, you are viewing results 11-20.

In [64]:
# Fetch Page Information for Indeed Search
query = "Software+Engineer"
cities = ["California"]
max_results_per_city = 10
null_value = "NA"

df = pd.DataFrame()
for city in cities:
    for start in range(0, max_results_per_city, 10):
        url = INDEED_URL_TEMPLATE.format(query,city,start)
        html = requests.get(url).text
        soups = bs4.BeautifulSoup(html, "html.parser")
        rows = soups.find_all('div', attrs = {'class':'row'})
        for posting in rows:
            df = df.append({
                "location": extract_location(posting, city),
                "company": extract_company(posting, null_value),
                "job_title": extract_job_title(posting, null_value),
                "salary": extract_salary(posting, null_value),
                "url": extract_url(posting, null_value)
#                 "summary": extract_summary(posting, null_value)
            }, ignore_index=True)

In [70]:
# Clean fetched Data
df.drop_duplicates(inplace=True) #dropping duplicates
df.company.replace(regex=True,inplace=True,to_replace=["\n", "\r"],value="") #getting rid of /n in company
df.salary.replace(regex=True, inplace=True, to_replace=["\n", "\r", "\$"], value="") #getting rid of $ in salary
# Splitting up h and the rest of the url because url strings mess with jupyter notebook's formatting.
df['url'] = "h" + "ttps://www.indeed.com/viewjob?jk=" + df['url']
df.summary.replace(regex=True, inplace=True, to_replace=['\.\.\.'], value="") #getting rid of elipses in summary

Data is now cleaned and ready processing

In [67]:
# sending it to csvs to save the data
timestr = time.strftime("%Y_%m_%d-%H%M%S")
df.to_csv(f"{query}-{timestr}.csv" , sep=',', encoding='utf-8')

In [9]:
data = pd.read_csv("Software+Engineer-2018_11_24-030232.csv", index_col=0)

## Getting Data Per Posting Page (WIP)

In [15]:
url = data.loc[:, 'url'].values[0]
html = requests.get(url).text
soups = bs4.BeautifulSoup(html, "html.parser")
print(url)

https://www.indeed.com/viewjob?jk=19859f4e3e579ebb


In [83]:
main_content = soups.find('div', {'class': "jobsearch-JobComponent icl-u-xs-mt--sm jobsearch-JobComponent-bottomDivider"})
job_description = soups.find('div', {'class': "jobsearch-JobComponent-description icl-u-xs-mt--md"})
job_description.get_text("  ", strip=True).strip()
# i=0
# for elem in job_description.find_all('br'):
#     elem.decompose()
# sections = {}
# for header in job_description.find_all('b'):
#     nextNode = header
#     header_section = nextNode.get_text(".  ", strip=True).strip()
#     sections[header_section] = ""
#     while True:
#         nextNode = nextNode.nextSibling
#         if nextNode is None:
#             break
#         if isinstance(nextNode, bs4.NavigableString):
#             sections[header_section] += nextNode.strip() + " "
#         elif isinstance(nextNode, bs4.Tag):
#             if nextNode.name == "b":
#                 break
#             sections[header_section] += nextNode.get_text(".  ", strip=True).strip() + " "
# from pprint import pprint
# pprint(sections)

"Desired:  Big Data  Ruby  MongoDB  Software Engineer-Ruby  We are a team of engineers focused on a building a performant, self-service ad platform that enables small and medium sized businesses to drive more sales and conversions through retargeting lost visitors on Facebook and across the web.  The perfect candidate to join our team would enjoy database design, application logic, and standards-based front ends. You will have a chance to explore cutting edge big data technologies in the context of a modern Rails web application.  This is a chance to work on every facet of a successful web app and grow your engineering skill set.  What we are looking for:  (2 Engineers)  3+ and 5+ years of engineering experience  Experience with: Rails, Ruby, MongoDB or other Database  Excellent at communicating and collaborating (ready to discuss features, bugs, architecture, etc.)  Knowledge and interest around database technologies and Big Data  Compensation :  This Is an on-site, full-time salary p

In [114]:
x,y = data.shape
filename = "Software+Engineer-2018_11_24-030232"
for i in range(x):
    url = data['url'][i]
    html = requests.get(url).text
    soups = bs4.BeautifulSoup(html, "html.parser")
    job_description = soups.find('div', {'class': "jobsearch-JobComponent-description icl-u-xs-mt--md"})
    description = job_description.get_text("  ", strip=True).strip()
    desc_filename = f'{filename}_{i}.txt'
    with open(desc_filename, 'w', encoding='utf-8') as the_file:
        the_file.write(description)
    data.loc[i, 'desc'] = desc_filename

In [127]:
df.to_csv(f'{filename}.csv', sep=',', encoding='utf-8')

In [128]:
pd.read_csv(f'{filename}.csv', index_col=0)

Unnamed: 0,company,job_title,location,salary,url,desc
0,Marin Software,Software Engineer-Ruby on Rails,"San Francisco, CA 94105 (Financial District area)",,https://www.indeed.com/viewjob?jk=19859f4e3e57...,Software+Engineer-2018_11_24-030232_0.txt
1,Tynker,Software Engineer - Game Development,"Mountain View, CA",,https://www.indeed.com/viewjob?jk=a528ae91f7c9...,Software+Engineer-2018_11_24-030232_1.txt
2,Laserfiche in partnership with Indeed ...,Software Engineer/Web Developer,"Long Beach, CA",,https://www.indeed.com/viewjob?jk=1e539366cc3e...,Software+Engineer-2018_11_24-030232_2.txt
3,Grand Rounds,Software Engineer - Data Engineering,"San Francisco, CA 94107 (South Of Market area)",,https://www.indeed.com/viewjob?jk=f2982cf91deb...,Software+Engineer-2018_11_24-030232_3.txt
4,"Yardi Systems, Inc.",Software Development Engineer I,"Santa Barbara, CA",,https://www.indeed.com/viewjob?jk=ba33eb20d052...,Software+Engineer-2018_11_24-030232_4.txt
5,Court Appearance Professionals,Software Engineer,"Santa Fe Springs, CA 90670","80,000 - 95,000 a year",https://www.indeed.com/viewjob?jk=0d71a3f968c9...,Software+Engineer-2018_11_24-030232_5.txt
6,Google,"Software Test Engineer, Devices, Verily Life S...",California,,https://www.indeed.com/viewjob?jk=4fdc702a46bb...,Software+Engineer-2018_11_24-030232_6.txt
7,Snowflake Computing,Software Engineer - University Graduate (Entry...,California,,https://www.indeed.com/viewjob?jk=f878758636a1...,Software+Engineer-2018_11_24-030232_7.txt
8,Google,"Software Engineer, Verily Life Sciences - Sout...",California,,https://www.indeed.com/viewjob?jk=533f85780bfb...,Software+Engineer-2018_11_24-030232_8.txt
9,Google,"Software Test Engineer, Mobile and Web Applica...",California,,https://www.indeed.com/viewjob?jk=1bf9aaf7196d...,Software+Engineer-2018_11_24-030232_9.txt


In [126]:
df = df[df.columns.drop(list(df.filter(regex='summary')))]