In [1]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
import pathlib

In [2]:
# DEFAULT_CITIES = {'New York', 'Chicago'}

DEFAULT_CITIES = {'New York', 'Chicago', 'San Francisco', 'Austin', 'Seattle',
                  'Los Angeles', 'Philadelphia', 'Atlanta', 'Dallas',
                  'Pittsburgh', 'Portland', 'Phoenix', 'Denver', 'Houston',
                  'Miami'}

In [4]:
def extract_company(result):
    """extract the name of the company"""
    try:
        company = result.find('span', class_='company').get_text().strip()
        return company
    except:
        return None

In [5]:
def extract_title(result):
    """extract the job title"""
    try:
        title = result.find('a', attrs={'data-tn-element': "jobTitle"}).get('title')
        return title
    except:
        return None

In [6]:
def extract_star(result):
    """extract a number (width) that is proportional to the number of stars
    shown for the company"""
    try:
        # the 'style' attribute dictates how many stars are filled with color
        star = result.find('span', class_='ratingsContent').get_text()
        # extract only the number
        star = star.replace('""', '').replace('\n', '')
        return star
    except:
        return None

In [8]:
# Setup basic parameters for the request.get url
url = "http://www.indeed.com/jobs"
params = {'q': 'data scientist', 'radius': '100'}
# params = {'radius': '100'}
max_results = 100

In [11]:
# Create two df to store scrape value
company_info_df = pd.DataFrame()
company_result_df = pd.DataFrame()

# Loop through default cities list
for city in DEFAULT_CITIES:
    # Loop through the different pages
    for start in range(0, max_results, 10):
        url_params = params.copy()
        # Update the city and page value for each request.get
        url_params.update({'l': city, 'start': start})
        scraped_data = {
                    'company': [],
                    'title': [],
                    'star': []}
        response = requests.get(url, params=url_params)
        soup = BeautifulSoup(response.text, 'lxml')
        results = soup.find_all('div', class_='result')
        
        # Print out url for debugging
        time.sleep(0.05)
        print(response.url)
        print('------------------------------------------------------------------------')
        
        # Loop though the list of scraped data and store into the dict
        for result in results:
            scraped_data['company'].append(extract_company(result))
            scraped_data['title'].append(extract_title(result))
            scraped_data['star'].append(extract_star(result))
        
        # Create df
        result_df = pd.DataFrame(scraped_data)
        # Append the page result into final dataframe
        company_result_df = company_result_df.append(result_df, ignore_index=True)
        
        # Save pages result into separately csv file to debugging purpose 
        company_info_df = pd.DataFrame(scraped_data)
        company_info_df.to_csv('../CSV_files/company_info.csv', mode='a', header=False, index=False)

https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Philadelphia&start=0
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Philadelphia&start=10
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Philadelphia&start=20
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Philadelphia&start=30
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Philadelphia&start=40
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Philadelphia&start=50
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Philadelphia&start=60
---

https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Houston&start=50
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Houston&start=60
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Houston&start=70
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Houston&start=80
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Houston&start=90
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Los+Angeles&start=0
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Los+Angeles&start=10
------------------------------

https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Pittsburgh&start=10
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Pittsburgh&start=20
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Pittsburgh&start=30
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Pittsburgh&start=40
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Pittsburgh&start=50
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Pittsburgh&start=60
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Pittsburgh&start=70
----------------

In [12]:
# View the scraping result df
company_result_df.head()


Unnamed: 0,location,company,title,salary,star
0,,The American College of Radiology,Clinical Data Manager,,
1,,Noom Inc.,Senior Data Scientist (Remote-friendly),,4.1
2,,North America Service Center for Chinese Stude...,房产咨询顾问/Real Estate Consultant/ Real Estate Sal...,"$80,000 - $150,000 a year",
3,"Philadelphia, PA 19103",Boxplot,Data Scientist,"$30,000 a year",
4,"Philadelphia, PA 19146 (Graduate Hospital area)",Children's Hospital of Philadelphia,Data Scientist II - Cell & Gene Therapy,,4.0


In [13]:
# Counting the position opening for each company
company_count_df = pd.DataFrame(company_result_df.company.value_counts())
company_count_df = company_count_df.reset_index()
company_count_df.rename(columns = {'index':'company', 'company':'count'}, inplace=True)
company_count_df

Unnamed: 0,company,count
0,"Perceptronics Solutions, Inc",112
1,i-Pharm Consulting,73
2,USAA,71
3,TuSimple,43
4,Amazon.com Services LLC,41
...,...,...
815,"KeHE Distributors, LLC",1
816,Smiths-Digital Forge,1
817,TopSpot Internet Marketing,1
818,Viber,1


In [14]:
# Average the rating for each company
company_result_df['star'] = company_result_df['star'].astype(float)
company_rating_df = company_result_df.groupby('company').mean()
company_rating_df = company_rating_df.reset_index()
company_rating_df

Unnamed: 0,company,star
0,#TeamGoHealth,3.0
1,0x,
2,1-800-Flowers,3.7
3,132 ENG Inc.,
4,9Rooftops,
...,...,...
815,setld,
816,thredUP Inc,2.7
817,true[x],
818,viagogo,2.9


In [15]:
# Combine position opening and rating into one final df
company_final_df = company_rating_df.merge(company_count_df, on='company')
company_final_df

Unnamed: 0,company,star,count
0,#TeamGoHealth,3.0,1
1,0x,,1
2,1-800-Flowers,3.7,7
3,132 ENG Inc.,,2
4,9Rooftops,,1
...,...,...,...
815,setld,,1
816,thredUP Inc,2.7,2
817,true[x],,1
818,viagogo,2.9,3


In [17]:
# Save to csv file
company_final_df.to_csv('../CSV_files/company_rating.csv', mode='w', header=True, index=False)