In [2]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
import pathlib

In [3]:
# DEFAULT_CITIES = {'New York', 'Chicago'}

DEFAULT_CITIES = {'New York', 'Chicago', 'San Francisco', 'Austin', 'Seattle',
                  'Los Angeles', 'Philadelphia', 'Atlanta', 'Dallas',
                  'Pittsburgh', 'Portland', 'Phoenix', 'Denver', 'Houston',
                  'Miami'}

YOUR_CITIES = {'Boston', 'Washington DC', 'St Louis', 'San Diego',
               'San Antonio', 'Columbus', 'Sacramento', 'Charlotte', 'Memphis',
               'Detroit', 'Nashville', 'Jacksonville', 'Indianapolis',
               'Fort Worth', 'Charlotte', 'El Paso', 'Oklahoma City',
               'Las Vegas', 'Louisville', 'Milwaukee', 'Albuquerque', 'Tucson',
               'Kansas City', 'Mesa', 'Colorado Springs', 'Raleigh', 'Omaha',
               'Virginia Beach', 'Minneapolis', 'New Orleans', 'Tampa',
               'San Jose', 'Baltimore', 'Fresno', 'Oakland', 'Tulsa', 'Madison',
               'Arlington', 'Wichita', 'Cleveland', 'Aurora', 'Honolulu',
               'Orlando', 'Anchorage', 'Des Moines', 'Salt Lake City',
               'Lexington', 'Cincinnati', 'Newark', 'Durham', 'Buffalo',
               'Baton Rouge', 'Richmond', 'Boise', 'Birmingham', 'Little Rock',
               'Grand Rapids', 'Worcester', 'Providence', 'Sioux Falls',
               'Jackson', 'Hartford', 'Bridgeport', 'Jersey City', 'Charleston',
               'Billings', 'Fargo', 'Augusta'}

In [4]:
def extract_location(result):
    """extract job location"""
    try:
        location = result.find('span', class_='location').get_text().strip()
        return location
    except:
        return None

In [5]:
def extract_company(result):
    """extract the name of the company"""
    try:
        company = result.find('span', class_='company').get_text().strip()
        return company
    except:
        return None

In [6]:
def extract_title(result):
    """extract the job title"""
    try:
        title = result.find('a', attrs={'data-tn-element': "jobTitle"}).get('title')
        return title
    except:
        return None

In [7]:
def extract_star(result):
    """extract a number (width) that is proportional to the number of stars
    shown for the company"""
    try:
        # the 'style' attribute dictates how many stars are filled with color
        star = result.find('span', class_='ratingsContent').get_text()
        # extract only the number
        star = star.replace('""', '').replace('\n', '')
        return star
    except:
        return None

In [8]:
def extract_salary(result):
    """extract the salary"""
    try:
        salary = result.find('span', class_='salaryText').get_text().strip()
        return salary
    except:
        return None

In [11]:
url = "http://www.indeed.com/jobs"
params = {'q': 'data scientist', 'radius': '100'}
# params = {'radius': '100'}
max_results = 100

In [12]:

company_info_df = pd.DataFrame()
company_result_df = pd.DataFrame()

for city in DEFAULT_CITIES:
# for city in DEFAULT_CITIES:
    for start in range(0, max_results, 10):
        url_params = params.copy()
        url_params.update({'l': city, 'start': start})
        scraped_data = {
                    'location': [],
                    'company': [],
                    'title': [],
                    'salary' :[],
                    'star': []}
        response = requests.get(url, params=url_params)
        soup = BeautifulSoup(response.text, 'lxml')
        results = soup.find_all('div', class_='result')
        
        time.sleep(0.05)
        print(response.url)
        print('------------------------------------------------------------------------')
        
        for result in results:
            scraped_data['location'].append(extract_location(result))
            scraped_data['company'].append(extract_company(result))
            scraped_data['title'].append(extract_title(result))
            scraped_data['salary'].append(extract_salary(result))
            scraped_data['star'].append(extract_star(result))
        
        result_df = pd.DataFrame(scraped_data)
        
        company_result_df = company_result_df.append(result_df, ignore_index=True)
        
        company_info_df = pd.DataFrame(scraped_data)
        
        company_info_df.to_csv('../csv/company_info.csv', mode='a', header=False, index=False)

https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Pittsburgh&start=0
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Pittsburgh&start=10
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Pittsburgh&start=20
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Pittsburgh&start=30
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Pittsburgh&start=40
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Pittsburgh&start=50
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Pittsburgh&start=60
-----------------

https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Houston&start=60
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Houston&start=70
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Houston&start=80
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Houston&start=90
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Denver&start=0
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Denver&start=10
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Denver&start=20
-----------------------------------------

https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Miami&start=10
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Miami&start=20
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Miami&start=30
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Miami&start=40
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Miami&start=50
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Miami&start=60
------------------------------------------------------------------------
https://www.indeed.com/jobs?q=data+scientist&radius=100&l=Miami&start=70
---------------------------------------------------

In [13]:
company_result_df.head()


Unnamed: 0,location,company,title,salary,star
0,,"YinzCam, Inc.",Senior Data Analyst,,4.0
1,,Software Engineering Institute,Senior Research Scientist-Machine Learning,,4.3
2,"Coraopolis, PA 15108",DICK'S Sporting Goods,Data Scientist,,3.5
3,"Pittsburgh, PA 15213 (Squirrel Hill North area)",Carnegie Mellon University,Machine Learning Fall Internships,,4.2
4,"Pittsburgh, PA 15207 (Hays area)","Dagostino Electronic Services, Inc.",Chief Data Scientist,,


In [24]:
company_count_df = pd.DataFrame(company_result_df.company.value_counts())
company_count_df = company_count_df.reset_index()
company_count_df.rename(columns = {'index':'company', 'company':'count'}, inplace=True)
company_count_df

Unnamed: 0,company,count
0,"MORI Associates, Inc.",108
1,"Perceptronics Solutions, Inc",88
2,The National Board of Boiler & PV Inspectors,80
3,Amazon.com Services LLC,45
4,TuSimple,41
...,...,...
817,Danone,1
818,Vinli,1
819,Sift,1
820,Careers | West Virginia University,1


In [25]:
company_result_df['star'] = company_result_df['star'].astype(float)
company_rating_df = company_result_df.groupby('company').mean()
company_rating_df = company_rating_df.reset_index()
company_rating_df

Unnamed: 0,company,star
0,#TeamGoHealth,3.0
1,0x,
2,1-800-Flowers,3.7
3,132 ENG Inc.,
4,"7-Eleven, Inc.",3.6
...,...,...
817,setld,
818,thredUP Inc,2.7
819,true[x],
820,viagogo,2.9


In [27]:
company_final_df = company_rating_df.merge(company_count_df, on='company')
company_final_df

Unnamed: 0,company,star,count
0,#TeamGoHealth,3.0,2
1,0x,,1
2,1-800-Flowers,3.7,5
3,132 ENG Inc.,,2
4,"7-Eleven, Inc.",3.6,1
...,...,...,...
817,setld,,1
818,thredUP Inc,2.7,1
819,true[x],,1
820,viagogo,2.9,1


In [28]:
company_rating_df.to_csv('../csv/company_rating.csv', mode='w', header=True, index=False)