# STEP 1 - WEB SCRAPING

### The results of Step 1.1 are three CSV files:

* 'job_ca_statistics.csv'

* 'job_ca_chemical engineering.csv'

* 'job_ca_materials science.csv'

### The results of Step 1.2 are three CSV files:

* 'summary_stat.csv'

* 'summary_che.csv'

* 'summary_mse.csv'

## Step 1.1 - Using API to get Job Posts

In [1]:
import requests
import requests_cache
requests_cache.install_cache("cache")
import pandas as pd
from bs4 import BeautifulSoup

from matplotlib import pyplot as plt
import string
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer   # sklearn --- primer machine learning package
from sklearn.neighbors import NearestNeighbors

In [2]:
publisher_id = "254802918413674"

http://api.indeed.com/ads/apisearch?publisher=254802918413674&q=java&l=austin%2C+tx&sort=&radius=&st=&jt=&start=&limit=&fromage=&filter=&latlong=1&co=us&chnl=&userip=1.2.3.4&useragent=Mozilla/%2F4.0%28Firefox%29&v=2

One example of: url = "http://api.indeed.com/ads/apisearch?publisher=254802918413674&format=json&q=data&l=sunnyvale%2C+ca&sort=&radius=0&st=&jt=internship&start=&limit=1000&fromage=30&filter=&latlong=1&co=us&chnl=&userip=1.2.3.4&useragent=Mozilla/%2F4.0%28Firefox%29&v=2"

In [3]:
def get_indeed(city, state, keyword, radius, jobtype = "fulltime", start = 0, publisher_id = publisher_id):
    url = "http://api.indeed.com/ads/apisearch?"
    if city == "":
        location = state
    else: 
        location = city + ', ' + state
    result = requests.get(url, params = {
            "q": keyword,
            "v": 2,
            "format": "json",
            "publisher": publisher_id,
            "l": location,
            "jt": jobtype, # "fulltime", "parttime", "contract", "internship", "temporary"
            "radius": radius,
            "start": start,
            "limit": 100,
            "latlong": 1})
    result = result.json()['results']
    return result

In [4]:
def create_job_list(major):
    '''This function creates the job list of a major and saves the dataframe as a csv file
    Input:  major ----- "statistics", "chemical engineering", "material science"
    Output: shape ----- shape of the dataframe
            job_list -- a dataframe of the job list of the specified major
    '''
    job_list = pd.DataFrame()
    for i in range(0,5000,25):
        temp = pd.DataFrame(get_indeed('','ca', major, 0, start = i))
        job_list = job_list.append(temp, ignore_index = True)
        
    # drop duplicate rows
    job_list = job_list.drop_duplicates(subset = ['jobkey'])
    filename = 'job_ca_' + major + '.csv'
    job_list.to_csv(filename)
    return job_list.shape, job_list

In [5]:
shape_stat, stat_joblist = create_job_list("statistics")
shape_che, che_joblist = create_job_list("chemical engineering")
shape_mse, mst_joblist = create_job_list("materials science")

## Step 1.2 - Webscraping each post

In [8]:
def csv2job_url_list(major):
    '''This function reads the csv file of the specified major and returns the list of urls.
       The url list will be used for web scraping for detailed job descriptions
    Input:  major ----- "statistics", "chemical engineering", "material science"
    Output: url_list -- list object, urls of job posts
    '''
    filename = 'job_ca_' + major + '.csv'
    job_list = pd.read_csv(filename,index_col=False)
    return list(job_list['url'])

In [9]:
def get_post_details(url):
    """
    This function extracts information of one post by taking a url as input
    Output is a dictionary with job title, company name, and description of the job.
    """
    post_doc = requests.get(url).content
    post = BeautifulSoup(post_doc, 'html.parser')
    try:
        title = post.find('b',attrs={'class':'jobtitle'}).text
    except AttributeError:
        title = ''
    try:
        company = post.find(attrs={'class':'company'}).text
    except AttributeError:
        company = ''
        
    dis_html = post.find(attrs={'id':'job_summary'})
    #dis_html = post.find(attrs={'class':'snip'})
    
    post_dict = {'title':title, 'company':company, 'description':dis_html}
    return post_dict

## The following 3 cells run forever! Be careful!

In [10]:
# get dataframes of the majors
stat_url_list = csv2job_url_list("statistics")
stat_summary = [get_post_details(url) for url in stat_url_list]
stat_summary = pd.DataFrame(stat_summary)
stat_summary.head()
stat_summary.to_csv('summary_stat.csv')

In [18]:
che_url_list = csv2job_url_list("chemical engineering")
che_summary = [get_post_details(url) for url in che_url_list]
che_summary = pd.DataFrame(che_summary)
che_summary.head()
che_summary.to_csv('summary_che.csv')

In [27]:
mse_url_list = csv2job_url_list("materials science")
mse_summary = [get_post_details(url) for url in mse_url_list]
mse_summary = pd.DataFrame(mse_summary)
mse_summary.to_csv('summary_mse.csv')
mse_summary.head()

Unnamed: 0,company,description,title
0,Lawrence Livermore National Laboratory,"<span class=""summary"" id=""job_summary"">Materia...",Materials Scientist/Physicist
1,Intel,"<span class=""summary"" id=""job_summary"">Job Des...",R&D Process Engineer
2,Tesla Motors,"<span class=""summary"" id=""job_summary"">Failure...",Reliability Engineer - Failure Analysis
3,SpaceX,"<span class=""summary"" id=""job_summary"">SpaceX ...",Materials Engineer (Avionics)
4,Google,"<span class=""summary"" id=""job_summary"">Our com...","Reliability Engineer, Failure Analysis and Mat..."
