Trying to update the Webscraping Indeed Notebook to Python 3

In [195]:
%load_ext autoreload
%autoreload 2
# API Calls
import requests
# Parse HTML
import bs4
# Handle Dataframes (excel data)
import pandas as pd
# Better Math functions
import numpy as np
# Plotting library
# import matplotlib as plt
# Time library to create timestamps for filenames
import time

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
# Functions to extract specific pieces of data from the html of a indeed search page.
def extract_location(posting, null_value=None):
    try:
        return posting.find('div', {'class': 'location'}).text
    except:
        return null_value

def extract_company(posting, null_value=None):
    try:
        return posting.find('span', {'class':'company'}).text
    except:
        return null_value

def extract_job_title(posting, null_value=None):
    try:
        return posting.find('a', attrs = {'data-tn-element':'jobTitle'}).text
    except:
        return null_value
    
def extract_salary(posting, null_value=None):
    try:
        return posting.find(name="span", attrs={"class":"no-wrap"}).text
    except:
        return null_value

def extract_summary(posting, null_value=None):
    summaries=""
    try:
        spans = posting.findAll('span', attrs={'class': 'summary'})
        for span in spans:
            summaries += span.text.strip()
        return summaries
    except:
        return null_value

def extract_url(posting, null_value=None):
    try:
        return posting.get('data-jk')
    except:
        return null_value

In [36]:
# URL Format for Indeed Search
INDEED_URL_TEMPLATE = "http://www.indeed.com/jobs?q={}&l={}&start={}"

# Example
print(INDEED_URL_TEMPLATE.format("Software+Engineer", "New+York", 0))

http://www.indeed.com/jobs?q=Software+Engineer&l=New+York&start=0


### Query Params in Indeed URL
- q= refers to the query, usually the job title and salary you want
- l= refers to the location, usually the city or state
- start= refers to the result number you are at. i.e., start=10, you are viewing results 11-20.

In [91]:
# Fetch Page Information for Indeed Search
query = "machine+learning"
cities = ["New+York%2C+NY"]
max_results_per_city = 100
null_value = "NA"

df = pd.DataFrame()
for city in cities:
    for start in range(0, max_results_per_city, 10):
        url = INDEED_URL_TEMPLATE.format(query,city,start)
        html = requests.get(url).text
        soups = bs4.BeautifulSoup(html, "html.parser")
        rows = soups.find_all('div', attrs = {'class':'row'})
        for posting in rows:
            df = df.append({
                "location": extract_location(posting, city),
                "company": extract_company(posting, null_value),
                "job_title": extract_job_title(posting, null_value),
                "salary": extract_salary(posting, null_value),
                "url": extract_url(posting, null_value)
#                 "summary": extract_summary(posting, null_value)
            }, ignore_index=True)

In [92]:
# Clean fetched Data
df.drop_duplicates(inplace=True) #dropping duplicates
df.company.replace(regex=True,inplace=True,to_replace=["\n", "\r"],value="") #getting rid of /n in company
df.salary.replace(regex=True, inplace=True, to_replace=["\n", "\r", "\$"], value="") #getting rid of $ in salary
# Splitting up h and the rest of the url because url strings mess with jupyter notebook's formatting.
df['url'] = "h" + "ttps://www.indeed.com/viewjob?jk=" + df['url']
# df.summary.replace(regex=True, inplace=True, to_replace=['\.\.\.'], value="") #getting rid of elipses in summary
df.reset_index(drop=True, inplace=True)
df.index.values

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91], dtype=int64)

Data is now cleaned and ready processing

In [93]:
# sending it to csvs to save the data
timestr = time.strftime("%Y_%m_%d-%H%M%S")
filename = f"{query}-{timestr}"
df.to_csv(f"{filename}.csv" , sep=',', encoding='utf-8')

In [94]:
# data = pd.read_csv("Software+Engineer-2018_11_24-030232.csv", index_col=0)
data = pd.read_csv(f"{filename}.csv", index_col=0)
# filename

## Getting Data Per Posting Page (WIP)

In [15]:
# Getting one posting worth of data
url = data.loc[:, 'url'].values[0]
html = requests.get(url).text
soups = bs4.BeautifulSoup(html, "html.parser")

https://www.indeed.com/viewjob?jk=19859f4e3e579ebb


In [83]:
# Print out job description as one srting
main_content = soups.find('div', {'class': "jobsearch-JobComponent icl-u-xs-mt--sm jobsearch-JobComponent-bottomDivider"})
job_description = soups.find('div', {'class': "jobsearch-JobComponent-description icl-u-xs-mt--md"})
# job_description.get_text("  ", strip=True).strip()

"Desired:  Big Data  Ruby  MongoDB  Software Engineer-Ruby  We are a team of engineers focused on a building a performant, self-service ad platform that enables small and medium sized businesses to drive more sales and conversions through retargeting lost visitors on Facebook and across the web.  The perfect candidate to join our team would enjoy database design, application logic, and standards-based front ends. You will have a chance to explore cutting edge big data technologies in the context of a modern Rails web application.  This is a chance to work on every facet of a successful web app and grow your engineering skill set.  What we are looking for:  (2 Engineers)  3+ and 5+ years of engineering experience  Experience with: Rails, Ruby, MongoDB or other Database  Excellent at communicating and collaborating (ready to discuss features, bugs, architecture, etc.)  Knowledge and interest around database technologies and Big Data  Compensation :  This Is an on-site, full-time salary p

In [95]:
# Code to get all descriptions from urls and append filenames to desc column
# TODO: Make this a function
def indeed_posting_scraper(data, filename):
    x,y = data.shape
    desc_df = pd.DataFrame()
    for i in range(x):
        url = data.iloc[i]['url']
        html = requests.get(url).text
        soups = bs4.BeautifulSoup(html, "html.parser")
        job_description = soups.find('div', {'class': "jobsearch-JobComponent-description icl-u-xs-mt--md"})
        description = job_description.get_text("  ", strip=True).strip()
        desc_filename = f'{filename}_{i}.txt'
        with open(desc_filename, 'w', encoding='utf-8') as the_file:
            the_file.write(description)
        desc_df.at[i, 'desc'] = desc_filename
    return desc_df

# filename = "Software+Engineer-2018_11_24-030232"
desc_dataframe = indeed_posting_scraper(data, filename)
data['desc'] = desc_dataframe['desc']

In [96]:
data.to_csv(f'{filename}.csv', sep=',', encoding='utf-8')

In [108]:
# pd.read_csv(f'{filename}.csv', index_col=0)

In [126]:
# df = df[df.columns.drop(list(df.filter(regex='summary')))]

## Count Vectorizer on File
Seeing most popular words in job posting.

In [97]:
# filename = "machine+learning-2018_11_24-185749"
job_info = pd.read_csv(f'{filename}.csv', index_col=0)
job_info

Unnamed: 0,company,job_title,location,salary,url,desc
0,Darwin Recruitment,Machine Learning Engineer,"Manhattan, NY",,https://www.indeed.com/viewjob?jk=caba65932c24...,machine+learning-2018_11_24-194831_0.txt
1,WeWork,"Manager, Machine Learning","New York, NY 10013 (Tribeca area)",,https://www.indeed.com/viewjob?jk=ce624f6bd070...,machine+learning-2018_11_24-194831_1.txt
2,BlackRock,Data & Analytics Specialist - Portfolio Analyt...,"New York, NY 10261 (Murray Hill area)",,https://www.indeed.com/viewjob?jk=0c9ba400a4f8...,machine+learning-2018_11_24-194831_2.txt
3,Mei Messaging,NLP and Machine Learning - Recent Grad,New+York%2C+NY,25 - 35 an hour,https://www.indeed.com/viewjob?jk=d344f0aeb105...,machine+learning-2018_11_24-194831_3.txt
4,MINDPORTS AI,Quantitative Analysis and Machine Learning,New+York%2C+NY,"150,000 - 185,000 a year",https://www.indeed.com/viewjob?jk=439cc4603aab...,machine+learning-2018_11_24-194831_4.txt
5,Google,"Software Engineer, Machine Learning",New+York%2C+NY,,https://www.indeed.com/viewjob?jk=2f0f31925a78...,machine+learning-2018_11_24-194831_5.txt
6,Citi,"MQA – Mortgage Quantitative Analyst, Machine L...",New+York%2C+NY,,https://www.indeed.com/viewjob?jk=43c08c68cf10...,machine+learning-2018_11_24-194831_6.txt
7,Kasisto,Machine Learning Engineer,New+York%2C+NY,"140,000 - 160,000 a year",https://www.indeed.com/viewjob?jk=dcc8c92f934b...,machine+learning-2018_11_24-194831_7.txt
8,Clarifai,Applied Machine Learning Engineer Intern,New+York%2C+NY,,https://www.indeed.com/viewjob?jk=dd8b1c8d9a2e...,machine+learning-2018_11_24-194831_8.txt
9,Google,Research Intern,New+York%2C+NY,,https://www.indeed.com/viewjob?jk=d15cfed2417d...,machine+learning-2018_11_24-194831_9.txt


In [117]:
descriptions = []
for txtfile in job_info['desc']:
    with open(txtfile, 'r', encoding='utf-8') as the_file:
        descriptions.append(the_file.read().lower())

In [183]:
# https://medium.com/@cristhianboujon/how-to-list-the-most-common-words-from-text-corpus-using-scikit-learn-dad4d0cab41d
def get_top_n_words(corpus, stop_words=None):
    """
    List the top n words in a vocabulary according to occurrence in a text corpus.
    
    get_top_n_words(["I love Python", "Python is a language programming", "Hello world", "I love the world"]) -> 
    [('python', 2),
     ('world', 2),
     ('love', 2),
     ('hello', 1),
     ('is', 1),
     ('programming', 1),
     ('the', 1),
     ('language', 1)]
    """
    vectorizer = CountVectorizer(stop_words=stop_words)
    X = vectorizer.fit_transform(corpus)
    feature_names = vectorizer.get_feature_names()
    sum_words = X.sum(axis=0).tolist()[0]
    words_freq = zip(feature_names, sum_words)
    words_freq =sorted(words_freq, key = lambda x: -x[1])
    return words_freq

In [207]:
%autoreload 2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from more_stop_words import more_stop_words
custom_stop_words = ENGLISH_STOP_WORDS.union(more_stop_words)
# from sklearn.model_selection import train_test_split

In [218]:
# print(descriptions)
vectorizer = CountVectorizer(stop_words=custom_stop_words)
X = vectorizer.fit_transform(descriptions)
feature_names = vectorizer.get_feature_names()
sum_words = X.sum(axis=0).tolist()[0]
words_freq = zip(feature_names, sum_words)
words_freq =sorted(words_freq, key = lambda x: -x[1])
final_map = [word[0] for word in words_freq if word[1] > 0]
print(word_freq[:None])
# vectorizer = CountVectorizer(stop_words=custom_stop_words)
# X = vectorizer.fit_transform(descriptions)
# feature_names = vectorizer.get_feature_names()
# sum_words = X.sum(axis=0).tolist()[0]
# words_freq = zip(feature_names, sum_words)
# words_freq =sorted(words_freq, key = lambda x: -x[1])
# words_freq

[('research', 172), ('deep', 93), ('models', 92), ('python', 83), ('ai', 82), ('ml', 79), ('ll', 77), ('analytics', 69), ('york', 68), ('language', 66), ('java', 62), ('scale', 58), ('intelligence', 54), ('financial', 50), ('natural', 47), ('applied', 42), ('insights', 42), ('real', 42), ('candidate', 41), ('client', 41), ('clients', 41), ('global', 41), ('modeling', 39), ('production', 39), ('quantitative', 38), ('statistics', 38), ('aws', 35), ('tensorflow', 35), ('artificial', 34), ('nlp', 34), ('scientists', 34), ('training', 33), ('candidates', 31), ('key', 31), ('media', 31), ('minimum', 31), ('ny', 31), ('sales', 31), ('amazon', 30), ('investment', 30), ('methods', 30), ('phd', 30), ('relevant', 30), ('include', 29), ('level', 29), ('scientist', 29), ('statistical', 29), ('background', 28), ('mission', 27), ('distributed', 26), ('lead', 26), ('record', 26), ('driven', 25), ('hadoop', 25), ('manage', 25), ('members', 25), ('professional', 25), ('program', 25), ('results', 25), ('