Trying to update the Webscraping Indeed Notebook to Python 3

In [2]:
%load_ext autoreload
%autoreload 2

In [237]:
# API Calls
import requests
# Parse HTML
import bs4
# Handle Dataframes (excel data)
import pandas as pd
# Plotting library
import matplotlib.pyplot as plt

## Fetching and Cleaning Indeed Search Data

In [15]:
from indeed_scraper import search_indeed, clean_data, save_data, posting_scraper
from utils import save_data

In [20]:
# Fetch Page Information for Indeed Search
# TODO: Add url encoding so you don't have to do it manually here.
query = "machine+learning"
cities = ["New+York%2C+NY", "San+Francisco%2C+CA", "Boston%2C+MA", "Greenwich%2C+CT"]
max_results_per_city = 100
null_value = "NA"

df = search_indeed(query, cities, max_results_per_city, null_value)
df = clean_data(df)
filename = save_data(df, query, path='./indeed_searches/')
print(filename)

./indeed_searches/machine+learning-2018_11_25-165725


## Getting Data Per Posting Page (WIP)

In [19]:
# # Getting one posting worth of data
# url = data.loc[:, 'url'].values[0]
# html = requests.get(url).text
# soups = bs4.BeautifulSoup(html, "html.parser")
# # Print out job description as one srting
# main_content = soups.find('div', {'class': "jobsearch-JobComponent icl-u-xs-mt--sm jobsearch-JobComponent-bottomDivider"})
# job_description = soups.find('div', {'class': "jobsearch-JobComponent-description icl-u-xs-mt--md"})
# # job_description.get_text("  ", strip=True).strip()

In [34]:
# # Print out job description as one srting
# main_content = soups.find('div', {'class': "jobsearch-JobComponent icl-u-xs-mt--sm jobsearch-JobComponent-bottomDivider"})
# job_description = soups.find('div', {'class': "jobsearch-JobComponent-description icl-u-xs-mt--md"})
# # job_description.get_text("  ", strip=True).strip()

In [21]:
## Getting Data Per Posting Page (WIP)
data = pd.read_csv(f"{filename}.csv", index_col=0)
desc_dataframe = posting_scraper(data, filename)
data['desc'] = desc_dataframe['desc']

data.to_csv(f'{filename}.csv', sep=',', encoding='utf-8')

## Count Vectorizer on File
Seeing most popular words in job posting.

In [22]:
# filename = "machine+learning-2018_11_24-185749"
job_info = pd.read_csv(f'{filename}.csv', index_col=0)

In [23]:
descriptions = []
for txtfile in job_info['desc']:
    with open(txtfile, 'r', encoding='utf-8') as the_file:
        descriptions.append(the_file.read().lower())

In [24]:
# https://medium.com/@cristhianboujon/how-to-list-the-most-common-words-from-text-corpus-using-scikit-learn-dad4d0cab41d
def get_top_n_words(corpus, stop_words=None):
    """
    List the top n words in a vocabulary according to occurrence in a text corpus.
    
    get_top_n_words(["I love Python", "Python is a language programming", "Hello world", "I love the world"]) -> 
    [('python', 2),
     ('world', 2),
     ('love', 2),
     ('hello', 1),
     ('is', 1),
     ('programming', 1),
     ('the', 1),
     ('language', 1)]
    """
    vectorizer = CountVectorizer(stop_words=stop_words)
    X = vectorizer.fit_transform(corpus)
    feature_names = vectorizer.get_feature_names()
    sum_words = X.sum(axis=0).tolist()[0]
    words_freq = zip(feature_names, sum_words)
    words_freq =sorted(words_freq, key = lambda x: -x[1])
    return words_freq

In [299]:
%autoreload 2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from more_stop_words import more_stop_words
custom_stop_words = ENGLISH_STOP_WORDS.union(more_stop_words)

In [300]:
# print(descriptions)
vectorizer = CountVectorizer(stop_words=custom_stop_words)
X = vectorizer.fit_transform(descriptions)

# Summing words along columns to find total amount of occurences per word.
feature_names = vectorizer.get_feature_names()
sum_words = X.sum(axis=0).tolist()[0]
words_freq = zip(feature_names, sum_words)
words_freq =sorted(words_freq, key = lambda x: -x[1])

In [301]:
# final_map = [word for word in words_freq if word[1] > 0]
# print(words_freq[:20])
# final_map[:50]

## TD-IDF Transformer and Clustering

In [302]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
X_tf = transformer.fit_transform(X)
# print(X_tf.todense())

In [303]:
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
# Using 8 buckets, should use differnt amount.
km = KMeans(n_clusters=20, init='k-means++', max_iter=100, n_init=1)
km.fit(X_tf)

# k means determine k
# import time
# # 100-150 -> .70-.58
# distortions = []
# K = range(1, 394)
# for k in K:
#     start = time.time()
#     X_tf_arr = X_tf.toarray()
#     kmeanModel = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
#     kmeanModel.fit(X_tf_arr)
#     distortions.append(sum(np.min(cdist(X_tf_arr, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X_tf_arr.shape[0])
#     end = time.time()
#     print(k, end - start)
# Plot the elbow
# plt.plot(K, distortions, 'bx-')
# plt.xlabel('k')
# plt.ylabel('Distortion')
# plt.title('The Elbow Method showing the optimal k')
# plt.show()


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=20, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [304]:
# # Plot the elbow
# start = 315
# step = 15
# plt.plot(K[start::step], distortions[start::step], 'bx-')
# plt.xlabel('k')
# plt.ylabel('Distortion')
# plt.title('The Elbow Method showing the optimal k')
# plt.show()

In [305]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = feature_names
for i in range(len(order_centroids)):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Cluster 0: trading investment financial quantitative markets statistical strategies quant firm electronic
Cluster 1: ai processing natural language deep artificial neural python java recognition
Cluster 2: twitter aws relevance recommendation francisco reviews distributed scala ipsy knowledgeable
Cluster 3: clinical computational biology regeneron genetic biological drug genetics datasets disease
Cluster 4: applicant privacy regional makes arrow linked browser confirm submitting agree
Cluster 5: analytics modeling statistical diabetes analytic hockey ascensia scientist consultant analytical
Cluster 6: ibm quantum font analytics chief analytical language eo prof power
Cluster 7: marketing riskmatch analytics honor mapping tableau sql visualization external attribution
Cluster 8: deep hardware accelerator publication temboo frameworks caffe tensorflow keras dolby
Cluster 9: language speech natural processing alexa nlp scientists applied modeling recognition
Cluster 10: students prime pat

In [306]:
# Print out all posting in that cluster
import numpy as np

def cluster_index(values, searchval):
    return np.where(values == searchval)[0]
for i in range(len(order_centroids)):
    print(i, cluster_index(km.labels_, i).shape)

0 (29,)
1 (22,)
2 (18,)
3 (28,)
4 (9,)
5 (45,)
6 (10,)
7 (22,)
8 (15,)
9 (38,)
10 (12,)
11 (6,)
12 (19,)
13 (21,)
14 (5,)
15 (10,)
16 (8,)
17 (14,)
18 (7,)
19 (56,)


In [320]:
pd.set_option('display.max_colwidth', -1)

# filtered_data = data.loc[ cluster_index(km.labels_, 8), ['company', 'job_title', 'url', 'location']]
# filtered_data = filtered_data[filtered_data['company'].str.contains('Adobe')]

# filtered_data.to_csv('most_promising_k_10.csv', sep=',', encoding='utf-8')
for i in range(len(order_centroids)):
    filtered_data = data.loc[ cluster_index(km.labels_, i), ['company', 'job_title', 'url', 'location']]
    filtered_data['company'] = filtered_data['company'].str.strip()
    filtered_data.to_csv(f'cluster_{i}_k_20.csv', sep=',', encoding='utf-8', index=False)

Next Steps are to sort these words into categories (job, technologies, companies)