Trying to update the Webscraping Indeed Notebook to Python 3

In [1]:
%load_ext autoreload
%autoreload 2

In [34]:
# API Calls
import requests
# Parse HTML
import bs4
# Handle Dataframes (excel data)
import pandas as pd
# Plotting library
import matplotlib.pyplot as plt

## Fetching and Cleaning Indeed Search Data

In [35]:
from indeed_scraper import search_indeed, clean_data, save_data, posting_scraper
from utils import save_data

In [57]:
# Fetch Page Information for Indeed Search
# TODO: Add url encoding so you don't have to do it manually here.
query = "machine+learning"
cities = ["New+York%2C+NY", "San+Francisco%2C+CA", "Dallas-Fort+Worth%2C+TX", "Boston%2C+MA", "Greenwich%2C+CT"]
max_results_per_city = 100
null_value = "NA"

df = search_indeed(query, cities, max_results_per_city, null_value)
df = clean_data(df)
filename = save_data(df, query, path='./indeed_searches/')
print(filename)

./indeed_searches/machine+learning-2018_11_26-200547


## Getting Data Per Posting Page (WIP)

In [58]:
# # Getting one posting worth of data
# url = data.loc[:, 'url'].values[0]
# html = requests.get(url).text
# soups = bs4.BeautifulSoup(html, "html.parser")
# # Print out job description as one srting
# main_content = soups.find('div', {'class': "jobsearch-JobComponent icl-u-xs-mt--sm jobsearch-JobComponent-bottomDivider"})
# job_description = soups.find('div', {'class': "jobsearch-JobComponent-description icl-u-xs-mt--md"})
# job_description.get_text("  ", strip=True).strip()

In [59]:
## Getting Data Per Posting Page (WIP)
data = pd.read_csv(f"{filename}.csv", index_col=0)
desc_dataframe = posting_scraper(data, filename, drop_old_postings=True)
data['desc'] = desc_dataframe['desc']
data.dropna(subset = ['desc'], inplace=True)
data.reset_index(drop=True, inplace=True)
data.to_csv(f'{filename}.csv', sep=',', encoding='utf-8')

## Count Vectorizer on File
Seeing most popular words in job posting.

In [60]:
# filename = "machine+learning-2018_11_24-185749"
job_info = pd.read_csv(f'{filename}.csv', index_col=0)

In [61]:
descriptions = []
for txtfile in job_info['desc']:
    with open(txtfile, 'r', encoding='utf-8') as the_file:
        descriptions.append(the_file.read().lower())

In [134]:
%autoreload 2
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from more_stop_words import more_stop_words
custom_stop_words = ENGLISH_STOP_WORDS.union(more_stop_words)

In [135]:
# print(descriptions)
vectorizer = CountVectorizer(stop_words=custom_stop_words)
X = vectorizer.fit_transform(descriptions)

# Summing words along columns to find total amount of occurences per word.
feature_names = vectorizer.get_feature_names()
sum_words = X.sum(axis=0).tolist()[0]
words_freq = zip(feature_names, sum_words)
words_freq =sorted(words_freq, key = lambda x: -x[1])

In [136]:
# final_map = [word for word in words_freq if word[1] > 0]
print(words_freq[:20])
# final_map[:50]

[('analytics', 235), ('python', 221), ('time', 187), ('deep', 161), ('ai', 155), ('language', 154), ('processing', 137), ('ml', 128), ('statistical', 112), ('java', 103), ('big', 102), ('modeling', 101), ('statistics', 101), ('natural', 99), ('advanced', 97), ('artificial', 93), ('sql', 84), ('mining', 80), ('scientists', 79), ('analytical', 75)]


## TD-IDF Transformer and Clustering

In [137]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
X_tf = transformer.fit_transform(X)
# print(X_tf.todense())
X_tf.shape[0]//18

13

In [138]:
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
# Using 8 buckets, should use differnt amount.
km = KMeans(n_clusters=15, init='k-means++', max_iter=100, n_init=1)
km.fit(X_tf)

# k means determine k
# import time
# # 100-150 -> .70-.58
# distortions = []
# K = range(1, 394)
# for k in K:
#     start = time.time()
#     X_tf_arr = X_tf.toarray()
#     kmeanModel = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
#     kmeanModel.fit(X_tf_arr)
#     distortions.append(sum(np.min(cdist(X_tf_arr, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X_tf_arr.shape[0])
#     end = time.time()
#     print(k, end - start)
# Plot the elbow
# plt.plot(K, distortions, 'bx-')
# plt.xlabel('k')
# plt.ylabel('Distortion')
# plt.title('The Elbow Method showing the optimal k')
# plt.show()


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=15, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [139]:
# # Plot the elbow
# start = 315
# step = 15
# plt.plot(K[start::step], distortions[start::step], 'bx-')
# plt.xlabel('k')
# plt.ylabel('Distortion')
# plt.title('The Elbow Method showing the optimal k')
# plt.show()

In [140]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = feature_names
for i in range(len(order_centroids)):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Cluster 0: analytics strategy consultant analytical healthcare ibm dooh sql bi time
Cluster 1: findmine nlp quantum ml pager deep williams sonoma values wear
Cluster 2: scientist statistical language modeling python speech boston recurly quantitative spoken
Cluster 3: automation rpa ai integratz chatbot dolby financial 150 time keywords
Cluster 4: processing language natural youtube artificial ai ca storage classification distributed
Cluster 5: ca scientists publishing papers researchers conferences language natural mining mountain
Cluster 6: sales marketing hand big architectures tpu trifacta ml ca organizations
Cluster 7: ibm conversational unstructured planet iris pluralsight 170 sources ai structured
Cluster 8: aws scala spark contract 60 java python hour apache distributed
Cluster 9: surveillance planet lose video security electronic eligible app capture secret
Cluster 10: signal internship analog processing algorithm intern stores nypa activities administrative
Cluster 11: ml con

In [148]:
# Print out all posting in that cluster
import numpy as np

def cluster_index(values, searchval):
    return np.where(values == searchval)[0]
for i in range(len(order_centroids)):
    print(i, cluster_index(km.labels_, i).shape)
filename

0 (38,)
1 (10,)
2 (20,)
3 (10,)
4 (22,)
5 (18,)
6 (13,)
7 (14,)
8 (14,)
9 (6,)
10 (9,)
11 (22,)
12 (14,)
13 (10,)
14 (17,)


'./indeed_searches/machine+learning-2018_11_26-200547'

In [149]:
pd.set_option('display.max_colwidth', -1)

filtered_data = data.loc[ cluster_index(km.labels_, 0), ['company', 'job_title', 'url', 'location']]
filtered_data = filtered_data[filtered_data['location'].str.contains('New|CT')]
# filtered_data.to_csv('most_promising_k_10.csv', sep=',', encoding='utf-8')
filtered_data

# for i in range(len(order_centroids)):
#     filtered_data = data.loc[ cluster_index(km.labels_, i), ['company', 'job_title', 'url', 'location']]
#     filtered_data['company'] = filtered_data['company'].str.strip()
#     filtered_data.to_csv(f'{filename}/cluster_{i}_k_20.csv', sep=',', encoding='utf-8', index=False)



Unnamed: 0,company,job_title,url,location
12,WeWork,"Manager, Machine Learning",https://www.indeed.com/viewjob?jk=ce624f6bd070ad13,"New York, NY 10013 (Tribeca area)"
25,Amenity Analytics,Graduate Training Program,https://www.indeed.com/viewjob?jk=2f8af56dd21494b5,New+York%2C+NY
29,New York State Office of the Attorney General (OAG...,Medicaid Fraud Control Unit - Research Analyst NYC (Ref# MFC...,https://www.indeed.com/viewjob?jk=f62ca9b2724952d3,New+York%2C+NY
30,ComplyAdvantage,"Customer Success Manager, North America",https://www.indeed.com/viewjob?jk=a30fda40a84fb929,New+York%2C+NY
32,WeWork,"Manager, Machine Learning",https://www.indeed.com/viewjob?jk=ce624f6bd070ad13,New+York%2C+NY
37,JP Morgan Chase,Asset & Wealth Management – Intelligent Digital Solutions –...,https://www.indeed.com/viewjob?jk=eb657621559137e5,New+York%2C+NY
44,Vettery,Talent Executive,https://www.indeed.com/viewjob?jk=589ae69563cca553,New+York%2C+NY
46,JP Morgan Chase,Asset & Wealth Management – Intelligent Digital Solutions –...,https://www.indeed.com/viewjob?jk=1d91f09325df5cce,New+York%2C+NY
50,SiriusXM,"Analyst, Music Metadata",https://www.indeed.com/viewjob?jk=e159229f91095483,New+York%2C+NY
58,Blackwood Seven,Strategist,https://www.indeed.com/viewjob?jk=598ffac9846f78c6,New+York%2C+NY


Next Steps are to sort these words into categories (job, technologies, companies)