#### Install and import libraries

In [0]:
!pip install azure-storage

In [0]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)

import numpy as np
import re

import nltk
nltk.download
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import gensim 
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from google.colab import files

#### Upload files to azure cloud platform



In [0]:
from azure.storage.blob import BlockBlobService

account_name='bricen'
account_key='N0ZNhw3i1xpnCxq5luWE2Sn3ZN88eKJn1PYqR3XxXR3nmqIzsD0rGWcL4Ul8AiSYciTqYHwugJPIxqfxWvKB9A=='

service_blob = BlockBlobService(account_name, account_key) 

In [0]:
import glob

files_to_upload = glob.glob('/content/*.csv')
files_to_upload

['/content/applicants.csv', '/content/jobs.csv']

In [0]:
service_blob.create_container('nlpreco')
container_name = 'nlpreco'
for item in files_to_upload:  
  source_file_path = item
  destination_file_path = item.rpartition('/')[-1]
  service_blob.create_blob_from_path(container_name, destination_file_path, source_file_path)

#### Upload files from local disk

In [0]:
uploaded = files.upload()

#### Upload files from azure cloud platform

In [0]:
from azure.storage.blob import BlockBlobService

account_name='bricen'
account_key='N0ZNhw3i1xpnCxq5luWE2Sn3ZN88eKJn1PYqR3XxXR3nmqIzsD0rGWcL4Ul8AiSYciTqYHwugJPIxqfxWvKB9A=='

service_blob = BlockBlobService(account_name, account_key) 

In [0]:
container_name = 'nlpreco'
files_list = []

for blob in service_blob.list_blobs(container_name):
    files_list.append(blob.name)
files_list

In [0]:
for filename in files_list:
  source_file_path = filename
  destination_file_path = '/content/{}'.format(filename)
  
  service_blob.get_blob_to_path(container_name, source_file_path, destination_file_path)

- There are 5 datasets in total

  - The *Combined_Jobs_Final.csv*, *Experience.csv*, *Job_Views.csv*, *Positions_Of_Interest.csv* datasets were taken from this [Kaggle Challenge](https://kaggle.com/kandij/job-recommendation-datasets)
  - The *Jobs2.csv* dataset was taken from [here](https://https://www.kaggle.com/chadalee/text-analytics-explained-job-description-data/notebook)

- These datasets can be separated in two groups:
  - Jobs datasets (*Combined_Jobs_Final.csv*, *Jobs2.csv*)
  - Applicants datasets (*Experience.csv*, *Job_Views.csv*, *Positions_Of_Interest.csv*)

- The following steps will consist of cleaning the datasets and combine them if they are in the same group


#### We are going to use word embedding (Word2Vec) for job recommendation
Here are the preprocessing steps to follow for each dataframe:

- Drop unwanted columns
- Impute missing values
- Transform all text to lower case
- Remove punctuation signs
- Remove stop words

In [0]:
# Preprocessing function

def text_preproc(row):
  stop_words = set(stopwords.words('english'))
  row = re.sub(r'\W+', ' ', row)   # remove non aphanumeric characters
  row = row.replace('\r', '')      # remove \r
  row = row.replace('\n', '')      # remove \n
  row = row.replace('\d+', '')     # remove numbers
  row = re.sub(r'[^\w\s]','', row) # remove punctuations
  row = row.lower()                # put all words in lowcase
  return row

#### Clean the Combined_Jobs_Final dataframe

In [0]:
# csv file containing job offers info

df_jobs = pd.read_csv('Combined_Jobs_Final.csv')
df_jobs.head(2)

In [0]:
# Drop unecessaray columns

df_jobs.drop(columns=['Provider', 
                      'Status', 
                      'Slug', 
                      'State.Code', 
                      'Address', 
                      'Industry', 
                      'Requirements', 
                      'Salary', 
                      'Listing.Start', 
                      'Listing.End', 
                      'Created.At', 
                      'Updated.At'], inplace=True)

In [0]:
# Rename columns

df_jobs.rename(columns={'Job.ID':'Job_ID', 
                        'State.Name':'State', 
                        'Job.Description':'Description',
                        'Employment.Type':'Employment',
                        'Education.Required':'Education'}, inplace=True)

In [0]:
# Change value in Employment column to be consistent

df_jobs["Employment"]= df_jobs.Employment.replace('Temporary/seasonal', 'Seasonal/Temp')

In [0]:
# Impute missing values

df_jobs['Description'] = df_jobs.Description.fillna('')
df_jobs['Employment'] = df_jobs.Employment.fillna('')
df_jobs['Education'] = df_jobs.Education.fillna(df_jobs.Education.mode().iloc[0])
df_jobs['Company'] = df_jobs.Company.fillna('')
df_jobs['City'] = df_jobs.City.fillna('')
df_jobs['State'] = df_jobs.State.fillna('')

In [0]:
# Remove company name from the job title where applicable
df_jobs['Title'] = df_jobs.apply(lambda x: x.Title.replace(x.Company, ''), axis=1)

# Apply the text_preproc function to the columns with text

df_jobs['Title'] = df_jobs.Title.apply(text_preproc)
df_jobs['Position'] = df_jobs.Position.apply(text_preproc)
df_jobs['Company'] = df_jobs.Company.apply(text_preproc)
df_jobs['City'] = df_jobs.City.apply(text_preproc)
df_jobs['State'] = df_jobs.State.apply(text_preproc)
df_jobs['Description'] = df_jobs.Description.apply(text_preproc)
df_jobs['Employment'] = df_jobs.Employment.apply(text_preproc)
df_jobs['Education'] = df_jobs.Education.apply(text_preproc)

#### Clean the Jobs2 dataframe

In [0]:
# csv file containing job offers info

df_jobs2 = pd.read_csv('Jobs2.csv') 
df_jobs2.head(2)

In [0]:
# Drop unecessaray columns

df_jobs2.drop(columns=['LocationRaw',
                       'ContractTime',
                       'Category', 
                       'SalaryRaw', 
                       'SalaryNormalized', 
                       'SourceName'], inplace=True)

In [0]:
# Rename columns

df_jobs2.rename(columns={'Id':'Job_ID', 
                         'State.Name':'State', 
                         'FullDescription':'Description',
                         'LocationNormalized':'City',
                         'ContractType':'Employment'}, inplace=True)

In [0]:
# Create new columns in df_jobs2 dataset to match that in df_jobs

df_jobs2['Education'] = 'Not Specified'
df_jobs2['Latitude'] = ''
df_jobs2['Longitude'] = ''
df_jobs2['State'] = ''

# Change the Employment columns values of the df_jobs2 to match that of the df_jobs dataframe

df_jobs2["Employment"]= df_jobs2.Employment.replace('full_time', 'Full-Time')
df_jobs2["Employment"]= df_jobs2.Employment.replace('part_time', 'Part-Time')

# Impute missing values

df_jobs2_w2v['Employment'] = df_jobs2_w2v.Employment.fillna('')
df_jobs2_w2v['Company'] = df_jobs2_w2v.Company.fillna('')

# Impute a missing value based on the other columns

df_jobs2.Title[1588] = 'Quality Improvement Manager'

In [0]:
# Remove 'Position:' in the Job_Descriptions

df_jobs2['Description'] = df_jobs2.Description.apply(lambda row: row.replace('Position:', ''))

# Create the Position column
df_jobs2['Position'] = df_jobs2.apply(lambda x: x.Title.replace(x.City, ''), axis=1)

# Apply the text_preproc function to the columns with text

df_jobs2['Title'] = df_jobs2.Title.apply(text_preproc)
df_jobs2['Description'] = df_jobs2.Description.apply(text_preproc)
df_jobs2['Position'] = df_jobs2.Position.apply(text_preproc)
df_jobs2['City'] = df_jobs2.City.apply(text_preproc)
df_jobs2['Employment'] = df_jobs2.Employment.apply(text_preproc)
df_jobs2['Company'] = df_jobs2.Company.apply(text_preproc)
df_jobs2['Education'] = df_jobs2.Education.apply(text_preproc)

In [0]:
# Reorganize columns orders to match df_jobs

df_jobs2 = df_jobs2[['Job_ID', 'Title', 'Position', 'Company', 'City', 'State', 'Latitude', 'Longitude', 'Description', 'Employment', 'Education']]

#### Concatenate the two jobs datasets

In [0]:
jobs_dataset = pd.concat([df_jobs_w2v, df_jobs2_w2v], ignore_index=True)
jobs_dataset.drop(columns=['Title'], inplace=True)
jobs_dataset.to_csv("jobs_w2v.csv", index=False)

In [0]:
# download the job dataset for the Word2Vec model on local disk

files.download('jobs_w2v.csv')

In [0]:
# Upload the job dataset for the Word2Vec model to Azure cloud platform

from azure.storage.blob import BlockBlobService

account_name='bricen'
account_key='N0ZNhw3i1xpnCxq5luWE2Sn3ZN88eKJn1PYqR3XxXR3nmqIzsD0rGWcL4Ul8AiSYciTqYHwugJPIxqfxWvKB9A=='
service_blob = BlockBlobService(account_name, account_key)

import glob
files_to_upload = 'jobs_w2v.csv'

container_name = 'nlpreco'
for item in files_to_upload:  
  source_file_path = files_to_upload
  destination_file_path = 'jobs_w2v.csv'
  service_blob.create_blob_from_path(container_name, destination_file_path, source_file_path)
  service_blob.create_blob_from_path(container_name, destination_file_path, source_file_path)

#### Clean the *Experience* dataframe

In [0]:
# csv file containing the professional experience of the Applicants

df_exp = pd.read_csv('Experience.csv')
df_exp.head(2)

In [0]:
# Drop unecessaray columns

df_exp.drop(columns=['State.Code',
                     'Start.Date',
                     'End.Date',
                     'Salary', 
                     'Can.Contact.Employer', 
                     'Created.At', 
                     'Updated.At'], inplace=True)

In [0]:
# Rename columns

df_exp.rename(columns={'Applicant.ID':'Applicant_ID', 
                       'Employer.Name':'Company', 
                       'Position.Name':'Position', 
                       'Job.Description':'Description', 
                       'State.Name':'State'}, inplace=True)

In [0]:
# Impute missing values

df_exp['Position'] = df_exp.Position.fillna('')
df_exp['Company'] = df_exp.Company.fillna('')
df_exp['City'] = df_exp.City.fillna('')
df_exp['State'] = df_exp.State.fillna('')
df_exp['Description'] = df_exp.Description.fillna('')

In [0]:
# Apply the text_preproc function to the columns with text

df_exp['Position'] = df_exp.Position.apply(text_preproc)
df_exp['Company'] = df_exp.Company.apply(text_preproc)
df_exp['City'] = df_exp.City.apply(text_preproc)
df_exp['State'] = df_exp.State.apply(text_preproc)
df_exp['Description'] = df_exp.Description.apply(text_preproc)

In [0]:
# Add a suffix to the column names of the experience dataset

col_names = ['Position', 'Company', 'City', 'State', 'Description']
col_new_names = [name+'_before' for name in col_names]
   
for i in range(len(col_names)):
  df_exp_w2v.rename(columns={col_names[i]:col_new_names[i]}, inplace=True)

#### Clean the *Job_views* dataframe

In [0]:
# csv file containing info about the job viewed by the applicants

df_views = pd.read_csv('Job_Views.csv')
df_views.head(2)

In [0]:
# Drop unecessaray columns

df_views.drop(columns=['Title',
                       'Job.ID',
                       'State.Code',
                       'Industry',
                       'View.Start',
                       'View.End', 
                       'Created.At', 
                       'Updated.At'], inplace=True)

In [0]:
# Rename columns

df_views.rename(columns={'Applicant.ID':'Applicant_ID',
                         'State.Name':'State',
                         'View.Duration':'View_Duration'}, inplace=True)

In [0]:
# Impute missing values

df_views['Company'] = df_views.Company.fillna('')
df_views['State'] = df_views.State.fillna('')
df_views['View_Duration'] = df_views.View_Duration.fillna(df_views.View_Duration.median())

In [0]:
# Remove Company name from Position
df_views['Position'] = df_views.apply(lambda x: x.Position.replace(x.Company, ''), axis=1)

# Apply the text_preproc function to the columns with text

df_views['Position'] = df_views.Position.apply(text_preproc)
df_views['Company'] = df_views.Company.apply(text_preproc)
df_views['City'] = df_views.City.apply(text_preproc)
df_views['State'] = df_views.State.apply(text_preproc)

In [0]:
# Add a suffix to the column names of the df_views dataset

col_names = ['Position', 'Company', 'City', 'State']
col_new_names = [name+'_viewed' for name in col_names]
   
for i in range(len(col_names)):
  df_views_w2v.rename(columns={col_names[i]:col_new_names[i]}, inplace=True)

#### Cleaning the *Positions_Of_Interest* dataframe

In [0]:
# csv file containing the jobs positions of interest for the applicants

df_poi = pd.read_csv('Positions_Of_Interest.csv')
df_poi.head(2)

In [0]:
# Rename columns

df_poi.rename(columns={'Applicant.ID':'Applicant_ID',
                       'Position.Of.Interest':'POI'}, inplace=True)

# Drop columns

df_poi.drop(columns=['Created.At', 'Updated.At'], inplace=True)
df_poi['POI'] = df_poi.POI.fillna('')

In [0]:
# Apply the text_preproc function to the columns with text

df_poi['POI'] = df_poi.POI.apply(text_preproc)

#### Join the three applicants datasets (df_poi, df_exp, and df_views) by Applicant_ID column

In [0]:
df_applicants = pd.merge(pd.merge(df_exp, df_views, on='Applicant_ID'), df_poi, on='Applicant_ID')
df_applicants.head()

In [0]:
df_applicants.to_csv("applicants_w2v.csv", index=False)

In [0]:
# download the apllicant dataset for the Word2Vec model to store on local disk

files.download('df_applicants_w2v.csv')

In [0]:
# Upload the applicant dataset for the Word2Vec model to Azure cloud platform

from azure.storage.blob import BlockBlobService

account_name='bricen'
account_key='N0ZNhw3i1xpnCxq5luWE2Sn3ZN88eKJn1PYqR3XxXR3nmqIzsD0rGWcL4Ul8AiSYciTqYHwugJPIxqfxWvKB9A=='
service_blob = BlockBlobService(account_name, account_key)

import glob
files_to_upload = 'applicants_w2v.csv'
files_to_upload

container_name = 'nlpreco'
for item in files_to_upload:  
  source_file_path = files_to_upload
  destination_file_path = 'applicants_w2v.csv'
  service_blob.create_blob_from_path(container_name, destination_file_path, source_file_path)
  service_blob.create_blob_from_path(container_name, destination_file_path, source_file_path)

#### Upload the jobs and applicants datasets

In [4]:
# upload the files from local disk

uploaded = files.upload()

Saving applicants_w2v.csv to applicants_w2v.csv
Saving d2v.model to d2v.model
Saving jobs_w2v.csv to jobs_w2v.csv


In [0]:
# Upload files from azure cloud platform

from azure.storage.blob import BlockBlobService

account_name='bricen'
account_key='N0ZNhw3i1xpnCxq5luWE2Sn3ZN88eKJn1PYqR3XxXR3nmqIzsD0rGWcL4Ul8AiSYciTqYHwugJPIxqfxWvKB9A=='

service_blob = BlockBlobService(account_name, account_key) 

In [0]:
container_name = 'nlpreco'
files_list = ['jobs_w2v.csv', 'applicants_w2v.csv']

#for blob in service_blob.list_blobs(container_name):
#    files_list.append(blob.name)
#files_list

In [0]:
for filename in files_list:
  source_file_path = filename
  destination_file_path = '/content/{}'.format(filename)
  
  service_blob.get_blob_to_path(container_name, source_file_path, destination_file_path)

#### Make job recommandation model using Word2Vec
- We need a combined text column for both final datasets
- We will try different combinations to see how it affects the recommendations

In [0]:
jobs_w2v = pd.read_csv('jobs_w2v.csv', low_memory=False)
jobs_w2v.fillna('[]', inplace=True)
jobs_w2v.head(2)

In [0]:
# Create a new column to contain the combined jobs information

jobs_w2v['Combined_info'] = jobs_w2v['Position'] + jobs_w2v['Description']
jobs_w2v.head(2)

In [0]:
jobs_w2v.tail(2)

In [0]:
# Transform the created column to list, tokenize and create tags for each job

job_documents = jobs_info_w2v.Combined_info.to_list()
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(job_documents)]

In [0]:
# Train the Doc2Vec model on the jobs_w2v corpus for 10 epochs
# This is to determine word vectors for each word in the jobs corpus

max_epochs = 20
vec_size = 20
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha


iteration 0




iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19


In [0]:
# Saving the trained model

model.save("d2v.model")
print("Model Saved")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Model Saved


In [0]:
# Uploading the trained  model

uploaded = files.upload()

Saving d2v.model to d2v.model


In [7]:
# Loading the trained model

model= Doc2Vec.load("d2v.model")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
df_applicants_w2v = pd.read_csv('applicants_w2v.csv')
df_applicants_w2v.fillna('[]', inplace=True)
df_applicants_w2v.head(2)

In [0]:
# Create a new column to contain the applicant info that will be used to make the recommendations

# 1. Recommendation based on previous experience(s) only
# df_applicants_w2v['Combined_info'] = df_applicants_w2v['Position_before'] +' '+ df_applicants_w2v['Description_before']    

# 2. Recommendation based on job views only
# df_applicants_w2v['Combined_info'] = df_applicants_w2v['Position_viewed']

# 3. Recommendation based on position of interest only
# df_applicants_w2v['Combined_info'] = df_applicants_w2v['POI']   

# 4. Recommendation based on the combinations of previous information
df_applicants_w2v['Combined_info'] = df_applicants_w2v['Position_before'] +' '+\
                                     df_applicants_w2v['Description_before'] +' '+\
                                     df_applicants_w2v['POI'] +' '+\
                                     df_applicants_w2v['Position_viewed']

# 5. Recommendation based on the combinations of previous information
#    - A filter is used to include only job that were viewed for certain time
# df_applicants_w2v['Combined_info'] = df_applicants_w2v['Position_before'] +' '+\
#                                      df_applicants_w2v['Description_before'] +' '+\
#                                      df_applicants_w2v['POI'] +' '+\
#                                     df_applicants_w2v['Position_viewed'].where(df_applicants_w2v.View_Duration >= 30, '')

df_applicants_w2v.head(2)

In [0]:
df_applicants_w2v.head(2)

In [47]:
# Print the selected user info

user_profile = df_applicants_w2v.iloc[:,[0,1,5,6,-2,-1]][df_applicants_w2v.index == 1967]
user_profile

Unnamed: 0,Applicant_ID,Position_before,Description_before,Position_viewed,POI,Combined_info
1967,11883,ride operator,operating rides being friendly to guest being a team player,seasonal retail sales part time northridge ca northridge fashion center,barista,ride operator operating rides being friendly to guest being a team player barista seasonal retail sales part time northridge ca northridge fashion center


In [48]:
# Get and tokenize the applicant information

user = df_applicants_w2v.iloc[:,:][df_applicants_w2v.index == 1967]
user_data = user.iloc[0]['Combined_info']
user_data = word_tokenize(user_data)
user_data

['ride',
 'operator',
 'operating',
 'rides',
 'being',
 'friendly',
 'to',
 'guest',
 'being',
 'a',
 'team',
 'player',
 'barista',
 'seasonal',
 'retail',
 'sales',
 'part',
 'time',
 'northridge',
 'ca',
 'northridge',
 'fashion',
 'center']

In [0]:
# Get the applicant data vectors from the trained model

user_vector = model.infer_vector(user_data)

In [50]:
# Find the most similar documents between the applicant informationa and the jobs informations

doc2vec_similarity = model.docvecs.most_similar([user_vector])
doc2vec_similarity

  if np.issubdtype(vec.dtype, np.int):


[('95685', 0.6694272756576538),
 ('168660', 0.6593121290206909),
 ('186212', 0.6585068106651306),
 ('6067', 0.6450024843215942),
 ('168667', 0.6444485783576965),
 ('57808', 0.6400918960571289),
 ('51153', 0.6374245882034302),
 ('277268', 0.6358883380889893),
 ('277049', 0.6336578130722046),
 ('2099', 0.632948637008667)]

In [51]:
doc2vec_best10_indices = [item[0] for item in doc2vec_similarity]
doc2vec_similarity_scores = [item[1] for item in doc2vec_similarity]
doc2vec_best10_indices = [int(item) for item in doc2vec_best10_indices]
doc2vec_best10_indices

[95685, 168660, 186212, 6067, 168667, 57808, 51153, 277268, 277049, 2099]

In [52]:
# Create a datframe with the best matches

doc2vec_best10_jobs = jobs_w2v.iloc[doc2vec_best10_indices,[5, 8]]
doc2vec_best10_jobs['doc2vec_similarity_scores'] = doc2vec_similarity_scores
doc2vec_best10_jobs

Unnamed: 0,Job_ID,Position,doc2vec_similarity_scores
95685,66538253,bar team member,0.669427
168660,69077266,assistant manager,0.659312
186212,69551174,canteen assistant,0.658507
6067,145413,seasonal associate production and sales workers needed,0.645002
168667,69077395,assistant manager,0.644449
57808,287346,starbucks barista 0 5fte variable shifts starbucks issaquah,0.640092
51153,280696,opening restaurant crewmember cashier cook kitchen customer service drive thru,0.637425
277268,71764782,night porter,0.635888
277049,71761934,hotel housekeeper,0.633658
2099,140426,seasonal associate production and sales workers needed,0.632949
