# Job recommendation engine

* Recommend similar jobs based on the jobs title, description
* Recommend jobs based on similar user profiles

## Import dependencies

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import ast 
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

import warnings; warnings.simplefilter('ignore')

## Load dataset

In [2]:
!dir ./input_data/*.tsv

Invalid switch - "input_data".


In [3]:
apps = pd.read_csv('./input_data/apps.tsv', delimiter='\t',encoding='utf-8')
user_history = pd.read_csv('./input_data/user_history.tsv', delimiter='\t',encoding='utf-8')
jobs = pd.read_csv('./input_data/jobs.tsv', delimiter='\t',encoding='utf-8', error_bad_lines=False)
users = pd.read_csv('./input_data/users.tsv' ,delimiter='\t',encoding='utf-8')
test_users = pd.read_csv('./input_data/test_users.tsv', delimiter='\t',encoding='utf-8')

b'Skipping line 122433: expected 11 fields, saw 12\n'
b'Skipping line 602576: expected 11 fields, saw 12\n'
b'Skipping line 990950: expected 11 fields, saw 12\n'


## Filtering Window 1 data for all datasets

In [4]:
apps = apps[apps['WindowID']==1]
apps.head()

Unnamed: 0,UserID,WindowID,Split,ApplicationDate,JobID
0,47,1,Train,2012-04-04 15:56:23.537,169528
1,47,1,Train,2012-04-06 01:03:00.003,284009
2,47,1,Train,2012-04-05 02:40:27.753,2121
3,47,1,Train,2012-04-05 02:37:02.673,848187
4,47,1,Train,2012-04-05 22:44:06.653,733748


In [5]:
apps.columns

Index(['UserID', 'WindowID', 'Split', 'ApplicationDate', 'JobID'], dtype='object')

In [6]:
apps.shape

(353582, 5)

In [7]:
# Information on apps file
apps.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 353582 entries, 0 to 353581
Data columns (total 5 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   UserID           353582 non-null  int64 
 1   WindowID         353582 non-null  int64 
 2   Split            353582 non-null  object
 3   ApplicationDate  353582 non-null  object
 4   JobID            353582 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 16.2+ MB


In [8]:
user_history = user_history[user_history['WindowID']==1]
user_history.head()

Unnamed: 0,UserID,WindowID,Split,Sequence,JobTitle
0,47,1,Train,1,National Space Communication Programs-Special ...
1,47,1,Train,2,Detention Officer
2,47,1,Train,3,"Passenger Screener, TSA"
3,72,1,Train,1,"Lecturer, Department of Anthropology"
4,72,1,Train,2,Student Assistant


In [9]:
# replace all the NaN with space
user_history['JobTitle'] = user_history['JobTitle'].fillna('')

In [10]:
# users[users['UserID']==1472089]

In [11]:
user_history.columns

Index(['UserID', 'WindowID', 'Split', 'Sequence', 'JobTitle'], dtype='object')

In [12]:
user_history.shape

(348269, 5)

In [13]:
# user_history information
user_history.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 348269 entries, 0 to 348268
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   UserID    348269 non-null  int64 
 1   WindowID  348269 non-null  int64 
 2   Split     348269 non-null  object
 3   Sequence  348269 non-null  int64 
 4   JobTitle  348269 non-null  object
dtypes: int64(3), object(2)
memory usage: 15.9+ MB


In [14]:
jobs = jobs[jobs['WindowID']==1]
jobs.head()

Unnamed: 0,JobID,WindowID,Title,Description,Requirements,City,State,Country,Zip5,StartDate,EndDate
0,1,1,Security Engineer/Technical Lead,<p>Security Clearance Required:&nbsp; Top Secr...,<p>SKILL SET</p>\r<p>&nbsp;</p>\r<p>Network Se...,Washington,DC,US,20531.0,2012-03-07 13:17:01.643,2012-04-06 23:59:59
1,4,1,SAP Business Analyst / WM,<strong>NO Corp. to Corp resumes&nbsp;are bein...,<p><b>WHAT YOU NEED: </b></p>\r<p>Four year co...,Charlotte,NC,US,28217.0,2012-03-21 02:03:44.137,2012-04-20 23:59:59
2,7,1,P/T HUMAN RESOURCES ASSISTANT,<b> <b> P/T HUMAN RESOURCES ASSISTANT</b> <...,Please refer to the Job Description to view th...,Winter Park,FL,US,32792.0,2012-03-02 16:36:55.447,2012-04-01 23:59:59
3,8,1,Route Delivery Drivers,CITY BEVERAGES Come to work for the best in th...,Please refer to the Job Description to view th...,Orlando,FL,US,,2012-03-03 09:01:10.077,2012-04-02 23:59:59
4,9,1,Housekeeping,I make sure every part of their day is magica...,Please refer to the Job Description to view th...,Orlando,FL,US,,2012-03-03 09:01:11.88,2012-04-02 23:59:59


In [15]:
jobs.columns

Index(['JobID', 'WindowID', 'Title', 'Description', 'Requirements', 'City',
       'State', 'Country', 'Zip5', 'StartDate', 'EndDate'],
      dtype='object')

In [16]:
jobs.shape

(285091, 11)

In [17]:
# jobs information
jobs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 285091 entries, 0 to 285090
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   JobID         285091 non-null  int64 
 1   WindowID      285091 non-null  int64 
 2   Title         285091 non-null  object
 3   Description   285090 non-null  object
 4   Requirements  261659 non-null  object
 5   City          285091 non-null  object
 6   State         285091 non-null  object
 7   Country       285088 non-null  object
 8   Zip5          182469 non-null  object
 9   StartDate     285091 non-null  object
 10  EndDate       285087 non-null  object
dtypes: int64(2), object(9)
memory usage: 26.1+ MB


In [18]:
users = users[users['WindowID']==1]
users.head()

Unnamed: 0,UserID,WindowID,Split,City,State,Country,ZipCode,DegreeType,Major,GraduationDate,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany
0,47,1,Train,Paramount,CA,US,90723,High School,,1999-06-01 00:00:00,3,10.0,Yes,No,0
1,72,1,Train,La Mesa,CA,US,91941,Master's,Anthropology,2011-01-01 00:00:00,10,8.0,Yes,No,0
2,80,1,Train,Williamstown,NJ,US,8094,High School,Not Applicable,1985-06-01 00:00:00,5,11.0,Yes,Yes,5
3,98,1,Train,Astoria,NY,US,11105,Master's,Journalism,2007-05-01 00:00:00,3,3.0,Yes,No,0
4,123,1,Train,Baton Rouge,LA,US,70808,Bachelor's,Agricultural Business,2011-05-01 00:00:00,1,9.0,Yes,No,0


In [19]:
users.columns

Index(['UserID', 'WindowID', 'Split', 'City', 'State', 'Country', 'ZipCode',
       'DegreeType', 'Major', 'GraduationDate', 'WorkHistoryCount',
       'TotalYearsExperience', 'CurrentlyEmployed', 'ManagedOthers',
       'ManagedHowMany'],
      dtype='object')

In [20]:
users.shape

(77060, 15)

In [21]:
# users information
users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77060 entries, 0 to 77059
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   UserID                77060 non-null  int64  
 1   WindowID              77060 non-null  int64  
 2   Split                 77060 non-null  object 
 3   City                  77060 non-null  object 
 4   State                 76952 non-null  object 
 5   Country               77060 non-null  object 
 6   ZipCode               76704 non-null  object 
 7   DegreeType            77060 non-null  object 
 8   Major                 58219 non-null  object 
 9   GraduationDate        53852 non-null  object 
 10  WorkHistoryCount      77060 non-null  int64  
 11  TotalYearsExperience  74212 non-null  float64
 12  CurrentlyEmployed     67033 non-null  object 
 13  ManagedOthers         77060 non-null  object 
 14  ManagedHowMany        77060 non-null  int64  
dtypes: float64(1), int6

In [22]:
test_users = test_users[test_users['WindowID']==1]
test_users.head()

Unnamed: 0,UserID,WindowID
0,767,1
1,769,1
2,861,1
3,1006,1
4,1192,1


In [23]:
test_users.columns

Index(['UserID', 'WindowID'], dtype='object')

In [24]:
test_users.shape

(5419, 2)

In [25]:
# test_users information
test_users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5419 entries, 0 to 5418
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   UserID    5419 non-null   int64
 1   WindowID  5419 non-null   int64
dtypes: int64(2)
memory usage: 127.0 KB


## Exploratory Data Analysis (EDA) and Pre-processing

### Split training and testing data based on column `split`

* Here, there are three datafiles/dataframes are having attribute split. 
    * apps
    * user_history
    * users
* This data attribute indicates that whether the data record can be used for training or testing so we need to filter out based on that. 
* We are generating training and testing dataframes 


In [26]:
# spliting apps data
apps_training = apps.loc[apps['Split'] == 'Train']

In [27]:
apps_training.shape

(303833, 5)

In [28]:
apps_training.tail()

Unnamed: 0,UserID,WindowID,Split,ApplicationDate,JobID
353577,1471976,1,Train,2012-04-02 13:53:18.88,702563
353578,1471976,1,Train,2012-04-02 13:52:16.327,1020868
353579,1471976,1,Train,2012-04-02 13:00:52.527,891097
353580,1471983,1,Train,2012-04-09 21:41:05.663,553373
353581,1471983,1,Train,2012-04-09 21:56:46.787,553371


In [29]:
apps_testing = apps.loc[apps['Split'] == 'Test']

In [30]:
apps_testing.shape

(49749, 5)

In [31]:
apps_testing.head()

Unnamed: 0,UserID,WindowID,Split,ApplicationDate,JobID
126,767,1,Test,2012-04-01 14:37:20.023,85377
127,769,1,Test,2012-04-16 22:36:52.48,853328
128,769,1,Test,2012-04-09 18:59:28.193,86106
129,769,1,Test,2012-04-09 18:59:31.127,327571
130,769,1,Test,2012-04-08 21:29:11.993,119161


In [32]:
# spliting user_history data
user_history_training = user_history.loc[user_history['Split'] =='Train']

In [33]:
user_history_training = user_history.loc[user_history['Split'] =='Train']
user_history_testing = user_history.loc[user_history['Split'] =='Test']
apps_training = apps.loc[apps['Split'] == 'Train']
apps_testing = apps.loc[apps['Split'] == 'Test']
users_training = users.loc[users['Split']=='Train']
users_testing = users.loc[users['Split']=='Test']

In [34]:
user_history_training.shape

(323851, 5)

In [35]:
user_history_training.head()

Unnamed: 0,UserID,WindowID,Split,Sequence,JobTitle
0,47,1,Train,1,National Space Communication Programs-Special ...
1,47,1,Train,2,Detention Officer
2,47,1,Train,3,"Passenger Screener, TSA"
3,72,1,Train,1,"Lecturer, Department of Anthropology"
4,72,1,Train,2,Student Assistant


In [36]:
user_history_testing = user_history.loc[user_history['Split'] =='Test']

In [37]:
user_history_testing.shape

(24418, 5)

In [38]:
user_history_testing.head()

Unnamed: 0,UserID,WindowID,Split,Sequence,JobTitle
144,767,1,Test,1,Claims Adjuster
145,767,1,Test,2,Professional Baseball Player
146,767,1,Test,3,Professional Baseball Player
147,767,1,Test,4,Professional Baseball Player
148,767,1,Test,5,Professional Baseball Player


In [39]:
# spliting user data
users_training = users.loc[users['Split']=='Train']

In [40]:
users_training.shape

(71641, 15)

In [41]:
users_training.head()

Unnamed: 0,UserID,WindowID,Split,City,State,Country,ZipCode,DegreeType,Major,GraduationDate,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany
0,47,1,Train,Paramount,CA,US,90723,High School,,1999-06-01 00:00:00,3,10.0,Yes,No,0
1,72,1,Train,La Mesa,CA,US,91941,Master's,Anthropology,2011-01-01 00:00:00,10,8.0,Yes,No,0
2,80,1,Train,Williamstown,NJ,US,8094,High School,Not Applicable,1985-06-01 00:00:00,5,11.0,Yes,Yes,5
3,98,1,Train,Astoria,NY,US,11105,Master's,Journalism,2007-05-01 00:00:00,3,3.0,Yes,No,0
4,123,1,Train,Baton Rouge,LA,US,70808,Bachelor's,Agricultural Business,2011-05-01 00:00:00,1,9.0,Yes,No,0


In [42]:
users_testing = users.loc[users['Split']=='Test']

In [43]:
users_testing.shape

(5419, 15)

In [44]:
users_testing.head()

Unnamed: 0,UserID,WindowID,Split,City,State,Country,ZipCode,DegreeType,Major,GraduationDate,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany
31,767,1,Test,Murrieta,CA,US,92562,Bachelor's,University Studies/Business,2008-05-01 00:00:00,5,16.0,No,No,0
32,769,1,Test,Roselle,IL,US,60172,Bachelor's,Radio-Television,2011-05-01 00:00:00,5,5.0,Yes,No,0
33,861,1,Test,Morris,IL,US,60450,High School,General Studies,1989-05-01 00:00:00,7,21.0,,No,0
38,1006,1,Test,West Chester,PA,US,19382,High School,Not Applicable,2008-06-01 00:00:00,3,6.0,Yes,No,0
44,1192,1,Test,Cincinnati,OH,US,45255,Bachelor's,Marketing,,5,6.0,Yes,No,0


### List down all training data records 

In [45]:
apps_training.head()

Unnamed: 0,UserID,WindowID,Split,ApplicationDate,JobID
0,47,1,Train,2012-04-04 15:56:23.537,169528
1,47,1,Train,2012-04-06 01:03:00.003,284009
2,47,1,Train,2012-04-05 02:40:27.753,2121
3,47,1,Train,2012-04-05 02:37:02.673,848187
4,47,1,Train,2012-04-05 22:44:06.653,733748


In [46]:
user_history_training.head()

Unnamed: 0,UserID,WindowID,Split,Sequence,JobTitle
0,47,1,Train,1,National Space Communication Programs-Special ...
1,47,1,Train,2,Detention Officer
2,47,1,Train,3,"Passenger Screener, TSA"
3,72,1,Train,1,"Lecturer, Department of Anthropology"
4,72,1,Train,2,Student Assistant


In [47]:
users_training.head(5).transpose()

Unnamed: 0,0,1,2,3,4
UserID,47,72,80,98,123
WindowID,1,1,1,1,1
Split,Train,Train,Train,Train,Train
City,Paramount,La Mesa,Williamstown,Astoria,Baton Rouge
State,CA,CA,NJ,NY,LA
Country,US,US,US,US,US
ZipCode,90723,91941,08094,11105,70808
DegreeType,High School,Master's,High School,Master's,Bachelor's
Major,,Anthropology,Not Applicable,Journalism,Agricultural Business
GraduationDate,1999-06-01 00:00:00,2011-01-01 00:00:00,1985-06-01 00:00:00,2007-05-01 00:00:00,2011-05-01 00:00:00


In [48]:
jobs.head()

Unnamed: 0,JobID,WindowID,Title,Description,Requirements,City,State,Country,Zip5,StartDate,EndDate
0,1,1,Security Engineer/Technical Lead,<p>Security Clearance Required:&nbsp; Top Secr...,<p>SKILL SET</p>\r<p>&nbsp;</p>\r<p>Network Se...,Washington,DC,US,20531.0,2012-03-07 13:17:01.643,2012-04-06 23:59:59
1,4,1,SAP Business Analyst / WM,<strong>NO Corp. to Corp resumes&nbsp;are bein...,<p><b>WHAT YOU NEED: </b></p>\r<p>Four year co...,Charlotte,NC,US,28217.0,2012-03-21 02:03:44.137,2012-04-20 23:59:59
2,7,1,P/T HUMAN RESOURCES ASSISTANT,<b> <b> P/T HUMAN RESOURCES ASSISTANT</b> <...,Please refer to the Job Description to view th...,Winter Park,FL,US,32792.0,2012-03-02 16:36:55.447,2012-04-01 23:59:59
3,8,1,Route Delivery Drivers,CITY BEVERAGES Come to work for the best in th...,Please refer to the Job Description to view th...,Orlando,FL,US,,2012-03-03 09:01:10.077,2012-04-02 23:59:59
4,9,1,Housekeeping,I make sure every part of their day is magica...,Please refer to the Job Description to view th...,Orlando,FL,US,,2012-03-03 09:01:11.88,2012-04-02 23:59:59


In [49]:
users.shape

(77060, 15)

## revised approach


###  Let's find out Similar jobs

In [50]:
jobs_base_line = jobs
jobs_base_line.columns

Index(['JobID', 'WindowID', 'Title', 'Description', 'Requirements', 'City',
       'State', 'Country', 'Zip5', 'StartDate', 'EndDate'],
      dtype='object')

In [51]:
# jobs_US.head().transpose()

In [52]:
jobs_base_line = jobs_base_line.iloc[0:10000,0:8]

In [53]:
# jobs_base_line.head()

In [54]:
jobs_base_line['Title'] = jobs_base_line['Title'].fillna('')
jobs_base_line['Description'] = jobs_base_line['Description'].fillna('')
#jobs_base_line['Requirements'] = jobs_base_line['Requirements'].fillna('')

jobs_base_line['Description'] = jobs_base_line['Title'] + jobs_base_line['Description']

In [55]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(jobs_base_line['Description'])

In [56]:
tfidf_matrix.shape

(10000, 515585)

In [57]:
# http://scikit-learn.org/stable/modules/metrics.html#linear-kernel
# cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim = cosine_similarity(tfidf_matrix)

In [58]:
cosine_sim[0]

array([1.        , 0.04301522, 0.00643905, ..., 0.03802139, 0.03802139,
       0.03802139])

In [59]:
jobs_base_line = jobs_base_line.reset_index()
titles = jobs_base_line['Title']
indices = pd.Series(jobs_base_line.index, index=jobs_base_line['Title'])
#indices.head(2)


In [60]:
def get_recommendations(title):
    idx = indices[title]
    #print (idx)
    sim_scores = list(enumerate(cosine_sim[idx]))
    #print (sim_scores)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    job_indices = [i[0] for i in sim_scores]
    return titles.iloc[job_indices]

In [61]:
get_recommendations('SAP Business Analyst / WM').head(10)

1                           SAP Business Analyst / WM
6054                    SAP FI/CO Business Consultant
5871                       SAP FI/CO Business Analyst
5162                          SAP Basis Administrator
5354    SAP Sales and Distribution Solution Architect
4799       Senior Specialist - SAP Configuration - SD
5120                       SAP Integration Specialist
5412            Senior Business Systems Analyst - SAP
5247                                 Business Analyst
4731           SAP ABAP Developer with PRA experience
Name: Title, dtype: object

In [62]:
get_recommendations('Security Engineer/Technical Lead').head(10)

0              Security Engineer/Technical Lead
5909                   Senior Security Engineer
3774                     Director of Admissions
6296    3 Network Architects needed - immediate
3560                          Assistant Manager
401          National Sales & Marketing Manager
2608               Inventory Analyst/ Scheduler
3760                        CLINICAL PHARMACIST
3481           Customer Service Representatives
3561                              Store Manager
Name: Title, dtype: object

In [63]:
get_recommendations('Immediate Opening').head(10)

13                           Immediate Opening
216                      Accounting/Bookkeeper
2874    Cable TV/Internet/Telephone Installers
8426                                    no job
4031                              Electricians
4032                              Electricians
4033                              Electricians
620                                     DENTAL
93                 A/C HEATING REFRIG MECHANIC
125                                   Optician
Name: Title, dtype: object

In [64]:
get_recommendations('EXPERIENCED ROOFERS').head(10)

26                                    EXPERIENCED ROOFERS
7952    Commercial Roofers  EXPERIENCED in  Hot Asphal...
51                                                 Driver
8015                                       OFFICE MANAGER
53                                                DRIVERS
33                         CNA OPENINGS AT TUSKAWILLA SNF
44                                   SALES REPRESENTATIVE
30                               Automotive Retail Dealer
60                                     Associate Attorney
59                                        SECURITY GUARDS
Name: Title, dtype: object

## Best approach

#### Find out similar users -- Find out for which jobs they have applied -- suggest those job to the other users who shared similar user profile.

We are finding put similar user profile based on their degree type, majors and total years of experience. 
* We will get to 10 similar users.
* We will find our which are the jobs for which these users have applied
* We take an union of these jobs and recommend the jobs all these user base

In [65]:
users_training.head()

Unnamed: 0,UserID,WindowID,Split,City,State,Country,ZipCode,DegreeType,Major,GraduationDate,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany
0,47,1,Train,Paramount,CA,US,90723,High School,,1999-06-01 00:00:00,3,10.0,Yes,No,0
1,72,1,Train,La Mesa,CA,US,91941,Master's,Anthropology,2011-01-01 00:00:00,10,8.0,Yes,No,0
2,80,1,Train,Williamstown,NJ,US,8094,High School,Not Applicable,1985-06-01 00:00:00,5,11.0,Yes,Yes,5
3,98,1,Train,Astoria,NY,US,11105,Master's,Journalism,2007-05-01 00:00:00,3,3.0,Yes,No,0
4,123,1,Train,Baton Rouge,LA,US,70808,Bachelor's,Agricultural Business,2011-05-01 00:00:00,1,9.0,Yes,No,0


In [66]:
user_based_approach = users_training

In [67]:
# for each in user_based_approach.index:
#     userid = user_based_approach.iloc[each].UserID
#     all_work = ''.join(list(user_history[user_history['UserID'] == userid]['JobTitle']))
#     user_based_approach.iloc[each]['WorkHistory'].replace('',all_work)
#     print(all_work)
#     break

In [68]:
# function for adding user_history data to the user data
def create_work_history(userid):
    return ''.join(list(user_history_training[user_history_training['UserID'] == userid]['JobTitle']))

In [69]:
# add user work history data
user_based_approach['WorkHistory'] = user_based_approach['UserID'].apply(lambda x: create_work_history(x))

In [70]:
user_based_approach.shape

(71641, 16)

In [71]:
user_based_approach = user_based_approach.iloc[0:20000,:]

In [72]:
user_based_approach.head()

Unnamed: 0,UserID,WindowID,Split,City,State,Country,ZipCode,DegreeType,Major,GraduationDate,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany,WorkHistory
0,47,1,Train,Paramount,CA,US,90723,High School,,1999-06-01 00:00:00,3,10.0,Yes,No,0,National Space Communication Programs-Special ...
1,72,1,Train,La Mesa,CA,US,91941,Master's,Anthropology,2011-01-01 00:00:00,10,8.0,Yes,No,0,"Lecturer, Department of AnthropologyStudent As..."
2,80,1,Train,Williamstown,NJ,US,8094,High School,Not Applicable,1985-06-01 00:00:00,5,11.0,Yes,Yes,5,"Auto Publishing/Electro Mechanical Technician,..."
3,98,1,Train,Astoria,NY,US,11105,Master's,Journalism,2007-05-01 00:00:00,3,3.0,Yes,No,0,Editor-in-ChiefDeputy Sports & Website EditorA...
4,123,1,Train,Baton Rouge,LA,US,70808,Bachelor's,Agricultural Business,2011-05-01 00:00:00,1,9.0,Yes,No,0,Lead Hostess and Takeout Server


In [73]:
user_based_approach = user_based_approach.reset_index()
userid = user_based_approach['UserID']
indices = pd.Series(user_based_approach.index, index=user_based_approach['UserID'])
#indices.head(2)

In [74]:
user_based_approach.head()

Unnamed: 0,index,UserID,WindowID,Split,City,State,Country,ZipCode,DegreeType,Major,GraduationDate,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany,WorkHistory
0,0,47,1,Train,Paramount,CA,US,90723,High School,,1999-06-01 00:00:00,3,10.0,Yes,No,0,National Space Communication Programs-Special ...
1,1,72,1,Train,La Mesa,CA,US,91941,Master's,Anthropology,2011-01-01 00:00:00,10,8.0,Yes,No,0,"Lecturer, Department of AnthropologyStudent As..."
2,2,80,1,Train,Williamstown,NJ,US,8094,High School,Not Applicable,1985-06-01 00:00:00,5,11.0,Yes,Yes,5,"Auto Publishing/Electro Mechanical Technician,..."
3,3,98,1,Train,Astoria,NY,US,11105,Master's,Journalism,2007-05-01 00:00:00,3,3.0,Yes,No,0,Editor-in-ChiefDeputy Sports & Website EditorA...
4,4,123,1,Train,Baton Rouge,LA,US,70808,Bachelor's,Agricultural Business,2011-05-01 00:00:00,1,9.0,Yes,No,0,Lead Hostess and Takeout Server


In [75]:
user_based_approach['DegreeType'] = user_based_approach['DegreeType'].fillna('')
user_based_approach['Major'] = user_based_approach['Major'].fillna('')
user_based_approach['TotalYearsExperience'] = str(user_based_approach['TotalYearsExperience'].fillna(''))

user_based_approach['DegreeType'] = user_based_approach['DegreeType'] + user_based_approach['Major'] + \
                                    user_based_approach['TotalYearsExperience'] + user_based_approach['WorkHistory']


In [76]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(user_based_approach['DegreeType'])

In [77]:
tfidf_matrix.shape

(20000, 173049)

In [78]:
# http://scikit-learn.org/stable/modules/metrics.html#linear-kernel
cosine_sim1 = linear_kernel(tfidf_matrix, tfidf_matrix)
# cosine_sim = cosine_similarity(tfidf_matrix)

In [79]:
cosine_sim.shape

(10000, 10000)

In [80]:
cosine_sim[0]

array([1.        , 0.04301522, 0.00643905, ..., 0.03802139, 0.03802139,
       0.03802139])

In [81]:
# a = [i for i in test_users.UserID if i in indices]
# print(a)
for i in indices.index:
    print(i)
    break

47


In [82]:
# Function for finding the index of similar user
def get_recommendations_userwise(userid,length=11):
    idx = indices[userid]
#     print (idx)
    sim_scores = list(enumerate(cosine_sim[idx]))
#     print (sim_scores[:10])
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
#     print(sim_scores[:10])
    user_indices = [i[0] for i in sim_scores]
#     print (user_indices[:10])
    return user_indices[0:length]

In [83]:
print ("-----Top 10 Similar users with userId: 72------")
get_recommendations_userwise(72,10)


-----Top 10 Similar users with userId: 72------


[1, 6054, 5871, 5162, 5354, 4799, 5120, 5412, 5247, 4731]

In [84]:
# Convertion of user indices to the userId
def convert_to_userID(user_index):
    user_idx = user_based_approach['index'].isin(user_index)
    df1 = pd.DataFrame(data = user_based_approach[user_idx], columns=['UserID'])
    return df1['UserID'].tolist()

In [85]:
# get the job list from the similar users
def get_job_id(usrid_list):
    jobs_userwise = apps_training['UserID'].isin(usrid_list) #
    df1 = pd.DataFrame(data = apps_training[jobs_userwise], columns=['JobID'])
    joblist = df1['JobID'].tolist()
    Job_list = jobs['JobID'].isin(joblist) #[1083186, 516837, 507614, 754917, 686406, 1058896, 335132])
    df_temp = pd.DataFrame(data = jobs[Job_list], columns=['JobID','Title','Description','City','State'])
    return df_temp[:11]

In [86]:
user_based_approach[user_based_approach['UserID']==123]

Unnamed: 0,index,UserID,WindowID,Split,City,State,Country,ZipCode,DegreeType,Major,GraduationDate,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany,WorkHistory
4,4,123,1,Train,Baton Rouge,LA,US,70808,Bachelor'sAgricultural Business0 10\n1 ...,Agricultural Business,2011-05-01 00:00:00,1,0 10\n1 8\n2 11\n3 ...,Yes,No,0,Lead Hostess and Takeout Server


In [87]:
get_job_id(convert_to_userID(get_recommendations_userwise(123)))

Unnamed: 0,JobID,Title,Description,City,State
1653,6867,Mail Sorters - Part Time Evenings,<div>\r<div>\r<div>\r<p><strong>Mail Sorters -...,Secaucus,NJ
2504,10312,Receptionist,<span>To assist all departments in the dealers...,Arlington,TX
2854,11623,Receptionist/HR Assistant,"<p><span>Operates system switchboard, </span><...",Mansfield,TX
3955,15796,Maintenance Opportunities,"<p align=""center""><strong>Maintenance Opportun...",Green Bay,WI
20212,79199,Community Support Coordinator,"Community Options, Inc. is a national nonprofi...",Moorestown,NJ
21233,84219,Human Resources Business Process Analyst,<b>Job ID:</b> 80006\r\n\r\n<b>Position Descri...,Roanoke,VA
50179,196495,Set Up Technician,"<SPAN></SPAN>WPI, a manufacturer of plastic in...",Green Bay,WI
62581,246306,Front desk assistant,"F/t Personable, energetic, multitasker needed ...",Bedford,TX
63838,250360,Financial Representative,"<p><b><span style=""text-decoration: underline""...",Delray Beach,FL
76486,300459,FC Bkkp,"FC Bkkp - P/T, Exp. w/Accrual, Multiple Books...",Fort Lauderdale,FL


In [88]:
get_recommendations_userwise(123,10)

[4, 23, 3378, 6008, 6311, 41, 56, 9815, 7700, 2170]

In [89]:
print ("-----Top 10 Similar users with userId: 47------")
get_recommendations_userwise(47)

-----Top 10 Similar users with userId: 47------


[0, 5909, 3774, 6296, 3560, 401, 2608, 3760, 3481, 3561, 2603]

In [90]:
get_job_id(convert_to_userID(get_recommendations_userwise(47,10)))

Unnamed: 0,JobID,Title,Description,City,State
609,2121,MEDICAL- FRONT OFFICE,"Medical - Front Office\r\nIndustrial Clinic, ...",Los Angeles,CA
4388,17358,Data Entry - Customer Service Representative ...,"<div style=""text-align: center""><strong>\r<p s...",Greenville,SC
7820,28992,Customer Service Rep Experienced CSRs needed ...,Customer Service Rep\r\n Experienced CSRs...,Nashville,TN
10598,39361,Customer Service Representative,"<br>\r<p style=""text-align: center"" align=""cen...",Charlotte,NC
23873,92858,Compliance Specialist II — Advertising Review,Our unique culture of independence gives Raymo...,Saint Petersburg,FL
34021,132647,Purchasing Assistant,<div><p>&nbsp;</p>\r<p><b><span>ESSENTIAL DUTI...,Charlotte,NC
42812,169528,"Resort Host/Marketing Coordinator - Anaheim, CA","<P STYLE=""MARGIN-TOP: 0px; MARGIN-BOTTOM: 0px""...",Anaheim,CA
47531,186401,Tax Associate,Tax Associate\t\t\t20-32583576\n\nA large insu...,Hoffman Estates,IL
48693,190688,Medical Assistant/Per Diem,Kelly Healthcare Resources<BR> <BR><BR>Kelly H...,Belleville,IL
55344,217664,Office Administrator,FORT MILL MANUFACTURE SEEKING IMMEDIATE HIRE!\...,Fort Mill,SC


In [91]:
users.loc[users.UserID == 47]

Unnamed: 0,UserID,WindowID,Split,City,State,Country,ZipCode,DegreeType,Major,GraduationDate,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany
0,47,1,Train,Paramount,CA,US,90723,High School,,1999-06-01 00:00:00,3,10.0,Yes,No,0


In [92]:
b = list(apps.loc[apps.UserID == 47]['JobID'])
b

[169528, 284009, 2121, 848187, 733748, 576958, 262470, 602298]

In [93]:
jobs.loc[jobs.JobID == 169528]

Unnamed: 0,JobID,WindowID,Title,Description,Requirements,City,State,Country,Zip5,StartDate,EndDate
42812,169528,1,"Resort Host/Marketing Coordinator - Anaheim, CA","<P STYLE=""MARGIN-TOP: 0px; MARGIN-BOTTOM: 0px""...",• Bachelor's Degree preferred;<BR> \r\n• Posse...,Anaheim,CA,US,92801,2012-04-03 22:31:26.41,2012-05-02 23:59:59


In [94]:
# jobs.at[853328,'Requirements']

In [95]:
# def find_accuracy(indices):
#     total = 0
#     for uid in indices.index:
#         app_job_ids=set(list(apps.loc[apps.UserID == uid]['JobID']))
#         len_app_job = len(app_job_ids)
#         if len_app_job != 0:
#             rec_job_ids=get_job_id(get_recommendations_userwise(uid, len_app_job))
#             total += len(app_job_ids.intersection(rec_job_ids))/len_app_job
#     return total/len(indices.index)