In [10]:
import numpy as np
import pandas as pd

In [11]:
job_df = pd.read_csv("Combined_Jobs_Final.csv")

In [12]:
job_df.head(2)

Unnamed: 0,Job.ID,Provider,Status,Slug,Title,Position,Company,City,State.Name,State.Code,...,Industry,Job.Description,Requirements,Salary,Listing.Start,Listing.End,Employment.Type,Education.Required,Created.At,Updated.At
0,111,1,open,palo-alto-ca-tacolicious-server,Server @ Tacolicious,Server,Tacolicious,Palo Alto,California,CA,...,Food and Beverages,Tacolicious' first Palo Alto store just opened...,,8.0,,,Part-Time,,2013-03-12 02:08:28 UTC,2014-08-16 15:35:36 UTC
1,113,1,open,san-francisco-ca-claude-lane-kitchen-staff-chef,Kitchen Staff/Chef @ Claude Lane,Kitchen Staff/Chef,Claude Lane,San Francisco,California,CA,...,Food and Beverages,\r\n\r\nNew French Brasserie in S.F. Financia...,,0.0,,,Part-Time,,2013-04-12 08:36:36 UTC,2014-08-16 15:35:36 UTC


In [13]:
print(len(job_df))
job_df = job_df[['Status', 'Title', 'Position', 'Company', 'Job.Description']]
print(len(job_df))

84090
84090


In [14]:
job_df.shape

(84090, 5)

In [15]:
job_df['Job.Description'][20]

'Hiring Event Details\r\nStore Associate\r\n\r\n$12.00 / Hour\r\nAdditional $1.00 Per Hour For ALL Sunday Shifts!\r\n50 Cent Wage Increases Beginning At 6 Months - Up to $13.50 At 2 Years\r\n\r\nMonday, December 15, 2014\r\n9am - 11am\r\n\r\nALDI\r\n3133 Market Place Dr\r\nOnalaska, WI 54650\r\n\r\n&nbsp;\r\nFor consideration, please apply in person at the hiring event only. Get started now by downloading our Store Employment Application.\r\n\r\nStore Associate - Retail Sales ( Customer Service )\r\n\r\nIf you are a customer service minded individual with a positive and energetic personality and you&rsquo;re interested in working for one of the best-known grocery stores in the nation, join the ALDI family! We are looking for motivated and reliable individuals to serve as a Store Associate. You will serve as the face of ALDI, providing customers with friendly and efficient check-out services. But that&rsquo;s just the beginning. You will also assist the store manager in a variety of rol

In [16]:
job_df.isnull().sum()
job_df.fillna('',inplace=True)
job_df.isnull().sum()

Status             0
Title              0
Position           0
Company            0
Job.Description    0
dtype: int64

In [17]:
job_df = job_df.sample(n=1000,random_state=42)

In [18]:
job_df.shape

(1000, 5)

# cleaning dataset
keeping all letters and digits                          
lover case                             
removing stopwords                            
tokenization                            
stemming                         

In [19]:
from nltk.corpus import stopwords
import nltk
import re
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [20]:
def cleaning(txt):
#     step 1
    txt = re.sub(r'[^a-zA-Z0-9\s]','',txt)
#     step 2
    tokens = nltk.word_tokenize(txt.lower())
    # step 3 and 5
    stemming = [ps.stem(w) for w in tokens if w not in stopwords.words('english')]
    return " ".join(stemming)
    

In [21]:
cleaning("\n\rhelo the master piece is my loving moving cat @9032#%$")

'helo master piec love move cat 9032'

In [22]:
job_df['Job.Description'] = job_df['Job.Description'].astype(str).apply(lambda x: cleaning(x))
job_df['Title'] = job_df['Title'].astype(str).apply(lambda x: cleaning(x))
job_df['Position'] = job_df['Position'].astype(str).apply(lambda x: cleaning(x))

In [23]:
job_df['clean_text'] = job_df['Job.Description']+" "+job_df['Title']+job_df['Position']

In [24]:
job_df['clean_text'][64119]

'job summari knowledg univers ku site director site leader inspir children teacher alik learn grow passion educ excel confid teach children adult use nation recogn curriculum framework creat uniqu engag classroom experi commit make site success know meaning relationship children famili team import success fulli engag enthusiast work eager share knowledg other job respons essenti function basic expect site director cours creativ new way meet exceed expect encourag long requir essenti function also met supervis children staff record keep licens record child file lesson plan implement mainten safe welcom classroom environ build relationship commun school recruit new student program applic must strong organiz skill site director knowledg universsite director'

# vectorizatoin

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
tfidf = TfidfVectorizer(stop_words='english')
matrix = tfidf.fit_transform(job_df['clean_text'])
similarity = cosine_similarity(matrix)

In [27]:
similarity

array([[1.        , 0.04322045, 0.02131549, ..., 0.05101897, 0.02496253,
        0.08044478],
       [0.04322045, 1.        , 0.02936862, ..., 0.03617188, 0.00565445,
        0.02613043],
       [0.02131549, 0.02936862, 1.        , ..., 0.05498817, 0.02959363,
        0.04218737],
       ...,
       [0.05101897, 0.03617188, 0.05498817, ..., 1.        , 0.05915732,
        0.11128466],
       [0.02496253, 0.00565445, 0.02959363, ..., 0.05915732, 1.        ,
        0.38569354],
       [0.08044478, 0.02613043, 0.04218737, ..., 0.11128466, 0.38569354,
        1.        ]])

In [28]:
sorted(list(enumerate(similarity[0])), key=lambda x: x[1], reverse=True)[1:20]

[(276, 0.9733772868873092),
 (730, 0.5181424118134745),
 (81, 0.49306292063609425),
 (917, 0.49306292063609425),
 (252, 0.2585811153172217),
 (128, 0.24712958824951992),
 (360, 0.23664048105861918),
 (825, 0.21870641716425473),
 (629, 0.2088881569389071),
 (38, 0.18257068622788095),
 (245, 0.1796380318454525),
 (114, 0.17263108553231282),
 (254, 0.16811344897738498),
 (298, 0.1446572732370542),
 (59, 0.14241842988596462),
 (940, 0.13972634715716156),
 (195, 0.1368714545586078),
 (284, 0.13625519998756502),
 (965, 0.13588092414113367)]

# Recommendation System

In [29]:
def recommend(title):

        indx = job_df[job_df['Title'] == title].index[0]
        indx = job_df.index.get_loc(indx)
        distances = sorted(list(enumerate(similarity[indx])), key=lambda x: x[1], reverse=True)[1:20]

        jobs = []
        for i in distances:
            jobs.append(job_df.iloc[i[0]].Title)
        return jobs
        

In [30]:
recommend('site director knowledg univers')

['site director knowledg univers educ llc',
 'teacher knowledg univers',
 'assist teacher knowledg univers',
 'assist teacher knowledg univers',
 'cook knowledg univers',
 'immedi open assist teacher la petit academi',
 'summer school age assist children courtyard',
 'hvac instructor vatterott educ center',
 'temporari coordin site oper macyscom maci',
 'fellowship program coordin connecticut children medic center',
 'medic offic assist instructor concord career colleg inc',
 'youth camp residenti assist activ coordin new york ny el educ servic',
 'pharmaci adjunct instructor brown macki colleg',
 'secur offic regular securita usa',
 'secur offic 100000 job coalit securita usa',
 'school day camp counselor ii ymca greenvil',
 'faculti call zenith educ group',
 'faculti support specialist zenith educ group',
 'handbag sell specialist part time bloomingdal chevi chase md bloomingdal']

In [33]:
import pickle
pickle.dump(job_df,open('df.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))

In [34]:
import pandas as pd
print(pd.__version__)


1.4.4
