In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
import sys

import warnings; warnings.simplefilter('ignore')

import os

working_directory = os.getcwd()

In [4]:
jobs = pd.read_csv("Sample_Datasets_Kaggles/Train_rev1.csv")

mock_variables = pd.read_csv("Sample_Datasets_Kaggles/md.csv")


jobs['vote_count'] = mock_variables['vote_count']
jobs['vote_average'] = mock_variables['vote_average']
jobs['start_date'] = mock_variables['release_date']
jobs['popularity'] = mock_variables['popularity']

jobs.dropna()

jobs.head()

Unnamed: 0,Id,Title,FullDescription,LocationRaw,LocationNormalized,ContractType,ContractTime,Company,Category,SalaryRaw,SalaryNormalized,SourceName,vote_count,vote_average,start_date,popularity
0,12612628,Engineering Systems Analyst,Engineering Systems Analyst Dorking Surrey Sal...,"Dorking, Surrey, Surrey",Dorking,,permanent,Gregory Martin International,Engineering Jobs,20000 - 30000/annum 20-30K,25000,cv-library.co.uk,5415.0,7.7,1995-10-30,21.946943
1,12612830,Stress Engineer Glasgow,Stress Engineer Glasgow Salary **** to **** We...,"Glasgow, Scotland, Scotland",Glasgow,,permanent,Gregory Martin International,Engineering Jobs,25000 - 35000/annum 25-35K,30000,cv-library.co.uk,2413.0,6.9,1995-12-15,17.015539
2,12612844,Modelling and simulation analyst,Mathematical Modeller / Simulation Analyst / O...,"Hampshire, South East, South East",Hampshire,,permanent,Gregory Martin International,Engineering Jobs,20000 - 40000/annum 20-40K,30000,cv-library.co.uk,92.0,6.5,1995-12-22,11.7129
3,12613049,Engineering Systems Analyst / Mathematical Mod...,Engineering Systems Analyst / Mathematical Mod...,"Surrey, South East, South East",Surrey,,permanent,Gregory Martin International,Engineering Jobs,25000 - 30000/annum 25K-30K negotiable,27500,cv-library.co.uk,34.0,6.1,1995-12-22,3.859495
4,12613647,"Pioneer, Miser Engineering Systems Analyst","Pioneer, Miser Engineering Systems Analyst Do...","Surrey, South East, South East",Surrey,,permanent,Gregory Martin International,Engineering Jobs,20000 - 30000/annum 20-30K,25000,cv-library.co.uk,173.0,5.7,1995-02-10,8.387519


In [124]:
#### Popularity Based Recommender System for Jobs ####

############################################

In [5]:
# Ranks Jobs by general popularity criteris within specific groups
def get_qulified_jobs():
    # Get Vote Counts & Average for non-null rows 
    vote_counts = jobs[jobs['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = jobs[jobs['vote_average'].notnull()]['vote_average'].astype('int')
    
    # Calulate Avergage for all Vote averages 
    C = vote_averages.mean()
    m = vote_counts.quantile(0.95)
    
    
    # Qualified Jobs should have vote_count above 95th quantile, not null count and average
    qualified = jobs[(jobs['vote_count'] >= m) & (jobs['vote_count'].notnull()) & (jobs['vote_average'].notnull())][['Title', 'vote_count', 'vote_average', 'popularity']]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    return qualified

    
    

In [6]:
# Function to calculate to weigh in both vote count and average

def weighted_rating(x):
    
    # Get Vote Counts & Average for non-null rows 
    vote_counts = jobs[jobs['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = jobs[jobs['vote_average'].notnull()]['vote_average'].astype('int')
    
    # Calulate Avergage for all Vote averages 
    C = vote_averages.mean()
    m = vote_counts.quantile(0.95)
    v = x['vote_count']
    R = x['vote_average']
    
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
def build_chart_with_weights():
    
    qualified = get_qulified_jobs()
    
    # Add a new row called weighted rating
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    
    # Sorted df will give you the most popular jobs
    qualified = qualified.sort_values('wr', ascending=False).head(250)
    
    return qualified

# TODO
def build_next10_with_weights_opt():
    qualified = build_chart_with_weights().head(30)
    
    return qualified

# Top Jobs
build_chart_with_weights()


In [None]:
# Top 10 Jobs
build_next10_with_weights_opt().head(10)

In [None]:
#### Content Based Recommender using description ####

#####################################################

In [None]:
def get_recommendations_content_based(title):
    
    jobs = pd.read_csv("Train_rev1.csv")
    # Term Frequency - Inverse Document Frequency. For all desciptions. 
    tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
    tfidf_matrix = tf.fit_transform(jobs['FullDescription'])
    tfidf_matrix.shape

    # Calculate Consine Similarity of Jobs based on Description
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    
    jobs = jobs.reset_index()
    titles = jobs['Title']
    indices = pd.Series(smd.index, index=jobs['Title'])
    
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    job_indices = [i[0] for i in sim_scores]
    return titles.iloc[job_indices]
    
get_recommendations_content_based('Accounts Assistant').head(10)

In [None]:
#### MetaData Based Recommender with Credits and Keywords ####

############################################################


In [None]:
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')

# Change all ids to int
keywords['Id'] = keywords['Id'].astype('int')
credits['Id'] = credits['Id'].astype('int')

# Remove the Stem on keywords 
stemmer = SnowballStemmer('english')
jobs['Id'] = jobs['Id'].astype('int') # Convert all the id types to int

jobs = jobs.merge(credits, on='Id')
jobs = jobs.merge(keywords, on='Id')


In [None]:
def get_new_df_metadata_based():
    # Merge Keywords and credits to MD
    smd = jobs[jobs['Id'].isin(links_small)]
    smd.shape

    smd['keywords'] = smd['keywords'].apply(literal_eval)
    smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
    smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
    
    smd['keywords'] = smd['keywords'].apply( filter_keywords(args=(smd)) )
    smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
    smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
    
    smd['Soup'] = smd['keywords']
    smd['Soup'] = smd['Soup'].apply(lambda x: ' '.join(x))
    
    return smd


In [None]:
def filter_keywords(x, smd):
    s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
    s.name = 'keyword'
    print(s)

    s = s.value_counts()
    s[:5]
    s = s[s > 1]
    
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [None]:
# Apply Similarity of Jobs MetaData
def get_recommendations_metadata_based(title):
    smd = get_new_df_metadata_based()
    
    smd = smd.reset_index()
    titles = smd['Title']
    indices = pd.Series(smd.index, index=smd['Title'])
    
    count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
    count_matrix = count.fit_transform(smd['Soup'])
    cosine_sim = cosine_similarity(count_matrix, count_matrix)     
    
    smd = smd.reset_index()
    titles = smd['Title']
    indices = pd.Series(smd.index, index=smd['Title'])
    
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    job_indices = [i[0] for i in sim_scores]
    return titles.iloc[job_indices]


get_recommendations_metadata_based('Accounts Assistant').head(10)