In [1]:
# import libraries  
import numpy as np
import pandas as pd
import nltk
import re, random, os
import string, pprint
import matplotlib.pyplot as plt
import seaborn as sns

# spacy for basic preprocessing, optional, can use nltk as well (lemmatisation etc.)
import spacy

import warnings
warnings.filterwarnings('ignore')

## 1. Read and inspect the data

In [2]:
data = pd.read_csv("potential-talents.csv")
data.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


In [3]:
data.fit.value_counts()

Series([], Name: count, dtype: int64)

In [4]:
data['fit'].unique()

array([nan])

We see that the fit column has all values as Nan, so we can safely drop it.

In [5]:
data.drop('fit', axis=1, inplace=True)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          104 non-null    int64 
 1   job_title   104 non-null    object
 2   location    104 non-null    object
 3   connection  104 non-null    object
dtypes: int64(1), object(3)
memory usage: 3.4+ KB


## 2. Data Cleaning and Preprocessing

In [7]:
# Checking for null values
data.isnull().sum()

id            0
job_title     0
location      0
connection    0
dtype: int64

In [8]:
# Finding unique values in each column
for col in data.columns:
    print(f'No. of unique values in {col} column = {data[col].nunique()}')

No. of unique values in id column = 104
No. of unique values in job_title column = 52
No. of unique values in location column = 41
No. of unique values in connection column = 33


Here we see that id column values are almost double the job title column values. So there is a **possibility for duplicate entries** which can be deleted.

In [9]:
# Check for duplicate entries without id column
temp = data.drop(['id'], axis=1)
temp.head()

Unnamed: 0,job_title,location,connection
0,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85
1,Native English Teacher at EPIK (English Progra...,Kanada,500+
2,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44
3,People Development Coordinator at Ryan,"Denton, Texas",500+
4,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+


In [10]:
temp.shape

(104, 3)

In [11]:
# Now finding duplicates and deleting them
temp.duplicated().sum()

51

As we see there are 51 columns which are duplicate of each other, we can delete these.

In [12]:
new_df = temp.drop_duplicates()
new_df.shape

(53, 3)

In [13]:
df = pd.concat([data['id'], new_df], axis=1).dropna()
df = df.reset_index(drop = True)


In [14]:
df.shape

(53, 4)

In [15]:
df.head()

Unnamed: 0,id,job_title,location,connection
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+


#### a) Now taking the main query column that is JOB_TITLE and preprocessing it.

In [16]:
df['job_title'].value_counts()

job_title
Aspiring Human Resources Professional                                                                                    2
2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional                 1
Lead Official at Western Illinois University                                                                             1
Senior Human Resources Business Partner at Heil Environmental                                                            1
Aspiring Human Resources Professional | An energetic and Team-Focused Leader                                             1
HR Manager at Endemol Shine North America                                                                                1
Human Resources professional for the world leader in GIS software                                                        1
RRP Brand Portfolio Executive at JTI (Japan Tobacco International)                                                       1
Inform

From the above we can see a number of things for **text pre-processing**:
1. Expanding abbreviations and new lines and numbers
2. Convert to lower case and remove unwanted characters like punctuation symbols from the text. - what we see is characters like | or ! or \n or opening closing brackets which need to be handled.
3. Then we will remove **stop words** and go for stemming or/and lemmatization.

In [17]:
# Expanding abbreviations and removing \n and numbers
df.replace({'job_title' : { 'HR' : 'human resources', 'GIS': 'geographic information system',
                           'EY': 'Ernst & Young Global Limited', 'MES': 'Manufacturing execution systems',
                           'CHRO' : 'chief human resources officer', 'SVP' : 'senior vice president',
                           'CSR' : 'corporate social responsibility', 'GPHR' : 'global professional in human resources',
                           'SPHR' : 'strategic and policy-making certification', 'HRIS' : 'human resources management system',  
                           'C.T.' : '', 'EPIK': 'english program in korea',
                           '\n': '', '\w*\d\w*': ''
    }}, regex=True, inplace=True)


In [18]:
# Removing punctuation and changing to lowercase
punct_chars = set(string.punctuation)
df['job_title'] = df['job_title'].apply(lambda x: ''.join(char for char in x if char not in punct_chars))
df['job_title'] = df['job_title'].str.lower()

df['job_title'].value_counts()

job_title
aspiring human resources professional                                                                                                                                                     2
  bauer college of business graduate magna cum laude and aspiring human resources professional                                                                                            1
lead official at western illinois university                                                                                                                                              1
senior human resources business partner at heil environmental                                                                                                                             1
aspiring human resources professional  an energetic and teamfocused leader                                                                                                                1
human resources manager at endemol shine north ame

In [19]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [20]:
stop_nltk = set(stopwords.words('english'))
lemmatiser = WordNetLemmatizer()

In [21]:
def preprocess_text(dataframe, col):
    for i in range(len(dataframe)):
        tokens = word_tokenize(dataframe.iloc[i][col])
        tokens_no_stopwords = [word for word in tokens if word not in stop_nltk]
        final_sentence = []
        for word in tokens_no_stopwords:
            final_sentence.append(lemmatiser.lemmatize(word))
        dataframe[col][i]=' '.join([str(elem) for elem in final_sentence])

In [22]:
nltk.download('punkt')
nltk.download('wordnet')
preprocess_text(df, 'job_title')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\CHARU\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\CHARU\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [23]:
df.head()

Unnamed: 0,id,job_title,location,connection
0,1,bauer college business graduate magna cum laud...,"Houston, Texas",85
1,2,native english teacher english program korea e...,Kanada,500+
2,3,aspiring human resource professional,"Raleigh-Durham, North Carolina Area",44
3,4,people development coordinator ryan,"Denton, Texas",500+
4,5,advisory board member celal bayar university,"İzmir, Türkiye",500+


### b) Now similar preprocessing for LOCATION

In [24]:
df['location'].value_counts()

location
Houston, Texas Area                    4
Raleigh-Durham, North Carolina Area    3
Greater New York City Area             3
Austin, Texas Area                     2
Amerika Birleşik Devletleri            2
Kanada                                 2
Greater Philadelphia Area              2
Greater Atlanta Area                   2
Torrance, California                   1
Highland, California                   1
Gaithersburg, Maryland                 1
Baltimore, Maryland                    1
Milpitas, California                   1
Greater Chicago Area                   1
Houston, Texas                         1
Long Beach, California                 1
Chattanooga, Tennessee Area            1
Bridgewater, Massachusetts             1
Lafayette, Indiana                     1
Kokomo, Indiana Area                   1
Las Vegas, Nevada Area                 1
Cape Girardeau, Missouri               1
Greater Los Angeles Area               1
Los Angeles, California                1
Dallas/

In [25]:
# Removing punctuations and correcting misspelt word(s)
punct_chars = punct_chars.difference(set([',']))
df['location'] = df['location'].apply(lambda x: ''.join(char for char in x if char not in punct_chars))
df['location'] = df['location'].str.lower()
df.replace({'location' : {'amerika birleşik devletleri' : 'united states of america',
                           'kanada' : 'canada', 'türkiye' : 'turkey',
    }}, regex=True, inplace=True)
df.head()

Unnamed: 0,id,job_title,location,connection
0,1,bauer college business graduate magna cum laud...,"houston, texas",85
1,2,native english teacher english program korea e...,canada,500+
2,3,aspiring human resource professional,"raleighdurham, north carolina area",44
3,4,people development coordinator ryan,"denton, texas",500+
4,5,advisory board member celal bayar university,"i̇zmir, turkey",500+


In [26]:
preprocess_text(df, 'location')
df.head()

Unnamed: 0,id,job_title,location,connection
0,1,bauer college business graduate magna cum laud...,"houston , texas",85
1,2,native english teacher english program korea e...,canada,500+
2,3,aspiring human resource professional,"raleighdurham , north carolina area",44
3,4,people development coordinator ryan,"denton , texas",500+
4,5,advisory board member celal bayar university,"i̇zmir , turkey",500+


### b) Preprocessing for CONNECTION

In [27]:
#We need to normalize the connections to be between 0-1. We will count 500+ as 500

# Function to normalize scores to between 0-1
def normalize_score(score):
    if score == '500+ ':
        score = 500
    return float(score)/500

df['connection'] = df['connection'].apply(normalize_score)
df.head()

Unnamed: 0,id,job_title,location,connection
0,1,bauer college business graduate magna cum laud...,"houston , texas",0.17
1,2,native english teacher english program korea e...,canada,1.0
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088
3,4,people development coordinator ryan,"denton , texas",1.0
4,5,advisory board member celal bayar university,"i̇zmir , turkey",1.0


## 3. Word Embeddings

We will now convert the words or tokens into vectors , ie, continuous-valued vectors in a high-dimensional space to capture the semantic relationships between words. The methods we will use are :
1) **TF-IDF** is a classic metric that calculates the importance of a term in a document relative to a collection of documents.
2) **Word2Vec** It is a shallow neural network-based model that learns word embeddings by predicting words in their context (CBOW) or predicting context words given a target word (Skip-gram).
3) **Glove** It is a count-based model that learns word embeddings based on the co-occurrence statistics of words in a large corpus.
4) **FastText** It represents words as the sum of their constituent subword embeddings (character n-grams) for capturing more meaning.
5) **BERT** It is a contextual embedding model that pre-trains on massive text corpora and captures word meanings in context.

### Chosen Performance metric : Cosine Similarity

We need to find how fit the candidate is for the role. As stated in the problem statement, it should be a **number**, like probability between 0-1. So for this case **cosine similarity** seems the closest needed to fit our criterion. As it is apt for text-oriented searches through which we can gauge the **similarity between two vectors**, such as query and document vectors.
As our use case is one of ranking and performing **document search** , cosine similarity is the apt metric. 
<p><i>NOTE:</i> The <u>final ranking fitness</u> score will also take into account the starring feedback loop action.</p>

### 1) Beginning with TF-IDF

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorizer instantiation
tfidf = TfidfVectorizer()

# Create tfidf matrix
tf_matrix = tfidf.fit_transform(df['job_title'])

# Shape of matrix
print(tf_matrix.shape)

(53, 181)


In [29]:
data = pd.DataFrame(tf_matrix.toarray(), columns=tfidf.get_feature_names_out())
data

Unnamed: 0,administration,administrative,admission,advisory,always,america,analyst,analytics,army,art,...,vice,victoria,wellington,western,westfield,within,woodland,work,world,young
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.426332,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
# Function for cleaning and preprocessing of query terms
def preprocessing_keywords(keywords):
    keywords = re.sub('[^a-zA-Z]', ' ', keywords) # remove punctuations and numbers
    keywords = str(keywords).lower()              # lowercase
    tokens = word_tokenize(keywords)          
    keywords_no_stopwords = [word for word in tokens if word not in stop_nltk]
    keywords = [lemmatiser.lemmatize(word=word) for word in keywords_no_stopwords]
    keywords = ' '.join(keywords)
    return keywords

#### Let us start with our search term keywords as "Aspiring human resources" 

In [31]:
keyword = 'Aspiring human resources'

# Preprocess these words
keyword_cleaned = preprocessing_keywords(keyword)
print(f'{keyword_cleaned}')

aspiring human resource


In [32]:
tfidf_keyword = tfidf.transform([keyword_cleaned])
print(tfidf_keyword.shape)

(1, 181)


In [33]:
# Now computing cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

In [34]:
cos_sim = cosine_similarity(tf_matrix, tfidf_keyword)

cos_data = { 'tfidf_fit' : cos_sim.ravel()}
cos_df = pd.DataFrame(cos_data)
cos_df.head()

Unnamed: 0,tfidf_fit
0,0.268142
1,0.0
2,0.76765
3,0.0
4,0.0


In [35]:
## Creating a final dataset with the performance metrics appended
final_df = pd.concat([df, cos_df], axis=1)
final_df.head()

Unnamed: 0,id,job_title,location,connection,tfidf_fit
0,1,bauer college business graduate magna cum laud...,"houston , texas",0.17,0.268142
1,2,native english teacher english program korea e...,canada,1.0,0.0
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765
3,4,people development coordinator ryan,"denton , texas",1.0,0.0
4,5,advisory board member celal bayar university,"i̇zmir , turkey",1.0,0.0


In [36]:
# Let us sort the dataframe to have the job_titles which have closest fit to our keywords search query
sorted_df = final_df.sort_values(['tfidf_fit'], ascending=False)
sorted_df.head(6)

Unnamed: 0,id,job_title,location,connection,tfidf_fit
45,97,aspiring human resource professional,"kokomo , indiana area",0.142,0.76765
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765
5,6,aspiring human resource specialist,greater new york city area,0.002,0.676406
21,73,aspiring human resource manager seeking intern...,"houston , texas area",0.014,0.612429
12,27,aspiring human resource management student see...,"houston , texas area",1.0,0.429728
20,72,business management major aspiring human resou...,"monroe , louisiana area",0.01,0.427448


In [37]:
sorted_df['job_title'].head(6).tolist()

['aspiring human resource professional',
 'aspiring human resource professional',
 'aspiring human resource specialist',
 'aspiring human resource manager seeking internship human resource',
 'aspiring human resource management student seeking internship',
 'business management major aspiring human resource manager']

Here we find the top 5 <b>job_titles</b> similar to <b><i>"Aspiring human resources"</b></i> are: 
* aspiring human resource professional with similarity 76.76%
* aspiring human resource specialist with similarity 67.64%
* aspiring human resource manager seeking internship human resource with similarity 61.2%
* aspiring human resource management student seeking internship with similarity 42.97%
* business management major aspiring human resource manager with similarity 42.74%

In [38]:
pd.set_option('display.max_colwidth', None)

In [39]:
## Function to do above all steps in one go
def top_talent(job_title):
    keyword_cleaned = preprocessing_keywords(job_title)
    tfidf_keyword = tfidf.transform([keyword_cleaned])
    cos_sim = cosine_similarity(tf_matrix, tfidf_keyword)
    cos_data = { 'tfidf_fit' : cos_sim.ravel()}
    cos_df = pd.DataFrame(cos_data)
    temp_df = pd.concat([df, cos_df], axis=1)
    return sorted_df

In [40]:
top_talent('Aspiring human resources').sort_values(['tfidf_fit'], ascending=False).head(5)

Unnamed: 0,id,job_title,location,connection,tfidf_fit
45,97,aspiring human resource professional,"kokomo , indiana area",0.142,0.76765
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765
5,6,aspiring human resource specialist,greater new york city area,0.002,0.676406
21,73,aspiring human resource manager seeking internship human resource,"houston , texas area",0.014,0.612429
12,27,aspiring human resource management student seeking internship,"houston , texas area",1.0,0.429728


#### Let us also check for keywords: "seeking human resources"

In [41]:
final_df_kw2 = top_talent('seeking human resources')
final_df_kw2.sort_values(['tfidf_fit'], ascending=False).head(5)

Unnamed: 0,id,job_title,location,connection,tfidf_fit
45,97,aspiring human resource professional,"kokomo , indiana area",0.142,0.76765
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765
5,6,aspiring human resource specialist,greater new york city area,0.002,0.676406
21,73,aspiring human resource manager seeking internship human resource,"houston , texas area",0.014,0.612429
12,27,aspiring human resource management student seeking internship,"houston , texas area",1.0,0.429728


### 2) Word2Vec

In [42]:
import gensim
import warnings
warnings.filterwarnings('ignore')
import multiprocessing
from gensim.models.phrases import Phrases, Phraser


In [43]:
df.head()

Unnamed: 0,id,job_title,location,connection
0,1,bauer college business graduate magna cum laude aspiring human resource professional,"houston , texas",0.17
1,2,native english teacher english program korea english program korea,canada,1.0
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088
3,4,people development coordinator ryan,"denton , texas",1.0
4,5,advisory board member celal bayar university,"i̇zmir , turkey",1.0


In [44]:
corpus = [words.split() for words in df['job_title']]
corpus

[['bauer',
  'college',
  'business',
  'graduate',
  'magna',
  'cum',
  'laude',
  'aspiring',
  'human',
  'resource',
  'professional'],
 ['native',
  'english',
  'teacher',
  'english',
  'program',
  'korea',
  'english',
  'program',
  'korea'],
 ['aspiring', 'human', 'resource', 'professional'],
 ['people', 'development', 'coordinator', 'ryan'],
 ['advisory', 'board', 'member', 'celal', 'bayar', 'university'],
 ['aspiring', 'human', 'resource', 'specialist'],
 ['student',
  'humber',
  'college',
  'aspiring',
  'human',
  'resource',
  'generalist'],
 ['human', 'resource', 'senior', 'specialist'],
 ['seeking',
  'human',
  'resource',
  'human',
  'resourcesis',
  'generalist',
  'position'],
 ['student', 'chapman', 'university'],
 ['senior',
  'vice',
  'president',
  'chuman',
  'resourceso',
  'marketing',
  'communication',
  'corporate',
  'social',
  'responsibility',
  'officer',
  'engie',
  'houston',
  'woodland',
  'energy',
  'gphuman',
  'resource',
  'sphuman',


In [45]:
w2vmodel = gensim.models.Word2Vec(corpus, min_count=1, workers = multiprocessing.cpu_count(), 
                                  vector_size=100, epochs=30)

In [46]:
w2vmodel

<gensim.models.word2vec.Word2Vec at 0x166653e2ac0>

In [47]:
w2vmodel.wv["aspiring"]

array([-2.21384433e-03,  5.07073337e-03, -6.95561524e-03,  2.94341851e-04,
        7.75390398e-03,  3.52842221e-03, -2.67321314e-03,  6.81405794e-03,
       -1.15574645e-02,  6.01049978e-03, -4.79733804e-03, -6.11406891e-03,
        9.48811881e-03,  2.63728155e-03,  7.81554077e-03, -5.36438031e-03,
        5.88819245e-03,  6.00915169e-03, -9.92803834e-03, -9.92780644e-03,
       -5.54706110e-03, -3.40491580e-03, -3.87486839e-03, -1.02112200e-02,
        7.74142100e-03, -3.69489472e-03,  7.36223813e-03,  3.65609396e-03,
       -8.41954723e-03,  4.35816916e-03,  6.97632600e-03, -7.10925553e-03,
       -6.79161120e-03, -3.19447601e-03, -9.22233239e-03,  3.00661661e-03,
        1.31044909e-03,  2.91950838e-03,  1.83283677e-03, -3.52096767e-03,
       -5.64593729e-03, -1.58958440e-03, -3.74302920e-03,  6.38566352e-03,
        6.02450361e-03,  4.15194593e-03,  5.47890784e-04, -2.77629285e-03,
       -2.31837621e-03,  4.50074673e-04,  3.15941032e-03, -4.22517490e-03,
       -5.17041981e-03, -

In [48]:
def check_present_in_model(token, model):
    if model.wv.__contains__(token) == True:
        return model.wv[token]
    else:
        return np.zeros(100)

In [49]:
def calculate_vector(input, model):
    query_tokens = input.split()
    query_vector = check_present_in_model(query_tokens[0], model)
    
    for token in query_tokens[1:]:
        query_vector = query_vector + check_present_in_model(token, model)

    query_vector = query_vector / len(query_tokens)
    return query_vector

In [50]:
w2vmodel.wv['college']

array([-8.7073585e-03, -9.1235840e-04, -8.0804192e-03,  8.9676762e-03,
        6.0702991e-03,  2.3903321e-03,  9.2701092e-03,  2.5726387e-03,
       -1.2056711e-02,  9.0783825e-03, -4.8670354e-03,  5.8739623e-03,
        5.7388837e-03,  5.9800721e-05, -7.3451903e-03, -7.8591792e-04,
        6.8201590e-03, -1.0004920e-02,  2.5494376e-04, -1.1656916e-02,
        9.7782193e-03, -1.6266612e-03,  9.2604579e-03, -6.9672181e-03,
       -9.9078640e-03, -7.9289768e-03, -4.8611234e-03,  3.3800271e-03,
       -1.5858081e-03,  9.4573433e-03,  3.9221593e-03,  4.0529040e-03,
        3.5757334e-03,  7.8469068e-03, -3.0084215e-03,  1.1160766e-02,
       -8.1678657e-03,  2.5565471e-03, -3.7057896e-04, -1.5962666e-03,
        6.7856722e-03, -5.3728945e-03, -4.5289327e-03, -1.8327226e-04,
        1.1380890e-03, -3.9495160e-03,  5.5922484e-03, -6.6196970e-03,
        9.3545979e-03,  1.0165565e-03,  3.9174072e-03,  2.1714813e-03,
        9.9074154e-04,  1.2273814e-03, -2.4351210e-03,  5.6173322e-03,
      

In [51]:
query_vector = calculate_vector(preprocessing_keywords('aspiring human resources'), w2vmodel)
job_title_scores = {}
for job_title in final_df['job_title']:
    job_title_vector = calculate_vector(job_title, w2vmodel)
    similarity_score = w2vmodel.wv.cosine_similarities(query_vector, [job_title_vector])[0]
    job_title_scores[job_title] = similarity_score

for title, score in job_title_scores.items():
    print(title, score)

bauer college business graduate magna cum laude aspiring human resource professional 0.8619738
native english teacher english program korea english program korea 0.56780547
aspiring human resource professional 0.95190215
people development coordinator ryan 0.51234156
advisory board member celal bayar university 0.61254036
aspiring human resource specialist 0.94967794
student humber college aspiring human resource generalist 0.89007705
human resource senior specialist 0.8344201
seeking human resource human resourcesis generalist position 0.8804667
student chapman university 0.43617266
senior vice president chuman resourceso marketing communication corporate social responsibility officer engie houston woodland energy gphuman resource sphuman resource 0.8021904
human resource coordinator intercontinental buckhead atlanta 0.7947706
aspiring human resource management student seeking internship 0.85901463
seeking human resource opportunity 0.8841362
experienced retail manager aspiring human 

In [52]:
## Assigning these values to our final_df 

In [53]:
final_df['Word2Vec_Fit'] = final_df['job_title'].map(job_title_scores)
final_df.head()

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit
0,1,bauer college business graduate magna cum laude aspiring human resource professional,"houston , texas",0.17,0.268142,0.861974
1,2,native english teacher english program korea english program korea,canada,1.0,0.0,0.567805
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765,0.951902
3,4,people development coordinator ryan,"denton , texas",1.0,0.0,0.512342
4,5,advisory board member celal bayar university,"i̇zmir , turkey",1.0,0.0,0.61254


#### Let us also check for keywords: "seeking human resources"

In [54]:
query_vector = calculate_vector(preprocessing_keywords('seeking human resources'), w2vmodel)
job_title_scores = {}
for job_title in final_df['job_title']:
    job_title_vector = calculate_vector(job_title, w2vmodel)
    similarity_score = w2vmodel.wv.cosine_similarities(query_vector, [job_title_vector])[0]
    job_title_scores[job_title] = similarity_score

In [55]:
# Adding these to our second keywords table as well
final_df_kw2['Word2Vec_Fit'] = final_df_kw2['job_title'].map(job_title_scores)
final_df_kw2.head()

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit
45,97,aspiring human resource professional,"kokomo , indiana area",0.142,0.76765,0.859485
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765,0.859485
5,6,aspiring human resource specialist,greater new york city area,0.002,0.676406,0.842872
21,73,aspiring human resource manager seeking internship human resource,"houston , texas area",0.014,0.612429,0.946601
12,27,aspiring human resource management student seeking internship,"houston , texas area",1.0,0.429728,0.855958


#### Some <i><u>utility functions</i></u> which will be needed for multiple algorithms :

In [56]:
# Function to calculate cosine similarity for checking closeness of search query vector
def cosine_sim(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    
    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0.0  # Handle the case of zero vectors
    
    similarity = dot_product / (norm_vec1 * norm_vec2)
    return similarity

In [57]:
# Function to calculate average of the word vectors
def average_word_vectors(words, model, size):
    word_vectors = [model[word] if word in model else np.zeros(size) for word in words]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(size)

In [58]:
def calculate_similarity(model, query, size):
    if size == -1 :
        size = model.vector_size
    job_title_vectors = [average_word_vectors(job_title.split(), model, size) for job_title in final_df['job_title']]
    query_vector = np.mean([model[word] if word in model else np.zeros(size) for word in preprocessing_keywords(query).split()], axis=0)
    similarities = [cosine_sim(query_vector, title_vector) for title_vector in job_title_vectors]
    return similarities

### 2.b) Using Google's pretrained Word2vec model

Apart from our own created Word2vec model with limited context, we will also be trying with a  <b><u>pretrained word2vec model</b></u> which will be trained on a massive corpus and may capture the semantic language patterns better. So trying with Google's pretrained Word2Vec model as well.

In [59]:
from gensim.models import KeyedVectors

In [60]:
google_word2vec = 'GoogleNews-vectors-negative300.bin.gz'

In [61]:
google_model = KeyedVectors.load_word2vec_format(google_word2vec, binary=True)

In [62]:
google_similarities = calculate_similarity(google_model, 'aspiring human resources',-1)
final_df['Google_Word2vec_fit'] = google_similarities
final_df.head()

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit
0,1,bauer college business graduate magna cum laude aspiring human resource professional,"houston , texas",0.17,0.268142,0.861974,0.623142
1,2,native english teacher english program korea english program korea,canada,1.0,0.0,0.567805,0.219639
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765,0.951902,0.950395
3,4,people development coordinator ryan,"denton , texas",1.0,0.0,0.512342,0.253594
4,5,advisory board member celal bayar university,"i̇zmir , turkey",1.0,0.0,0.61254,0.206563


In [63]:
# Also trying for our second set of keywords
google_similarities_2 = calculate_similarity(google_model, 'seeking human resources', -1)
final_df_kw2['Google_Word2vec_fit'] = google_similarities_2
final_df_kw2.head()

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit
45,97,aspiring human resource professional,"kokomo , indiana area",0.142,0.76765,0.859485,0.422934
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765,0.859485,0.127382
5,6,aspiring human resource specialist,greater new york city area,0.002,0.676406,0.842872,0.723763
21,73,aspiring human resource manager seeking internship human resource,"houston , texas area",0.014,0.612429,0.946601,0.268558
12,27,aspiring human resource management student seeking internship,"houston , texas area",1.0,0.429728,0.855958,0.237063


### 3) Now applying Glove

In [64]:
glove_file = 'glove.6B.300d.word2vec.txt'

In [65]:
# Load the GloVe word vectors into Gensim format
glove_model = KeyedVectors.load_word2vec_format(glove_file, binary=False)

In [66]:
title_similarities = calculate_similarity(glove_model, 'aspiring human resources', -1)
final_df['Glove_fit'] = title_similarities
final_df.head()

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit,Glove_fit
0,1,bauer college business graduate magna cum laude aspiring human resource professional,"houston , texas",0.17,0.268142,0.861974,0.623142,0.552778
1,2,native english teacher english program korea english program korea,canada,1.0,0.0,0.567805,0.219639,0.376623
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765,0.951902,0.950395,0.924892
3,4,people development coordinator ryan,"denton , texas",1.0,0.0,0.512342,0.253594,0.451536
4,5,advisory board member celal bayar university,"i̇zmir , turkey",1.0,0.0,0.61254,0.206563,0.262758


In [67]:
final_df.sort_values('Glove_fit', ascending=False).head(5)

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit,Glove_fit
5,6,aspiring human resource specialist,greater new york city area,0.002,0.676406,0.949678,0.912262,0.934877
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765,0.951902,0.950395,0.924892
45,97,aspiring human resource professional,"kokomo , indiana area",0.142,0.76765,0.951902,0.950395,0.924892
21,73,aspiring human resource manager seeking internship human resource,"houston , texas area",0.014,0.612429,0.945208,0.875945,0.908395
22,74,human resource professional,greater boston area,0.032,0.40256,0.879294,0.874494,0.871642


In [68]:
# Also trying for our second set of keywords
title_similarities = calculate_similarity(glove_model, 'seeking human resources', -1)
final_df_kw2['Glove_fit'] = title_similarities
final_df_kw2.head()

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit,Glove_fit
45,97,aspiring human resource professional,"kokomo , indiana area",0.142,0.76765,0.859485,0.422934,0.459776
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765,0.859485,0.127382,0.403659
5,6,aspiring human resource specialist,greater new york city area,0.002,0.676406,0.842872,0.723763,0.792764
21,73,aspiring human resource manager seeking internship human resource,"houston , texas area",0.014,0.612429,0.946601,0.268558,0.513656
12,27,aspiring human resource management student seeking internship,"houston , texas area",1.0,0.429728,0.855958,0.237063,0.305832


### 4) Now applying FastText

In [69]:
import fasttext
import fasttext.util

In [70]:
fasttext.util.download_model('en', if_exists='ignore')  # English

'cc.en.300.bin'

In [71]:
fasttext_model = fasttext.load_model('cc.en.300.bin')



In [72]:
# Get embeddings for the query
search_query = 'aspiring human resources'
query_vector = fasttext_model.get_sentence_vector(preprocessing_keywords(search_query))

In [73]:
title_similarities = calculate_similarity(fasttext_model, 'aspiring human resources', 300)
final_df['Fasttext_Fit'] = title_similarities
final_df.head()

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit,Glove_fit,Fasttext_Fit
0,1,bauer college business graduate magna cum laude aspiring human resource professional,"houston , texas",0.17,0.268142,0.861974,0.623142,0.552778,0.458525
1,2,native english teacher english program korea english program korea,canada,1.0,0.0,0.567805,0.219639,0.376623,0.301101
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765,0.951902,0.950395,0.924892,0.984363
3,4,people development coordinator ryan,"denton , texas",1.0,0.0,0.512342,0.253594,0.451536,0.300589
4,5,advisory board member celal bayar university,"i̇zmir , turkey",1.0,0.0,0.61254,0.206563,0.262758,0.23651


In [74]:
final_df.sort_values('Fasttext_Fit', ascending=False).head(5)

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit,Glove_fit,Fasttext_Fit
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765,0.951902,0.950395,0.924892,0.984363
45,97,aspiring human resource professional,"kokomo , indiana area",0.142,0.76765,0.951902,0.950395,0.924892,0.984363
5,6,aspiring human resource specialist,greater new york city area,0.002,0.676406,0.949678,0.912262,0.934877,0.969928
21,73,aspiring human resource manager seeking internship human resource,"houston , texas area",0.014,0.612429,0.945208,0.875945,0.908395,0.938275
22,74,human resource professional,greater boston area,0.032,0.40256,0.879294,0.874494,0.871642,0.931923


In [75]:
# Also trying for our second set of keywords
title_similarities = calculate_similarity(fasttext_model, 'seeking human resources', 300)
final_df_kw2['Fasttext_Fit'] = title_similarities
final_df_kw2.head()

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit,Glove_fit,Fasttext_Fit
45,97,aspiring human resource professional,"kokomo , indiana area",0.142,0.76765,0.859485,0.422934,0.459776,0.388841
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765,0.859485,0.127382,0.403659,0.252584
5,6,aspiring human resource specialist,greater new york city area,0.002,0.676406,0.842872,0.723763,0.792764,0.877064
21,73,aspiring human resource manager seeking internship human resource,"houston , texas area",0.014,0.612429,0.946601,0.268558,0.513656,0.281134
12,27,aspiring human resource management student seeking internship,"houston , texas area",1.0,0.429728,0.855958,0.237063,0.305832,0.235586


In [76]:
final_df_kw2.sort_values('Fasttext_Fit', ascending=False).head(5)

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit,Glove_fit,Fasttext_Fit
13,28,seeking human resource opportunity,"chicago , illinois",0.78,0.271462,0.943784,0.904508,0.956014,0.977981
38,90,undergraduate research assistant styczynski lab,greater atlanta area,0.31,0.0,0.425043,0.918737,0.938362,0.959728
6,7,student humber college aspiring human resource generalist,canada,0.122,0.391946,0.801875,0.870324,0.914206,0.94745
16,68,human resource specialist luxottica,greater new york city area,1.0,0.227186,0.808715,0.842598,0.908195,0.931645
15,67,human resource staffing recruiting professional,"jackson , mississippi area",1.0,0.198735,0.795123,0.799299,0.855232,0.911936


### 5) Now applying BERT

In [77]:
from transformers import BertTokenizer, BertModel

In [78]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [79]:
import torch

# Encoding the job titles with BERT
embeddings = []
for job_title in final_df['job_title']:
    inputs = tokenizer(job_title, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        output = model(**inputs)
    embeddings.append(output.pooler_output.numpy())

In [80]:
query_1 = 'aspiring human resources'
query_2 = 'seeking human resources'

query1_inputs = tokenizer(query_1, return_tensors = 'pt', padding=True, truncation=True)
query2_inputs = tokenizer(query_2, return_tensors = 'pt', padding=True, truncation=True)

In [81]:
# Processing 
with torch.no_grad():
    query1_output = model(**query1_inputs)
    query2_output = model(**query2_inputs)
query1_embedding = query1_output.pooler_output.numpy()
query2_embedding = query2_output.pooler_output.numpy()

In [82]:
# Now calculating similarities for query 1
similarities = []
for embedding in embeddings:
    similarity = cosine_similarity(embedding, query1_embedding)
    similarities.append(similarity)
    
final_df['Bert_fit'] = np.vstack(similarities).ravel()
final_df.head()

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit,Glove_fit,Fasttext_Fit,Bert_fit
0,1,bauer college business graduate magna cum laude aspiring human resource professional,"houston , texas",0.17,0.268142,0.861974,0.623142,0.552778,0.458525,0.899871
1,2,native english teacher english program korea english program korea,canada,1.0,0.0,0.567805,0.219639,0.376623,0.301101,0.87627
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765,0.951902,0.950395,0.924892,0.984363,0.933594
3,4,people development coordinator ryan,"denton , texas",1.0,0.0,0.512342,0.253594,0.451536,0.300589,0.84262
4,5,advisory board member celal bayar university,"i̇zmir , turkey",1.0,0.0,0.61254,0.206563,0.262758,0.23651,0.839048


In [83]:
# Now calculating similarities for query 2
similarities2 = []
for embedding in embeddings:
    similarity = cosine_similarity(embedding, query2_embedding)
    similarities2.append(similarity)
    
final_df_kw2['Bert_fit'] = np.vstack(similarities2).ravel()
final_df_kw2.head()

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit,Glove_fit,Fasttext_Fit,Bert_fit
45,97,aspiring human resource professional,"kokomo , indiana area",0.142,0.76765,0.859485,0.422934,0.459776,0.388841,0.888918
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765,0.859485,0.127382,0.403659,0.252584,0.867738
5,6,aspiring human resource specialist,greater new york city area,0.002,0.676406,0.842872,0.723763,0.792764,0.877064,0.924877
21,73,aspiring human resource manager seeking internship human resource,"houston , texas area",0.014,0.612429,0.946601,0.268558,0.513656,0.281134,0.82769
12,27,aspiring human resource management student seeking internship,"houston , texas area",1.0,0.429728,0.855958,0.237063,0.305832,0.235586,0.83297


### Finalizing what will be our FIT function.

Let us look at our applied algorithms by **visual table inspection** which is showing best current fit for the keyword searches of 
* a) Aspiring human resources 
* b) seeking human resources

### 1) For "Aspiring human resources"

In [84]:
# For Keyword set a)
final_df.head(5)

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit,Glove_fit,Fasttext_Fit,Bert_fit
0,1,bauer college business graduate magna cum laude aspiring human resource professional,"houston , texas",0.17,0.268142,0.861974,0.623142,0.552778,0.458525,0.899871
1,2,native english teacher english program korea english program korea,canada,1.0,0.0,0.567805,0.219639,0.376623,0.301101,0.87627
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765,0.951902,0.950395,0.924892,0.984363,0.933594
3,4,people development coordinator ryan,"denton , texas",1.0,0.0,0.512342,0.253594,0.451536,0.300589,0.84262
4,5,advisory board member celal bayar university,"i̇zmir , turkey",1.0,0.0,0.61254,0.206563,0.262758,0.23651,0.839048


We will get first 10 sorted outputs by each algorithm to see which works better. So, 

#### a) For TFIDF

In [85]:
final_df.sort_values(['tfidf_fit'], ascending=False).head(10)

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit,Glove_fit,Fasttext_Fit,Bert_fit
45,97,aspiring human resource professional,"kokomo , indiana area",0.142,0.76765,0.951902,0.950395,0.924892,0.984363,0.933594
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765,0.951902,0.950395,0.924892,0.984363,0.933594
5,6,aspiring human resource specialist,greater new york city area,0.002,0.676406,0.949678,0.912262,0.934877,0.969928,0.964565
21,73,aspiring human resource manager seeking internship human resource,"houston , texas area",0.014,0.612429,0.945208,0.875945,0.908395,0.938275,0.922119
12,27,aspiring human resource management student seeking internship,"houston , texas area",1.0,0.429728,0.859015,0.783973,0.815054,0.866789,0.936784
20,72,business management major aspiring human resource manager,"monroe , louisiana area",0.01,0.427448,0.915736,0.760062,0.756317,0.812208,0.931662
14,66,experienced retail manager aspiring human resource professional,"austin , texas area",0.114,0.40613,0.873018,0.779507,0.768465,0.833309,0.84477
22,74,human resource professional,greater boston area,0.032,0.40256,0.879294,0.874494,0.871642,0.931923,0.958796
6,7,student humber college aspiring human resource generalist,canada,0.122,0.391946,0.890077,0.829068,0.738099,0.797077,0.900687
30,82,aspiring human resource professional energetic teamfocused leader,"austin , texas area",0.348,0.375301,0.895872,0.827266,0.824088,0.885884,0.888631


#### b) For Word2Vec

In [86]:
final_df.sort_values(['Word2Vec_Fit'], ascending=False).head(10)

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit,Glove_fit,Fasttext_Fit,Bert_fit
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765,0.951902,0.950395,0.924892,0.984363,0.933594
45,97,aspiring human resource professional,"kokomo , indiana area",0.142,0.76765,0.951902,0.950395,0.924892,0.984363,0.933594
5,6,aspiring human resource specialist,greater new york city area,0.002,0.676406,0.949678,0.912262,0.934877,0.969928,0.964565
21,73,aspiring human resource manager seeking internship human resource,"houston , texas area",0.014,0.612429,0.945208,0.875945,0.908395,0.938275,0.922119
20,72,business management major aspiring human resource manager,"monroe , louisiana area",0.01,0.427448,0.915736,0.760062,0.756317,0.812208,0.931662
48,100,aspiring human resource manager graduating may seeking entrylevel human resource position st louis,"cape girardeau , missouri",0.206,0.373131,0.911749,0.79129,0.774596,0.584913,0.882259
30,82,aspiring human resource professional energetic teamfocused leader,"austin , texas area",0.348,0.375301,0.895872,0.827266,0.824088,0.885884,0.888631
47,99,seeking human resource position,"la vega , nevada area",0.096,0.280808,0.894779,0.728513,0.785119,0.850613,0.959093
27,79,liberal art major aspiring human resource analyst,"baton rouge , louisiana area",0.014,0.351478,0.890232,0.731783,0.760117,0.723149,0.899233
6,7,student humber college aspiring human resource generalist,canada,0.122,0.391946,0.890077,0.829068,0.738099,0.797077,0.900687


#### c) For Google Word2Vec

In [87]:
final_df.sort_values(['Google_Word2vec_fit'], ascending=False).head(10)

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit,Glove_fit,Fasttext_Fit,Bert_fit
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765,0.951902,0.950395,0.924892,0.984363,0.933594
45,97,aspiring human resource professional,"kokomo , indiana area",0.142,0.76765,0.951902,0.950395,0.924892,0.984363,0.933594
5,6,aspiring human resource specialist,greater new york city area,0.002,0.676406,0.949678,0.912262,0.934877,0.969928,0.964565
21,73,aspiring human resource manager seeking internship human resource,"houston , texas area",0.014,0.612429,0.945208,0.875945,0.908395,0.938275,0.922119
22,74,human resource professional,greater boston area,0.032,0.40256,0.879294,0.874494,0.871642,0.931923,0.958796
6,7,student humber college aspiring human resource generalist,canada,0.122,0.391946,0.890077,0.829068,0.738099,0.797077,0.900687
30,82,aspiring human resource professional energetic teamfocused leader,"austin , texas area",0.348,0.375301,0.895872,0.827266,0.824088,0.885884,0.888631
48,100,aspiring human resource manager graduating may seeking entrylevel human resource position st louis,"cape girardeau , missouri",0.206,0.373131,0.911749,0.79129,0.774596,0.584913,0.882259
12,27,aspiring human resource management student seeking internship,"houston , texas area",1.0,0.429728,0.859015,0.783973,0.815054,0.866789,0.936784
26,78,human resource generalist schwans,united state america,1.0,0.231303,0.810355,0.780322,0.797933,0.889861,0.97699


#### d) For Glove

In [88]:
final_df.sort_values(['Glove_fit'], ascending=False).head(10)

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit,Glove_fit,Fasttext_Fit,Bert_fit
5,6,aspiring human resource specialist,greater new york city area,0.002,0.676406,0.949678,0.912262,0.934877,0.969928,0.964565
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765,0.951902,0.950395,0.924892,0.984363,0.933594
45,97,aspiring human resource professional,"kokomo , indiana area",0.142,0.76765,0.951902,0.950395,0.924892,0.984363,0.933594
21,73,aspiring human resource manager seeking internship human resource,"houston , texas area",0.014,0.612429,0.945208,0.875945,0.908395,0.938275,0.922119
22,74,human resource professional,greater boston area,0.032,0.40256,0.879294,0.874494,0.871642,0.931923,0.958796
13,28,seeking human resource opportunity,"chicago , illinois",0.78,0.271462,0.884136,0.756767,0.837561,0.877444,0.975341
30,82,aspiring human resource professional energetic teamfocused leader,"austin , texas area",0.348,0.375301,0.895872,0.827266,0.824088,0.885884,0.888631
12,27,aspiring human resource management student seeking internship,"houston , texas area",1.0,0.429728,0.859015,0.783973,0.815054,0.866789,0.936784
8,10,seeking human resource human resourcesis generalist position,greater philadelphia area,1.0,0.262936,0.880467,0.76045,0.805695,0.8841,0.845393
26,78,human resource generalist schwans,united state america,1.0,0.231303,0.810355,0.780322,0.797933,0.889861,0.97699


#### e) For Fasttext

In [89]:
final_df.sort_values(['Fasttext_Fit'], ascending=False).head(10)

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit,Glove_fit,Fasttext_Fit,Bert_fit
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765,0.951902,0.950395,0.924892,0.984363,0.933594
45,97,aspiring human resource professional,"kokomo , indiana area",0.142,0.76765,0.951902,0.950395,0.924892,0.984363,0.933594
5,6,aspiring human resource specialist,greater new york city area,0.002,0.676406,0.949678,0.912262,0.934877,0.969928,0.964565
21,73,aspiring human resource manager seeking internship human resource,"houston , texas area",0.014,0.612429,0.945208,0.875945,0.908395,0.938275,0.922119
22,74,human resource professional,greater boston area,0.032,0.40256,0.879294,0.874494,0.871642,0.931923,0.958796
16,68,human resource specialist luxottica,greater new york city area,1.0,0.227186,0.826671,0.741441,0.704786,0.893584,0.940522
26,78,human resource generalist schwans,united state america,1.0,0.231303,0.810355,0.780322,0.797933,0.889861,0.97699
49,101,human resource generalist loparex,"raleighdurham , north carolina area",1.0,0.231303,0.809823,0.780322,0.797933,0.889861,0.98047
30,82,aspiring human resource professional energetic teamfocused leader,"austin , texas area",0.348,0.375301,0.895872,0.827266,0.824088,0.885884,0.888631
8,10,seeking human resource human resourcesis generalist position,greater philadelphia area,1.0,0.262936,0.880467,0.76045,0.805695,0.8841,0.845393


#### f) For BERT

In [90]:
final_df.sort_values(['Bert_fit'], ascending=False).head(10)

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit,Glove_fit,Fasttext_Fit,Bert_fit
51,103,always set success,greater los angeles area,1.0,0.0,0.340498,0.213981,0.361773,0.282368,0.98185
49,101,human resource generalist loparex,"raleighdurham , north carolina area",1.0,0.231303,0.809823,0.780322,0.797933,0.889861,0.98047
26,78,human resource generalist schwans,united state america,1.0,0.231303,0.810355,0.780322,0.797933,0.889861,0.97699
13,28,seeking human resource opportunity,"chicago , illinois",0.78,0.271462,0.884136,0.756767,0.837561,0.877444,0.975341
5,6,aspiring human resource specialist,greater new york city area,0.002,0.676406,0.949678,0.912262,0.934877,0.969928,0.964565
11,13,human resource coordinator intercontinental buckhead atlanta,"atlanta , georgia",1.0,0.153334,0.794771,0.57807,0.559741,0.654652,0.962325
7,8,human resource senior specialist,san francisco bay area,1.0,0.248041,0.83442,0.697906,0.76478,0.822855,0.961523
9,11,student chapman university,"lake forest , california",0.004,0.0,0.436173,0.244274,0.313866,0.335059,0.960531
47,99,seeking human resource position,"la vega , nevada area",0.096,0.280808,0.894779,0.728513,0.785119,0.850613,0.959093
22,74,human resource professional,greater boston area,0.032,0.40256,0.879294,0.874494,0.871642,0.931923,0.958796


### 2) For "Seeking human resources"

#### a) For TFIDF

In [91]:
final_df_kw2.sort_values(['tfidf_fit'], ascending=False).head(10)

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit,Glove_fit,Fasttext_Fit,Bert_fit
45,97,aspiring human resource professional,"kokomo , indiana area",0.142,0.76765,0.859485,0.422934,0.459776,0.388841,0.888918
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765,0.859485,0.127382,0.403659,0.252584,0.867738
5,6,aspiring human resource specialist,greater new york city area,0.002,0.676406,0.842872,0.723763,0.792764,0.877064,0.924877
21,73,aspiring human resource manager seeking internship human resource,"houston , texas area",0.014,0.612429,0.946601,0.268558,0.513656,0.281134,0.82769
12,27,aspiring human resource management student seeking internship,"houston , texas area",1.0,0.429728,0.855958,0.237063,0.305832,0.235586,0.83297
20,72,business management major aspiring human resource manager,"monroe , louisiana area",0.01,0.427448,0.858703,0.759907,0.83647,0.884207,0.955644
14,66,experienced retail manager aspiring human resource professional,"austin , texas area",0.114,0.40613,0.814708,0.632909,0.600964,0.684716,0.888156
22,74,human resource professional,greater boston area,0.032,0.40256,0.869644,0.70652,0.801243,0.815128,0.951196
6,7,student humber college aspiring human resource generalist,canada,0.122,0.391946,0.801875,0.870324,0.914206,0.94745,0.842429
30,82,aspiring human resource professional energetic teamfocused leader,"austin , texas area",0.348,0.375301,0.82329,0.164599,0.298825,0.263629,0.954817


#### b) For Word2Vec

In [92]:
final_df_kw2.sort_values(['Word2Vec_Fit'], ascending=False).head(10)

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit,Glove_fit,Fasttext_Fit,Bert_fit
47,99,seeking human resource position,"la vega , nevada area",0.096,0.280808,0.957151,0.718755,0.803569,0.845641,0.927217
21,73,aspiring human resource manager seeking internship human resource,"houston , texas area",0.014,0.612429,0.946601,0.268558,0.513656,0.281134,0.82769
13,28,seeking human resource opportunity,"chicago , illinois",0.78,0.271462,0.943784,0.904508,0.956014,0.977981,0.982861
8,10,seeking human resource human resourcesis generalist position,greater philadelphia area,1.0,0.262936,0.903312,0.671721,0.748888,0.837362,0.88373
48,100,aspiring human resource manager graduating may seeking entrylevel human resource position st louis,"cape girardeau , missouri",0.206,0.373131,0.888368,0.599692,0.695179,0.568251,0.819739
36,88,human resource management major,"milpitas , california",0.036,0.248041,0.883509,0.57369,0.672508,0.60317,0.799135
23,75,nortia staffing seeking human resource payroll administrative professional,"san jose , california",1.0,0.141157,0.874903,0.646103,0.767653,0.628124,0.857442
22,74,human resource professional,greater boston area,0.032,0.40256,0.869644,0.70652,0.801243,0.815128,0.951196
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765,0.859485,0.127382,0.403659,0.252584,0.867738
45,97,aspiring human resource professional,"kokomo , indiana area",0.142,0.76765,0.859485,0.422934,0.459776,0.388841,0.888918


#### c) For Google Word2Vec


In [93]:
final_df_kw2.sort_values(['Google_Word2vec_fit'], ascending=False).head(10)

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit,Glove_fit,Fasttext_Fit,Bert_fit
38,90,undergraduate research assistant styczynski lab,greater atlanta area,0.31,0.0,0.425043,0.918737,0.938362,0.959728,0.963576
13,28,seeking human resource opportunity,"chicago , illinois",0.78,0.271462,0.943784,0.904508,0.956014,0.977981,0.982861
6,7,student humber college aspiring human resource generalist,canada,0.122,0.391946,0.801875,0.870324,0.914206,0.94745,0.842429
16,68,human resource specialist luxottica,greater new york city area,1.0,0.227186,0.808715,0.842598,0.908195,0.931645,0.912021
15,67,human resource staffing recruiting professional,"jackson , mississippi area",1.0,0.198735,0.795123,0.799299,0.855232,0.911936,0.949881
1,2,native english teacher english program korea english program korea,canada,1.0,0.0,0.527206,0.771333,0.812123,0.581721,0.870009
24,76,aspiring human resource professional passionate helping create inclusive engaging work environment,"new york , new york",0.424,0.25728,0.773791,0.770908,0.74011,0.901139,0.936163
20,72,business management major aspiring human resource manager,"monroe , louisiana area",0.01,0.427448,0.858703,0.759907,0.83647,0.884207,0.955644
31,83,human resource manager endemol shine north america,"los angeles , california",0.536,0.148305,0.815474,0.757564,0.794645,0.885821,0.97378
35,87,bachelor science biology victoria university wellington,"baltimore , maryland",0.08,0.0,0.569281,0.757564,0.794645,0.885821,0.977471


#### d) For Glove

In [94]:
final_df_kw2.sort_values(['Glove_fit'], ascending=False).head(10)

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit,Glove_fit,Fasttext_Fit,Bert_fit
13,28,seeking human resource opportunity,"chicago , illinois",0.78,0.271462,0.943784,0.904508,0.956014,0.977981,0.982861
38,90,undergraduate research assistant styczynski lab,greater atlanta area,0.31,0.0,0.425043,0.918737,0.938362,0.959728,0.963576
6,7,student humber college aspiring human resource generalist,canada,0.122,0.391946,0.801875,0.870324,0.914206,0.94745,0.842429
16,68,human resource specialist luxottica,greater new york city area,1.0,0.227186,0.808715,0.842598,0.908195,0.931645,0.912021
15,67,human resource staffing recruiting professional,"jackson , mississippi area",1.0,0.198735,0.795123,0.799299,0.855232,0.911936,0.949881
44,96,student indiana university kokomo business management retail manager delphi hardware paint,"lafayette , indiana",0.038,0.0,0.676229,0.733012,0.853648,0.795586,0.934132
51,103,always set success,greater los angeles area,1.0,0.0,0.304366,0.754709,0.843359,0.793411,0.847278
20,72,business management major aspiring human resource manager,"monroe , louisiana area",0.01,0.427448,0.858703,0.759907,0.83647,0.884207,0.955644
1,2,native english teacher english program korea english program korea,canada,1.0,0.0,0.527206,0.771333,0.812123,0.581721,0.870009
47,99,seeking human resource position,"la vega , nevada area",0.096,0.280808,0.957151,0.718755,0.803569,0.845641,0.927217


#### e) For Fasttext

In [95]:
final_df_kw2.sort_values(['Fasttext_Fit'], ascending=False).head(10)

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit,Glove_fit,Fasttext_Fit,Bert_fit
13,28,seeking human resource opportunity,"chicago , illinois",0.78,0.271462,0.943784,0.904508,0.956014,0.977981,0.982861
38,90,undergraduate research assistant styczynski lab,greater atlanta area,0.31,0.0,0.425043,0.918737,0.938362,0.959728,0.963576
6,7,student humber college aspiring human resource generalist,canada,0.122,0.391946,0.801875,0.870324,0.914206,0.94745,0.842429
16,68,human resource specialist luxottica,greater new york city area,1.0,0.227186,0.808715,0.842598,0.908195,0.931645,0.912021
15,67,human resource staffing recruiting professional,"jackson , mississippi area",1.0,0.198735,0.795123,0.799299,0.855232,0.911936,0.949881
24,76,aspiring human resource professional passionate helping create inclusive engaging work environment,"new york , new york",0.424,0.25728,0.773791,0.770908,0.74011,0.901139,0.936163
35,87,bachelor science biology victoria university wellington,"baltimore , maryland",0.08,0.0,0.569281,0.757564,0.794645,0.885821,0.977471
31,83,human resource manager endemol shine north america,"los angeles , california",0.536,0.148305,0.815474,0.757564,0.794645,0.885821,0.97378
20,72,business management major aspiring human resource manager,"monroe , louisiana area",0.01,0.427448,0.858703,0.759907,0.83647,0.884207,0.955644
40,92,seeking employment opportunity within customer service patient care,"torrance , california",0.128,0.0,0.6787,0.723763,0.792764,0.877064,0.924877


#### f) For BERT

In [96]:
final_df_kw2.sort_values(['Bert_fit'], ascending=False).head(10)

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit,Glove_fit,Fasttext_Fit,Bert_fit
13,28,seeking human resource opportunity,"chicago , illinois",0.78,0.271462,0.943784,0.904508,0.956014,0.977981,0.982861
33,85,rrp brand portfolio executive jti japan tobacco international,greater philadelphia area,1.0,0.0,0.694551,0.150533,0.434082,0.266974,0.978145
35,87,bachelor science biology victoria university wellington,"baltimore , maryland",0.08,0.0,0.569281,0.757564,0.794645,0.885821,0.977471
31,83,human resource manager endemol shine north america,"los angeles , california",0.536,0.148305,0.815474,0.757564,0.794645,0.885821,0.97378
27,79,liberal art major aspiring human resource analyst,"baton rouge , louisiana area",0.014,0.351478,0.828845,0.59403,0.592933,0.635967,0.964595
38,90,undergraduate research assistant styczynski lab,greater atlanta area,0.31,0.0,0.425043,0.918737,0.938362,0.959728,0.963576
20,72,business management major aspiring human resource manager,"monroe , louisiana area",0.01,0.427448,0.858703,0.759907,0.83647,0.884207,0.955644
30,82,aspiring human resource professional energetic teamfocused leader,"austin , texas area",0.348,0.375301,0.82329,0.164599,0.298825,0.263629,0.954817
22,74,human resource professional,greater boston area,0.032,0.40256,0.869644,0.70652,0.801243,0.815128,0.951196
15,67,human resource staffing recruiting professional,"jackson , mississippi area",1.0,0.198735,0.795123,0.799299,0.855232,0.911936,0.949881


## Goal1 : Predict how fit the candidate is based on their available information (variable fit)

**Ans:**
From the above visual inspection of the query results for the two different set of keywords , it was found that **Word2vec** algorithm trained on the above dataset worked better. The reason behind beating the renowned algorithms which have been trained on massive corpus seems to be the fact that the **dataset we have is small** with fewer number of records, due to which an algorithm trained and fitted on our specific dataset provides us with more relevant results. Going ahead with word2vec for calculating fitness score.

We will consider our <b><i>fitness score</i><b> as **F(x) = 0.9 * Word2Vec_fit + 0.1 * connections**

So appending the fit column back in our final datasets : 

In [97]:
final_df['fit'] = 0.9 * final_df['Word2Vec_Fit'] + 0.1 * final_df['connection']
final_df = final_df.sort_values(['fit'], ascending=False)
final_df.head()

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit,Glove_fit,Fasttext_Fit,Bert_fit,fit
8,10,seeking human resource human resourcesis generalist position,greater philadelphia area,1.0,0.262936,0.880467,0.76045,0.805695,0.8841,0.845393,0.89242
13,28,seeking human resource opportunity,"chicago , illinois",0.78,0.271462,0.884136,0.756767,0.837561,0.877444,0.975341,0.873723
12,27,aspiring human resource management student seeking internship,"houston , texas area",1.0,0.429728,0.859015,0.783973,0.815054,0.866789,0.936784,0.873113
45,97,aspiring human resource professional,"kokomo , indiana area",0.142,0.76765,0.951902,0.950395,0.924892,0.984363,0.933594,0.870912
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765,0.951902,0.950395,0.924892,0.984363,0.933594,0.865512


So here we see that candidates with ids 10,28,27,97 and 3 are most fit based on both connections and job title. But if we take a closer look we can star 97, 3 as higher than others. And 27 is least fit as he is a student. This will be taken care of in <i> reranking </i>

In [98]:
final_df_kw2['fit'] = 0.9 * final_df_kw2['Word2Vec_Fit'] + 0.1 * final_df_kw2['connection']
final_df_kw2 = final_df_kw2.sort_values(['fit'], ascending=False)
final_df_kw2.head()

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit,Glove_fit,Fasttext_Fit,Bert_fit,fit
13,28,seeking human resource opportunity,"chicago , illinois",0.78,0.271462,0.943784,0.904508,0.956014,0.977981,0.982861,0.927406
8,10,seeking human resource human resourcesis generalist position,greater philadelphia area,1.0,0.262936,0.903312,0.671721,0.748888,0.837362,0.88373,0.91298
23,75,nortia staffing seeking human resource payroll administrative professional,"san jose , california",1.0,0.141157,0.874903,0.646103,0.767653,0.628124,0.857442,0.887413
47,99,seeking human resource position,"la vega , nevada area",0.096,0.280808,0.957151,0.718755,0.803569,0.845641,0.927217,0.871036
12,27,aspiring human resource management student seeking internship,"houston , texas area",1.0,0.429728,0.855958,0.237063,0.305832,0.235586,0.83297,0.870362


For "seeking human resources" keyword search query, we see that candidates with ids 10,28,75, 99 and 27 are most fit based on both connections and job title. And if we again take a closer look we can remove 27 as is least fit because seeking internship and not experienced. This will be taken care of in <i> reranking </i>

## Goal2: Re-rank candidates when a candidate is starred.

Ans: There are a number of **Learning to Rank Algorithms** which can take care of this set of problems. This is a special subset of machine learning problems that does not fall clearly as a regression or a classification alfgorithm and is taken care of in different ways. Some of the ways being:

1) *Pointwise Methods*:
    Pointwise methods treat the ranking problem as a regression or classification task. Each item is treated as an independent instance, and the model is trained to predict a relevance score or label for each item. Examples: Linear regression, logistic regression, support vector machines (SVM), and decision trees can be adapted for pointwise ranking.

2) *Pairwise Methods*:
    Pairwise methods consider pairs of items and learn to rank them relative to each other. The model is trained to differentiate between pairs of items, indicating which item in each pair is more relevant. Examples: RankNet, RankBoost, and RankBoost variants, such as AdaRank, are pairwise LTR algorithms.

3) *Listwise Methods*: 
    Listwise methods take into account the entire ranked list of items for a query. These methods optimize a ranking measure directly, aiming to maximize the quality of the entire ranked list. Examples: ListNet, ListMLE (Listwise Maximum Likelihood Estimation), and LambdaMART are examples of listwise LTR algorithms.
    
We will be going ahead with the Listwise Method which seems to appropriately fit our criterion as we have a list of job titles from the dataframe which needs to be ranked. And it should not be computationally as intensive as the size of the dataset is not large. So will be proceeding with **LambdaMART with LightGBM** algorithm first. 

First we will add a column titled <u><i>starred</u></i> in our final dataframe to keep track of whether a candidate is starred or not. Herein 1 means starred and 0 means not starred.

In [99]:
final_df['starred'] = 0

# Starring candidates 10,28,97,3
final_df.loc[final_df['id'] == 10, 'starred'] = 1
final_df.loc[final_df['id'] == 28, 'starred'] = 1
final_df.loc[final_df['id'] == 97, 'starred'] = 1
final_df.loc[final_df['id'] == 3, 'starred'] = 1

final_df = final_df.sort_values(['fit'], ascending=False)
final_df.head(10)

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit,Glove_fit,Fasttext_Fit,Bert_fit,fit,starred
8,10,seeking human resource human resourcesis generalist position,greater philadelphia area,1.0,0.262936,0.880467,0.76045,0.805695,0.8841,0.845393,0.89242,1
13,28,seeking human resource opportunity,"chicago , illinois",0.78,0.271462,0.884136,0.756767,0.837561,0.877444,0.975341,0.873723,1
12,27,aspiring human resource management student seeking internship,"houston , texas area",1.0,0.429728,0.859015,0.783973,0.815054,0.866789,0.936784,0.873113,0
45,97,aspiring human resource professional,"kokomo , indiana area",0.142,0.76765,0.951902,0.950395,0.924892,0.984363,0.933594,0.870912,1
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765,0.951902,0.950395,0.924892,0.984363,0.933594,0.865512,1
5,6,aspiring human resource specialist,greater new york city area,0.002,0.676406,0.949678,0.912262,0.934877,0.969928,0.964565,0.85491,0
21,73,aspiring human resource manager seeking internship human resource,"houston , texas area",0.014,0.612429,0.945208,0.875945,0.908395,0.938275,0.922119,0.852087,0
7,8,human resource senior specialist,san francisco bay area,1.0,0.248041,0.83442,0.697906,0.76478,0.822855,0.961523,0.850978,0
19,71,human resource generalist scottmadden inc,"raleighdurham , north carolina area",1.0,0.185021,0.833427,0.672638,0.737717,0.566922,0.927363,0.850084,0
29,81,senior human resource business partner heil environmental,"chattanooga , tennessee area",0.91,0.145662,0.837309,0.648945,0.718803,0.640661,0.867658,0.844578,0


### Implementing Ranking algorithm LambdaMART with lightgbm

In [100]:
import lightgbm as lgb

In [101]:
# A ranking algorithm which takes as input the df, feature columns, target and returns the ranked df according to columns
def ranking(dataframe, features, target):
    # Create an augmented dataset with manually selected candidates
    augmented_df = dataframe.copy()

    # Feature engineering 
    X = augmented_df[features]
    y = augmented_df[target]

    # Train a LambdaMART model on the augmented dataset
    lgb_dataset = lgb.Dataset(X, label=y, group=[len(augmented_df)])  # Group all instances together
    params = {"objective": "lambdarank", "metric": "ndcg"}  # LambdaMART parameters
    model = lgb.train(params, lgb_dataset, num_boost_round=100)

    # Make predictions for all candidates 
    augmented_df["Predicted_Score"] = model.predict(X)

    # Sort candidates based on predicted scores
    sorted_candidates = augmented_df.sort_values(by="Predicted_Score", ascending=False)
    return sorted_candidates

In [102]:
# A ranking algorithm which takes as input the df, feature columns, output and returns the ranked df according to columns
# def ranking(dataframe, features, y):
#     # Create an augmented dataset with manually selected candidates
#     augmented_df = datframe.copy()

#     # Feature engineering 
#     X = augmented_df[["connection", "Word2Vec_Fit", "fit"]]
#     y = augmented_df["starred"]

#     # Train a LambdaMART model on the augmented dataset
#     lgb_dataset = lgb.Dataset(X, label=y, group=[len(augmented_df)])  # Group all instances together
#     params = {"objective": "lambdarank", "metric": "ndcg"}  # LambdaMART parameters
#     model = lgb.train(params, lgb_dataset, num_boost_round=100)

#     # Make predictions for all candidates 
#     augmented_df["Predicted_Score"] = model.predict(X)

#     # Sort candidates based on predicted scores
#     sorted_candidates = augmented_df.sort_values(by="Predicted_Score", ascending=False)
#     return sorted_candidates
sorted_candidates = ranking(final_df, ["connection", "Word2Vec_Fit", "fit"], "starred")

# Print the re-ranked list of candidates
sorted_candidates[['id', 'job_title','location', 'Word2Vec_Fit','fit','starred','Predicted_Score']].head(20)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000785 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 51
[LightGBM] [Info] Number of data points in the train set: 53, number of used features: 3


Unnamed: 0,id,job_title,location,Word2Vec_Fit,fit,starred,Predicted_Score
13,28,seeking human resource opportunity,"chicago , illinois",0.884136,0.873723,1,3.80041
45,97,aspiring human resource professional,"kokomo , indiana area",0.951902,0.870912,1,2.60169
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.951902,0.865512,1,2.60169
8,10,seeking human resource human resourcesis generalist position,greater philadelphia area,0.880467,0.89242,1,1.475732
12,27,aspiring human resource management student seeking internship,"houston , texas area",0.859015,0.873113,0,0.162425
27,79,liberal art major aspiring human resource analyst,"baton rouge , louisiana area",0.890232,0.802609,0,-1.521993
6,7,student humber college aspiring human resource generalist,canada,0.890077,0.813269,0,-1.521993
21,73,aspiring human resource manager seeking internship human resource,"houston , texas area",0.945208,0.852087,0,-1.525083
20,72,business management major aspiring human resource manager,"monroe , louisiana area",0.915736,0.825162,0,-1.525083
5,6,aspiring human resource specialist,greater new york city area,0.949678,0.85491,0,-1.525083


#### Let us star more candidates and see what happens to the ranking.

In [103]:
# Starring more candidates 79,73,72,6,99,82,66
final_df.loc[final_df['id'] == 79, 'starred'] = 1
final_df.loc[final_df['id'] == 73, 'starred'] = 1
final_df.loc[final_df['id'] == 72, 'starred'] = 1
final_df.loc[final_df['id'] == 6, 'starred'] = 1
final_df.loc[final_df['id'] == 99, 'starred'] = 1
final_df.loc[final_df['id'] == 82, 'starred'] = 1
final_df.loc[final_df['id'] == 66, 'starred'] = 1

final_df = final_df.sort_values(['fit'], ascending=False)
final_df.head(20)

Unnamed: 0,id,job_title,location,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit,Glove_fit,Fasttext_Fit,Bert_fit,fit,starred
8,10,seeking human resource human resourcesis generalist position,greater philadelphia area,1.0,0.262936,0.880467,0.76045,0.805695,0.8841,0.845393,0.89242,1
13,28,seeking human resource opportunity,"chicago , illinois",0.78,0.271462,0.884136,0.756767,0.837561,0.877444,0.975341,0.873723,1
12,27,aspiring human resource management student seeking internship,"houston , texas area",1.0,0.429728,0.859015,0.783973,0.815054,0.866789,0.936784,0.873113,0
45,97,aspiring human resource professional,"kokomo , indiana area",0.142,0.76765,0.951902,0.950395,0.924892,0.984363,0.933594,0.870912,1
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.088,0.76765,0.951902,0.950395,0.924892,0.984363,0.933594,0.865512,1
5,6,aspiring human resource specialist,greater new york city area,0.002,0.676406,0.949678,0.912262,0.934877,0.969928,0.964565,0.85491,1
21,73,aspiring human resource manager seeking internship human resource,"houston , texas area",0.014,0.612429,0.945208,0.875945,0.908395,0.938275,0.922119,0.852087,1
7,8,human resource senior specialist,san francisco bay area,1.0,0.248041,0.83442,0.697906,0.76478,0.822855,0.961523,0.850978,0
19,71,human resource generalist scottmadden inc,"raleighdurham , north carolina area",1.0,0.185021,0.833427,0.672638,0.737717,0.566922,0.927363,0.850084,0
29,81,senior human resource business partner heil environmental,"chattanooga , tennessee area",0.91,0.145662,0.837309,0.648945,0.718803,0.640661,0.867658,0.844578,0


In [104]:
sorted_candidates = ranking(final_df, ["connection", "Word2Vec_Fit", "fit"], "starred")

# Print the re-ranked list of candidates
sorted_candidates[['id', 'job_title','location', 'Word2Vec_Fit','fit','starred','Predicted_Score']].head(20)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000075 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 51
[LightGBM] [Info] Number of data points in the train set: 53, number of used features: 3


Unnamed: 0,id,job_title,location,Word2Vec_Fit,fit,starred,Predicted_Score
2,3,aspiring human resource professional,"raleighdurham , north carolina area",0.951902,0.865512,1,3.881954
5,6,aspiring human resource specialist,greater new york city area,0.949678,0.85491,1,3.881954
21,73,aspiring human resource manager seeking internship human resource,"houston , texas area",0.945208,0.852087,1,3.881954
20,72,business management major aspiring human resource manager,"monroe , louisiana area",0.915736,0.825162,1,2.43295
45,97,aspiring human resource professional,"kokomo , indiana area",0.951902,0.870912,1,2.008225
47,99,seeking human resource position,"la vega , nevada area",0.894779,0.814901,1,1.685873
13,28,seeking human resource opportunity,"chicago , illinois",0.884136,0.873723,1,1.374621
27,79,liberal art major aspiring human resource analyst,"baton rouge , louisiana area",0.890232,0.802609,1,1.151765
8,10,seeking human resource human resourcesis generalist position,greater philadelphia area,0.880467,0.89242,1,0.597765
30,82,aspiring human resource professional energetic teamfocused leader,"austin , texas area",0.895872,0.841085,1,0.579789


## Bonus(es):

### Ques.1. We are interested in a robust algorithm, tell us how your solution works and show us how your ranking gets better with each starring action.


**Ans**. The solution is pretty robust because after a lot of comparisons (TF-IDF, Word2Vec, Google Word2Vec, Glove, Fasttext and BERT), I was able to find an **NLP embedding algorithm which is particularly well suited to our specific use case data** , ie. the Word2Vec algorithm trained on the words in this dataset. The candidates in the dataset were ranked based on a **fitness score which was based on the cosine similarity metric and normalized connections.** The solution also handles manual starring of candidates with reranking based on **Learning To Rank algorithm (LambaMart with LightGBM)** in which our final predicted_score is better for the candidates that were starred.

Future enhancements: To implement this on large scale, Fasttext or BERT can be used in the above solution. 

Plus, the solution above becomes **better with each starring**. This is because as and when the **starred candidate sample size increases**, it helps our LTR algorithm to learn the patterns in the data better. And hence the better results (which can be seen when number of starred candidates was increased to 10).

### Ques.2. How can we filter out candidates which in the first place should not be in this list?

**Ans**. The candidates which should not be in the list can be **filtered out by setting a threshold** for our fitness score. We can take a safe limit of fitness score threshold as 0.40 as not related or candidates who should not be in the list. This is because we can see our min is 0.207 and max is 0.892. And we can cross check with the dataset to confirm.

In [108]:
final_df.sort_values(['fit'], ascending=False).describe()

Unnamed: 0,id,connection,tfidf_fit,Word2Vec_Fit,Google_Word2vec_fit,Glove_fit,Fasttext_Fit,Bert_fit,fit,starred
count,53.0,53.0,53.0,53.0,53.0,53.0,53.0,53.0,53.0,53.0
mean,65.132075,0.483811,0.194236,0.743555,0.566251,0.624114,0.630714,0.903649,0.71758,0.207547
std,35.117954,0.43413,0.204105,0.175023,0.239992,0.200834,0.237342,0.048524,0.162684,0.409432
min,1.0,0.002,0.0,0.229144,0.206563,0.202625,0.186951,0.807109,0.20703,0.0
25%,28.0,0.088,0.0,0.622602,0.307312,0.427824,0.379228,0.863759,0.59316,0.0
50%,78.0,0.31,0.148305,0.811338,0.623142,0.700032,0.654652,0.899871,0.79481,0.0
75%,91.0,1.0,0.271462,0.873018,0.760062,0.774596,0.833309,0.946331,0.833233,0.0
max,104.0,1.0,0.76765,0.951902,0.950395,0.934877,0.984363,0.98185,0.89242,1.0


#### Ques.3. Can we determine a cut-off point that would work for other roles without losing high potential candidates?

### Ques.4. Do you have any ideas that we should explore so that we can even automate this procedure to prevent human bias?

Ans. Human bias can be prevented by updating the dataset with keeping a track of list of candidates which actually get selected for specific queries and updating the score to give higher precedence to selected candidates. The ranking algorithm can be reranked and **starred candidates can be substituted to selected candidates**. This pattern can be learnt and applied to later searches after a generous amount of queries and selected candidates are recorded. Then this procedure will be automated and starring will not be needed, hence no human intervention.

Further enhancements can also be to improve our feature set by including years of experience, educational qualifications, query related skill set, awards or certificates attained and many more.