#### Rank Candidate with GloVe Model

#### Load libraries

In [1]:
import pandas as pd
from gensim.models import KeyedVectors
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

#### Load the cleaned dataset

In [3]:
file_path = r"C:\Users\USER\Documents\Potential_Talent\cleaned_dataset.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 ct bauer college of business graduate mag...,houston texas,0.168337,
1,2,native english teacher at epik english program...,kanada,1.0,
2,3,aspiring human resources professional,raleighdurham north carolina area,0.086172,
3,4,people development coordinator at ryan,denton texas,1.0,
4,5,advisory board member at celal bayar university,izmir trkiye,1.0,


#### Drop fit column

In [5]:
df = df.drop(columns =['fit'])
df.head()

Unnamed: 0,id,job_title,location,connection
0,1,2019 ct bauer college of business graduate mag...,houston texas,0.168337
1,2,native english teacher at epik english program...,kanada,1.0
2,3,aspiring human resources professional,raleighdurham north carolina area,0.086172
3,4,people development coordinator at ryan,denton texas,1.0
4,5,advisory board member at celal bayar university,izmir trkiye,1.0


#### Load the pretrained GloVe model

In [24]:
# Path to your unzipped GloVe file
glove_file = r"C:\Users\USER\Documents\Potential Talent\wiki_giga_2024_100_MFT20_vectors_seed_2024_alpha_0.75_eta_0.05.050_combined.txt"

# Load GloVe vectors into Gensim
glove_model = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

print("GloVe loaded successfully")

GloVe loaded successfully


#### Save the loaded model for easy loading

In [26]:
# Save the loaded model
save_path = r"C:\Users\USER\Documents\Potential Talent\glove_100_word2vec.kv"
glove_model.save(save_path)


#### Load the saved model

In [7]:
glove_model = KeyedVectors.load(r"C:\Users\USER\Documents\Potential Talent\glove_100_word2vec.kv")
print("GloVe loaded successfully")

GloVe loaded successfully


#### Define a function to tokenize job_title and vectorize it

In [9]:
def get_glove_vector(text, model, vector_size=100):
    words = text.split()  # simple tokenization, no lower() needed
    vectors = [model[w] for w in words if w in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)  # fallback if no words found

# Apply to job_title column
vector_size = 100  # depends on your loaded GloVe
df['glove_vector'] = df['job_title'].apply(lambda x: get_glove_vector(x, glove_model, vector_size))

#### Convert the vectorised job_title to pandas dataframe

In [11]:
glove_df = pd.DataFrame(df['glove_vector'].tolist(), index=df.index)
glove_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.459754,0.006152,-0.155833,-0.221236,-0.256272,0.205457,-0.300256,0.033459,-0.023397,-0.101417,...,-0.257249,-0.089633,0.369736,-0.243235,-0.018758,-0.39287,0.202076,0.185653,0.448418,-0.058369
1,0.330251,0.100468,-0.036782,0.012773,0.136644,-0.097561,-0.147439,0.249057,0.204614,-0.166419,...,-0.064068,-0.161566,0.161229,-0.201916,-0.320825,-0.302057,-0.147689,0.197886,0.197342,-0.125275
2,0.534294,-0.180405,-0.604273,-0.130981,-0.484982,0.306137,-0.333915,0.162578,0.404458,-0.046744,...,0.164977,0.202905,0.466124,-0.222518,-0.252643,-0.832752,0.100228,0.485862,0.577425,-0.33849
3,0.205151,0.420787,0.087748,0.093096,-0.225512,0.131477,-0.307568,-0.00157,0.257949,0.189696,...,0.321081,-0.350858,0.127341,0.006582,-0.29679,-0.628727,-0.169661,-0.092067,0.308878,0.232731
4,0.132313,0.105128,0.135264,0.022658,-0.578859,0.542097,-0.118011,0.258626,-0.149068,0.390992,...,-0.051858,-0.259557,0.478615,-0.217018,-0.621216,-0.582815,0.427113,0.330779,0.286989,0.401508


#### Rename the vectorised column from 0, 2, 3 ... to glove_0,...

In [13]:
# Rename columns to glove_0, glove_1, ...
glove_df.columns = [f'glove_{i}' for i in range(glove_df.shape[1])]
glove_df.head()

Unnamed: 0,glove_0,glove_1,glove_2,glove_3,glove_4,glove_5,glove_6,glove_7,glove_8,glove_9,...,glove_90,glove_91,glove_92,glove_93,glove_94,glove_95,glove_96,glove_97,glove_98,glove_99
0,0.459754,0.006152,-0.155833,-0.221236,-0.256272,0.205457,-0.300256,0.033459,-0.023397,-0.101417,...,-0.257249,-0.089633,0.369736,-0.243235,-0.018758,-0.39287,0.202076,0.185653,0.448418,-0.058369
1,0.330251,0.100468,-0.036782,0.012773,0.136644,-0.097561,-0.147439,0.249057,0.204614,-0.166419,...,-0.064068,-0.161566,0.161229,-0.201916,-0.320825,-0.302057,-0.147689,0.197886,0.197342,-0.125275
2,0.534294,-0.180405,-0.604273,-0.130981,-0.484982,0.306137,-0.333915,0.162578,0.404458,-0.046744,...,0.164977,0.202905,0.466124,-0.222518,-0.252643,-0.832752,0.100228,0.485862,0.577425,-0.33849
3,0.205151,0.420787,0.087748,0.093096,-0.225512,0.131477,-0.307568,-0.00157,0.257949,0.189696,...,0.321081,-0.350858,0.127341,0.006582,-0.29679,-0.628727,-0.169661,-0.092067,0.308878,0.232731
4,0.132313,0.105128,0.135264,0.022658,-0.578859,0.542097,-0.118011,0.258626,-0.149068,0.390992,...,-0.051858,-0.259557,0.478615,-0.217018,-0.621216,-0.582815,0.427113,0.330779,0.286989,0.401508


#### Add the renamed the column back to df

In [15]:
 concatenate with your original dataframe
df = pd.concat([df.reset_index(drop=True), glove_df], axis=1)
df.head()

Unnamed: 0,id,job_title,location,connection,glove_vector,glove_0,glove_1,glove_2,glove_3,glove_4,...,glove_90,glove_91,glove_92,glove_93,glove_94,glove_95,glove_96,glove_97,glove_98,glove_99
0,1,2019 ct bauer college of business graduate mag...,houston texas,0.168337,"[0.45975354, 0.0061516697, -0.15583293, -0.221...",0.459754,0.006152,-0.155833,-0.221236,-0.256272,...,-0.257249,-0.089633,0.369736,-0.243235,-0.018758,-0.39287,0.202076,0.185653,0.448418,-0.058369
1,2,native english teacher at epik english program...,kanada,1.0,"[0.3302506, 0.1004681, -0.036781553, 0.0127731...",0.330251,0.100468,-0.036782,0.012773,0.136644,...,-0.064068,-0.161566,0.161229,-0.201916,-0.320825,-0.302057,-0.147689,0.197886,0.197342,-0.125275
2,3,aspiring human resources professional,raleighdurham north carolina area,0.086172,"[0.5342938, -0.18040474, -0.60427326, -0.13098...",0.534294,-0.180405,-0.604273,-0.130981,-0.484982,...,0.164977,0.202905,0.466124,-0.222518,-0.252643,-0.832752,0.100228,0.485862,0.577425,-0.33849
3,4,people development coordinator at ryan,denton texas,1.0,"[0.2051512, 0.420787, 0.08774841, 0.09309599, ...",0.205151,0.420787,0.087748,0.093096,-0.225512,...,0.321081,-0.350858,0.127341,0.006582,-0.29679,-0.628727,-0.169661,-0.092067,0.308878,0.232731
4,5,advisory board member at celal bayar university,izmir trkiye,1.0,"[0.13231285, 0.10512829, 0.13526429, 0.0226577...",0.132313,0.105128,0.135264,0.022658,-0.578859,...,-0.051858,-0.259557,0.478615,-0.217018,-0.621216,-0.582815,0.427113,0.330779,0.286989,0.401508


#### Define the keywords

In [17]:
keywords = [
    "aspiring human resources", "seeking human resources"]

#### Define a function to normalise, tokenise the keywords 

#### Define a function to normalise and tokenise the keywords

In [19]:
def get_sentence_vector(text, glove_model):
    """
    Computes the average GloVe embedding for a sentence.
    Words not in GloVe are skipped.
    """
    words = text.lower().split()
    vectors = [glove_model[word] for word in words if word in glove_model]
    if len(vectors) == 0:
        return np.zeros(glove_model.vector_size)
    return np.mean(vectors, axis=0)

# apply the function on the keywords
keyword_vecs = np.array([
    get_sentence_vector(kw, glove_model)
    for kw in keywords
])

#### Compute vector similarity b/w the candidate and the keywords using cosine method

In [21]:
# Compute similarity of each candidate to each keyword
similarities = cosine_similarity(glove_df, keyword_vecs)
# similarities shape = (num_candidates, num_keywords)

# Take the maximum similarity score per candidate across all keywords
df['glove_similarity_score'] = similarities.max(axis=1)

# Optional: sort by similarity
df_sorted = df.sort_values('glove_similarity_score', ascending=False)

# View top candidates
df_sorted[['id', 'job_title', 'glove_similarity_score']].head(10)

Unnamed: 0,id,job_title,glove_similarity_score
27,28,seeking human resources opportunities,0.976651
29,30,seeking human resources opportunities,0.976651
98,99,seeking human resources position,0.971908
5,6,aspiring human resources specialist,0.966006
35,36,aspiring human resources specialist,0.966006
59,60,aspiring human resources specialist,0.966006
48,49,aspiring human resources specialist,0.966006
23,24,aspiring human resources specialist,0.966006
20,21,aspiring human resources professional,0.95735
2,3,aspiring human resources professional,0.95735
