#### Predicting and Ranking Candidate using SBERT model

#### Import libraries

#### Load the Cleaned dataset

In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [3]:
file_path = r"C:\Users\USER\Documents\Potential_Talent\cleaned_dataset.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 ct bauer college of business graduate mag...,houston texas,0.168337,
1,2,native english teacher at epik english program...,kanada,1.0,
2,3,aspiring human resources professional,raleighdurham north carolina area,0.086172,
3,4,people development coordinator at ryan,denton texas,1.0,
4,5,advisory board member at celal bayar university,izmir trkiye,1.0,


#### Combine relevant columns to form a text column with a longer sentences as requied by Sbert

Codes below combine job_title, location and connection to creat text column

In [5]:
# Combine relevant columns into one string
df["text"] = (
    df["job_title"].fillna("") + " "
    + df["location"].fillna("") + " "
    + df["connection"].astype(str).fillna("")
)

df.head()

Unnamed: 0,id,job_title,location,connection,fit,text
0,1,2019 ct bauer college of business graduate mag...,houston texas,0.168337,,2019 ct bauer college of business graduate mag...
1,2,native english teacher at epik english program...,kanada,1.0,,native english teacher at epik english program...
2,3,aspiring human resources professional,raleighdurham north carolina area,0.086172,,aspiring human resources professional raleighd...
3,4,people development coordinator at ryan,denton texas,1.0,,people development coordinator at ryan denton ...
4,5,advisory board member at celal bayar university,izmir trkiye,1.0,,advisory board member at celal bayar universit...


Result above shown that text column has been created

#### Drop fit column since it is emppty

In [7]:
df = df.drop(columns = ['fit'])

In [9]:
df.head()

Unnamed: 0,id,job_title,location,connection,text
0,1,2019 ct bauer college of business graduate mag...,houston texas,0.168337,2019 ct bauer college of business graduate mag...
1,2,native english teacher at epik english program...,kanada,1.0,native english teacher at epik english program...
2,3,aspiring human resources professional,raleighdurham north carolina area,0.086172,aspiring human resources professional raleighd...
3,4,people development coordinator at ryan,denton texas,1.0,people development coordinator at ryan denton ...
4,5,advisory board member at celal bayar university,izmir trkiye,1.0,advisory board member at celal bayar universit...


#### Load pretrained SBERT

In [11]:
# Load SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

**Vectorize the text column using embedding from sbert**

In [13]:
# Compute embeddings for all rows
embeddings = model.encode(df['text'].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

#### Convert the embedding scores to pandas dataframe

In [15]:
embedding_df = pd.DataFrame(embeddings)
embedding_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.080647,-0.01359,0.025604,0.045382,-0.054794,0.027669,-0.092094,-0.024126,-0.01879,-0.041619,...,0.035383,-0.039114,0.014456,-0.021426,-0.080527,0.010414,0.059675,-0.064236,0.023765,0.004547
1,0.013974,0.002556,0.05864,0.008864,-0.03688,-0.017276,0.064799,0.008673,-0.059727,0.091424,...,0.033897,0.073916,-0.029386,-0.065074,-0.066251,-0.024638,0.049792,0.022195,-0.08559,0.064956
2,-0.03288,0.0207,-0.03608,0.101873,-0.036267,0.003642,-0.01897,-0.053405,-0.043835,0.06315,...,0.098044,0.006575,0.034576,0.0171,-0.052237,0.079505,0.025924,-0.034924,-0.026293,0.028292
3,-0.077126,-0.016689,-0.015529,0.038083,-0.028124,-0.031241,0.033025,-0.060054,-0.041687,0.004289,...,0.054908,-0.012562,0.026102,-0.046238,-0.028634,-0.014254,-0.07107,0.003787,-0.003871,0.008868
4,-0.101833,0.050485,-0.035294,0.002843,-0.078087,-0.100537,0.073505,0.023797,-0.031542,0.004224,...,0.017321,0.074921,-0.010447,0.072482,-0.072462,-0.046297,0.093996,-0.018277,-0.092328,-0.028491


#### Rename the embedding scores columns 0, 1,2,3..... to embedding_0, embedding_1 ......

In [17]:
# Rename columns
embedding_df.columns = [f"embedding_{i}" for i in range(embedding_df.shape[1])]
embedding_df.head()

Unnamed: 0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_374,embedding_375,embedding_376,embedding_377,embedding_378,embedding_379,embedding_380,embedding_381,embedding_382,embedding_383
0,-0.080647,-0.01359,0.025604,0.045382,-0.054794,0.027669,-0.092094,-0.024126,-0.01879,-0.041619,...,0.035383,-0.039114,0.014456,-0.021426,-0.080527,0.010414,0.059675,-0.064236,0.023765,0.004547
1,0.013974,0.002556,0.05864,0.008864,-0.03688,-0.017276,0.064799,0.008673,-0.059727,0.091424,...,0.033897,0.073916,-0.029386,-0.065074,-0.066251,-0.024638,0.049792,0.022195,-0.08559,0.064956
2,-0.03288,0.0207,-0.03608,0.101873,-0.036267,0.003642,-0.01897,-0.053405,-0.043835,0.06315,...,0.098044,0.006575,0.034576,0.0171,-0.052237,0.079505,0.025924,-0.034924,-0.026293,0.028292
3,-0.077126,-0.016689,-0.015529,0.038083,-0.028124,-0.031241,0.033025,-0.060054,-0.041687,0.004289,...,0.054908,-0.012562,0.026102,-0.046238,-0.028634,-0.014254,-0.07107,0.003787,-0.003871,0.008868
4,-0.101833,0.050485,-0.035294,0.002843,-0.078087,-0.100537,0.073505,0.023797,-0.031542,0.004224,...,0.017321,0.074921,-0.010447,0.072482,-0.072462,-0.046297,0.093996,-0.018277,-0.092328,-0.028491


#### Add the embedding scores back to the original dataframe

In [19]:
# Concatenate with original df
df = pd.concat([df.reset_index(drop=True), embedding_df], axis=1)

df.head()

Unnamed: 0,id,job_title,location,connection,text,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,...,embedding_374,embedding_375,embedding_376,embedding_377,embedding_378,embedding_379,embedding_380,embedding_381,embedding_382,embedding_383
0,1,2019 ct bauer college of business graduate mag...,houston texas,0.168337,2019 ct bauer college of business graduate mag...,-0.080647,-0.01359,0.025604,0.045382,-0.054794,...,0.035383,-0.039114,0.014456,-0.021426,-0.080527,0.010414,0.059675,-0.064236,0.023765,0.004547
1,2,native english teacher at epik english program...,kanada,1.0,native english teacher at epik english program...,0.013974,0.002556,0.05864,0.008864,-0.03688,...,0.033897,0.073916,-0.029386,-0.065074,-0.066251,-0.024638,0.049792,0.022195,-0.08559,0.064956
2,3,aspiring human resources professional,raleighdurham north carolina area,0.086172,aspiring human resources professional raleighd...,-0.03288,0.0207,-0.03608,0.101873,-0.036267,...,0.098044,0.006575,0.034576,0.0171,-0.052237,0.079505,0.025924,-0.034924,-0.026293,0.028292
3,4,people development coordinator at ryan,denton texas,1.0,people development coordinator at ryan denton ...,-0.077126,-0.016689,-0.015529,0.038083,-0.028124,...,0.054908,-0.012562,0.026102,-0.046238,-0.028634,-0.014254,-0.07107,0.003787,-0.003871,0.008868
4,5,advisory board member at celal bayar university,izmir trkiye,1.0,advisory board member at celal bayar universit...,-0.101833,0.050485,-0.035294,0.002843,-0.078087,...,0.017321,0.074921,-0.010447,0.072482,-0.072462,-0.046297,0.093996,-0.018277,-0.092328,-0.028491


#### Define keywords: Aspiring human resources", "Seeking human resources

In [21]:
# Define your keywords
keywords = ["aspiring human resources", "seeking human resources"]

#### Compute embeddings for the keywords

In [23]:
# Compute embeddings for all target keywords
keywords_embeddings = model.encode(keywords, normalize_embeddings=True)  # shape: (2, embedding_dim)

#### Compute cosine similarity b/w the vectorised text column and the keywords

In [25]:
# Compute cosine similarity of each candidate to each target keyword
# embeddings: your candidate embeddings, shape (num_candidates, embedding_dim)
similarities = cosine_similarity(embeddings, keywords_embeddings)  # shape: (num_candidates, 2)

In [27]:
# Take the max similarity for each candidate
df['similarity_score'] = similarities.max(axis=1)

In [35]:
#  sorted by similarity score descending
df[['id', 'job_title', 'similarity_score']].sort_values('similarity_score', ascending=False).head(20)

Unnamed: 0,id,job_title,similarity_score
98,99,seeking human resources position,0.757853
5,6,aspiring human resources specialist,0.754887
35,36,aspiring human resources specialist,0.754887
23,24,aspiring human resources specialist,0.754887
48,49,aspiring human resources specialist,0.754887
59,60,aspiring human resources specialist,0.754887
29,30,seeking human resources opportunities,0.740008
27,28,seeking human resources opportunities,0.740008
20,21,aspiring human resources professional,0.734231
2,3,aspiring human resources professional,0.734231
