#### Ranking with TF-IDF

#### Load libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer


#### Load the cleaned dataset

In [3]:
file_path = r"C:\Users\USER\Documents\Potential_Talent\cleaned_dataset.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 ct bauer college of business graduate mag...,houston texas,0.168337,
1,2,native english teacher at epik english program...,kanada,1.0,
2,3,aspiring human resources professional,raleighdurham north carolina area,0.086172,
3,4,people development coordinator at ryan,denton texas,1.0,
4,5,advisory board member at celal bayar university,izmir trkiye,1.0,


#### Drop fit column

In [5]:
df = df.drop(columns= ['fit'])
df.head()

Unnamed: 0,id,job_title,location,connection
0,1,2019 ct bauer college of business graduate mag...,houston texas,0.168337
1,2,native english teacher at epik english program...,kanada,1.0
2,3,aspiring human resources professional,raleighdurham north carolina area,0.086172
3,4,people development coordinator at ryan,denton texas,1.0
4,5,advisory board member at celal bayar university,izmir trkiye,1.0


#### Initialize TF-IDF vectorizer

In [7]:
tfidf = TfidfVectorizer(stop_words='english')
print('TF-IDF initialization successful')

TF-IDF initialization successful


#### Vectorise job_title

In [9]:
tfidf_matrix = tfidf.fit_transform(df["job_title"].fillna(""))

#print("Vectorization successful.")
print("TF-IDF matrix shape:", tfidf_matrix.shape)

TF-IDF matrix shape: (104, 178)


#### convert tfidf sparse array to dense array

In [11]:
# Convert sparse TF-IDF matrix to dense array
tfidf_dense = tfidf_matrix.toarray()

# Check shape to confirm
print("Shape:", tfidf_dense.shape)

Shape: (104, 178)


#### Convert the dense array to pandas dataframe

In [13]:
tfidf_df = pd.DataFrame(tfidf_dense)
tfidf_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,168,169,170,171,172,173,174,175,176,177
0,0.321105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.423208,0.0,0.0,...,0.0,0.0,0.323226,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Rename the vectorised columns from 0,1,2... to tfidf_0, ...

In [15]:
# Number of TF-IDF features
num_features = tfidf_df.shape[1]

# Rename columns to tfidf_0, tfidf_1, ..., tfidf_n
tfidf_df.columns = [f"tfidf_{i}" for i in range(num_features)]

# Check the first few rows
tfidf_df.head()

Unnamed: 0,tfidf_0,tfidf_1,tfidf_2,tfidf_3,tfidf_4,tfidf_5,tfidf_6,tfidf_7,tfidf_8,tfidf_9,...,tfidf_168,tfidf_169,tfidf_170,tfidf_171,tfidf_172,tfidf_173,tfidf_174,tfidf_175,tfidf_176,tfidf_177
0,0.321105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.423208,0.0,0.0,...,0.0,0.0,0.323226,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Add it bact to df

In [17]:
# Concatenate with original df
df = pd.concat([df.reset_index(drop=True), tfidf_df], axis=1)

df.head()

Unnamed: 0,id,job_title,location,connection,tfidf_0,tfidf_1,tfidf_2,tfidf_3,tfidf_4,tfidf_5,...,tfidf_168,tfidf_169,tfidf_170,tfidf_171,tfidf_172,tfidf_173,tfidf_174,tfidf_175,tfidf_176,tfidf_177
0,1,2019 ct bauer college of business graduate mag...,houston texas,0.168337,0.321105,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,native english teacher at epik english program...,kanada,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,aspiring human resources professional,raleighdurham north carolina area,0.086172,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,people development coordinator at ryan,denton texas,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,advisory board member at celal bayar university,izmir trkiye,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.323226,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Define the keywords and vectorise it

In [19]:
# Keep keywords as separate phrases
keywords = [
    "aspiring human resources","seeking human resources"]

# Vectorize each keyword separately
keyword_vecs = tfidf.transform(keywords)

#### compute similarity between tfidf_dense and the keyword_vec using cosine method.

In [21]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity between candidates (rows) and each keyword (columns)
similarities = cosine_similarity(tfidf_dense, keyword_vecs.toarray())
# similarities shape = (num_candidates, 2)

# Take the maximum similarity for each candidate across both keywords
df['similarity_score'] = similarities.max(axis=1)

# Sort by similarity
df_sorted = df.sort_values('similarity_score', ascending=False)

# View top candidates
df_sorted[['id', 'job_title', 'similarity_score']].head(20)

Unnamed: 0,id,job_title,similarity_score
2,3,aspiring human resources professional,0.753591
32,33,aspiring human resources professional,0.753591
16,17,aspiring human resources professional,0.753591
20,21,aspiring human resources professional,0.753591
57,58,aspiring human resources professional,0.753591
96,97,aspiring human resources professional,0.753591
45,46,aspiring human resources professional,0.753591
23,24,aspiring human resources specialist,0.695679
59,60,aspiring human resources specialist,0.695679
5,6,aspiring human resources specialist,0.695679
