#### Ranking with FastText Model.

#### Load libraries

In [1]:
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors 
from sklearn.metrics.pairwise import cosine_similarity

#### Load the cleaned dataset

In [3]:
file_path  = r"C:\Users\USER\Documents\Potential_Talent\cleaned_dataset.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 ct bauer college of business graduate mag...,houston texas,0.168337,
1,2,native english teacher at epik english program...,kanada,1.0,
2,3,aspiring human resources professional,raleighdurham north carolina area,0.086172,
3,4,people development coordinator at ryan,denton texas,1.0,
4,5,advisory board member at celal bayar university,izmir trkiye,1.0,


#### Drop the fit column 

In [5]:
df = df.drop(columns =['fit'])
df.head()

Unnamed: 0,id,job_title,location,connection
0,1,2019 ct bauer college of business graduate mag...,houston texas,0.168337
1,2,native english teacher at epik english program...,kanada,1.0
2,3,aspiring human resources professional,raleighdurham north carolina area,0.086172
3,4,people development coordinator at ryan,denton texas,1.0
4,5,advisory board member at celal bayar university,izmir trkiye,1.0


#### Load the pretrained FastText model

In [11]:
fasttext_path = r"C:\Users\USER\Documents\Potential Talent\wiki-news-300d-1M-subword.vec"

fasttext_model = KeyedVectors.load_word2vec_format(
    fasttext_path,
    binary=False  
)

print("FastText model loaded successfully!")
print("Vector size:", fasttext_model.vector_size)

Loading FastText .vec file... (this may take 1–3 minutes)
FastText model loaded successfully!
Vector size: 300


#### Save the loaded model for easy loading

In [15]:
# savd the loaded model
save_path = r"C:\Users\USER\Documents\Potential Talent\FastText_model.kv"
fasttext_model.save(save_path)
print('model saved')

model saved


#### Load the saved model

In [7]:
fasttext_model = KeyedVectors.load(r"C:\Users\USER\Documents\Potential Talent\FastText_model.kv")
print("model loaded")

model loaded


#### Define a function to tokenize job_title

In [9]:

def get_fasttext_vector(text, fasttext_model, vector_size=300):
    """
    Computes the average FastText embedding for a sentence.
    Words not in the model are skipped.
    """
    words = text.split()  
    vectors = [fasttext_model[word] for word in words if word in fasttext_model]
    if len(vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

#### Apply the function on job_title for vectorization

In [11]:
#### Apply the fasttext vectors to job_title
vector_size = 300  # depends on the FastText model you loaded
df['fasttext_vector'] = df['job_title'].apply(lambda x: get_fasttext_vector(x, fasttext_model, vector_size))

##### Convert the vectorized job_title to pandas dataframe

In [13]:
fasttext_df = pd.DataFrame(df['fasttext_vector'].to_list())
                           
fasttext_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.016067,-0.030327,0.01256,0.00472,-0.005633,-0.026827,0.003613,-0.040633,0.025627,-0.012087,...,0.00148,0.002453,-0.007773,0.0013,0.0051,0.00656,-0.007107,-0.011267,0.002533,0.003007
1,0.005312,-0.033212,0.017525,-0.010325,0.006437,-0.00775,-0.012575,-0.065237,0.01105,-0.00025,...,-0.009887,0.008738,-0.001087,-0.005863,0.004062,0.0068,0.003412,-0.012538,0.00735,-0.006713
2,-0.0027,-0.024025,0.00745,-0.00055,-0.0068,-0.01105,-0.005875,-0.03365,0.02375,0.004875,...,0.0021,-0.002225,-0.020775,-0.004625,-0.005375,0.012775,0.001975,0.008925,-0.0088,0.003125
3,0.00488,-0.05786,0.00516,0.00296,0.00798,0.0022,0.0071,-0.0489,0.01514,0.00278,...,0.0027,0.00206,-0.0201,0.00226,0.00206,0.0016,-0.01628,-0.00222,-0.00492,-0.01078
4,-0.00294,-0.06316,0.01494,0.00658,0.01034,-0.0219,-0.01204,-0.05992,0.01038,-0.00024,...,0.00176,0.00092,-0.00018,-0.00802,-0.0023,0.02178,-0.02386,-0.00514,-0.00962,-0.01388


#### Rename the vectorised job_title from 0, 1, 2..... to fasttext_0,...

In [15]:
# Rename columns to glove_0, glove_1, ...
fasttext_df.columns = [f'fasttext_{i}' for i in range(fasttext_df.shape[1])]
fasttext_df.head()

Unnamed: 0,fasttext_0,fasttext_1,fasttext_2,fasttext_3,fasttext_4,fasttext_5,fasttext_6,fasttext_7,fasttext_8,fasttext_9,...,fasttext_290,fasttext_291,fasttext_292,fasttext_293,fasttext_294,fasttext_295,fasttext_296,fasttext_297,fasttext_298,fasttext_299
0,-0.016067,-0.030327,0.01256,0.00472,-0.005633,-0.026827,0.003613,-0.040633,0.025627,-0.012087,...,0.00148,0.002453,-0.007773,0.0013,0.0051,0.00656,-0.007107,-0.011267,0.002533,0.003007
1,0.005312,-0.033212,0.017525,-0.010325,0.006437,-0.00775,-0.012575,-0.065237,0.01105,-0.00025,...,-0.009887,0.008738,-0.001087,-0.005863,0.004062,0.0068,0.003412,-0.012538,0.00735,-0.006713
2,-0.0027,-0.024025,0.00745,-0.00055,-0.0068,-0.01105,-0.005875,-0.03365,0.02375,0.004875,...,0.0021,-0.002225,-0.020775,-0.004625,-0.005375,0.012775,0.001975,0.008925,-0.0088,0.003125
3,0.00488,-0.05786,0.00516,0.00296,0.00798,0.0022,0.0071,-0.0489,0.01514,0.00278,...,0.0027,0.00206,-0.0201,0.00226,0.00206,0.0016,-0.01628,-0.00222,-0.00492,-0.01078
4,-0.00294,-0.06316,0.01494,0.00658,0.01034,-0.0219,-0.01204,-0.05992,0.01038,-0.00024,...,0.00176,0.00092,-0.00018,-0.00802,-0.0023,0.02178,-0.02386,-0.00514,-0.00962,-0.01388


#### Add the renamed and the vectorized job_title back to df

In [17]:
# Concatenate back to main DataFrame
df = pd.concat([df, fasttext_df], axis=1)
df.head()

Unnamed: 0,id,job_title,location,connection,fasttext_vector,fasttext_0,fasttext_1,fasttext_2,fasttext_3,fasttext_4,...,fasttext_290,fasttext_291,fasttext_292,fasttext_293,fasttext_294,fasttext_295,fasttext_296,fasttext_297,fasttext_298,fasttext_299
0,1,2019 ct bauer college of business graduate mag...,houston texas,0.168337,"[-0.016066665, -0.030326666, 0.012559998, 0.00...",-0.016067,-0.030327,0.01256,0.00472,-0.005633,...,0.00148,0.002453,-0.007773,0.0013,0.0051,0.00656,-0.007107,-0.011267,0.002533,0.003007
1,2,native english teacher at epik english program...,kanada,1.0,"[0.0053124996, -0.033212498, 0.017525, -0.0103...",0.005312,-0.033212,0.017525,-0.010325,0.006437,...,-0.009887,0.008738,-0.001087,-0.005863,0.004062,0.0068,0.003412,-0.012538,0.00735,-0.006713
2,3,aspiring human resources professional,raleighdurham north carolina area,0.086172,"[-0.0027, -0.024025, 0.0074499995, -0.00054999...",-0.0027,-0.024025,0.00745,-0.00055,-0.0068,...,0.0021,-0.002225,-0.020775,-0.004625,-0.005375,0.012775,0.001975,0.008925,-0.0088,0.003125
3,4,people development coordinator at ryan,denton texas,1.0,"[0.00488, -0.05786, 0.0051599997, 0.0029600002...",0.00488,-0.05786,0.00516,0.00296,0.00798,...,0.0027,0.00206,-0.0201,0.00226,0.00206,0.0016,-0.01628,-0.00222,-0.00492,-0.01078
4,5,advisory board member at celal bayar university,izmir trkiye,1.0,"[-0.00294, -0.063159995, 0.014939999, 0.006580...",-0.00294,-0.06316,0.01494,0.00658,0.01034,...,0.00176,0.00092,-0.00018,-0.00802,-0.0023,0.02178,-0.02386,-0.00514,-0.00962,-0.01388


#### Define the keywords

In [19]:


# Step 1 — Define your keywords as separate phrases
keywords = [
    "aspiring human resources", "seeking human resources"]

#### Define a function to normalize and tokenize the keywords

In [21]:

def get_sentence_vector_fasttext(text, fasttext_model, vector_size=300):
    """
    Computes the average FastText embedding for a sentence.
    Words not in the model are skipped (FastText usually covers most words).
    """
    words = text.split()  # text is already cleaned & lowercased
    vectors = [fasttext_model[word] for word in words if word in fasttext_model]
    if len(vectors) == 0:
        return np.zeros(vector_size)  # fallback if no words match
    return np.mean(vectors, axis=0)

#### Apply the function on the keywords

In [23]:

keyword_vecs_fasttext = np.array([
    get_sentence_vector_fasttext(kw, fasttext_model)
    for kw in keywords
])
# keyword_vecs_fasttext shape = (num_keywords, vector_size)

#### compute vec similarity b/w candidate vec and keywords vec.

In [25]:
# Make sure candidate vectors are a numpy array
candidate_vectors = fasttext_df.to_numpy()  

# Compute similarity between each candidate and each keyword
similarities = cosine_similarity(candidate_vectors, keyword_vecs_fasttext)
# similarities shape: (num_candidates, num_keywords)

# Take the maximum similarity score per candidate
df['fasttext_similarity_score'] = similarities.max(axis=1)

#  sort candidates by similarity
df_sorted = df.sort_values('fasttext_similarity_score', ascending=False)

# View top candidates
df_sorted[['id', 'job_title', 'fasttext_similarity_score']].head(10)

Unnamed: 0,id,job_title,fasttext_similarity_score
2,3,aspiring human resources professional,0.988642
32,33,aspiring human resources professional,0.988642
16,17,aspiring human resources professional,0.988642
20,21,aspiring human resources professional,0.988642
57,58,aspiring human resources professional,0.988642
96,97,aspiring human resources professional,0.988642
45,46,aspiring human resources professional,0.988642
27,28,seeking human resources opportunities,0.985051
29,30,seeking human resources opportunities,0.985051
23,24,aspiring human resources specialist,0.976215
