In [1]:
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity



In [2]:
# Load the CSV files
candidates_df = pd.read_excel('/kaggle/input/word2vec-lightcast/df-dropna.xlsx')
titles_df = pd.read_csv('/kaggle/input/word2vec-lightcast/lightcast-titles.csv')

In [3]:
candidates_df.sample(5)

Unnamed: 0,status,experience,present_ctc,expected_ctc,notice_period,candidate_profiles,job_categories,designation,Relevant,exp_min,exp_max,ctc_fixed_min,ctc_fixed_max
1198,ACCEPTED,2.42,6.0,9.0,15,Software Developer,Backend Developer,Backend Developer,1,1.0,3.0,8.0,12.0
4053,ACCEPTED,6.0,1100000.0,550000.0,2,Assistant Sales Manager,Inside Sales (Outbound),Inside Sales,1,1.0,5.0,4.0,6.0
10011,ACCEPTED,4.0,7.0,13.0,15,Associate Software Developer|||Consultant|||So...,Developer,Senior Software Engineer,1,2.0,3.0,11.5,15.0
13158,REJECTED,3.0,4.45,7.5,15,Clinical Researcher,Frontend Developer,Medical Officer,0,0.0,2.0,6.0,10.0
10192,ACCEPTED,6.0,10.0,17.0,25,Software Programmer|||senior Software Engineer,Developer,Java Developer,1,3.0,8.0,9.0,24.0


In [4]:
import re

def clean_text_columns(df, columns):
    clean_pattern = r'[!@#$%^&*()_+{}\[\-]:;<>,.?~\\/|"]'
    for col in columns:
        df[col] = df[col].apply(lambda x: re.sub(clean_pattern, ' ', str(x)))
        df[col] = df[col].str.replace(r'\s+', ' ', regex=True).str.strip()
        df[col] = df[col].str.lower()  # Convert to lowercase
    return df

# List of columns to clean
columns_to_clean = ['candidate_profiles','designation']

# Call the clean_text_columns function to clean the specified columns
candidates_df = clean_text_columns(candidates_df, columns_to_clean)
candidates_df.sample(10)

Unnamed: 0,status,experience,present_ctc,expected_ctc,notice_period,candidate_profiles,job_categories,designation,Relevant,exp_min,exp_max,ctc_fixed_min,ctc_fixed_max
3419,ACCEPTED,4.0,4.0,0.0,0,business development associate(business develo...,Business development,business development executive - telugu,1,0.5,3.0,3.0,5.5
10186,ACCEPTED,7.0,5.5,7.5,15,area manager,Business development,area sales manager - north,1,3.0,8.0,5.2,7.8
5951,ACCEPTED,6.5,4.65,6.0,30,physics teacher,Marketing,academic qc specialist,1,2.0,5.0,4.0,6.0
6292,ACCEPTED,2.5,22.0,33.0,60,software developer intern|||software developme...,Backend Developer,backend engineer ii- golang,1,2.0,5.0,25.0,35.0
9904,ACCEPTED,6.0,6.0,9.0,15,sales officer,Key account manager/ Customer success manager,key account executive ã¢â‚¬â€œ modern trade,1,1.0,4.0,6.0,7.0
2405,ACCEPTED,0.0,0.0,2.4,0,fresher,Business development,business development executive,1,0.0,2.0,3.0,3.6
10326,ACCEPTED,8.0,17.3,30.0,30,senior software engineer|||senior systems engi...,Backend Developer,java developer -3,1,5.0,7.0,30.0,50.0
8012,ACCEPTED,4.6,28.0,35.0,30,software developer - ii (android),Backend Developer,sde- ii backend,1,2.0,5.0,25.0,40.0
3996,ACCEPTED,4.4,13.0,25.0,20,software engineer,Backend Developer,software development engineer (sde) - ii,1,3.0,5.0,15.0,25.0
9459,ACCEPTED,5.5,5.0,6.5,30,payroll - senior hr executive,HR,payroll specialist,1,2.0,5.0,5.0,7.0


In [5]:
titles_df.sample(5)

Unnamed: 0,ID,Title
68120,ETA943ED7CE4056B2D,Test Project Manager
8326,ET323079C786BDAFA3,Business and Information Technology Teacher
10455,ETAE5CD9845EA897F9,Change Management Specialist
53807,ET4549476C8D0CC277,Project Development Associate
21059,ET9C160EB30DEC177F,Director of Global Regulatory Affairs


In [6]:
titles_df['Title'] = titles_df['Title'].str.lower()

In [7]:
# Train word2vec model on titles 
titles_model = Word2Vec(titles_df['Title'], vector_size=200, window=5, min_count=5)

In [8]:
# Get embeddings matrix  
title_embeddings = titles_model.wv.vectors

# Calculate embeddings for candidate profiles
candidate_embeddings = []
for profile in candidates_df['candidate_profiles']:
    word_vectors = []
    
    for word in profile.split():
        if word in titles_model.wv:
            word_vectors.append(titles_model.wv[word])
            
    if len(word_vectors) > 0:  
        # Average the word vectors 
        profile_vector = np.mean(word_vectors, axis=0) 
        candidate_embeddings.append(profile_vector)
    else:
        candidate_embeddings.append(np.zeros(200))

In [9]:
# Calculate embeddings for designations
designation_embeddings = []
for d in candidates_df['designation']:
    word_vectors = []
    
    for word in d.split():
        if word in titles_model.wv:
            word_vectors.append(titles_model.wv[word])
            
    if len(word_vectors) > 0:  
        # Average the word vectors 
        designation_vector = np.mean(word_vectors, axis=0) 
        designation_embeddings.append(designation_vector)
    else:
        designation_embeddings.append(np.zeros(200))

In [10]:
# Calculate cosine similarity
c_cos_sim = cosine_similarity(candidate_embeddings, title_embeddings)
d_cos_sim = cosine_similarity(designation_embeddings, title_embeddings)


# Get index of closest match
c_idx = c_cos_sim.argmax(1)
d_idx = d_cos_sim.argmax(1)

In [11]:
# Add normalized title column
candidates_df['normalized_candidate_profiles'] = titles_df.loc[c_idx, 'Title'].values
candidates_df['normalized_jd_designation'] = titles_df.loc[d_idx, 'Title'].values

In [12]:
candidates_df.sample(5)

Unnamed: 0,status,experience,present_ctc,expected_ctc,notice_period,candidate_profiles,job_categories,designation,Relevant,exp_min,exp_max,ctc_fixed_min,ctc_fixed_max,normalized_candidate_profiles,normalized_jd_designation
1110,ACCEPTED,2.6,1.95,2.5,10,documentation executive,Operations,documentation,1,2.0,7.0,1.5,3.0,.net application architect,.net application architect
3520,ACCEPTED,3.67,6.2,9.0,15,key accounts manager|||territory sales manager,Business development,sales manager - mumbai,1,2.0,12.0,12.0,17.0,.net application architect,.net technical architect
13304,REJECTED,6.0,6.0,7.0,30,academic counselor,Digital marketing,professor,0,4.0,7.0,6.0,9.0,.net application architect,.net application architect
9845,ACCEPTED,6.0,6.0,7.8,15,area sales manager,Business development,area sales manager - andhra pradesh/ telangana,1,3.0,8.0,5.2,7.8,.net application architect,.net technical architect
8796,ACCEPTED,10.0,25.0,30.0,60,finance controller,Finance,finance controller,1,8.0,10.0,24.0,30.0,.net application architect,.net application architect


In [13]:
# Reorder columns
columns_reordered = ['status', 'experience', 'present_ctc', 'expected_ctc', 'notice_period',
                     'candidate_profiles', 'normalized_candidate_profiles', 'job_categories', 
                     'designation', 'normalized_jd_designation', 'Relevant', 'exp_min', 'exp_max', 'ctc_fixed_min', 
                     'ctc_fixed_max']

# Reindex the DataFrame with the reordered columns
candidates_df = candidates_df.reindex(columns=columns_reordered)

candidates_df.head()  # Displaying the reordered DataFrame

Unnamed: 0,status,experience,present_ctc,expected_ctc,notice_period,candidate_profiles,normalized_candidate_profiles,job_categories,designation,normalized_jd_designation,Relevant,exp_min,exp_max,ctc_fixed_min,ctc_fixed_max
0,ACCEPTED,2.0,3.6,4.14,15,senior sales executive,.net application architect,"Inside Sales (Outbound),Sales",inside sales,.net application architect,1,0.5,3.0,3.0,5.0
1,ACCEPTED,11.0,3.0,4.0,1,marketing executive,.net application architect,"Inside Sales (Outbound),Sales",inside sales,.net application architect,1,0.5,3.0,3.0,5.0
2,ACCEPTED,2.5,5.0,5.75,15,android developer,.net application architect,Frontend Developer,react native developer,.net application architect,1,2.0,4.0,6.0,10.0
3,ACCEPTED,4.08,7.0,8.05,15,android mobile application developer|||android...,.net application architect,Frontend Developer,react native developer,.net application architect,1,2.0,4.0,6.0,10.0
4,ACCEPTED,3.5,4.0,8.0,1,devops engineer,.net application architect,Developer Operations,developer operations,.net application architect,1,1.0,5.0,6.0,10.0


In [14]:
candidates_df.to_csv('w2v-norm-profiles.csv', index=False)

In [15]:
# Sample titles
title1 = "Software Engineer"  
title2 = "Senior Software Developer"

# Get word vectors 
try:
    vector1 = titles_model.wv[title1]
    vector2 = titles_model.wv[title2]
except KeyError:
    # Word not in vocabulary
    vector1 = vector2 = np.zeros(100)
    
# Calculate cosine similarity
cos_sim = cosine_similarity(vector1.reshape(1, -1), vector2.reshape(1, -1))

print(f"Cosine similarity between {title1} and {title2}: {cos_sim[0][0]:.3f}")

# Check if vectors are too close
if cos_sim > 0.9:
    print(f"Similarity between {title1} and {title2} is too high. Model needs further tuning.")
else: 
    print(f"Vectors for {title1} and {title2} seem sufficiently distinct.")

Cosine similarity between Software Engineer and Senior Software Developer: 0.000
Vectors for Software Engineer and Senior Software Developer seem sufficiently distinct.


In [16]:
vector1

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [17]:
vector2

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])