# Siamese Network for Resume-Job Description Matching

## Phase 1: Data Preparation

In [1]:
import pandas as pd
import os
import random
from tqdm import tqdm

tqdm.pandas()

In [2]:
# Define paths
RESUME_PATH = '../data/raw/parsed_resumes.csv'
JD_FOLDER_PATH = '../data/job_descriptions/'
OUTPUT_PATH = '../data/processed/triplet_training_data.csv'

# Define the target job description for positive examples
TARGET_JD_FILENAME = 'Web-Developer-job-description.txt'

In [26]:
# Load Resumes
resumes_df = pd.read_csv(RESUME_PATH)

# Concatenate relevant fields to create Resume_str
fields_to_concat = [
    'Person Name', 'Work Experience', 'Skills', 'Education', 'Certifications', 'Projects', 'Summary', 'Contact Information'
]

# Only use fields that exist in the dataframe
fields_to_concat = [f for f in fields_to_concat if f in resumes_df.columns]

resumes_df['Resume_str'] = resumes_df[fields_to_concat].fillna('').agg(' '.join, axis=1)

print(f'Loaded {len(resumes_df)} resumes. Created Resume_str by concatenating: {fields_to_concat}')

Loaded 2437 resumes. Created Resume_str by concatenating: ['Person Name', 'Work Experience', 'Skills', 'Education']


In [5]:
# Load Job Descriptions
jd_files = os.listdir(JD_FOLDER_PATH)
job_descriptions = {}
for file_name in jd_files:
    with open(os.path.join(JD_FOLDER_PATH, file_name), 'r', encoding='utf-8') as f:
        job_descriptions[file_name] = f.read()

print(f'Loaded {len(job_descriptions)} job descriptions.')

Loaded 5 job descriptions.


### Generate Triplets
We will create a triplet for each resume. The resume is the **anchor**. The **positive** example is the target job description ('Web-Developer-job-description.txt'). The **negative** example is any other job description chosen at random.

In [8]:
positive_jd = job_descriptions[TARGET_JD_FILENAME]
negative_jd_files = [f for f in jd_files if f != TARGET_JD_FILENAME]

triplets = []
for index, row in tqdm(resumes_df.iterrows(), total=resumes_df.shape[0]):
    # Anchor is the resume text
    anchor = row['Resume_str']
    
    # Positive is the target JD
    positive = positive_jd
    
    # Negative is a randomly chosen different JD
    negative_filename = random.choice(negative_jd_files)
    negative = job_descriptions[negative_filename]
    
    triplets.append({'anchor': anchor, 'positive': positive, 'negative': negative})

triplets_df = pd.DataFrame(triplets)
print(f'Generated {len(triplets_df)} triplets.')
triplets_df.head()

100%|██████████| 2437/2437 [00:00<00:00, 62584.30it/s]

Generated 2437 triplets.





Unnamed: 0,anchor,positive,negative
0,A senior systems administrator trico products ...,\nJob Title: Web Developer\nCompany: Not speci...,Position: Data Scientist\nExperience: 2-4 Year...
1,B systems administrator bios technologies - me...,\nJob Title: Web Developer\nCompany: Not speci...,Position: Software Engineer\nExperience: 1-3 Y...
2,C systems administrator nord gear corporation ...,\nJob Title: Web Developer\nCompany: Not speci...,Position: Project Manager\nExperience: 3-6 Yea...
3,"D roti mediterranean grill - north bethesda, m...",\nJob Title: Web Developer\nCompany: Not speci...,Position: Data Scientist\nExperience: 2-4 Year...
4,E systems administrator bex realty - boca rato...,\nJob Title: Web Developer\nCompany: Not speci...,Position: Data Analyst\nExperience: 2-5 Years\...


In [9]:
# Save the triplets to a new CSV file
triplets_df.to_csv(OUTPUT_PATH, index=False)
print(f'Triplet data saved to {OUTPUT_PATH}')

Triplet data saved to ../data/processed/triplet_training_data.csv


## Phase 2 & 3: Model Architecture & Training

In [10]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
import torch.nn.functional as F

### Configuration

In [11]:
MODEL_NAME = 'distilbert-base-uncased'
MAX_LENGTH = 256
BATCH_SIZE = 16
EPOCHS = 1
LEARNING_RATE = 2e-5
MARGIN = 0.5

### Create a PyTorch Dataset

In [12]:
class TripletDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.df = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        anchor = self.tokenizer(row['anchor'], max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        positive = self.tokenizer(row['positive'], max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        negative = self.tokenizer(row['negative'], max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
        
        return {
            'anchor': {'input_ids': anchor['input_ids'].flatten(), 'attention_mask': anchor['attention_mask'].flatten()}, 
            'positive': {'input_ids': positive['input_ids'].flatten(), 'attention_mask': positive['attention_mask'].flatten()}, 
            'negative': {'input_ids': negative['input_ids'].flatten(), 'attention_mask': negative['attention_mask'].flatten()}
        }

### Define the Siamese Network Architecture

In [13]:
class SiameseNetwork(nn.Module):
    def __init__(self, model_name):
        super(SiameseNetwork, self).__init__()
        self.encoder = DistilBertModel.from_pretrained(model_name)

    def forward_once(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        # Use mean pooling for sentence representation
        pooled_output = outputs.last_hidden_state.mean(axis=1)
        return pooled_output

    def forward(self, anchor, positive, negative):
        anchor_embedding = self.forward_once(anchor['input_ids'], anchor['attention_mask'])
        positive_embedding = self.forward_once(positive['input_ids'], positive['attention_mask'])
        negative_embedding = self.forward_once(negative['input_ids'], negative['attention_mask'])
        return anchor_embedding, positive_embedding, negative_embedding

### Define the Triplet Loss

In [14]:
class TripletLoss(nn.Module):
    def __init__(self, margin):
        super(TripletLoss, self).__init__()
        self.margin = margin

    def forward(self, anchor, positive, negative):
        distance_positive = F.pairwise_distance(anchor, positive, p=2)
        distance_negative = F.pairwise_distance(anchor, negative, p=2)
        loss = torch.mean(F.relu(distance_positive - distance_negative + self.margin))
        return loss

### Training Setup

In [15]:
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)
train_dataset = TripletDataset(triplets_df, tokenizer, MAX_LENGTH)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SiameseNetwork(MODEL_NAME).to(device)
loss_fn = TripletLoss(margin=MARGIN)
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

print(f'Training on {device}')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Training on cpu


### Training Loop

In [16]:
model.train()
for epoch in range(EPOCHS):
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{EPOCHS}'):
        optimizer.zero_grad()
        
        anchor = {k: v.to(device) for k, v in batch['anchor'].items()}
        positive = {k: v.to(device) for k, v in batch['positive'].items()}
        negative = {k: v.to(device) for k, v in batch['negative'].items()}
        
        anchor_embedding, positive_embedding, negative_embedding = model(anchor, positive, negative)
        
        loss = loss_fn(anchor_embedding, positive_embedding, negative_embedding)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
    avg_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{EPOCHS}, Average Loss: {avg_loss:.4f}')

Epoch 1/1: 100%|██████████| 153/153 [32:12<00:00, 12.63s/it]

Epoch 1/1, Average Loss: 0.0009





### Save the Model

In [25]:
ENCODER_SAVE_PATH = '../models/siamese_encoder'
os.makedirs(ENCODER_SAVE_PATH, exist_ok=True)
model.encoder.save_pretrained(ENCODER_SAVE_PATH, safe_serialization=False)
tokenizer.save_pretrained(ENCODER_SAVE_PATH)
print(f'Model encoder saved to {ENCODER_SAVE_PATH}')

SafetensorError: Error while serializing: IoError(Os { code: 1224, kind: Uncategorized, message: "The requested operation cannot be performed on a file with a user-mapped section open." })

## Phase 4: Inference and Ranking

In [18]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import torch.nn.functional as F
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

### Load the Fine-Tuned Encoder

In [19]:
ENCODER_PATH = '../models/siamese_encoder'
RANKING_OUTPUT_PATH = '../data/results/siamese_ranking_results.csv'

tokenizer = DistilBertTokenizer.from_pretrained(ENCODER_PATH)
encoder = DistilBertModel.from_pretrained(ENCODER_PATH)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
encoder.to(device)
encoder.eval()
print('Model loaded and in evaluation mode.')

Model loaded and in evaluation mode.


### Function to Generate Embeddings

In [20]:
def get_embedding(text, tokenizer, model, device, max_length=256):
    inputs = tokenizer(text, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        output = model(**inputs).last_hidden_state.mean(axis=1)
    return output.cpu()

### Generate Embeddings for Job Description and Resumes

In [21]:
# Get the target job description text
target_jd_text = job_descriptions[TARGET_JD_FILENAME]

# Generate embedding for the JD
jd_embedding = get_embedding(target_jd_text, tokenizer, encoder, device)
print('Generated embedding for the target job description.')

Generated embedding for the target job description.


In [22]:
# Generate embeddings for all resumes
resumes_df['embedding'] = resumes_df['Resume_str'].progress_apply(
    lambda x: get_embedding(x, tokenizer, encoder, device)
)
print(f'Generated embeddings for {len(resumes_df)} resumes.')

100%|██████████| 2437/2437 [03:01<00:00, 13.39it/s]

Generated embeddings for 2437 resumes.





### Calculate Similarity and Rank

In [23]:
# Calculate cosine similarity
resumes_df['similarity_score'] = resumes_df['embedding'].progress_apply(
    lambda x: F.cosine_similarity(x, jd_embedding).item()
)

# Sort by similarity score
ranked_resumes = resumes_df.sort_values(by='similarity_score', ascending=False)

100%|██████████| 2437/2437 [00:00<00:00, 45513.32it/s]



In [28]:
# Save the results
final_ranking = ranked_resumes[['Resume_str', 'similarity_score']]
final_ranking.to_csv(RANKING_OUTPUT_PATH, index=False)

print(f'Ranking complete. Results saved to {RANKING_OUTPUT_PATH}')
final_ranking.head()

Ranking complete. Results saved to ../data/results/siamese_ranking_results.csv


Unnamed: 0,Resume_str,similarity_score
797,"ADR front-end software developer ecra group, i...",0.928677
643,XT front-end developer hack illinois august 20...,0.926955
1807,"BQN ? ? extensive ms office tools (word, exce...",0.923196
634,XK senior front end web developer oogloo.com f...,0.92107
794,ADO front end ui developer nwea/hitachi - port...,0.920999
