# Fine-Tune 'all-mpnet-base-v2' Sentence Transformer

In [113]:
modelName="sentence-transformers/all-mpnet-base-v2"

In [114]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
import torch
from torch import nn, optim
from sentence_transformers import SentenceTransformer, models, losses, InputExample, models
from sentence_transformers.evaluation import LabelAccuracyEvaluator
from transformers import AutoTokenizer
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

In [115]:
df = pd.read_csv("datasets/resume.csv")  # Replace with actual path
df = df[["Resume_str", "Category"]]

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['Category'])

texts = df['Resume_str'].tolist()
labels = df['label'].tolist()

In [116]:
class ResumeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.tokenized = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.tokenized["input_ids"][idx],
            "attention_mask": self.tokenized["attention_mask"][idx],
            "label": self.labels[idx]
        }

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
dataset = ResumeDataset(texts, labels, tokenizer)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

In [117]:
# Step 1: Build SentenceTransformer-style base model
word_embedding_model = models.Transformer(modelName)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
base_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Step 2: Define classification head
class SentenceClassifier(nn.Module):
    def __init__(self, base_model, num_classes):
        super(SentenceClassifier, self).__init__()
        self.base_model = base_model  # SentenceTransformer model
        self.classifier = nn.Sequential(
            nn.Linear(base_model.get_sentence_embedding_dimension(), 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )

    def forward(self, features):
        # Allow gradients for fine-tuning base model
        embeddings = self.base_model.forward(features)['sentence_embedding']
        return self.classifier(embeddings)

# Step 3: Instantiate classifier model
model = SentenceClassifier(base_model, num_classes=len(label_encoder.classes_))

In [118]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

# Early Stopping Setup
best_loss = float('inf')
patience = 6
counter = 0

# Training config
baseEpochs = 200
device = torch.device('cuda' if torch.cuda.is_available else 'mps' if torch.mps.is_available else 'cpu')
print("Using device:", device)
model = model.to(device)

# Freeze base model initially
for param in model.base_model.parameters():
    param.requires_grad = False

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=5e-4)

# Learning Rate Scheduler (optional for frozen phase)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=3, factor=0.5, verbose=True)

torch.save(model.state_dict(), "models/best_model.pt")
for epoch in range(baseEpochs):
    model.load_state_dict(torch.load("models/best_model.pt"))
    model.train()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}", leave=False)

    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = torch.tensor(batch["label"]).to(device)

        features = {"input_ids": input_ids, "attention_mask": attention_mask}
        outputs = model(features)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1} complete. Avg Loss: {avg_loss:.4f}")

    # Step the scheduler
    scheduler.step(avg_loss)

    # Early Stopping Check
    if avg_loss < best_loss:
        best_loss = avg_loss
        counter = 0
        print("‚úÖ Improvement detected ‚Äî saving model")
        torch.save(model.state_dict(), "models/best_model.pt")
    else:
        counter += 1
        print(f"‚ö†Ô∏è No improvement. Patience left: {patience - counter}")
        if counter >= patience:
            print("‚èπÔ∏è Early stopping triggered.")
            break

Using device: cuda


  labels = torch.tensor(batch["label"]).to(device)


Epoch 1 complete. Avg Loss: 2.7746
‚úÖ Improvement detected ‚Äî saving model




Epoch 2 complete. Avg Loss: 1.8635
‚úÖ Improvement detected ‚Äî saving model




Epoch 3 complete. Avg Loss: 1.4548
‚úÖ Improvement detected ‚Äî saving model




Epoch 4 complete. Avg Loss: 1.3172
‚úÖ Improvement detected ‚Äî saving model




Epoch 5 complete. Avg Loss: 1.1880
‚úÖ Improvement detected ‚Äî saving model




Epoch 6 complete. Avg Loss: 1.1233
‚úÖ Improvement detected ‚Äî saving model




Epoch 7 complete. Avg Loss: 1.0585
‚úÖ Improvement detected ‚Äî saving model




Epoch 8 complete. Avg Loss: 1.0096
‚úÖ Improvement detected ‚Äî saving model




Epoch 9 complete. Avg Loss: 0.9687
‚úÖ Improvement detected ‚Äî saving model




Epoch 10 complete. Avg Loss: 0.9285
‚úÖ Improvement detected ‚Äî saving model




Epoch 11 complete. Avg Loss: 0.9002
‚úÖ Improvement detected ‚Äî saving model




Epoch 12 complete. Avg Loss: 0.8594
‚úÖ Improvement detected ‚Äî saving model




Epoch 13 complete. Avg Loss: 0.8571
‚úÖ Improvement detected ‚Äî saving model




Epoch 14 complete. Avg Loss: 0.8391
‚úÖ Improvement detected ‚Äî saving model




Epoch 15 complete. Avg Loss: 0.8122
‚úÖ Improvement detected ‚Äî saving model




Epoch 16 complete. Avg Loss: 0.7847
‚úÖ Improvement detected ‚Äî saving model




Epoch 17 complete. Avg Loss: 0.7690
‚úÖ Improvement detected ‚Äî saving model




Epoch 18 complete. Avg Loss: 0.7427
‚úÖ Improvement detected ‚Äî saving model




Epoch 19 complete. Avg Loss: 0.7218
‚úÖ Improvement detected ‚Äî saving model




Epoch 20 complete. Avg Loss: 0.7276
‚ö†Ô∏è No improvement. Patience left: 5




Epoch 21 complete. Avg Loss: 0.7038
‚úÖ Improvement detected ‚Äî saving model




Epoch 22 complete. Avg Loss: 0.6996
‚úÖ Improvement detected ‚Äî saving model




Epoch 23 complete. Avg Loss: 0.6970
‚úÖ Improvement detected ‚Äî saving model




Epoch 24 complete. Avg Loss: 0.6606
‚úÖ Improvement detected ‚Äî saving model




Epoch 25 complete. Avg Loss: 0.6581
‚úÖ Improvement detected ‚Äî saving model




Epoch 26 complete. Avg Loss: 0.6411
‚úÖ Improvement detected ‚Äî saving model




Epoch 27 complete. Avg Loss: 0.6520
‚ö†Ô∏è No improvement. Patience left: 5




Epoch 28 complete. Avg Loss: 0.6291
‚úÖ Improvement detected ‚Äî saving model




Epoch 29 complete. Avg Loss: 0.6187
‚úÖ Improvement detected ‚Äî saving model




Epoch 30 complete. Avg Loss: 0.6021
‚úÖ Improvement detected ‚Äî saving model




Epoch 31 complete. Avg Loss: 0.5922
‚úÖ Improvement detected ‚Äî saving model




Epoch 32 complete. Avg Loss: 0.5715
‚úÖ Improvement detected ‚Äî saving model




Epoch 33 complete. Avg Loss: 0.5674
‚úÖ Improvement detected ‚Äî saving model




Epoch 34 complete. Avg Loss: 0.5524
‚úÖ Improvement detected ‚Äî saving model




Epoch 35 complete. Avg Loss: 0.5519
‚úÖ Improvement detected ‚Äî saving model




Epoch 36 complete. Avg Loss: 0.5314
‚úÖ Improvement detected ‚Äî saving model




Epoch 37 complete. Avg Loss: 0.5060
‚úÖ Improvement detected ‚Äî saving model




Epoch 38 complete. Avg Loss: 0.5204
‚ö†Ô∏è No improvement. Patience left: 5




Epoch 39 complete. Avg Loss: 0.5093
‚ö†Ô∏è No improvement. Patience left: 4




Epoch 40 complete. Avg Loss: 0.5145
‚ö†Ô∏è No improvement. Patience left: 3




Epoch 41 complete. Avg Loss: 0.5129
‚ö†Ô∏è No improvement. Patience left: 2




Epoch 42 complete. Avg Loss: 0.5013
‚úÖ Improvement detected ‚Äî saving model




Epoch 43 complete. Avg Loss: 0.5023
‚ö†Ô∏è No improvement. Patience left: 5




Epoch 44 complete. Avg Loss: 0.4975
‚úÖ Improvement detected ‚Äî saving model




Epoch 45 complete. Avg Loss: 0.4886
‚úÖ Improvement detected ‚Äî saving model




Epoch 46 complete. Avg Loss: 0.4874
‚úÖ Improvement detected ‚Äî saving model




Epoch 47 complete. Avg Loss: 0.4707
‚úÖ Improvement detected ‚Äî saving model




Epoch 48 complete. Avg Loss: 0.4699
‚úÖ Improvement detected ‚Äî saving model




Epoch 49 complete. Avg Loss: 0.4593
‚úÖ Improvement detected ‚Äî saving model




Epoch 50 complete. Avg Loss: 0.4516
‚úÖ Improvement detected ‚Äî saving model




Epoch 51 complete. Avg Loss: 0.4622
‚ö†Ô∏è No improvement. Patience left: 5




Epoch 52 complete. Avg Loss: 0.4481
‚úÖ Improvement detected ‚Äî saving model




Epoch 53 complete. Avg Loss: 0.4542
‚ö†Ô∏è No improvement. Patience left: 5




Epoch 54 complete. Avg Loss: 0.4686
‚ö†Ô∏è No improvement. Patience left: 4




Epoch 55 complete. Avg Loss: 0.4572
‚ö†Ô∏è No improvement. Patience left: 3




Epoch 56 complete. Avg Loss: 0.4508
‚ö†Ô∏è No improvement. Patience left: 2




Epoch 57 complete. Avg Loss: 0.4566
‚ö†Ô∏è No improvement. Patience left: 1


                                                           

Epoch 58 complete. Avg Loss: 0.4499
‚ö†Ô∏è No improvement. Patience left: 0
‚èπÔ∏è Early stopping triggered.




In [119]:
tuneEpochs = 200

# üîì Unfreeze base model
for param in model.base_model.parameters():
    param.requires_grad = True

# üîÅ New optimizer & scheduler for fine-tuning
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=3, factor=0.5, verbose=True)

# Early stopping state
tune_best_loss = float('inf')
tune_counter = 0
tune_patience = 6

for epoch in range(baseEpochs, baseEpochs + tuneEpochs):
    model.load_state_dict(torch.load("models/best_model.pt"))
    model.train()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc=f"Tune Epoch {epoch+1}", leave=False)

    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = torch.tensor(batch["label"]).to(device)

        features = {"input_ids": input_ids, "attention_mask": attention_mask}
        outputs = model(features)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"üõ†Ô∏è Epoch {epoch+1} complete. Avg Loss: {avg_loss:.4f}")

    scheduler.step(avg_loss)

    # Early stopping
    if avg_loss < tune_best_loss:
        tune_best_loss = avg_loss
        tune_counter = 0
        print("‚úÖ Improvement detected ‚Äî saving model")
        torch.save(model.state_dict(), "models/best_model.pt")
    else:
        tune_counter += 1
        print(f"‚ö†Ô∏è No improvement. Patience left: {tune_patience - tune_counter}")
        if tune_counter >= tune_patience:
            print("‚èπÔ∏è Early stopping triggered.")
            break

  labels = torch.tensor(batch["label"]).to(device)


üõ†Ô∏è Epoch 201 complete. Avg Loss: 0.4883
‚úÖ Improvement detected ‚Äî saving model




üõ†Ô∏è Epoch 202 complete. Avg Loss: 0.3108
‚úÖ Improvement detected ‚Äî saving model




üõ†Ô∏è Epoch 203 complete. Avg Loss: 0.2218
‚úÖ Improvement detected ‚Äî saving model




üõ†Ô∏è Epoch 204 complete. Avg Loss: 0.1292
‚úÖ Improvement detected ‚Äî saving model




üõ†Ô∏è Epoch 205 complete. Avg Loss: 0.0820
‚úÖ Improvement detected ‚Äî saving model




üõ†Ô∏è Epoch 206 complete. Avg Loss: 0.0604
‚úÖ Improvement detected ‚Äî saving model




üõ†Ô∏è Epoch 207 complete. Avg Loss: 0.0549
‚úÖ Improvement detected ‚Äî saving model




üõ†Ô∏è Epoch 208 complete. Avg Loss: 0.0377
‚úÖ Improvement detected ‚Äî saving model




üõ†Ô∏è Epoch 209 complete. Avg Loss: 0.0362
‚úÖ Improvement detected ‚Äî saving model




üõ†Ô∏è Epoch 210 complete. Avg Loss: 0.0234
‚úÖ Improvement detected ‚Äî saving model




üõ†Ô∏è Epoch 211 complete. Avg Loss: 0.0234
‚ö†Ô∏è No improvement. Patience left: 5




üõ†Ô∏è Epoch 212 complete. Avg Loss: 0.0588
‚ö†Ô∏è No improvement. Patience left: 4




üõ†Ô∏è Epoch 213 complete. Avg Loss: 0.0233
‚úÖ Improvement detected ‚Äî saving model




üõ†Ô∏è Epoch 214 complete. Avg Loss: 0.0586
‚ö†Ô∏è No improvement. Patience left: 5




üõ†Ô∏è Epoch 215 complete. Avg Loss: 0.0227
‚úÖ Improvement detected ‚Äî saving model




üõ†Ô∏è Epoch 216 complete. Avg Loss: 0.0239
‚ö†Ô∏è No improvement. Patience left: 5




üõ†Ô∏è Epoch 217 complete. Avg Loss: 0.0198
‚úÖ Improvement detected ‚Äî saving model




üõ†Ô∏è Epoch 218 complete. Avg Loss: 0.0089
‚úÖ Improvement detected ‚Äî saving model




üõ†Ô∏è Epoch 219 complete. Avg Loss: 0.0045
‚úÖ Improvement detected ‚Äî saving model




üõ†Ô∏è Epoch 220 complete. Avg Loss: 0.0046
‚ö†Ô∏è No improvement. Patience left: 5




üõ†Ô∏è Epoch 221 complete. Avg Loss: 0.0073
‚ö†Ô∏è No improvement. Patience left: 4




üõ†Ô∏è Epoch 222 complete. Avg Loss: 0.0409
‚ö†Ô∏è No improvement. Patience left: 3




üõ†Ô∏è Epoch 223 complete. Avg Loss: 0.0061
‚ö†Ô∏è No improvement. Patience left: 2




üõ†Ô∏è Epoch 224 complete. Avg Loss: 0.0029
‚úÖ Improvement detected ‚Äî saving model




üõ†Ô∏è Epoch 225 complete. Avg Loss: 0.0033
‚ö†Ô∏è No improvement. Patience left: 5




üõ†Ô∏è Epoch 226 complete. Avg Loss: 0.0030
‚ö†Ô∏è No improvement. Patience left: 4




üõ†Ô∏è Epoch 227 complete. Avg Loss: 0.0038
‚ö†Ô∏è No improvement. Patience left: 3




üõ†Ô∏è Epoch 228 complete. Avg Loss: 0.0026
‚úÖ Improvement detected ‚Äî saving model




üõ†Ô∏è Epoch 229 complete. Avg Loss: 0.0022
‚úÖ Improvement detected ‚Äî saving model




üõ†Ô∏è Epoch 230 complete. Avg Loss: 0.0039
‚ö†Ô∏è No improvement. Patience left: 5




üõ†Ô∏è Epoch 231 complete. Avg Loss: 0.0016
‚úÖ Improvement detected ‚Äî saving model




üõ†Ô∏è Epoch 232 complete. Avg Loss: 0.0018
‚ö†Ô∏è No improvement. Patience left: 5




üõ†Ô∏è Epoch 233 complete. Avg Loss: 0.0024
‚ö†Ô∏è No improvement. Patience left: 4




üõ†Ô∏è Epoch 234 complete. Avg Loss: 0.0024
‚ö†Ô∏è No improvement. Patience left: 3




üõ†Ô∏è Epoch 235 complete. Avg Loss: 0.0022
‚ö†Ô∏è No improvement. Patience left: 2




üõ†Ô∏è Epoch 236 complete. Avg Loss: 0.0056
‚ö†Ô∏è No improvement. Patience left: 1


                                                                 

üõ†Ô∏è Epoch 237 complete. Avg Loss: 0.0037
‚ö†Ô∏è No improvement. Patience left: 0
‚èπÔ∏è Early stopping triggered.




In [120]:
model.load_state_dict(torch.load("models/best_model.pt"))
torch.save(model.base_model.state_dict(), "models/model1.pt")

# TEST

In [136]:
encoder = SentenceTransformer(modelName)
resume = "Master's in Computer Science"
jd = "Bachelor's in Healthcare or related fields"
resumeEmbeddings = encoder.encode([resume])
jdEmbeddings = encoder.encode([jd])
similarity = cosine_similarity(resumeEmbeddings, jdEmbeddings)[0][0]
print("Similarity : ", similarity)
original_weights = encoder[0].auto_model.encoder.layer[0].attention.attn.q.weight.clone().detach()

Similarity :  0.45878172


In [138]:
encoder = SentenceTransformer(modelName)
encoder.load_state_dict(torch.load("models/model1.pt"))
resume = "Master's in Computer Science"
jd = "Bachelor's in Healthcare or related fields"
resumeEmbeddings = encoder.encode([resume])
jdEmbeddings = encoder.encode([jd])
similarity = cosine_similarity(resumeEmbeddings, jdEmbeddings)[0][0]
print("Similarity : ", similarity)
finetuned_weights = encoder[0].auto_model.encoder.layer[0].attention.attn.q.weight

Similarity :  0.3088787


In [139]:
is_same = torch.equal(original_weights, finetuned_weights)
print("Weights changed?" , not is_same)

Weights changed? True
