In [32]:
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer
import pandas as pd
from datasets import Dataset
import re
from sentence_transformers.losses import TripletLoss
from sentence_transformers.training_args import SentenceTransformerTrainingArguments, BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator
import tensorflow as tf
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [74]:
model = SentenceTransformer('all-mpnet-base-v2')

In [78]:
embedding = model.encode(["Deep Learning", "Machine Learning"])
similarities = cosine_similarity([embedding[0]], [embedding[1]])[0]
print(similarities)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[0.77652353]


In [35]:
df_original = pd.read_csv("/kaggle/input/rjdb-csv-format/dev.csv")
df_preprocessed = pd.read_csv("/kaggle/input/rjdb-csv-format/dev.csv")
df_preprocessed_test = pd.read_csv("/kaggle/input/rjdb-csv-format/test.csv")
df_original = df_original.iloc[:, 0:3]
df_preprocessed = df_preprocessed.iloc[:, 0:3]
df_preprocessed_test = df_preprocessed_test.iloc[:, 0:3]

In [36]:
print(df_preprocessed.iloc[0,0])



## Job Title
SOLR Search Lead

## Job Summary
The appointed SOLR Search Lead will tackle complex information retrieval challenges, mentor team members and work in coordination with different units of the organization to deliver top-notch search solutions.

## Required Skills
Applicants should be proficient in using JUnit, an important unit testing tool for the runtime environment inspired by the Java programming language, to maintain process finesse and ensure optimal functionality of the implemented solutions.

## Required Experience
The desirable candidates should possess relevant practical job experience of 3 years as a SOLR Search Lead and a substantial service of multiplying effect to the extent of five years in capacity as a Systems Engineer (DevOps). Deep understanding of Junit is expected as it remains a crucial part of personnel selection process.

## Responsibilities
• Implementing effective strategies to maximize search engine utility.
• Providing mentorship and guidance t

In [37]:
df_preprocessed.head()

Unnamed: 0,Job-Description,Resume-matched,Resume-unmatched
0,\r\n\r\n## Job Title\r\nSOLR Search Lead\r\n\r...,\r\n## Personal Information\r\nName: Mia O'Con...,\r\n\r\n## Personal Information\r\nName: Mia O...
1,\r\n\r\n## Job title\r\nSecurity and Fire Alar...,\r\n\r\n## Personal Information\r\nName: Watee...,\r\n\r\n## Personal Information\r\nName: Watee...
2,\r\n\r\n## Job Title\r\nSAP FICO Lead | Projec...,\r\n\r\n## Personal Information\r\nName: Carlo...,\r\n\r\n## Personal Information\r\nName: Carlo...
3,\r\n\r\n## Job title\r\nClient Engagement Mana...,\r\n\r\n## Personal Information\r\nName: Ecrin...,\r\n\r\n## Personal Information\r\nName: Ecrin...
4,\r\n\r\n## Job Title\r\n.NET Developer \r\n\r\...,\r\n\r\n## Personal Information\r\nName: Hosse...,\r\n\r\n## Personal Information\r\nName: Hosse...


In [38]:
def clean_text(text):
    text = re.sub(r'#*', '', text)
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [39]:
df_preprocessed["Job-Description"] = df_preprocessed["Job-Description"].apply(clean_text)
df_preprocessed["Resume-matched"] = df_preprocessed["Resume-matched"].apply(clean_text)
df_preprocessed["Resume-unmatched"] = df_preprocessed["Resume-unmatched"].apply(clean_text)

df_preprocessed_test["Job-Description"] = df_preprocessed_test["Job-Description"].apply(clean_text)
df_preprocessed_test["Resume-matched"] = df_preprocessed_test["Resume-matched"].apply(clean_text)
df_preprocessed_test["Resume-unmatched"] = df_preprocessed_test["Resume-unmatched"].apply(clean_text)

In [40]:
df_preprocessed.head()

Unnamed: 0,Job-Description,Resume-matched,Resume-unmatched
0,Job Title SOLR Search Lead Job Summary The app...,Personal Information Name: Mia O'Connor Email:...,Personal Information Name: Mia O'Connor Email:...
1,Job title Security and Fire Alarm Systems Inst...,Personal Information Name: Wateen Smith Email:...,Personal Information Name: Wateen Smith Email:...
2,Job Title SAP FICO Lead | Project Manager Job ...,Personal Information Name: Carlos Rodriguez Em...,Personal Information Name: Carlos Rodriguez Em...
3,Job title Client Engagement Manager Job Summar...,Personal Information Name: Ecrin Boomfield Ema...,Personal Information Name: Ecrin Boomfield Ema...
4,Job Title .NET Developer Job Summary We are in...,Personal Information Name: Hossein Farsi Email...,Personal Information Name: Hossein Farsi Email...


In [41]:
df_original.iloc[200,0]

'\r\n\r\n## Job title\r\nSoftware Engineer III\r\n\r\n## Job Summary\r\nIn this key role as a Software Engineer III, you will be responsible for developing embedded software and systems within IoT environment, ensuring compatibility with hardware design and interfacing with real-time video streaming and documentary systems. Using your essential skills in business intelligence, project management and analysis, you will drive complex software projects, aligning objectives with company vision and strategies.\r\n\r\n## Required Skills\r\n- Strong abilities in analysis and problem solving in complex software systems.\r\n- Business intelligence proficiency to decode project metrics into comprehendible strategies.\r\n- Experience in embedded systems coding.\r\n- Understanding and knowledge of hardware systems.\r\n- Capability to supervise and manage project teams and essentials.\r\n- Proven experience in project development and management.\r\n- Familiarity with video technology, encoding and 

In [42]:
df_preprocessed.iloc[200,0]

'Job title Software Engineer III Job Summary In this key role as a Software Engineer III, you will be responsible for developing embedded software and systems within IoT environment, ensuring compatibility with hardware design and interfacing with real-time video streaming and documentary systems. Using your essential skills in business intelligence, project management and analysis, you will drive complex software projects, aligning objectives with company vision and strategies. Required Skills - Strong abilities in analysis and problem solving in complex software systems. - Business intelligence proficiency to decode project metrics into comprehendible strategies. - Experience in embedded systems coding. - Understanding and knowledge of hardware systems. - Capability to supervise and manage project teams and essentials. - Proven experience in project development and management. - Familiarity with video technology, encoding and streaming. Required Experience - At least 5 years of relev

In [43]:
anchors = [i.iloc[0] for n,i in df_preprocessed.iterrows()]
positives = [i.iloc[1] for n,i in df_preprocessed.iterrows()]
negatives = [i.iloc[2] for n,i in df_preprocessed.iterrows()]

anchors_test = [i.iloc[0] for n,i in df_preprocessed_test.iterrows()]
positives_test = [i.iloc[1] for n,i in df_preprocessed_test.iterrows()]
negatives_test = [i.iloc[2] for n,i in df_preprocessed_test.iterrows()]

In [44]:
train_dataset = Dataset.from_dict({
    "anchor": anchors,
    "positive": positives,
    "negative": negatives
})

dataset_test = Dataset.from_dict({
    "anchor": anchors_test,
    "positive": positives_test,
    "negative": negatives_test
})

dataset_split = dataset_test.train_test_split(test_size=0.4, seed=42, shuffle = True)

eval_dataset = dataset_split['train']
test_dataset = dataset_split['test']

In [63]:
loss = TripletLoss(model)

In [64]:
args = SentenceTransformerTrainingArguments(
    output_dir="/kaggle/working/",
    
    # Training
    num_train_epochs=1,  # Recommended: 2-3 epochs for 2.5k examples
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_ratio=0.1,
    
    # Sampling strategy for better negatives
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    # Evaluation / Saving
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,

    # Logging
    logging_steps=10,  # More frequent logs can help for short training
    report_to=[],
    run_name="mpnet-base-triplet-finetune"
)

In [48]:
triplet_evaluator = TripletEvaluator(
    anchors=eval_dataset["anchor"],
    positives=eval_dataset["positive"],
    negatives=eval_dataset["negative"],
    name="Evaluation mpnet-v2",
)
triplet_evaluator(model)

{'Evaluation mpnet-v2_cosine_accuracy': 0.7266666889190674}

In [49]:
test_evaluator = TripletEvaluator(
    anchors=test_dataset["anchor"],
    positives=test_dataset["positive"],
    negatives=test_dataset["negative"],
    name="Evaluation mpnet-v2",
)
test_evaluator(model)

{'Evaluation mpnet-v2_cosine_accuracy': 0.7475000023841858}

In [65]:
# Access the underlying transformer model
transformer = model._first_module().auto_model

# Freeze all layers
for param in transformer.parameters():
    param.requires_grad = False

# Unfreeze last N transformer layers (e.g., last 2)
unfreeze_layers = 1
for i in range(1, unfreeze_layers + 1):
    for param in transformer.encoder.layer[-i].parameters():
        param.requires_grad = True


In [66]:
# Access the base transformer model
transformer = model._first_module().auto_model

# Check number of encoder layers
print(f"\nTransformer has {len(transformer.encoder.layer)} encoder layers.")

# Print which layers are frozen or not
for i, layer in enumerate(transformer.encoder.layer):
    status = all(not p.requires_grad for p in layer.parameters())
    print(f"Layer {i}: {'❄️ Frozen' if status else '🔥 Trainable'}")



Transformer has 12 encoder layers.
Layer 0: ❄️ Frozen
Layer 1: ❄️ Frozen
Layer 2: ❄️ Frozen
Layer 3: ❄️ Frozen
Layer 4: ❄️ Frozen
Layer 5: ❄️ Frozen
Layer 6: ❄️ Frozen
Layer 7: ❄️ Frozen
Layer 8: ❄️ Frozen
Layer 9: ❄️ Frozen
Layer 10: ❄️ Frozen
Layer 11: 🔥 Trainable


In [67]:
def count_parameters(model):
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total parameters: {total:,}")
    print(f"Trainable parameters: {trainable:,}")
    print(f"Frozen parameters: {total - trainable:,}")


In [68]:
count_parameters(model)

Total parameters: 109,486,464
Trainable parameters: 7,087,872
Frozen parameters: 102,398,592


In [69]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss,
    evaluator=triplet_evaluator,
)
trainer.train()

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Evaluation mpnet-v2 Cosine Accuracy
100,4.8998,4.891697,0.8


TrainOutput(global_step=154, training_loss=4.901254554847618, metrics={'train_runtime': 154.8242, 'train_samples_per_second': 15.85, 'train_steps_per_second': 0.995, 'total_flos': 0.0, 'train_loss': 4.901254554847618, 'epoch': 1.0})

In [70]:
test_evaluator = TripletEvaluator(
    anchors=test_dataset["anchor"],
    positives=test_dataset["positive"],
    negatives=test_dataset["negative"],
    name="Evaluation mpnet-v2",
)
test_evaluator(model)

{'Evaluation mpnet-v2_cosine_accuracy': 0.8100000023841858}

In [71]:
trainer.save_model("/kaggle/working/fine_tuned_mpnet")

In [72]:
import shutil
shutil.make_archive("/kaggle/working/fine_tuned_mpnet 4", 'zip', "/kaggle/working/fine_tuned_mpnet")

'/kaggle/working/fine_tuned_mpnet 4.zip'