In [1]:
!pip install transformers torch sentence-transformers datasets -q

In [2]:
# ==============================================================================
# Step 2: Imports and the Corrected, Definitive Custom Model
# ==============================================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
# This is the correct base class to inherit from for full compatibility
from sentence_transformers.models import Transformer
import math

print("Libraries installed and imported successfully.")

Libraries installed and imported successfully.


In [3]:
def normalize_embeddings(embeddings):
    """ Helper function to L2 normalize embeddings. """
    return F.normalize(embeddings, p=2, dim=1)

In [4]:
class CustomBGETransformer(Transformer):
    """
    This is the definitive, corrected custom model.
    It inherits from `sentence_transformers.models.Transformer`, which automatically
    provides all necessary methods like `.save()` and `.tokenize()`.
    """
    def __init__(self, model_name_or_path, pooling_strategy='mean', projection_dim=None, **kwargs):
        # Initialize the parent Transformer class. This handles loading the
        # underlying Hugging Face model and tokenizer.
        super().__init__(model_name_or_path, **kwargs)

        if pooling_strategy not in ['cls', 'mean']:
            raise ValueError("pooling_strategy must be one of 'cls' or 'mean'")
        self.pooling_strategy = pooling_strategy
        self.projection_dim = projection_dim

        # The parent class stores the model's hidden size
        hidden_size = self.get_word_embedding_dimension()

        # Define our optional, custom projection head
        if self.projection_dim is not None:
            self.projection_head = nn.Sequential(
                nn.Linear(hidden_size, hidden_size),
                nn.GELU(),
                nn.Linear(hidden_size, self.projection_dim)
            )
        else:
            self.projection_head = None

        print(f"\nCustomBGETransformer initialized with:")
        print(f"  - Base Model: {model_name_or_path}")
        print(f"  - Pooling Strategy: {self.pooling_strategy}")
        print(f"  - Projection Head: {'Yes (' + str(self.projection_dim) + '-dim)' if self.projection_head else 'No'}")

    def _mean_pooling(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask

    def forward(self, features):
        """
        This forward pass modifies the output of the standard Transformer.
        """
        # Run the standard forward pass of the parent Transformer class.
        # This populates `features` with `token_embeddings`, `attention_mask`, etc.
        features = super().forward(features)

        last_hidden_state = features['token_embeddings']
        attention_mask = features['attention_mask']

        # Apply our custom pooling logic
        if self.pooling_strategy == 'cls':
            pooled_embeddings = last_hidden_state[:, 0]
        else: # 'mean'
            pooled_embeddings = self._mean_pooling(last_hidden_state, attention_mask)

        # Apply projection head if it exists
        if self.projection_head:
            pooled_embeddings = self.projection_head(pooled_embeddings)

        # Normalize the final embeddings
        final_embeddings = normalize_embeddings(pooled_embeddings)

        # Update the features dictionary with our final sentence embedding
        features.update({'sentence_embedding': final_embeddings})
        return features

    # This method tells the SentenceTransformer what the final output dimension will be.
    def get_sentence_embedding_dimension(self):
        return self.projection_dim if self.projection_dim is not None else super().get_sentence_embedding_dimension()

In [6]:
# ==============================================================================
# Step 3: Load and Prepare the MTEB Dataset (STSBenchmark)
# ==============================================================================
print("\nLoading STSBenchmark dataset from MTEB...")
stsb_dataset = load_dataset("mteb/stsbenchmark-sts")

train_samples, dev_samples, test_samples = [], [], []

for split in stsb_dataset:
    for example in stsb_dataset[split]:
        score = float(example['score']) / 5.0 # Normalize score to [0, 1]
        input_example = InputExample(texts=[example['sentence1'], example['sentence2']], label=score)

        if split == 'train': train_samples.append(input_example)
        elif split == 'validation': dev_samples.append(input_example)
        elif split == 'test': test_samples.append(input_example)

print(f"Dataset loaded. Train: {len(train_samples)}, Dev: {len(dev_samples)}, Test: {len(test_samples)}")


Loading STSBenchmark dataset from MTEB...
Dataset loaded. Train: 5749, Dev: 1500, Test: 1379


In [18]:
# ==============================================================================
# Step 4: Configure the Model and Training
# ==============================================================================
MODEL_NAME = 'BAAI/bge-small-en-v1.5'

# --- CHOOSE YOUR CONFIGURATION HERE ---
# Option 1: Mean Pooling (Recommended starting point) gave 90%
custom_transformer_module = CustomBGETransformer(MODEL_NAME, pooling_strategy='mean')

# Option 2: CLS Pooling gave 87%
#custom_transformer_module = CustomBGETransformer(MODEL_NAME, pooling_strategy='cls')

# Option 3: Mean Pooling + Projection Head gave 86%
#custom_transformer_module = CustomBGETransformer(MODEL_NAME, pooling_strategy='mean', projection_dim=256)


# The SentenceTransformer wrapper now correctly manages our custom module.
model = SentenceTransformer(modules=[custom_transformer_module])


CustomBGETransformer initialized with:
  - Base Model: BAAI/bge-small-en-v1.5
  - Pooling Strategy: mean
  - Projection Head: Yes (256-dim)


In [19]:
# --- Training Parameters ---
train_batch_size = 16
num_epochs = 2
model_save_path = 'output/bge-small-finetuned-stsb'

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)

In [20]:
# ==============================================================================
# Step 5: Fine-Tune the Model
# ==============================================================================
print("\nStarting model fine-tuning...")
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=500,
          warmup_steps=warmup_steps,
          output_path=model_save_path,
          show_progress_bar=True)

print(f"\nFine-tuning complete. Best model saved to: {model_save_path}")


Starting model fine-tuning...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss,Validation Loss,Sts-dev Pearson Cosine,Sts-dev Spearman Cosine
360,No log,No log,0.886214,0.888489
500,0.026700,No log,0.886812,0.889321
720,0.026700,No log,0.887885,0.890387



Fine-tuning complete. Best model saved to: output/bge-small-finetuned-stsb


In [22]:
# ==============================================================================
# Step 6: Evaluate the Fine-Tuned Model on the Test Set
# ==============================================================================
print("\nEvaluating the fine-tuned model on the test set...")
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=model_save_path)
print("\nEvaluation on test set complete. Results are saved in the output directory.")


Evaluating the fine-tuned model on the test set...

CustomBGETransformer initialized with:
  - Base Model: output/bge-small-finetuned-stsb
  - Pooling Strategy: mean
  - Projection Head: No

Evaluation on test set complete. Results are saved in the output directory.


In [23]:
print("\nEvaluation Results:")
results = test_evaluator(model, output_path=model_save_path)
print(results)


Evaluation Results:
{'sts-test_pearson_cosine': 0.8675687030074057, 'sts-test_spearman_cosine': 0.8731718078957199}
