In [1]:
import wandb
wandb.login(key="bad6b8a0ac53c6665bbf6201ac36a3ab180041b7")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mshravang[0m ([33mshravang-iiit-hyderabad[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [2]:
import os
import numpy as np
import torch
from datasets import load_dataset, Dataset, load_from_disk
from transformers import (
    Trainer,
    TrainingArguments,
    TrainerCallback
)
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr, spearmanr
from tqdm.auto import tqdm

##############################################
# Custom Epoch-Level Progress Callback       #
##############################################
class SingleEpochProgressCallback(TrainerCallback):
    def on_epoch_begin(self, args, state, control, **kwargs):
        print(f"\nStarting epoch {state.epoch:.0f}/{args.num_train_epochs}")
        # Calculate steps per epoch if available.
        if state.max_steps and args.num_train_epochs:
            self.steps_per_epoch = int(state.max_steps / args.num_train_epochs)
        else:
            self.steps_per_epoch = 0
        self.progress_bar = tqdm(total=self.steps_per_epoch, desc=f"Epoch {state.epoch:.0f}")
    def on_step_end(self, args, state, control, **kwargs):
        if hasattr(self, "progress_bar"):
            self.progress_bar.update(1)
    def on_epoch_end(self, args, state, control, **kwargs):
        if hasattr(self, "progress_bar"):
            self.progress_bar.close()

##############################################
# Device and Dataset Paths                   #
##############################################
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
languages = ['de','en','es','fr','it','nl','pl','pt','ru','zh']
train_path = "/kaggle/working/distilbert_combined_train"
val_path   = "/kaggle/working/distilbert_combined_dev"

##############################################
# Dataset Loading & Combining                #
##############################################
def load_and_combine_split(split_name):
    print(f"Loading split '{split_name}' for all languages...")
    combined_examples = []
    for lang in languages:
        print(f"Loading language: {lang}")
        ds = load_dataset("PhilipMay/stsb_multi_mt", lang, split=split_name)
        print(f"Number of examples for {lang}: {len(ds)}")
        combined_examples.extend(ds)
    print("Creating combined dataset...")
    return Dataset.from_dict({
        "sentence1": [ex["sentence1"] for ex in combined_examples],
        "sentence2": [ex["sentence2"] for ex in combined_examples],
        "similarity_score": [ex["similarity_score"] for ex in combined_examples],
    })

if os.path.exists(train_path):
    train_dataset = load_from_disk(train_path)
    print("Loaded combined train dataset from disk.")
else:
    print("Loading and combining training split...")
    train_dataset = load_and_combine_split("train")
    train_dataset.save_to_disk(train_path)
    print("Saved combined train dataset to disk.")

if os.path.exists(val_path):
    val_dataset = load_from_disk(val_path)
    print("Loaded combined validation dataset from disk.")
else:
    print("Loading and combining validation split...")
    val_dataset = load_and_combine_split("dev")
    val_dataset.save_to_disk(val_path)
    print("Saved combined validation dataset to disk.")

# Optionally set dataset format to torch.
train_dataset.set_format("torch")
val_dataset.set_format("torch")
print("Train dataset size:", len(train_dataset))
print("Validation dataset size:", len(val_dataset))

##############################################
# Model and Tokenizer                        #
##############################################

model_name = "distilbert-base-multilingual-cased"
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

def model_init():
    model = DistilBertForSequenceClassification.from_pretrained(
        model_name,
        num_labels=1,                # For regression
        problem_type="regression"    # Ensure proper handling of regression tasks
    )
    model.config.use_cache = False
    model.to(device)
    return model

##############################################
# Preprocessing & Tokenization               #
##############################################
def preprocess_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True, max_length=128)

print("Tokenizing train dataset...")
train_dataset = train_dataset.map(preprocess_function, batched=True)
print("Tokenizing validation dataset...")
val_dataset = val_dataset.map(preprocess_function, batched=True)

def set_labels(example):
    example["labels"] = float(example["similarity_score"])
    return example

print("Setting labels for train dataset...")
train_dataset = train_dataset.map(set_labels)
print("Setting labels for validation dataset...")
val_dataset = val_dataset.map(set_labels)

##############################################
# Metrics Calculation                        #
##############################################
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.flatten()
    pearson_corr = pearsonr(predictions, labels)[0]
    spearman_corr = spearmanr(predictions, labels)[0]
    mse = mean_squared_error(labels, predictions)
    pred_array = np.array(predictions)
    label_array = np.array(labels)
    dot = np.dot(pred_array, label_array)
    norm_pred = np.linalg.norm(pred_array)
    norm_label = np.linalg.norm(label_array)
    cosine_sim = dot / (norm_pred * norm_label) if norm_pred and norm_label else 0.0
    return {
        "pearson": pearson_corr,
        "spearman": spearman_corr,
        "mse": mse,
        "cosine": cosine_sim,
        "avg_corr": (pearson_corr + spearman_corr) / 2
    }

##############################################
# Training Arguments                         #
##############################################
training_args = TrainingArguments(
    output_dir="./distilbert_sts_finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    save_total_limit=2,
    dataloader_num_workers=4
)

print("Starting training for DistilBERT...")
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.add_callback(SingleEpochProgressCallback())

trainer.train()
eval_results = trainer.evaluate()
print("DistilBERT Evaluation results:", eval_results)

# Save the final model after training.
final_model = trainer.model
final_model.save_pretrained("./distilbert_sts_finetuned")
print("Final model saved at './distilbert_sts_finetuned'.")


Using device: cuda
Loading and combining training split...
Loading split 'train' for all languages...
Loading language: de


README.md:   0%|          | 0.00/11.4k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/537k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/123k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/163k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Number of examples for de: 5749
Loading language: en


train-00000-of-00001.parquet:   0%|          | 0.00/470k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/108k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/142k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Number of examples for en: 5749
Loading language: es


train-00000-of-00001.parquet:   0%|          | 0.00/528k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/119k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/157k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Number of examples for es: 5749
Loading language: fr


train-00000-of-00001.parquet:   0%|          | 0.00/542k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/123k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/163k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Number of examples for fr: 5749
Loading language: it


train-00000-of-00001.parquet:   0%|          | 0.00/532k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/122k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/159k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Number of examples for it: 5749
Loading language: nl


train-00000-of-00001.parquet:   0%|          | 0.00/517k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/116k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/153k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Number of examples for nl: 5749
Loading language: pl


train-00000-of-00001.parquet:   0%|          | 0.00/546k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/123k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/164k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Number of examples for pl: 5749
Loading language: pt


train-00000-of-00001.parquet:   0%|          | 0.00/523k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/119k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/158k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Number of examples for pt: 5749
Loading language: ru


train-00000-of-00001.parquet:   0%|          | 0.00/721k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/158k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/209k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Number of examples for ru: 5749
Loading language: zh


train-00000-of-00001.parquet:   0%|          | 0.00/468k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/107k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/140k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1500 [00:00<?, ? examples/s]

Number of examples for zh: 5749
Creating combined dataset...


Saving the dataset (0/1 shards):   0%|          | 0/57490 [00:00<?, ? examples/s]

Saved combined train dataset to disk.
Loading and combining validation split...
Loading split 'dev' for all languages...
Loading language: de
Number of examples for de: 1500
Loading language: en
Number of examples for en: 1500
Loading language: es
Number of examples for es: 1500
Loading language: fr
Number of examples for fr: 1500
Loading language: it
Number of examples for it: 1500
Loading language: nl
Number of examples for nl: 1500
Loading language: pl
Number of examples for pl: 1500
Loading language: pt
Number of examples for pt: 1500
Loading language: ru
Number of examples for ru: 1500
Loading language: zh
Number of examples for zh: 1500
Creating combined dataset...


Saving the dataset (0/1 shards):   0%|          | 0/15000 [00:00<?, ? examples/s]

Saved combined validation dataset to disk.
Train dataset size: 57490
Validation dataset size: 15000


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

Tokenizing train dataset...


Map:   0%|          | 0/57490 [00:00<?, ? examples/s]

Tokenizing validation dataset...


Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Setting labels for train dataset...


Map:   0%|          | 0/57490 [00:00<?, ? examples/s]

Setting labels for validation dataset...


Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Starting training for DistilBERT...


  trainer = Trainer(


model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting epoch 0/3


Epoch 0:   0%|          | 0/3594 [00:00<?, ?it/s]

Epoch,Training Loss,Validation Loss,Pearson,Spearman,Mse,Cosine,Avg Corr
1,0.7958,0.76468,0.818078,0.818142,0.76468,0.950611,0.81811
2,0.3658,0.754525,0.816555,0.814199,0.754525,0.950853,0.815377
3,0.2225,0.791248,0.80987,0.80743,0.791248,0.949145,0.80865



Starting epoch 1/3


Epoch 1:   0%|          | 0/3594 [00:00<?, ?it/s]


Starting epoch 2/3


Epoch 2:   0%|          | 0/3594 [00:00<?, ?it/s]

DistilBERT Evaluation results: {'eval_loss': 0.7912477254867554, 'eval_pearson': 0.8098701242675328, 'eval_spearman': 0.8074295370780157, 'eval_mse': 0.7912477254867554, 'eval_cosine': 0.9491446614265442, 'eval_avg_corr': 0.8086498306727743, 'eval_runtime': 16.5014, 'eval_samples_per_second': 909.013, 'eval_steps_per_second': 56.844, 'epoch': 3.0}
Final model saved at './distilbert_sts_finetuned'.


In [8]:
import os
import numpy as np
import torch
import pandas as pd
from datasets import load_from_disk, load_dataset, concatenate_datasets, Dataset
from transformers import (
    DistilBertForSequenceClassification,
    DistilBertTokenizerFast,
    Trainer,
    TrainingArguments,
    DistilBertConfig
)
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import mean_squared_error
from tqdm.auto import tqdm
import logging

# Suppress transformer warnings.
logging.getLogger("transformers").setLevel(logging.ERROR)

##############################################
# Define Languages and Concatenation Function#
##############################################
languages = ['de','en','es','fr','it','nl','pl','pt','ru','zh']

def load_and_concatenate_split(split_name):
    print(f"Loading split '{split_name}' for all languages...")
    datasets_list = []
    for lang in languages:
        print(f"Loading language {lang}...")
        ds = load_dataset("PhilipMay/stsb_multi_mt", lang, split=split_name)
        datasets_list.append(ds)
    print("Concatenating datasets from all languages...")
    combined_dataset = concatenate_datasets(datasets_list)
    return combined_dataset

##############################################
# Matching Configuration                     #
##############################################
config = DistilBertConfig.from_pretrained("distilbert-base-multilingual-cased")
config.num_labels = 1
config.problem_type = "regression"
config.use_cache = False

##############################################
# Load the Fine-Tuned Model (Non-Quantized)    #
##############################################
model_path = "/kaggle/working/distilbert_sts_finetuned"  # Use your non-quantized DistilBERT checkpoint.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBertForSequenceClassification.from_pretrained(model_path, config=config)
model.to(device)
model.eval()

##############################################
# Load the Tokenizer                         #
##############################################
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-multilingual-cased")

##############################################
# Load and Prepare the Test Dataset          #
##############################################
test_path = "/kaggle/working/combined_test"
if os.path.exists(test_path):
    test_dataset = load_from_disk(test_path)
    print("Loaded combined test dataset from disk.")
else:
    print("Combined test dataset not found on disk; generating concatenated test dataset on the fly...")
    test_dataset = load_and_concatenate_split("test")
    test_dataset.save_to_disk(test_path)

# Tokenize test data if needed.
if "input_ids" not in test_dataset.column_names:
    def preprocess_function(example):
        return tokenizer(example["sentence1"], example["sentence2"], truncation=True, max_length=128)
    test_dataset = test_dataset.map(preprocess_function, batched=True, desc="Tokenizing")

# Ensure the labels are correctly set.
if "labels" not in test_dataset.column_names:
    def set_labels(example):
        example["labels"] = float(example["similarity_score"])
        return example
    test_dataset = test_dataset.map(set_labels, desc="Setting labels")

test_dataset.set_format("torch")

##############################################
# Setup Trainer (Using GPU)                  #
##############################################
training_args = TrainingArguments(
    output_dir="./eval_results",
    per_device_eval_batch_size=16,
    logging_strategy="no",
    report_to=None
)

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

##############################################
# Manual Evaluation Loop with Progress Bar   #
##############################################
eval_dataloader = trainer.get_eval_dataloader()
all_preds = []

for batch in tqdm(eval_dataloader, desc="Evaluating"):
    # Move batch to GPU.
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    all_preds.append(outputs.logits.cpu().numpy())

preds = np.concatenate(all_preds).flatten()

# Also get gold scores and original sentences.
gold_scores = np.array(test_dataset["similarity_score"])
sentences1 = test_dataset["sentence1"]
sentences2 = test_dataset["sentence2"]

##############################################
# Generate CSV with Predictions              #
##############################################
df = pd.DataFrame({
    "sentence1": sentences1,
    "sentence2": sentences2,
    "gold_similarity_score": gold_scores,
    "predicted_similarity_score": preds
})

output_csv_path = "./evaluation_results.csv"
df.to_csv(output_csv_path, index=False)
print(f"CSV file with evaluation results saved at: {output_csv_path}")

##############################################
# Compute Metrics (Optional)                 #
##############################################
def compute_metrics_from_preds(preds, gold):
    pearson_corr = pearsonr(preds, gold)[0]
    spearman_corr = spearmanr(preds, gold)[0]
    mse = mean_squared_error(gold, preds)
    dot = np.dot(preds, gold)
    norm_pred = np.linalg.norm(preds)
    norm_gold = np.linalg.norm(gold)
    cosine_sim = dot / (norm_pred * norm_gold) if norm_pred and norm_gold else 0.0
    avg_corr = (pearson_corr + spearman_corr) / 2.0
    return {
        "pearson": pearson_corr,
        "spearman": spearman_corr,
        "mse": mse,
        "cosine": cosine_sim,
        "avg_corr": avg_corr
    }

metrics = compute_metrics_from_preds(preds, gold_scores)
print("Evaluation Metrics:")
print(metrics)


Combined test dataset not found on disk; generating concatenated test dataset on the fly...
Loading split 'test' for all languages...
Loading language de...
Loading language en...
Loading language es...
Loading language fr...
Loading language it...
Loading language nl...
Loading language pl...
Loading language pt...
Loading language ru...
Loading language zh...
Concatenating datasets from all languages...


Saving the dataset (0/1 shards):   0%|          | 0/13790 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/13790 [00:00<?, ? examples/s]

Setting labels:   0%|          | 0/13790 [00:00<?, ? examples/s]

  trainer = Trainer(


Evaluating:   0%|          | 0/862 [00:00<?, ?it/s]

CSV file with evaluation results saved at: ./evaluation_results.csv
Evaluation Metrics:
{'pearson': 0.77267361810789, 'spearman': 0.7631519898382821, 'mse': 0.94900113, 'cosine': 0.9467342, 'avg_corr': 0.767912803973086}


In [3]:
!zip -r full_model.zip /kaggle/working

  adding: kaggle/working/ (stored 0%)
  adding: kaggle/working/wandb/ (stored 0%)
  adding: kaggle/working/wandb/debug.log (deflated 67%)
  adding: kaggle/working/wandb/debug-internal.log (deflated 68%)
  adding: kaggle/working/wandb/latest-run/ (stored 0%)
  adding: kaggle/working/wandb/latest-run/files/ (stored 0%)
  adding: kaggle/working/wandb/latest-run/files/wandb-metadata.json (deflated 47%)
  adding: kaggle/working/wandb/latest-run/files/output.log (deflated 43%)
  adding: kaggle/working/wandb/latest-run/files/requirements.txt (deflated 56%)
  adding: kaggle/working/wandb/latest-run/tmp/ (stored 0%)
  adding: kaggle/working/wandb/latest-run/tmp/code/ (stored 0%)
  adding: kaggle/working/wandb/latest-run/logs/ (stored 0%)
  adding: kaggle/working/wandb/latest-run/logs/debug.log (deflated 67%)
  adding: kaggle/working/wandb/latest-run/logs/debug-internal.log (deflated 68%)
  adding: kaggle/working/wandb/latest-run/logs/debug-core.log (deflated 58%)
  adding: kaggle/working/wandb/