In [1]:
import wandb
wandb.login(key="bad6b8a0ac53c6665bbf6201ac36a3ab180041b7")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mshravang[0m ([33mshravang-iiit-hyderabad[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

# XLM Training without PEFT without permutation

In [3]:
import os
import numpy as np
import torch
from datasets import load_dataset, Dataset, load_from_disk, concatenate_datasets
from transformers import (
    XLMRobertaForSequenceClassification,
    XLMRobertaTokenizerFast,
    Trainer,
    TrainingArguments,
    TrainerCallback
)
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr, spearmanr
from tqdm.auto import tqdm

##############################################
# Custom Epoch-Level Progress Callback       #
##############################################
class SingleEpochProgressCallback(TrainerCallback):
    def on_epoch_begin(self, args, state, control, **kwargs):
        print(f"\nStarting epoch {state.epoch:.0f}/{args.num_train_epochs}")
        # Calculate steps per epoch if available.
        if state.max_steps and args.num_train_epochs:
            self.steps_per_epoch = int(state.max_steps / args.num_train_epochs)
        else:
            self.steps_per_epoch = 0
        self.progress_bar = tqdm(total=self.steps_per_epoch, desc=f"Epoch {state.epoch:.0f}")
    def on_step_end(self, args, state, control, **kwargs):
        if hasattr(self, "progress_bar"):
            self.progress_bar.update(1)
    def on_epoch_end(self, args, state, control, **kwargs):
        if hasattr(self, "progress_bar"):
            self.progress_bar.close()

##############################################
# Device and Dataset Paths                   #
##############################################
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
languages = ['de','en','es','fr','it','nl','pl','pt','ru','zh']
train_path = "/kaggle/working/combined_train"
val_path   = "/kaggle/working/combined_dev"

##############################################
# Dataset Loading & Combining                #
##############################################
def load_and_combine_split(split_name):
    print(f"Loading split '{split_name}' for all languages...")
    datasets_list = []
    for lang in languages:
        print(f"Loading language {lang}...")
        ds = load_dataset("PhilipMay/stsb_multi_mt", lang, split=split_name)
        datasets_list.append(ds)
    print("Concatenating datasets from all languages...")
    combined_dataset = concatenate_datasets(datasets_list)
    return combined_dataset

if os.path.exists(train_path):
    train_dataset = load_from_disk(train_path)
    print("Loaded combined train dataset from disk.")
else:
    print("Loading and combining training split...")
    train_dataset = load_and_combine_split("train")
    train_dataset.save_to_disk(train_path)
    print("Saved combined train dataset to disk.")

if os.path.exists(val_path):
    val_dataset = load_from_disk(val_path)
    print("Loaded combined validation dataset from disk.")
else:
    print("Loading and combining validation split...")
    val_dataset = load_and_combine_split("dev")
    val_dataset.save_to_disk(val_path)
    print("Saved combined validation dataset to disk.")

# Optionally set dataset format to torch.
train_dataset.set_format("torch")
val_dataset.set_format("torch")
print("Train dataset size:", len(train_dataset))
print("Validation dataset size:", len(val_dataset))

##############################################
# Model and Tokenizer                        #
##############################################
model_name = "xlm-roberta-base"
tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_name)

def model_init():
    model = XLMRobertaForSequenceClassification.from_pretrained(
        model_name,
        num_labels=1,
        problem_type="regression"
    )
    model.config.use_cache = False  # Disable cache to avoid warnings.
    model.to(device)
    return model

##############################################
# Preprocessing & Tokenization               #
##############################################
def preprocess_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True, max_length=128)

print("Tokenizing train dataset...")
train_dataset = train_dataset.map(preprocess_function, batched=True)
print("Tokenizing validation dataset...")
val_dataset = val_dataset.map(preprocess_function, batched=True)

def set_labels(example):
    example["labels"] = float(example["similarity_score"])
    return example

print("Setting labels for train dataset...")
train_dataset = train_dataset.map(set_labels)
print("Setting labels for validation dataset...")
val_dataset = val_dataset.map(set_labels)

##############################################
# Metrics Calculation                        #
##############################################
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.flatten()
    pearson_corr = pearsonr(predictions, labels)[0]
    spearman_corr = spearmanr(predictions, labels)[0]
    mse = mean_squared_error(labels, predictions)
    pred_array = np.array(predictions)
    label_array = np.array(labels)
    dot = np.dot(pred_array, label_array)
    norm_pred = np.linalg.norm(pred_array)
    norm_label = np.linalg.norm(label_array)
    cosine_sim = dot / (norm_pred * norm_label) if norm_pred and norm_label else 0.0
    return {
        "pearson": pearson_corr,
        "spearman": spearman_corr,
        "mse": mse,
        "cosine": cosine_sim,
        "avg_corr": (pearson_corr + spearman_corr) / 2
    }

##############################################
# Training Arguments                         #
##############################################
training_args = TrainingArguments(
    output_dir="./xlmroberta_sts_finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    save_total_limit=2,
    dataloader_num_workers=4
)

print("Starting training for XLM-R...")
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.add_callback(SingleEpochProgressCallback())

trainer.train()
eval_results = trainer.evaluate()
print("XLM-R Evaluation results:", eval_results)

# Save the final fine-tuned model without quantization.
final_model = trainer.model
final_model.eval()
final_model.save_pretrained("./xlmroberta_sts_finetuned")
print("Final fine-tuned XLM-R model saved at './xlmroberta_sts_finetuned'.")


Using device: cuda
Loaded combined train dataset from disk.
Loaded combined validation dataset from disk.
Train dataset size: 57490
Validation dataset size: 15000
Tokenizing train dataset...


Map:   0%|          | 0/57490 [00:00<?, ? examples/s]

Tokenizing validation dataset...


Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Setting labels for train dataset...


Map:   0%|          | 0/57490 [00:00<?, ? examples/s]

Setting labels for validation dataset...


Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Starting training for XLM-R...


  trainer = Trainer(
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting epoch 0/3


Epoch 0:   0%|          | 0/3594 [00:00<?, ?it/s]

Epoch,Training Loss,Validation Loss,Pearson,Spearman,Mse,Cosine,Avg Corr
1,0.7656,0.580416,0.869315,0.86495,0.580416,0.964246,0.867132
2,0.3216,0.583129,0.866318,0.864279,0.583129,0.962234,0.865298
3,0.1925,0.574017,0.86922,0.866355,0.574017,0.964183,0.867787



Starting epoch 1/3


Epoch 1:   0%|          | 0/3594 [00:00<?, ?it/s]


Starting epoch 2/3


Epoch 2:   0%|          | 0/3594 [00:00<?, ?it/s]

XLM-R Evaluation results: {'eval_loss': 0.5740170478820801, 'eval_pearson': 0.869220029663228, 'eval_spearman': 0.8663548553431225, 'eval_mse': 0.5740170478820801, 'eval_cosine': 0.9641826152801514, 'eval_avg_corr': 0.8677874425031753, 'eval_runtime': 30.4101, 'eval_samples_per_second': 493.257, 'eval_steps_per_second': 30.845, 'epoch': 3.0}
Final fine-tuned XLM-R model saved at './xlmroberta_sts_finetuned'.


# Analysis without PEFT without permutation on test set with csv generation

In [6]:
import os
import numpy as np
import torch
import pandas as pd
from datasets import load_from_disk, load_dataset, concatenate_datasets, Dataset
from transformers import (
    XLMRobertaForSequenceClassification,
    XLMRobertaTokenizerFast,
    Trainer,
    TrainingArguments,
    XLMRobertaConfig
)
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import mean_squared_error
from tqdm.auto import tqdm
import logging

# Suppress transformer warnings.
logging.getLogger("transformers").setLevel(logging.ERROR)

##############################################
# Define Languages and Concatenation Function#
##############################################
languages = ['de','en','es','fr','it','nl','pl','pt','ru','zh']

def load_and_concatenate_split(split_name):
    print(f"Loading split '{split_name}' for all languages...")
    datasets_list = []
    for lang in languages:
        print(f"Loading language {lang}...")
        ds = load_dataset("PhilipMay/stsb_multi_mt", lang, split=split_name)
        datasets_list.append(ds)
    print("Concatenating datasets from all languages...")
    combined_dataset = concatenate_datasets(datasets_list)
    return combined_dataset

##############################################
# Matching Configuration                     #
##############################################
config = XLMRobertaConfig.from_pretrained("xlm-roberta-base")
config.num_labels = 1
config.problem_type = "regression"
config.use_cache = False
# Set these to match your training checkpoint dimensions.
config.vocab_size = 250002
config.max_position_embeddings = 514
config.type_vocab_size = 1

##############################################
# Load the Fine-Tuned Model (Non-Quantized)    #
##############################################
# IMPORTANT: To use CUDA, load a non-quantized version of your model.
model_path = "/kaggle/working/xlmroberta_sts_finetuned"  # use the non-quantized checkpoint here
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = XLMRobertaForSequenceClassification.from_pretrained(model_path, config=config)
model.to(device)
model.eval()

##############################################
# Load the Tokenizer                         #
##############################################
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")

##############################################
# Load and Prepare the Test Dataset          #
##############################################
test_path = "/kaggle/working/combined_test"
if os.path.exists(test_path):
    test_dataset = load_from_disk(test_path)
    print("Loaded combined test dataset from disk.")
else:
    print("Combined test dataset not found on disk; generating concatenated test dataset on the fly...")
    test_dataset = load_and_concatenate_split("test")
    test_dataset.save_to_disk("/kaggle/working/combined_test")

# Tokenize test data if needed.
if "input_ids" not in test_dataset.column_names:
    def preprocess_function(example):
        return tokenizer(example["sentence1"], example["sentence2"], truncation=True, max_length=128)
    test_dataset = test_dataset.map(preprocess_function, batched=True, desc="Tokenizing")

# Ensure the labels are correctly set.
if "labels" not in test_dataset.column_names:
    def set_labels(example):
        example["labels"] = [float(example["similarity_score"])]
        return example
    test_dataset = test_dataset.map(set_labels, desc="Setting labels")

test_dataset.set_format("torch")

##############################################
# Setup Trainer (Using GPU)                  #
##############################################
training_args = TrainingArguments(
    output_dir="./eval_results",
    per_device_eval_batch_size=16,
    logging_strategy="no",
    report_to=None
)

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

##############################################
# Manual Evaluation Loop with Progress Bar   #
##############################################
eval_dataloader = trainer.get_eval_dataloader()
all_preds = []

for batch in tqdm(eval_dataloader, desc="Evaluating"):
    # Move batch to GPU.
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    all_preds.append(outputs.logits.cpu().numpy())

preds = np.concatenate(all_preds).flatten()

# Also get gold scores and original sentences.
gold_scores = np.array(test_dataset["similarity_score"])
sentences1 = test_dataset["sentence1"]
sentences2 = test_dataset["sentence2"]

##############################################
# Generate CSV with Predictions              #
##############################################
df = pd.DataFrame({
    "sentence1": sentences1,
    "sentence2": sentences2,
    "gold_similarity_score": gold_scores,
    "predicted_similarity_score": preds
})

output_csv_path = "./evaluation_results.csv"
df.to_csv(output_csv_path, index=False)
print(f"CSV file with evaluation results saved at: {output_csv_path}")

##############################################
# Compute Metrics (Optional)                 #
##############################################
def compute_metrics_from_preds(preds, gold):
    pearson_corr = pearsonr(preds, gold)[0]
    spearman_corr = spearmanr(preds, gold)[0]
    mse = mean_squared_error(gold, preds)
    dot = np.dot(preds, gold)
    norm_pred = np.linalg.norm(preds)
    norm_gold = np.linalg.norm(gold)
    cosine_sim = dot / (norm_pred * norm_gold) if norm_pred and norm_gold else 0.0
    avg_corr = (pearson_corr + spearman_corr) / 2.0
    return {
        "pearson": pearson_corr,
        "spearman": spearman_corr,
        "mse": mse,
        "cosine": cosine_sim,
        "avg_corr": avg_corr
    }

metrics = compute_metrics_from_preds(preds, gold_scores)
print("Evaluation Metrics:")
print(metrics)


Combined test dataset not found on disk; generating concatenated test dataset on the fly...
Loading split 'test' for all languages...
Loading language de...
Loading language en...
Loading language es...
Loading language fr...
Loading language it...
Loading language nl...
Loading language pl...
Loading language pt...
Loading language ru...
Loading language zh...
Concatenating datasets from all languages...


Saving the dataset (0/1 shards):   0%|          | 0/13790 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/13790 [00:00<?, ? examples/s]

Setting labels:   0%|          | 0/13790 [00:00<?, ? examples/s]

  trainer = Trainer(


Evaluating:   0%|          | 0/862 [00:00<?, ?it/s]

CSV file with evaluation results saved at: ./evaluation_results.csv
Evaluation Metrics:
{'pearson': 0.8272604417102964, 'spearman': 0.8168667922807934, 'mse': 0.7521109, 'cosine': 0.9588413, 'avg_corr': 0.8220636169955449}


# Analysis without PEFT without permutation on dev set without csv generation

In [7]:
import os
import numpy as np
import torch
from datasets import load_from_disk, load_dataset, concatenate_datasets, Dataset
from transformers import (
    XLMRobertaForSequenceClassification,
    XLMRobertaTokenizerFast,
    Trainer,
    TrainingArguments,
    XLMRobertaConfig
)
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import mean_squared_error
from tqdm.auto import tqdm
import logging

# Suppress transformer warnings.
logging.getLogger("transformers").setLevel(logging.ERROR)

##############################################
# Define Languages and Concatenation Function#
##############################################
languages = ['de','en','es','fr','it','nl','pl','pt','ru','zh']

def load_and_concatenate_split(split_name):
    print(f"Loading split '{split_name}' for all languages...")
    datasets_list = []
    for lang in languages:
        print(f"Loading language {lang}...")
        ds = load_dataset("PhilipMay/stsb_multi_mt", lang, split=split_name)
        datasets_list.append(ds)
    print("Concatenating datasets from all languages...")
    combined_dataset = concatenate_datasets(datasets_list)
    return combined_dataset

##############################################
# Matching Configuration                     #
##############################################
config = XLMRobertaConfig.from_pretrained("xlm-roberta-base")
config.num_labels = 1
config.problem_type = "regression"
config.use_cache = False
# Set these to match your training checkpoint dimensions.
config.vocab_size = 250002
config.max_position_embeddings = 514
config.type_vocab_size = 1

##############################################
# Load the Fine-Tuned Model (Non-Quantized)    #
##############################################
# IMPORTANT: To use CUDA, load a non-quantized version of your model.
model_path = "/kaggle/working/xlmroberta_sts_finetuned"  # use the non-quantized checkpoint here
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = XLMRobertaForSequenceClassification.from_pretrained(model_path, config=config)
model.to(device)
model.eval()

##############################################
# Load the Tokenizer                         #
##############################################
tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")

##############################################
# Load and Prepare the Dev Dataset           #
##############################################
dev_path = "/kaggle/working/combined_dev"
if os.path.exists(dev_path):
    dev_dataset = load_from_disk(dev_path)
    print("Loaded combined dev dataset from disk.")
else:
    print("Combined dev dataset not found on disk; generating concatenated dev dataset on the fly...")
    dev_dataset = load_and_concatenate_split("dev")
    dev_dataset.save_to_disk("/kaggle/working/combined_dev")

# Tokenize dev data if needed.
if "input_ids" not in dev_dataset.column_names:
    def preprocess_function(example):
        return tokenizer(example["sentence1"], example["sentence2"], truncation=True, max_length=128)
    dev_dataset = dev_dataset.map(preprocess_function, batched=True, desc="Tokenizing")

# Ensure the labels are correctly set.
if "labels" not in dev_dataset.column_names:
    def set_labels(example):
        example["labels"] = [float(example["similarity_score"])]
        return example
    dev_dataset = dev_dataset.map(set_labels, desc="Setting labels")

dev_dataset.set_format("torch")

##############################################
# Setup Trainer (Using GPU)                  #
##############################################
training_args = TrainingArguments(
    output_dir="./eval_results",
    per_device_eval_batch_size=16,
    logging_strategy="no",
    report_to=None
)

trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer
)

##############################################
# Manual Evaluation Loop with Progress Bar   #
##############################################
eval_dataloader = trainer.get_eval_dataloader()
all_preds = []

for batch in tqdm(eval_dataloader, desc="Evaluating"):
    # Move batch to GPU.
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    all_preds.append(outputs.logits.cpu().numpy())

preds = np.concatenate(all_preds).flatten()

# Also get gold scores.
gold_scores = np.array(dev_dataset["similarity_score"])

##############################################
# Compute Metrics                            #
##############################################
def compute_metrics_from_preds(preds, gold):
    pearson_corr = pearsonr(preds, gold)[0]
    spearman_corr = spearmanr(preds, gold)[0]
    mse = mean_squared_error(gold, preds)
    dot = np.dot(preds, gold)
    norm_pred = np.linalg.norm(preds)
    norm_gold = np.linalg.norm(gold)
    cosine_sim = dot / (norm_pred * norm_gold) if norm_pred and norm_gold else 0.0
    avg_corr = (pearson_corr + spearman_corr) / 2.0
    return {
        "pearson": pearson_corr,
        "spearman": spearman_corr,
        "mse": mse,
        "cosine": cosine_sim,
        "avg_corr": avg_corr
    }

metrics = compute_metrics_from_preds(preds, gold_scores)
print("Evaluation Metrics on Dev Set:")
print(metrics)


Loaded combined dev dataset from disk.


Tokenizing:   0%|          | 0/15000 [00:00<?, ? examples/s]

Setting labels:   0%|          | 0/15000 [00:00<?, ? examples/s]

  trainer = Trainer(


Evaluating:   0%|          | 0/938 [00:00<?, ?it/s]

Evaluation Metrics on Dev Set:
{'pearson': 0.8691959427411435, 'spearman': 0.8663298933496116, 'mse': 0.5742456, 'cosine': 0.9641784, 'avg_corr': 0.8677629180453776}


In [4]:
!zip -r XLM_trained.zip /kaggle/working

  adding: kaggle/working/ (stored 0%)
  adding: kaggle/working/.virtual_documents/ (stored 0%)
  adding: kaggle/working/combined_dev/ (stored 0%)
  adding: kaggle/working/combined_dev/state.json (deflated 41%)
  adding: kaggle/working/combined_dev/data-00000-of-00001.arrow (deflated 60%)
  adding: kaggle/working/combined_dev/cache-c349c6dabdf0f66e.arrow (deflated 65%)
  adding: kaggle/working/combined_dev/dataset_info.json (deflated 58%)
  adding: kaggle/working/combined_dev/cache-b05a34031e2bc56c.arrow (deflated 65%)
  adding: kaggle/working/wandb/ (stored 0%)
  adding: kaggle/working/wandb/latest-run/ (stored 0%)
  adding: kaggle/working/wandb/latest-run/run-lcjjyjoi.wandb (deflated 79%)
  adding: kaggle/working/wandb/latest-run/logs/ (stored 0%)
  adding: kaggle/working/wandb/latest-run/logs/debug-core.log (deflated 58%)
  adding: kaggle/working/wandb/latest-run/logs/debug-internal.log (deflated 71%)
  adding: kaggle/working/wandb/latest-run/logs/debug.log (deflated 80%)
  adding: k