# Transfer Learning

Inspired by: https://github.com/openai/openai-cookbook/blob/main/examples/Customizing_embeddings.ipynb

In [None]:
# imports

from typing import List, Tuple  # for type hints

import numpy as np  # for manipulating arrays
import pandas as pd  # for manipulating data in dataframes
import plotly.express as px  # for plots
import random  # for generating run IDs
from sklearn.model_selection import train_test_split  # for splitting train & test data
import torch  # for matrix optimization
import torch.nn as nn 
import torch.nn.functional as F
import torchmetrics
import os
import datasets
from livelossplot import PlotLosses
from sentence_transformers.util import pairwise_angle_sim

from mteb import MTEB

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [None]:
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'
print('Using device:', device)

## 1. Load and process input data

In [None]:
def load_dataset_for_task(task_name: str):
    # Note, we load all splits except the test set
    mteb = MTEB(tasks=[task_name], task_langs=["en"])
    task = mteb.tasks[0]
    task.load_data()
    assert task.data_loaded, "Data was not loaded"
    lis_dfs = []
    for split in task.dataset:
        if split == 'test':
            continue
        print(f"Split: {split}")
        lis_dfs.append(task.dataset[split].to_pandas())
    return pd.concat(lis_dfs)

def process_dataset(train_df: datasets.Dataset) -> pd.DataFrame:
    train_df = train_df[['sentence1', 'sentence2', 'score']]
    def normalize(x, min_score, max_score): # Norm between 0 and 1
        zero_one = (x - min_score) / (max_score - min_score)
        scaled_down = zero_one 
        return scaled_down 
    train_df['score'] = normalize(train_df['score'], train_df['score'].min(), train_df['score'].max())
    return train_df

In [None]:
TASKS = [ # Only ones with train or val data 
    "STS12",
    "STSBenchmark",
]

dfs = {}
for task_name in TASKS:
    print("Loading task:", task_name)
    dataset_df = load_dataset_for_task(task_name)
    df_task = process_dataset(dataset_df)
    dfs[task_name] = df_task



## 2. Generate Synthetic Data

In [None]:
# TODO (Look at generating for dissimilar data cause all of them seem to be similar)

## 3. Get Embeddings and Cosine Similarities

In [None]:
from model_factory import model_factory
model_name = 'angle$cohere$flag-embedding$gist$gte-large$llmrails$mixed-bread$voyage'

In [None]:
def generate_embedding(df: pd.DataFrame, model_name: str, task_name: str):
    model = model_factory(model_name, task_name)
    for column in ['sentence1', 'sentence2']:
        df[f"{column}_embedding"] = model.encode(df[column].tolist())

def get_cosine_similarity(a: np.array, b: np.array):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

def generate_cosine_similarity(df: pd.DataFrame):
    df['cosine_similarity'] = df.apply(lambda x: get_cosine_similarity(x['sentence1_embedding'], x['sentence2_embedding']), axis=1)

In [None]:
for task_name, df in dfs.items():
    generate_embedding(df, model_name, task_name)
    generate_cosine_similarity(df)


In [None]:
# Combine all the dataframes
df = pd.concat(dfs.values(), ignore_index=True)

# Shuffle the data
df = df.sample(frac=1, random_state=seed).reset_index(drop=True)

# Split the data
df_train, df_val = train_test_split(df, test_size=0.2, random_state=seed)

# Reset the index
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

df_train

## 4. Evaluate The Baseline Performance

We use spearman and Pearson to evaluate the performance. 

Pearson correlation is a measure of strength of linear relationship between two variables. It ranges from -1 to 1.

While spearman correlation is a measure of monotonic relationship between two variables. It ranges from -1 to 1.

We want both these values to be close to 1. 


In [None]:
def get_correlation(df: pd.DataFrame, column_name: str = 'cosine_similarity'):
    return df[column_name].corr(df['score'], method='spearman'), df[column_name].corr(df['score'], method='pearson')

spearmans, pearsons = get_correlation(df_train)
print(f"Train: Spearman: {spearmans}, Pearson: {pearsons}")

spearmans, pearsons = get_correlation(df_val)
print(f"Validation: Spearman: {spearmans}, Pearson: {pearsons}")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def plot_correlation(df: pd.DataFrame, title: str):
    sns.scatterplot(data=df, x='cosine_similarity', y='score')
    plt.title(title)
    plt.xlabel('Cosine Similarity')
    plt.ylabel('Score')
    plt.show()

plot_correlation(df_train, 'Relation Between Score and Cosine Similarity (Train)')

# 5. Preprocessing before training



In [None]:
def get_tensors(df):
    df_x1 = np.stack(df['sentence1_embedding'].values)
    df_x2 = np.stack(df['sentence2_embedding'].values)
    df_y = df['score'].values

    X1 = torch.from_numpy(df_x1).float()
    X2 = torch.from_numpy(df_x2).float()
    Y = torch.from_numpy(df_y).float()
    return X1, X2, Y
    
X1_train, X2_train, Y_train = get_tensors(df_train)
X1_val, X2_val, Y_val = get_tensors(df_val)

# Move everything to the device
X1_train = X1_train.to(device)
X2_train = X2_train.to(device)
Y_train = Y_train.to(device)

X1_val = X1_val.to(device)
X2_val = X2_val.to(device)
Y_val = Y_val.to(device)

# 6. Transfer Learning

Defining our Model

In [None]:
from tl_models import LinearTransformationModel, ElementwiseProductModel, StackWiseProductModel

In [None]:
class CosineSimilarityModel(torch.nn.Module):
    def __init__(self, transformation_model: torch.nn.Module):
        super(CosineSimilarityModel, self).__init__()
        self.transformation_model = transformation_model
    
    def forward(self, embeddings1: torch.Tensor, embeddings2: torch.Tensor):
        embedding1_custom = self.transformation_model(embeddings1)
        embedding2_custom = self.transformation_model(embeddings2)
        return torch.nn.functional.cosine_similarity(embedding1_custom, embedding2_custom, dim=1)

class AnglEModel(torch.nn.Module):
    def __init__(self, transformation_model: torch.nn.Module, scale: float = 20.0):
        super(AnglEModel, self).__init__()
        self.transformation_model = transformation_model
        self.scale = scale
    
    def forward(self, embeddings1: torch.Tensor, embeddings2: torch.Tensor):
        scores = pairwise_angle_sim(self.transformation_model(embeddings1), self.transformation_model(embeddings2))
        return scores


Possible Loss Functions

In [None]:
def mse_loss(cosine_similarity: torch.Tensor, scores: torch.Tensor):
    lossF = torch.nn.MSELoss()
    return lossF(cosine_similarity, scores)

def spearman_correlation(cosine_similarity: torch.Tensor, scores: torch.Tensor):
    spearman_corr = torchmetrics.SpearmanCorrCoef().to(device)
    return spearman_corr(cosine_similarity, scores)

def pearson_correlation(cosine_similarity: torch.Tensor, scores: torch.Tensor):
    pearson_corr = torchmetrics.PearsonCorrCoef().to(device)
    return pearson_corr(cosine_similarity, scores)

def angle_loss(scores: torch.Tensor, labels: torch.Tensor, scale: float = 20.0):
    scores = scores * scale
    scores = scores[:, None] - scores[None, :]

    labels = labels[:, None] < labels[None, :]
    labels = labels.float()

    scores = scores - (1 - labels) * 1e12

    scores = torch.cat((torch.zeros(1).to(device), scores.view(-1)), dim=0)
    loss = torch.logsumexp(scores, dim=0)
    return loss

Training Loop

In [None]:
def train_one_epoch(model, training_loader, optimizer, loss_function = None):
    running_loss = 0.

    for X1, X2, Y in training_loader:
        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        cosine_sim = model(X1, X2)
        loss = loss_function(cosine_sim, Y)
        
        # Compute the loss and its gradients
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Record the loss
        running_loss += loss.item()

    average_loss = running_loss / len(training_loader)
    return average_loss


In [None]:
out_dir = 'test_results'
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

# Hyperparameters
max_epochs = 2000
lr = 1e-5 # 1e-4 also works but bounces around 
batch_size = 300
momentum = 0.9

# Loss function
# loss_function = mse_loss
loss_function = angle_loss

# Model
transformation_model = LinearTransformationModel(1024 * 8, 1024 * 8, dropout_rate=0.5)  # NOTE: dropout of 0.5 seems to work better than 0.1
# transformation_model = ElementwiseProductModel(1024 * 8)
# transformation_model = StackWiseProductModel(8)

# model = CosineSimilarityModel(transformation_model)
# model = model.to(device)

model = AnglEModel(transformation_model)
model = model.to(device)

# Optimizer
# optimizer = torch.optim.Adam(model.parameters(), lr=lr)
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum)

# Data Loader
train_dataset = torch.utils.data.TensorDataset(X1_train, X2_train, Y_train)
training_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)

# Keep track of losses
plotlosses = PlotLosses()

model.train()
best_accuracy = -float('inf')
for _ in range(max_epochs):
    epoch_loss = train_one_epoch(model, training_loader, optimizer, loss_function)

    # Additional metrics for performance tracking
    model.eval()
    with torch.no_grad():
        # Generate the validation loss
        val_predictions = model(X1_val, X2_val)
        val_loss = loss_function(val_predictions, Y_val).item()

        # Compute the correlations
        train_spearman = spearman_correlation(model(X1_train, X2_train), Y_train).item()
        val_spearman = spearman_correlation(val_predictions, Y_val).item()

        # Save locally if it is the best
        if val_spearman > best_accuracy:
            best_accuracy = val_spearman
            torch.save({'transformation_model': model.transformation_model}, f"{out_dir}/best_model.pth")

    model.train()

    plotlosses.update({'loss': epoch_loss, 'val_loss': val_loss, 'acc': train_spearman, 'val_acc': val_spearman})
    plotlosses.send()

torch.save({'transformation_model': model.transformation_model}, f"{out_dir}/final_model.pth")

# 7. Evaluation

Load from memory

In [None]:
state_dict = torch.load(f"test_results/best_model.pth")
transformation_model = state_dict["transformation_model"].to(device)
model = CosineSimilarityModel(transformation_model)

In [None]:
model.eval()
with torch.no_grad():
    val_predictions = model(X1_val, X2_val)
    val_spearman = spearman_correlation(val_predictions, Y_val).item()
    val_pearson = pearson_correlation(val_predictions, Y_val).item()

print(f"Validation Spearman: {val_spearman}, Validation Pearson: {val_pearson}")    