# Transfer Learning

Inspired by: https://github.com/openai/openai-cookbook/blob/main/examples/Customizing_embeddings.ipynb

In [None]:
# imports
from typing import List, Tuple  # for type hints

import numpy as np  # for manipulating arrays
import pandas as pd  # for manipulating data in dataframes
import pickle  # for saving the embeddings cache
import plotly.express as px  # for plots
import random  # for generating run IDs
from sklearn.model_selection import train_test_split  # for splitting train & test data
import torch  # for matrix optimization
import os
import datasets

from mteb import MTEB

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)


## 1. Load and process input data

In [None]:
def load_dataset_for_task(task_name: str, split: str = 'train'):
    mteb = MTEB(tasks=[task_name], task_langs=["en"])
    task = mteb.tasks[0]
    task.load_data()
    assert task.data_loaded, "Data was not loaded"
    return task.dataset[split] # Have form train, validation, test (test is used for MTEB)

def process_dataset(dataset: datasets.Dataset) -> pd.DataFrame:
    train_df = dataset.to_pandas()
    train_df = train_df[['sentence1', 'sentence2', 'score']]
    def normalize(x, min_score, max_score): # Normalise between 0 and 1 (Maybe better to do -1 and 1)
        return (x - min_score) / (max_score - min_score)
    train_df['score'] = normalize(train_df['score'], train_df['score'].min(), train_df['score'].max())
    return train_df

In [None]:
task_name = "STSBenchmark"
train_dataset = load_dataset_for_task(task_name, split='train')
val_dataset = load_dataset_for_task(task_name, split='validation')

df_train = process_dataset(train_dataset)
df_val = process_dataset(val_dataset)

df = pd.concat([df_train, df_val])

# Shuffle the data
df = df.sample(frac=1, random_state=seed).reset_index(drop=True)

# Split the data
df_train, df_val = train_test_split(df, test_size=0.2, random_state=seed)

# Reset the index
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

df_train.head()

## 2. Generate Synthetic Data

In [None]:
# TODO (Look at generating for dismilliar data cause all of them seem to be simialr)

## 3. Get Embeddings and Cosine Similarities

In [None]:
from model_factory import model_factory
model_name = 'voyage'

In [None]:
def generate_embedding(df: pd.DataFrame, model_name: str, task_name: str):
    model = model_factory(model_name, task_name)
    for column in ['sentence1', 'sentence2']:
        df[f"{column}_embedding"] = model.encode(df[column].tolist())

def get_cosine_similarity(a: np.array, b: np.array):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

def generate_cosine_similarity(df: pd.DataFrame):
    df['cosine_similarity'] = df.apply(lambda x: get_cosine_similarity(x['sentence1_embedding'], x['sentence2_embedding']), axis=1)

In [None]:
generate_embedding(df_train, model_name, task_name)
generate_embedding(df_val, model_name, task_name)

generate_cosine_similarity(df_train)
generate_cosine_similarity(df_val)

df_train.head()

## 4. Evaluate The Baseline Performance

We use spearman and Pearson to evaluate the performance. 

Pearson correlation is a measure of strength of linear relationship between two variables. It ranges from -1 to 1.

While spearman correlation is a measure of monotonic relationship between two variables. It ranges from -1 to 1.

We want both these values to be close to 1. 


In [None]:
def get_correlation(df: pd.DataFrame, column_name: str = 'cosine_similarity'):
    return df[column_name].corr(df['score'], method='spearman'), df[column_name].corr(df['score'], method='pearson')

spearmans, pearsons = get_correlation(df_train)
print(f"Train: Spearman: {spearmans}, Pearson: {pearsons}")

spearmans, pearsons = get_correlation(df_val)
print(f"Validation: Spearman: {spearmans}, Pearson: {pearsons}")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def plot_correlation(df: pd.DataFrame, title: str):
    sns.scatterplot(data=df, x='cosine_similarity', y='score')
    plt.title(title)
    plt.xlabel('Cosine Similarity')
    plt.ylabel('Score')
    plt.show()

plot_correlation(df_train, 'Relation Between Score and Cosine Similarity (Train)')

# 5. Transfer Learning

In [None]:
def optimize_matrix(
    new_length: int = 2048,
    batch_size: int = 200,
    max_epochs: int = 100,
    lr: float = 100,
    p: float = 0.1,
    print_every: int = 10,
):
    def get_tensors(df):
        df_x1 = np.stack(df['sentence1_embedding'].values)
        df_x2 = np.stack(df['sentence2_embedding'].values)
        df_y = df['score'].values


        X1 = torch.from_numpy(df_x1).float()
        X2 = torch.from_numpy(df_x2).float()
        Y = torch.from_numpy(df_y).float()
        return X1, X2, Y

    X1_train, X2_train, Y_train = get_tensors(df_train)
    X1_val, X2_val, Y_val = get_tensors(df_val)

    train_dataset = torch.utils.data.TensorDataset(X1_train, X2_train, Y_train)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    def model(X1, X2, matrix, p=p):
        e1 = torch.nn.functional.dropout(X1, p=p)
        e2 = torch.nn.functional.dropout(X2, p=p)
        return torch.nn.functional.cosine_similarity(e1 @ matrix, e2 @ matrix)
    
    def mse_loss(predictions, targets):
        real_preds = torch.clamp(2*(predictions - 0.5), 0, 1)
        return torch.nn.functional.mse_loss(real_preds, targets)

    def calc_custom(embedding, matrix):
        embedding_tensor = torch.from_numpy(embedding).float()
        modified_embedding = embedding_tensor @ matrix
        return modified_embedding.detach().numpy()

    epochs, types, losses, accuracies, matrices = [], [], [], [], []

    matrix = torch.randn(len(df_train['sentence1_embedding'].values[0]), new_length, requires_grad=True)

    for epoch in range(max_epochs):
        epoch_loss = 0
        for X1, X2, Y in train_loader:
            predictions = model(X1, X2, matrix)
            loss = mse_loss(predictions, Y)
            loss.backward()
            with torch.no_grad():
                matrix -= lr * matrix.grad
                matrix.grad.zero_()
            epoch_loss += loss.item()
        
        with torch.no_grad():
            test_preds = model(X1_val, X2_val, matrix)
            test_loss = mse_loss(test_preds, Y_val)
        
        for df in [df_train, df_val]:
            m = matrix.clone().detach().numpy()
            df["custom_1"] = df['sentence1_embedding'].apply(lambda x: calc_custom(x, matrix))
            df["custom_2"] = df['sentence2_embedding'].apply(lambda x: calc_custom(x, matrix))
            df["custom_cosine_similarity"] = df.apply(lambda x: get_cosine_similarity(x['custom_1'], x['custom_2']), axis=1)

            spearmans, _ = get_correlation(df, 'custom_cosine_similarity')
            type = "train" if df is df_train else "val"
            l = loss.item() if df is df_train else test_loss.item()
            accuracies.append(spearmans)
            losses.append(l)
            types.append(type)
            epochs.append(epoch)
            matrices.append(m)

            if print_every and epoch % print_every == 0:
                print(f"Epoch {epoch+1}/{max_epochs} ({type}) - Loss: {l}, Spearman: {spearmans}")

            
    data = pd.DataFrame({
        "epoch": epochs,
        "type": types,
        "loss": losses,
        "accuracy": accuracies,
        "matrix": matrices,
    })
    result_dir = 'tlresults'
    if not os.path.exists(result_dir):
        os.makedirs(result_dir)
    data.to_csv(f"{result_dir}/{task_name}_{model_name}_matrix_optimization.csv")
    return data
            

In [None]:
optimize_matrix(print_every=1, max_epochs=20)

In [None]:
df = pd.read_csv(f"tlresults/{task_name}_{model_name}_matrix_optimization.csv", index_col=0)
df.head()

In [None]:
# plot loss, separate by train and val
sns.lineplot(data=df, x='epoch', y='loss', hue='type')
plt.title('Loss Over Time')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

In [None]:
# plot spearmans, separate by train and val
sns.lineplot(data=df, x='epoch', y='accuracy', hue='type')
plt.title('Spearmans Over Time')
plt.xlabel('Epoch')
plt.ylabel('Spearmans')
plt.show()