In [17]:
# imports

from typing import List, Tuple  # for type hints

import numpy as np  # for manipulating arrays
import pandas as pd  # for manipulating data in dataframes
import plotly.express as px  # for plots
import random  # for generating run IDs
from sklearn.model_selection import train_test_split  # for splitting train & test data
import torch  # for matrix optimization
import torch.nn as nn 
import torch.nn.functional as F
import torchmetrics
import os
import datasets
from livelossplot import PlotLosses
from model_factory import model_factory
import itertools
from tl_models import LinearTransformationModel, ElementwiseProductModel, StackWiseProductModel
from sentence_transformers.util import pairwise_angle_sim

from mteb import MTEB

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [18]:
# setting device on GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


# Computing Transfer Learning Model

In [19]:
TASKS = [
    "STS12",
    "STSBenchmark",
]

In [20]:
BASIC_MODELS = os.listdir("data")
BASIC_MODELS.remove("sentences")
BASIC_MODELS.remove("cohere-large")

## Retrieving and processing dataset

In [21]:
def load_dataset_for_task(task_name: str):
    # Note, we load all splits except the test set
    mteb = MTEB(tasks=[task_name], task_langs=["en"])
    task = mteb.tasks[0]
    task.load_data()
    assert task.data_loaded, "Data was not loaded"
    lis_dfs = []
    for split in task.dataset:
        if split == 'test':
            continue
        print(f"Split: {split}")
        lis_dfs.append(task.dataset[split].to_pandas())
    return pd.concat(lis_dfs)

def process_dataset(train_df: datasets.Dataset) -> pd.DataFrame:
    train_df = train_df[['sentence1', 'sentence2', 'score']]
    def normalize(x, min_score, max_score): # Norm between 0 and 1
        zero_one = (x - min_score) / (max_score - min_score)
        scaled_down = zero_one 
        return scaled_down 
    train_df['score'] = normalize(train_df['score'], train_df['score'].min(), train_df['score'].max())
    return train_df

dfs = {}
for task_name in TASKS:
    print("Loading task:", task_name)
    dataset_df = load_dataset_for_task(task_name)
    df_task = process_dataset(dataset_df)
    dfs[task_name] = df_task

Loading task: STS12
Split: train
Loading task: STSBenchmark


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['score'] = normalize(train_df['score'], train_df['score'].min(), train_df['score'].max())


Split: train
Split: validation


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['score'] = normalize(train_df['score'], train_df['score'].min(), train_df['score'].max())


In [22]:
class CosineSimilarityModel(torch.nn.Module):
    def __init__(self, transformation_model: torch.nn.Module):
        super(CosineSimilarityModel, self).__init__()
        self.transformation_model = transformation_model
    
    def forward(self, embeddings1: torch.Tensor, embeddings2: torch.Tensor):
        embedding1_custom = self.transformation_model(embeddings1)
        embedding2_custom = self.transformation_model(embeddings2)
        return torch.nn.functional.cosine_similarity(embedding1_custom, embedding2_custom, dim=1)

class AnglEModel(torch.nn.Module):
    def __init__(self, transformation_model: torch.nn.Module, scale: float = 20.0):
        super(AnglEModel, self).__init__()
        self.transformation_model = transformation_model
        self.scale = scale
    
    def forward(self, embeddings1: torch.Tensor, embeddings2: torch.Tensor):
        scores = pairwise_angle_sim(self.transformation_model(embeddings1), self.transformation_model(embeddings2))
        return scores

In [23]:
def mse_loss(cosine_similarity: torch.Tensor, scores: torch.Tensor):
    lossF = torch.nn.MSELoss()
    return lossF(cosine_similarity, scores)

def spearman_correlation(cosine_similarity: torch.Tensor, scores: torch.Tensor):
    spearman_corr = torchmetrics.SpearmanCorrCoef().to(device)
    return spearman_corr(cosine_similarity, scores)

def pearson_correlation(cosine_similarity: torch.Tensor, scores: torch.Tensor):
    pearson_corr = torchmetrics.PearsonCorrCoef().to(device)
    return pearson_corr(cosine_similarity, scores)

def angle_loss(scores: torch.Tensor, labels: torch.Tensor, scale: float = 20.0):
    scores = scores * scale
    scores = scores[:, None] - scores[None, :]

    labels = labels[:, None] < labels[None, :]
    labels = labels.float()

    scores = scores - (1 - labels) * 1e12

    scores = torch.cat((torch.zeros(1).to(device), scores.view(-1)), dim=0)
    loss = torch.logsumexp(scores, dim=0)
    return loss

In [24]:
def train_one_epoch(model, training_loader, optimizer, loss_function = None):
    running_loss = 0.

    for X1, X2, Y in training_loader:
        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        cosine_sim = model(X1, X2)
        loss = loss_function(cosine_sim, Y)
        
        # Compute the loss and its gradients
        loss.backward()

        # Adjust learning weights
        optimizer.step()

        # Record the loss
        running_loss += loss.item()

    average_loss = running_loss / len(training_loader)
    return average_loss


In [25]:
def get_cosine_similarity(a: np.array, b: np.array):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

# generating transfer learning model
for r in range(7, len(BASIC_MODELS) + 1):
    combinations_object = itertools.combinations(BASIC_MODELS, r)
    combinations_list = [sorted(list(combination)) for combination in combinations_object]
    
    # generating model for each combination
    for combination in combinations_list:
        model_name = "$".join(combination)
        
        # check if the model is already trained
        if os.path.exists(f"pca_tl/{model_name}/final_model.pth"):
            print(f"Model {model_name} already trained")
            continue
        
        model_name += "-pca"
        
        # load a copy of the dataframes
        dfs_copy = dfs.copy()
        
        # generating emebeddings for each task
        for task_name in TASKS:
            df = dfs_copy[task_name]
            # generating embeddings
            model = model_factory(model_name, task_name)
            for column in ['sentence1', 'sentence2']:
                embs = model.encode(df[column].tolist())
                df[f"{column}_embedding"] = list(embs)
        
            # computing cosine similarity
            df["cosine_similarity"] = df.apply(lambda x: get_cosine_similarity(x['sentence1_embedding'], x['sentence2_embedding']), axis=1)

        # Combine all the dataframes
        df = pd.concat(dfs.values(), ignore_index=True)

        # Shuffle the data
        df = df.sample(frac=1, random_state=seed).reset_index(drop=True)

        # Split the data
        df_train, df_val = train_test_split(df, test_size=0.2, random_state=seed)

        # Reset the index
        df_train = df_train.reset_index(drop=True)
        df_val = df_val.reset_index(drop=True)

        def get_tensors(df):
            df_x1 = np.stack(df['sentence1_embedding'].values)
            df_x2 = np.stack(df['sentence2_embedding'].values)
            df_y = df['score'].values

            X1 = torch.from_numpy(df_x1).float()
            X2 = torch.from_numpy(df_x2).float()
            Y = torch.from_numpy(df_y).float()
            return X1, X2, Y
            
        X1_train, X2_train, Y_train = get_tensors(df_train)
        X1_val, X2_val, Y_val = get_tensors(df_val)

        # Move everything to the device
        X1_train = X1_train.to(device)
        X2_train = X2_train.to(device)
        Y_train = Y_train.to(device)

        X1_val = X1_val.to(device)
        X2_val = X2_val.to(device)
        Y_val = Y_val.to(device)

        # train model
        model_name = model_name.replace("-pca", "")
        output_dir = f"pca_tl/{model_name}"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            
        # Hyperparameters
        max_epochs = 3_000
        lr = 1e-4 # 1e-5 is the smoothest
        batch_size = 300
        momentum = 0.9
        
        loss_function = angle_loss
        
        transformation_model = LinearTransformationModel(1024, 1024, dropout_rate=0.4)
        
        model = AnglEModel(transformation_model)
        model = model.to(device)
        
        optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum)

        # Data Loader
        train_dataset = torch.utils.data.TensorDataset(X1_train, X2_train, Y_train)
        training_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)

        # Keep track of losses
        plotlosses = PlotLosses()
        
        model.train()
        best_accuracy = 0.9 # Should be better then this
        epochs_saved = []
        for epoch_num in range(max_epochs):
            epoch_loss = train_one_epoch(model, training_loader, optimizer, loss_function)

            # Additional metrics for performance tracking
            model.eval()
            with torch.no_grad():
                # Generate the validation loss
                val_predictions = model(X1_val, X2_val)
                val_loss = loss_function(val_predictions, Y_val).item()

                # Compute the correlations
                train_spearman = spearman_correlation(model(X1_train, X2_train), Y_train).item()
                val_spearman = spearman_correlation(val_predictions, Y_val).item()

                # Save locally if it is the best
                if val_spearman > best_accuracy: # Only save if it does not change much
                    epochs_saved.append(epoch_num)
                    best_accuracy = val_spearman
                    torch.save({'transformation_model': model.transformation_model}, f"{output_dir}/best_model.pth")

            model.train()
            
            plotlosses.update({'loss': epoch_loss, 'val_loss': val_loss, 'acc': train_spearman, 'val_acc': val_spearman})
            plotlosses.send()
        
        torch.save({"transformation_model": model.transformation_model}, f"{output_dir}/final_model.pth")
        

Model angle$flag-embedding$gist$gte-large$llmrails$mixed-bread$voyage already trained
Model angle$cohere$flag-embedding$gist$gte-large$llmrails$mixed-bread already trained
Model angle$cohere$flag-embedding$gist$gte-large$mixed-bread$voyage already trained
Model angle$cohere$flag-embedding$gist$llmrails$mixed-bread$voyage already trained
Model angle$cohere$flag-embedding$gte-large$llmrails$mixed-bread$voyage already trained
Model angle$cohere$flag-embedding$gist$gte-large$llmrails$voyage already trained
Model angle$cohere$gist$gte-large$llmrails$mixed-bread$voyage already trained
Model cohere$flag-embedding$gist$gte-large$llmrails$mixed-bread$voyage already trained
Model angle$cohere$flag-embedding$gist$gte-large$llmrails$mixed-bread$voyage already trained


# Evaluating Transfer Learning Models

In [None]:
TRANFER_LEARNING_MODELS = os.listdir("pca_tl")

In [None]:
import os
from run_utils import run_on_tasks 

for model in TRANFER_LEARNING_MODELS:
    model_name = model + "-transfer"
    run_on_tasks(model_name)