In [None]:
import numpy as np
import itertools
import datasets
import pandas as pd
import os
from tqdm import tqdm
from abc import ABC, abstractmethod
from results_to_csv import main as convert_to_csv
from mteb import MTEB
from model_factory import model_factory

In [None]:
BASIC_MODELS = os.listdir('data')
BASIC_MODELS.remove("sentences")

In [None]:
TASK_LIST_STS = [
    "SICK-R",
    "STS12",
    "STS13",
    "STS14",
    "STS15",
    "STS16",
    "STS17",
    "STS22",
    "STSBenchmark",
    "BIOSSES",
]

In [None]:
def PCA2(data, dims_rescaled_data=2):
    """
    returns: data transformed in 2 dims/columns + regenerated original data
    pass in: data as 2D NumPy array
    """
    import numpy as NP
    from scipy import linalg as LA
    m, n = data.shape
    # mean center the data
    data -= data.mean(axis=0)
    # calculate the covariance matrix
    R = NP.cov(data, rowvar=False)
    # calculate eigenvectors & eigenvalues of the covariance matrix
    # use 'eigh' rather than 'eig' since R is symmetric,
    # the performance gain is substantial
    evals, evecs = LA.eigh(R)
    # sort eigenvalue in decreasing order
    idx = NP.argsort(evals)[::-1]
    evecs = evecs[:,idx]
    # sort eigenvectors according to same index
    evals = evals[idx]
    # select the first n eigenvectors (n is desired dimension
    # of rescaled data array, or dims_rescaled_data)
    evecs = evecs[:, :dims_rescaled_data]
    # carry out the transformation on the data using eigenvectors
    # and return the re-scaled data, eigenvalues, and eigenvectors
    return NP.dot(evecs.T, data.T).T, evals, evecs

In [None]:
def generate_pca(ndims):
    # Generate stacked model of all sizes
    for r in range(1, len(BASIC_MODELS) + 1):
        combinations_object = itertools.combinations(BASIC_MODELS, r)
        combinations_list = [sorted(list(combination)) for combination in combinations_object] # Sort to ensure the same combination is always the same
        
        for combination in combinations_list:
            concat_model = "$".join(combination)
            for task in TASK_LIST_STS:
                # if computed, skip
                path = f"data_pca/{ndims}/{concat_model}/{task}"
                if (os.path.exists(path)):
                    continue
                
                # get the embeddings for each task from the different models
                task_embeddings = []
                for model in combination:
                    model_dataset = datasets.load_from_disk(f"data/{model}/{task}")
                    
                    # normalise embeddings in dataset
                    model_dataset = model_dataset.map(lambda x: {'embeddings': x['embeddings'] / np.linalg.norm(x['embeddings'])})
                    
                    # rename column (to avoid duplicate 'embeddings' column name)
                    model_dataset = model_dataset.rename_column("embeddings", f"embeddings_{model}")
                    task_embeddings.append(model_dataset)
                
                ds = datasets.concatenate_datasets(task_embeddings, axis = 1)
                df = ds.to_pandas()
                concat_model = "$".join(combination)
                df[concat_model] = df.apply(lambda row: np.concatenate([row[f"embeddings_{model}"] for model in combination]), axis = 1)
                df = df[concat_model]
                
                # number of samples must be at least the number of dimensions
                if len(df) < ndims:
                    continue
                
                # PCA
                data = np.array(df.tolist())
                new_data, _, _ = PCA2(data, dims_rescaled_data=ndims)
                
                # save to dir
                dataset = datasets.Dataset.from_dict({"embeddings": new_data})
                dataset.save_to_disk(path, max_shard_size="75MB")

In [None]:
ndims_list = [256, 512, 768, 896, 1024]

In [None]:
for ndims in ndims_list:
    generate_pca(ndims)

## Computing Results

In [None]:
MODELS = os.listdir('data_pca/1024')

PCA Model Class

In [None]:
class AbstractModel(ABC):
    
    def __init__(self, model_name: str, task_name: str): # The task name is needed for caching
        self.model_name = model_name
        self.task_name = task_name

    @abstractmethod
    def encode(self, sentences, batch_size=32, **kwargs):
        """
        Returns a list of embeddings for the given sentences.

        NOTE: The vectors should be normalized to unit length (L2 norm)

        Args:
            sentences (`List[str]`): List of sentences to encode
            batch_size (`int`): Batch size for the encoding

        Returns:
            `List[np.ndarray]` or `List[tensor]`: List of embeddings for the given sentences
        """
        pass

In [None]:
class PCAModel(AbstractModel):
    def __init__(self, model_name: str, task_name: str, ndims: int):
        super().__init__(model_name, task_name)
        main: datasets.Dataset = datasets.load_from_disk(f"data/sentences/{self.task_name}")
        embeddings: datasets.Dataset = datasets.load_from_disk(f"data_pca/{ndims}/{model_name}/{self.task_name}")
        
        ds = datasets.concatenate_datasets([main, embeddings], axis=1)
        self.df = ds.to_pandas().drop_duplicates(subset=["text"]).set_index("text")

    def encode(self, sentences, batch_size=32, **kwargs):
        embeddings = self.df.loc[sentences]["embeddings"].values
        return np.vstack(embeddings)

Custom Logging

In [None]:
import logging

def setup_logger(name, log_file, level=logging.INFO):
    """Function to setup a logger for a given name and file."""
    if not os.path.exists(os.path.dirname(log_file)):
        os.makedirs(os.path.dirname(log_file))
    handler = logging.FileHandler(log_file)    
    logger = logging.getLogger(name)
    logger.setLevel(level)
    logger.addHandler(handler)
    
    return logger

Computing results

In [None]:
def evaluate_model(model_name, ndims):
    script_logger = setup_logger(model_name, f"logs_pca/{ndims}/{model_name}_log.txt")
    script_logger.info(f"Starting evaluation for {model_name} reduced to {ndims} dimensions")
    
    for task in TASK_LIST_STS:
        # if results have already been computed, skip
        if os.path.exists(f"results_pca/{ndims}/{model_name}/{task}.json"):
            script_logger.info(f"Skipping {task} as it already exists")
            continue
        
        script_logger.info(f"Running Task: {task}")
        
        # if embeddings have not been computed, skip
        if not os.path.exists(f"data_pca/{ndims}/{model_name}/{task}"):
            script_logger.info(f"Skipping {task} as embeddings have not been computed")
            continue
        
        # loading the model
        if ("$" in model_name): # if the model is not a PCA model
            model = PCAModel(model_name, task, ndims)
        else:
            model = model_factory(model_name, task)
        
        # eval_splits = ["dev"] if task == "MSMARCO" else ["test"]
        evaluation = MTEB(tasks=[task], task_langs=["en"])
        evaluation.run(model, output_folder=f"results_pca/{ndims}/{model_name}", eval_splits= ["test"])
    
    if os.path.exists(f"results_pca/{ndims}/{model_name}"):
        script_logger.info("Converting the results to a CSV file...")
        convert_to_csv(f"results_pca/{ndims}/{model_name}")
    
    script_logger.info("-- DONE --")
    logging.shutdown()

def evaluate_models(ndims):
    print(f"Num models to be evaluated: {len(MODELS)}")
    for model_name in MODELS:
        evaluate_model(model_name, ndims)

In [None]:
ndims_list = os.listdir('data_pca')
ndims_list.remove(".git")
ndims_list.remove("push.sh")

In [None]:
for ndims in ndims_list:
    evaluate_models(ndims)