In [4]:
import numpy as np
import itertools
import datasets
import pandas as pd
import os
from tqdm import tqdm
from abc import ABC, abstractmethod
from results_to_csv import main as convert_to_csv
from mteb import MTEB
from model_factory import model_factory
from sklearn.decomposition import PCA
import pickle as pk

In [5]:
import logging

def setup_logger(name, log_file, level=logging.INFO):
    """Function to setup a logger for a given name and file."""
    if not os.path.exists(os.path.dirname(log_file)):
        os.makedirs(os.path.dirname(log_file))
        
    logger = logging.getLogger(name)
    if not logger.handlers:  # Only add handler if there are no existing handlers
        handler = logging.FileHandler(log_file)    
        logger.setLevel(level)
        logger.addHandler(handler)
    
    return logger

## Model List

In [6]:
BASIC_MODELS = os.listdir('data')
BASIC_MODELS.remove("sentences")

## Task List

In [7]:
TASK_LIST_CLASSIFICATION = [
    "AmazonCounterfactualClassification",
    # "AmazonPolarityClassification",
    # "AmazonReviewsClassification",
    "Banking77Classification",
    "EmotionClassification",
    # "ImdbClassification",
    # "MassiveIntentClassification",
    # "MassiveScenarioClassification",
    # "MTOPDomainClassification",
    # "MTOPIntentClassification",
    # "ToxicConversationsClassification",
    # "TweetSentimentExtractionClassification",
]

TASK_LIST_CLUSTERING = [
    # "ArxivClusteringP2P",
    "ArxivClusteringS2S",
    # "BiorxivClusteringP2P",
    # "BiorxivClusteringS2S",
    # "MedrxivClusteringP2P",
    # "MedrxivClusteringS2S",
    "RedditClustering",
    # "RedditClusteringP2P",
    # "StackExchangeClustering",
    # "StackExchangeClusteringP2P",
    # "TwentyNewsgroupsClustering",
]

TASK_LIST_PAIR_CLASSIFICATION = [
    # "SprintDuplicateQuestions",
    "TwitterSemEval2015",
    # "TwitterURLCorpus",
]

TASK_LIST_RERANKING = [
    "AskUbuntuDupQuestions",
#     "MindSmallReranking",
#     "SciDocsRR",
#     "StackOverflowDupQuestions",
]

TASK_LIST_RETRIEVAL = [
    "ArguAna",
    # "ClimateFEVER",
    # "CQADupstackAndroidRetrieval",
    # "CQADupstackEnglishRetrieval",
    # "CQADupstackGamingRetrieval",
    # "CQADupstackGisRetrieval",
    # "CQADupstackMathematicaRetrieval",
    # "CQADupstackPhysicsRetrieval",
    # "CQADupstackProgrammersRetrieval",
    # "CQADupstackStatsRetrieval",
    # "CQADupstackTexRetrieval",
    # "CQADupstackUnixRetrieval",
    # "CQADupstackWebmastersRetrieval",
    # "CQADupstackWordpressRetrieval",
    # "DBPedia",
    # "FEVER",
    # "FiQA2018",
    # "HotpotQA",
    # "MSMARCO",
    # "NFCorpus",
    # "NQ",
    # "QuoraRetrieval",
    # "SCIDOCS",
    "SciFact",
    # "Touche2020",
    # "TRECCOVID",
]

TASK_LIST_STS = [
    "SICK-R",
    "STS12",
    "STS13",
    "STS14",
    "STS15",
    "STS16",
    "STS17",
    "STS22",
    "STSBenchmark",
    # "BIOSSES",
]

TASK_LIST = TASK_LIST_CLASSIFICATION + TASK_LIST_CLUSTERING + TASK_LIST_PAIR_CLASSIFICATION + TASK_LIST_RERANKING + TASK_LIST_RETRIEVAL + TASK_LIST_STS

In [8]:
ndims_list = [256, 512, 768, 896, 1024]

## Generating PCA Functions

In [None]:
%%capture 
# supressing output

all_sentences = {}

for dataset in TASK_LIST:
    mteb = MTEB(tasks = [dataset], task_langs=["en"], trust_remote_code=True)
    task = mteb.tasks[0]
    task.load_data()
    if task.dataset is None:
        print(f"{dataset} has no dataset. Skipping...")
        continue
    print(f"{dataset} has these splits: {task.dataset.keys()}")

    if (train:= "train" not in task.dataset.keys()) and (validation := "validation" not in task.dataset.keys()):
        print(f"{dataset} has no train or validation split. Skipping...")
        continue
    
    all_sentences[dataset] = []
    if not train:
        if "text" in task.dataset["train"].column_names: 
            all_sentences[dataset] += task.dataset["train"]["text"]
        if "sentence1" in task.dataset["train"].column_names:
            all_sentences[dataset] += task.dataset["train"]["sentence1"]
        if "sentence2" in task.dataset["train"].column_names:
            all_sentences[dataset] += task.dataset["train"]["sentence2"]
    if not validation:
        if "text" in task.dataset["validation"].column_names: 
            all_sentences[dataset] += task.dataset["validation"]["text"]
        if "sentence1" in task.dataset["validation"].column_names:
            all_sentences[dataset] += task.dataset["validation"]["sentence1"]
        if "sentence2" in task.dataset["validation"].column_names:
            all_sentences[dataset] += task.dataset["validation"]["sentence2"]

In [None]:
%%capture
# suppresing output

script_logger = setup_logger("Generating PCA", f"pca/logs.txt")

for ndims in ndims_list:
    script_logger.info(f"\n\nStarting PCA for {ndims} dimensions")
    
    # creating directory
    path = f"pca/{ndims}"
    if not os.path.exists(path):
        os.makedirs(path)
    
    # generating PCA for all combinations of models
    for r in range(1, len(BASIC_MODELS) + 1):
        combinations_object = itertools.combinations(BASIC_MODELS, r)
        combinations_list = [sorted(list(combination)) for combination in combinations_object]
        
        # generating PCA for each combination
        for combination in combinations_list:
            model_name = "$".join(combination)
            script_logger.info(f"<--> Starting PCA for {model_name}!")
            
            # if model alr exists
            if os.path.exists(f"{path}/{model_name}.pkl"):
                script_logger.info(f"PCA for {model_name} already exists. Skipping...")
                continue
            
            # retrieving embeddings
            all_embeddings = []
            for task in all_sentences.keys():
                model = model_factory(model_name, task)
                embeddings = model.encode(all_sentences[task])
                all_embeddings += embeddings
            script_logger.info(f"Retrieved embeddings for {model_name}")
            
            # PCA
            script_logger.info(f"Fitting PCA for {model_name}")
            pca = PCA(n_components= ndims)
            pca.fit_transform(all_embeddings)
            
            # Saving PCA
            script_logger.info(f"Saving PCA for {model_name}")
            pk.dump(pca, open(f"{path}/{model_name}.pkl", "wb"))

## Evaluating Stacked Model with PCA

In [12]:
def evaluate_model(model_name: str, ndims: int):
    script_logger = setup_logger("eval_models", f"results_pca/{ndims}/logs.txt")
    
    for task in TASK_LIST:
        # if model has already been evaluated
        if os.path.exists(f"results_pca/{ndims}/{model_name}/{task}.json"):
            script_logger.info(f"{model_name} has already been evaluated for {task}. Skipping...")
            continue
        
        # get stacked model
        model = model_factory(model_name + "-pca", task)
        
        script_logger.info(f"Evaluating {model_name} on {task}")
        evaluation = MTEB(tasks = [task], task_langs=["en"])
        evaluation.run(model, output_folder=f"results_pca/{ndims}/{model_name}", eval_splits=["test"])

    if os.path.exists(f"results_pca/{ndims}/{model_name}_results.csv"):
        script_logger.info(f"Results for {model_name} already converted to csv. Skipping...")
    elif os.path.exists(f"results_pca/{ndims}/{model_name}"):
        script_logger.info(f"Converting results to csv for {model_name}")
        convert_to_csv(f"results_pca/{ndims}/{model_name}")
    else:
        script_logger.info(f"No results found for {model_name}. Skipping...")
        
    script_logger.info(f"Finished evaluating {model_name} for all tasks\n\n")

In [13]:
def evaluate_models(ndims: int):
    script_logger = setup_logger(f"eval_models_{ndims}", f"results_pca/{ndims}/logs.txt")
    for r in range(1, len(BASIC_MODELS) + 1):
        combinations_object = itertools.combinations(BASIC_MODELS, r)
        combinations_list = [sorted(list(combination)) for combination in combinations_object]
        
        for combination in combinations_list:
            model_name = "$".join(combination)
            script_logger.info(f"Starting evaluation for {model_name}")
            evaluate_model(model_name, ndims)

In [14]:
for ndims in ndims_list:
    evaluate_models(ndims)

Creating model cohere$gist$llmrails$voyage-pca for task AskUbuntuDupQuestions
Loading cohere from cache for AskUbuntuDupQuestions...
Loading gist from cache for AskUbuntuDupQuestions...
Loading llmrails from cache for AskUbuntuDupQuestions...
Loading voyage from cache for AskUbuntuDupQuestions...


Creating model cohere$gist$llmrails$voyage-pca for task ArguAna
Loading cohere from cache for ArguAna...
Loading gist from cache for ArguAna...
Loading llmrails from cache for ArguAna...
Loading voyage from cache for ArguAna...


Creating model cohere$gist$llmrails$voyage-pca for task SciFact
Loading cohere from cache for SciFact...
Loading gist from cache for SciFact...
Loading llmrails from cache for SciFact...
Loading voyage from cache for SciFact...


INFO:eval_models:Evaluating for cohere$gist$llmrails$voyage on SciFact


Creating model cohere$gist$llmrails$voyage-pca for task SICK-R
Loading cohere from cache for SICK-R...
Loading gist from cache for SICK-R...
Loading llmrails from cache for SICK-R...
Loading voyage from cache for SICK-R...


INFO:eval_models:Evaluating for cohere$gist$llmrails$voyage on SICK-R


Creating model cohere$gist$llmrails$voyage-pca for task STS12
Loading cohere from cache for STS12...
Loading gist from cache for STS12...
Loading llmrails from cache for STS12...


INFO:eval_models:Evaluating for cohere$gist$llmrails$voyage on STS12


Loading voyage from cache for STS12...


INFO:eval_models:Evaluating for cohere$gist$llmrails$voyage on STS13


Creating model cohere$gist$llmrails$voyage-pca for task STS13
Loading cohere from cache for STS13...
Loading gist from cache for STS13...
Loading llmrails from cache for STS13...
Loading voyage from cache for STS13...


Creating model cohere$gist$llmrails$voyage-pca for task STS14
Loading cohere from cache for STS14...
Loading gist from cache for STS14...
Loading llmrails from cache for STS14...


INFO:eval_models:Evaluating for cohere$gist$llmrails$voyage on STS14


Loading voyage from cache for STS14...


Creating model cohere$gist$llmrails$voyage-pca for task STS15
Loading cohere from cache for STS15...
Loading gist from cache for STS15...
Loading llmrails from cache for STS15...
Loading voyage from cache for STS15...


INFO:eval_models:Evaluating for cohere$gist$llmrails$voyage on STS15


INFO:eval_models:Evaluating for cohere$gist$llmrails$voyage on STS16


Creating model cohere$gist$llmrails$voyage-pca for task STS16
Loading cohere from cache for STS16...
Loading gist from cache for STS16...
Loading llmrails from cache for STS16...
Loading voyage from cache for STS16...


INFO:eval_models:Evaluating for cohere$gist$llmrails$voyage on STS17


Creating model cohere$gist$llmrails$voyage-pca for task STS17
Loading cohere from cache for STS17...
Loading gist from cache for STS17...
Loading llmrails from cache for STS17...
Loading voyage from cache for STS17...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:eval_models:Evaluating for cohere$gist$llmrails$voyage on STS22


Creating model cohere$gist$llmrails$voyage-pca for task STS22
Loading cohere from cache for STS22...
Loading gist from cache for STS22...
Loading llmrails from cache for STS22...
Loading voyage from cache for STS22...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Creating model cohere$gist$llmrails$voyage-pca for task STSBenchmark
Loading cohere from cache for STSBenchmark...
Loading gist from cache for STSBenchmark...
Loading llmrails from cache for STSBenchmark...
Loading voyage from cache for STSBenchmark...


INFO:eval_models:Evaluating for cohere$gist$llmrails$voyage on STSBenchmark


INFO:eval_models:Converting results to csv for cohere$gist$llmrails$voyage


Using model name cohere$gist$llmrails$voyage
Converting results_pca/256/cohere$gist$llmrails$voyage to results_pca/256/cohere$gist$llmrails$voyage_results.csv
BUCC - test set not found
Tatoeba - test set not found
de & accuracy not found for task AmazonCounterfactualClassification.
en-ext & accuracy not found for task AmazonCounterfactualClassification.
ja & accuracy not found for task AmazonCounterfactualClassification.
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
Medrxi

INFO:eval_models:Finished evaluating cohere$gist$llmrails$voyage for all tasks


INFO:eval_models:Starting evaluation for angle$cohere$gist$llmrails$voyage


SummEval - test set not found
Not found: 'BUCC','Tatoeba','AmazonPolarityClassification','AmazonReviewsClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterURLCorpus','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','Touche2020','TRECCOVID','BIOSSES','SummEval' 40
Creating model angle$cohere$gist$llmrails$voyage-pca for task AmazonCounterfactualClassification
Loading angle from cache for AmazonCounterfactualClassification..

INFO:eval_models:Evaluating for angle$cohere$gist$llmrails$voyage on AmazonCounterfactualClassification


Loading voyage from cache for AmazonCounterfactualClassification...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Repo card metadata block was not found. Setting CardData to empty.


Creating model angle$cohere$gist$llmrails$voyage-pca for task Banking77Classification
Loading angle from cache for Banking77Classification...
Loading cohere from cache for Banking77Classification...
Loading gist from cache for Banking77Classification...
Loading llmrails from cache for Banking77Classification...
Loading voyage from cache for Banking77Classification...


INFO:eval_models:Evaluating for angle$cohere$gist$llmrails$voyage on Banking77Classification


Creating model angle$cohere$gist$llmrails$voyage-pca for task EmotionClassification
Loading angle from cache for EmotionClassification...
Loading cohere from cache for EmotionClassification...
Loading gist from cache for EmotionClassification...
Loading llmrails from cache for EmotionClassification...


INFO:eval_models:Evaluating for angle$cohere$gist$llmrails$voyage on EmotionClassification


Loading voyage from cache for EmotionClassification...


Repo card metadata block was not found. Setting CardData to empty.


Creating model angle$cohere$gist$llmrails$voyage-pca for task ArxivClusteringS2S
Loading angle from cache for ArxivClusteringS2S...
Loading cohere from cache for ArxivClusteringS2S...
Loading gist from cache for ArxivClusteringS2S...
Loading llmrails from cache for ArxivClusteringS2S...
Loading voyage from cache for ArxivClusteringS2S...


INFO:eval_models:Evaluating for angle$cohere$gist$llmrails$voyage on ArxivClusteringS2S


Clustering: 100%|██████████| 31/31 [03:18<00:00,  6.41s/it]


Creating model angle$cohere$gist$llmrails$voyage-pca for task RedditClustering
Loading angle from cache for RedditClustering...
Loading cohere from cache for RedditClustering...
Loading gist from cache for RedditClustering...
Loading llmrails from cache for RedditClustering...
Loading voyage from cache for RedditClustering...


INFO:eval_models:Evaluating for angle$cohere$gist$llmrails$voyage on RedditClustering


Clustering: 100%|██████████| 25/25 [01:40<00:00,  4.02s/it]


Creating model angle$cohere$gist$llmrails$voyage-pca for task TwitterSemEval2015
Loading angle from cache for TwitterSemEval2015...
Loading cohere from cache for TwitterSemEval2015...
Loading gist from cache for TwitterSemEval2015...
Loading llmrails from cache for TwitterSemEval2015...
Loading voyage from cache for TwitterSemEval2015...


INFO:eval_models:Evaluating for angle$cohere$gist$llmrails$voyage on TwitterSemEval2015


Creating model angle$cohere$gist$llmrails$voyage-pca for task AskUbuntuDupQuestions
Loading angle from cache for AskUbuntuDupQuestions...
Loading cohere from cache for AskUbuntuDupQuestions...
Loading gist from cache for AskUbuntuDupQuestions...
Loading llmrails from cache for AskUbuntuDupQuestions...
Loading voyage from cache for AskUbuntuDupQuestions...


INFO:eval_models:Evaluating for angle$cohere$gist$llmrails$voyage on AskUbuntuDupQuestions


Creating model angle$cohere$gist$llmrails$voyage-pca for task ArguAna
Loading angle from cache for ArguAna...
Loading cohere from cache for ArguAna...
Loading gist from cache for ArguAna...
Loading llmrails from cache for ArguAna...
Loading voyage from cache for ArguAna...


INFO:eval_models:Evaluating for angle$cohere$gist$llmrails$voyage on ArguAna


Creating model angle$cohere$gist$llmrails$voyage-pca for task SciFact
Loading angle from cache for SciFact...
Loading cohere from cache for SciFact...
Loading gist from cache for SciFact...


INFO:eval_models:Evaluating for angle$cohere$gist$llmrails$voyage on SciFact


Loading llmrails from cache for SciFact...
Loading voyage from cache for SciFact...


Creating model angle$cohere$gist$llmrails$voyage-pca for task SICK-R
Loading angle from cache for SICK-R...
Loading cohere from cache for SICK-R...
Loading gist from cache for SICK-R...
Loading llmrails from cache for SICK-R...


INFO:eval_models:Evaluating for angle$cohere$gist$llmrails$voyage on SICK-R


Loading voyage from cache for SICK-R...


Creating model angle$cohere$gist$llmrails$voyage-pca for task STS12
Loading angle from cache for STS12...
Loading cohere from cache for STS12...
Loading gist from cache for STS12...


INFO:eval_models:Evaluating for angle$cohere$gist$llmrails$voyage on STS12


Loading llmrails from cache for STS12...
Loading voyage from cache for STS12...


Creating model angle$cohere$gist$llmrails$voyage-pca for task STS13
Loading angle from cache for STS13...
Loading cohere from cache for STS13...
Loading gist from cache for STS13...


INFO:eval_models:Evaluating for angle$cohere$gist$llmrails$voyage on STS13


Loading llmrails from cache for STS13...
Loading voyage from cache for STS13...


Creating model angle$cohere$gist$llmrails$voyage-pca for task STS14
Loading angle from cache for STS14...
Loading cohere from cache for STS14...
Loading gist from cache for STS14...
Loading llmrails from cache for STS14...


INFO:eval_models:Evaluating for angle$cohere$gist$llmrails$voyage on STS14


Loading voyage from cache for STS14...


Creating model angle$cohere$gist$llmrails$voyage-pca for task STS15
Loading angle from cache for STS15...
Loading cohere from cache for STS15...
Loading gist from cache for STS15...


INFO:eval_models:Evaluating for angle$cohere$gist$llmrails$voyage on STS15


Loading llmrails from cache for STS15...
Loading voyage from cache for STS15...


INFO:eval_models:Evaluating for angle$cohere$gist$llmrails$voyage on STS16


Creating model angle$cohere$gist$llmrails$voyage-pca for task STS16
Loading angle from cache for STS16...
Loading cohere from cache for STS16...
Loading gist from cache for STS16...
Loading llmrails from cache for STS16...
Loading voyage from cache for STS16...


INFO:eval_models:Evaluating for angle$cohere$gist$llmrails$voyage on STS17


Creating model angle$cohere$gist$llmrails$voyage-pca for task STS17
Loading angle from cache for STS17...
Loading cohere from cache for STS17...
Loading gist from cache for STS17...
Loading llmrails from cache for STS17...
Loading voyage from cache for STS17...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:eval_models:Evaluating for angle$cohere$gist$llmrails$voyage on STS22


Creating model angle$cohere$gist$llmrails$voyage-pca for task STS22
Loading angle from cache for STS22...
Loading cohere from cache for STS22...
Loading gist from cache for STS22...
Loading llmrails from cache for STS22...
Loading voyage from cache for STS22...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Creating model angle$cohere$gist$llmrails$voyage-pca for task STSBenchmark
Loading angle from cache for STSBenchmark...
Loading cohere from cache for STSBenchmark...
Loading gist from cache for STSBenchmark...
Loading llmrails from cache for STSBenchmark...


INFO:eval_models:Evaluating for angle$cohere$gist$llmrails$voyage on STSBenchmark


Loading voyage from cache for STSBenchmark...


INFO:eval_models:Converting results to csv for angle$cohere$gist$llmrails$voyage


Using model name angle$cohere$gist$llmrails$voyage
Converting results_pca/256/angle$cohere$gist$llmrails$voyage to results_pca/256/angle$cohere$gist$llmrails$voyage_results.csv
BUCC - test set not found
Tatoeba - test set not found
de & accuracy not found for task AmazonCounterfactualClassification.
en-ext & accuracy not found for task AmazonCounterfactualClassification.
ja & accuracy not found for task AmazonCounterfactualClassification.
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test se

INFO:eval_models:Finished evaluating angle$cohere$gist$llmrails$voyage for all tasks


INFO:eval_models:Starting evaluation for angle
INFO:eval_models:Evaluating for angle on AmazonCounterfactualClassification


SummEval - test set not found
Not found: 'BUCC','Tatoeba','AmazonPolarityClassification','AmazonReviewsClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterURLCorpus','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','Touche2020','TRECCOVID','BIOSSES','SummEval' 40
Creating model angle-pca for task AmazonCounterfactualClassification
Loading angle from cache for AmazonCounterfactualClassification...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Repo card metadata block was not found. Setting CardData to empty.
INFO:eval_models:Evaluating for angle on Banking77Classification


Creating model angle-pca for task Banking77Classification
Loading angle from cache for Banking77Classification...


INFO:eval_models:Evaluating for angle on EmotionClassification


Creating model angle-pca for task EmotionClassification
Loading angle from cache for EmotionClassification...


Repo card metadata block was not found. Setting CardData to empty.


Creating model angle-pca for task ArxivClusteringS2S
Loading angle from cache for ArxivClusteringS2S...


INFO:eval_models:Evaluating for angle on ArxivClusteringS2S


Clustering: 100%|██████████| 31/31 [00:43<00:00,  1.40s/it]


Creating model angle-pca for task RedditClustering
Loading angle from cache for RedditClustering...


INFO:eval_models:Evaluating for angle on RedditClustering


Clustering: 100%|██████████| 25/25 [00:23<00:00,  1.08it/s]


Creating model angle-pca for task TwitterSemEval2015
Loading angle from cache for TwitterSemEval2015...


INFO:eval_models:Evaluating for angle on TwitterSemEval2015


INFO:eval_models:Evaluating for angle on AskUbuntuDupQuestions


Creating model angle-pca for task AskUbuntuDupQuestions
Loading angle from cache for AskUbuntuDupQuestions...


INFO:eval_models:Evaluating for angle on ArguAna


Creating model angle-pca for task ArguAna
Loading angle from cache for ArguAna...


INFO:eval_models:Evaluating for angle on SciFact


Creating model angle-pca for task SciFact
Loading angle from cache for SciFact...


INFO:eval_models:Evaluating for angle on SICK-R


Creating model angle-pca for task SICK-R
Loading angle from cache for SICK-R...


INFO:eval_models:Evaluating for angle on STS12


Creating model angle-pca for task STS12
Loading angle from cache for STS12...


INFO:eval_models:Evaluating for angle on STS13


Creating model angle-pca for task STS13
Loading angle from cache for STS13...


INFO:eval_models:Evaluating for angle on STS14


Creating model angle-pca for task STS14
Loading angle from cache for STS14...


INFO:eval_models:Evaluating for angle on STS15


Creating model angle-pca for task STS15
Loading angle from cache for STS15...


INFO:eval_models:Evaluating for angle on STS16


Creating model angle-pca for task STS16
Loading angle from cache for STS16...


INFO:eval_models:Evaluating for angle on STS17


Creating model angle-pca for task STS17
Loading angle from cache for STS17...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:eval_models:Evaluating for angle on STS22


Creating model angle-pca for task STS22
Loading angle from cache for STS22...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:eval_models:Evaluating for angle on STSBenchmark


Creating model angle-pca for task STSBenchmark
Loading angle from cache for STSBenchmark...


INFO:eval_models:Converting results to csv for angle


Using model name angle
Converting results_pca/512/angle to results_pca/512/angle_results.csv
BUCC - test set not found
Tatoeba - test set not found
de & accuracy not found for task AmazonCounterfactualClassification.
en-ext & accuracy not found for task AmazonCounterfactualClassification.
ja & accuracy not found for task AmazonCounterfactualClassification.
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not found
MedrxivClusteringS2S - test se

INFO:eval_models:Finished evaluating angle for all tasks


INFO:eval_models:Starting evaluation for cohere
INFO:eval_models:Evaluating for cohere on AmazonCounterfactualClassification


SummEval - test set not found
Not found: 'BUCC','Tatoeba','AmazonPolarityClassification','AmazonReviewsClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterURLCorpus','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','Touche2020','TRECCOVID','BIOSSES','SummEval' 40
Creating model cohere-pca for task AmazonCounterfactualClassification
Loading cohere from cache for AmazonCounterfactualClassification...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Repo card metadata block was not found. Setting CardData to empty.
INFO:eval_models:Evaluating for cohere on Banking77Classification


Creating model cohere-pca for task Banking77Classification
Loading cohere from cache for Banking77Classification...


INFO:eval_models:Evaluating for cohere on EmotionClassification


Creating model cohere-pca for task EmotionClassification
Loading cohere from cache for EmotionClassification...


Repo card metadata block was not found. Setting CardData to empty.


Creating model cohere-pca for task ArxivClusteringS2S
Loading cohere from cache for ArxivClusteringS2S...


INFO:eval_models:Evaluating for cohere on ArxivClusteringS2S


Clustering: 100%|██████████| 31/31 [00:42<00:00,  1.38s/it]


Creating model cohere-pca for task RedditClustering
Loading cohere from cache for RedditClustering...


INFO:eval_models:Evaluating for cohere on RedditClustering


Clustering: 100%|██████████| 25/25 [00:25<00:00,  1.02s/it]


Creating model cohere-pca for task TwitterSemEval2015
Loading cohere from cache for TwitterSemEval2015...


INFO:eval_models:Evaluating for cohere on TwitterSemEval2015


INFO:eval_models:Evaluating for cohere on AskUbuntuDupQuestions


Creating model cohere-pca for task AskUbuntuDupQuestions
Loading cohere from cache for AskUbuntuDupQuestions...


INFO:eval_models:Evaluating for cohere on ArguAna


Creating model cohere-pca for task ArguAna
Loading cohere from cache for ArguAna...


INFO:eval_models:Evaluating for cohere on SciFact


Creating model cohere-pca for task SciFact
Loading cohere from cache for SciFact...


INFO:eval_models:Evaluating for cohere on SICK-R


Creating model cohere-pca for task SICK-R
Loading cohere from cache for SICK-R...


INFO:eval_models:Evaluating for cohere on STS12


Creating model cohere-pca for task STS12
Loading cohere from cache for STS12...


INFO:eval_models:Evaluating for cohere on STS13


Creating model cohere-pca for task STS13
Loading cohere from cache for STS13...


INFO:eval_models:Evaluating for cohere on STS14


Creating model cohere-pca for task STS14
Loading cohere from cache for STS14...


INFO:eval_models:Evaluating for cohere on STS15


Creating model cohere-pca for task STS15
Loading cohere from cache for STS15...


INFO:eval_models:Evaluating for cohere on STS16


Creating model cohere-pca for task STS16
Loading cohere from cache for STS16...


INFO:eval_models:Evaluating for cohere on STS17


Creating model cohere-pca for task STS17
Loading cohere from cache for STS17...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:eval_models:Evaluating for cohere on STS22


Creating model cohere-pca for task STS22
Loading cohere from cache for STS22...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:eval_models:Evaluating for cohere on STSBenchmark


Creating model cohere-pca for task STSBenchmark
Loading cohere from cache for STSBenchmark...


INFO:eval_models:Converting results to csv for cohere


Using model name cohere
Converting results_pca/512/cohere to results_pca/512/cohere_results.csv
BUCC - test set not found
Tatoeba - test set not found
de & accuracy not found for task AmazonCounterfactualClassification.
en-ext & accuracy not found for task AmazonCounterfactualClassification.
ja & accuracy not found for task AmazonCounterfactualClassification.
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not found
MedrxivClusteringS2S - test

INFO:eval_models:Finished evaluating cohere for all tasks


INFO:eval_models:Starting evaluation for gist
INFO:eval_models:Evaluating for gist on AmazonCounterfactualClassification


SummEval - test set not found
Not found: 'BUCC','Tatoeba','AmazonPolarityClassification','AmazonReviewsClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterURLCorpus','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','Touche2020','TRECCOVID','BIOSSES','SummEval' 40
Creating model gist-pca for task AmazonCounterfactualClassification
Loading gist from cache for AmazonCounterfactualClassification...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Repo card metadata block was not found. Setting CardData to empty.
INFO:eval_models:Evaluating for gist on Banking77Classification


Creating model gist-pca for task Banking77Classification
Loading gist from cache for Banking77Classification...


INFO:eval_models:Evaluating for gist on EmotionClassification


Creating model gist-pca for task EmotionClassification
Loading gist from cache for EmotionClassification...


Repo card metadata block was not found. Setting CardData to empty.


Creating model gist-pca for task ArxivClusteringS2S
Loading gist from cache for ArxivClusteringS2S...


INFO:eval_models:Evaluating for gist on ArxivClusteringS2S


Clustering: 100%|██████████| 31/31 [00:44<00:00,  1.44s/it]


Creating model gist-pca for task RedditClustering
Loading gist from cache for RedditClustering...


INFO:eval_models:Evaluating for gist on RedditClustering


Clustering: 100%|██████████| 25/25 [00:24<00:00,  1.02it/s]


Creating model gist-pca for task TwitterSemEval2015
Loading gist from cache for TwitterSemEval2015...


INFO:eval_models:Evaluating for gist on TwitterSemEval2015


INFO:eval_models:Evaluating for gist on AskUbuntuDupQuestions


Creating model gist-pca for task AskUbuntuDupQuestions
Loading gist from cache for AskUbuntuDupQuestions...


INFO:eval_models:Evaluating for gist on ArguAna


Creating model gist-pca for task ArguAna
Loading gist from cache for ArguAna...


INFO:eval_models:Evaluating for gist on SciFact


Creating model gist-pca for task SciFact
Loading gist from cache for SciFact...


INFO:eval_models:Evaluating for gist on SICK-R


Creating model gist-pca for task SICK-R
Loading gist from cache for SICK-R...


INFO:eval_models:Evaluating for gist on STS12


Creating model gist-pca for task STS12
Loading gist from cache for STS12...


INFO:eval_models:Evaluating for gist on STS13


Creating model gist-pca for task STS13
Loading gist from cache for STS13...


INFO:eval_models:Evaluating for gist on STS14


Creating model gist-pca for task STS14
Loading gist from cache for STS14...


INFO:eval_models:Evaluating for gist on STS15


Creating model gist-pca for task STS15
Loading gist from cache for STS15...


INFO:eval_models:Evaluating for gist on STS16


Creating model gist-pca for task STS16
Loading gist from cache for STS16...


INFO:eval_models:Evaluating for gist on STS17


Creating model gist-pca for task STS17
Loading gist from cache for STS17...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:eval_models:Evaluating for gist on STS22


Creating model gist-pca for task STS22
Loading gist from cache for STS22...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:eval_models:Evaluating for gist on STSBenchmark


Creating model gist-pca for task STSBenchmark
Loading gist from cache for STSBenchmark...


INFO:eval_models:Converting results to csv for gist


Using model name gist
Converting results_pca/512/gist to results_pca/512/gist_results.csv
BUCC - test set not found
Tatoeba - test set not found
de & accuracy not found for task AmazonCounterfactualClassification.
en-ext & accuracy not found for task AmazonCounterfactualClassification.
ja & accuracy not found for task AmazonCounterfactualClassification.
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not found
MedrxivClusteringS2S - test set n

INFO:eval_models:Finished evaluating gist for all tasks


INFO:eval_models:Starting evaluation for llmrails
INFO:eval_models:Evaluating for llmrails on AmazonCounterfactualClassification


SummEval - test set not found
Not found: 'BUCC','Tatoeba','AmazonPolarityClassification','AmazonReviewsClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterURLCorpus','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','Touche2020','TRECCOVID','BIOSSES','SummEval' 40
Creating model llmrails-pca for task AmazonCounterfactualClassification
Loading llmrails from cache for AmazonCounterfactualClassification...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Repo card metadata block was not found. Setting CardData to empty.
INFO:eval_models:Evaluating for llmrails on Banking77Classification


Creating model llmrails-pca for task Banking77Classification
Loading llmrails from cache for Banking77Classification...


INFO:eval_models:Evaluating for llmrails on EmotionClassification


Creating model llmrails-pca for task EmotionClassification
Loading llmrails from cache for EmotionClassification...


Repo card metadata block was not found. Setting CardData to empty.


Creating model llmrails-pca for task ArxivClusteringS2S
Loading llmrails from cache for ArxivClusteringS2S...


INFO:eval_models:Evaluating for llmrails on ArxivClusteringS2S


Clustering: 100%|██████████| 31/31 [00:45<00:00,  1.48s/it]


Creating model llmrails-pca for task RedditClustering
Loading llmrails from cache for RedditClustering...


INFO:eval_models:Evaluating for llmrails on RedditClustering


Clustering: 100%|██████████| 25/25 [00:24<00:00,  1.03it/s]
INFO:eval_models:Evaluating for llmrails on TwitterSemEval2015


Creating model llmrails-pca for task TwitterSemEval2015
Loading llmrails from cache for TwitterSemEval2015...


INFO:eval_models:Evaluating for llmrails on AskUbuntuDupQuestions


Creating model llmrails-pca for task AskUbuntuDupQuestions
Loading llmrails from cache for AskUbuntuDupQuestions...


INFO:eval_models:Evaluating for llmrails on ArguAna


Creating model llmrails-pca for task ArguAna
Loading llmrails from cache for ArguAna...


INFO:eval_models:Evaluating for llmrails on SciFact


Creating model llmrails-pca for task SciFact
Loading llmrails from cache for SciFact...


INFO:eval_models:Evaluating for llmrails on SICK-R


Creating model llmrails-pca for task SICK-R
Loading llmrails from cache for SICK-R...


INFO:eval_models:Evaluating for llmrails on STS12


Creating model llmrails-pca for task STS12
Loading llmrails from cache for STS12...


INFO:eval_models:Evaluating for llmrails on STS13


Creating model llmrails-pca for task STS13
Loading llmrails from cache for STS13...


INFO:eval_models:Evaluating for llmrails on STS14


Creating model llmrails-pca for task STS14
Loading llmrails from cache for STS14...


INFO:eval_models:Evaluating for llmrails on STS15


Creating model llmrails-pca for task STS15
Loading llmrails from cache for STS15...


INFO:eval_models:Evaluating for llmrails on STS16


Creating model llmrails-pca for task STS16
Loading llmrails from cache for STS16...


INFO:eval_models:Evaluating for llmrails on STS17


Creating model llmrails-pca for task STS17
Loading llmrails from cache for STS17...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:eval_models:Evaluating for llmrails on STS22


Creating model llmrails-pca for task STS22
Loading llmrails from cache for STS22...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:eval_models:Evaluating for llmrails on STSBenchmark


Creating model llmrails-pca for task STSBenchmark
Loading llmrails from cache for STSBenchmark...


INFO:eval_models:Converting results to csv for llmrails


Using model name llmrails
Converting results_pca/512/llmrails to results_pca/512/llmrails_results.csv
BUCC - test set not found
Tatoeba - test set not found
de & accuracy not found for task AmazonCounterfactualClassification.
en-ext & accuracy not found for task AmazonCounterfactualClassification.
ja & accuracy not found for task AmazonCounterfactualClassification.
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not found
MedrxivClusteringS2S 

INFO:eval_models:Finished evaluating llmrails for all tasks


INFO:eval_models:Starting evaluation for voyage
INFO:eval_models:Evaluating for voyage on AmazonCounterfactualClassification


SummEval - test set not found
Not found: 'BUCC','Tatoeba','AmazonPolarityClassification','AmazonReviewsClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterURLCorpus','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','Touche2020','TRECCOVID','BIOSSES','SummEval' 40
Creating model voyage-pca for task AmazonCounterfactualClassification
Loading voyage from cache for AmazonCounterfactualClassification...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Repo card metadata block was not found. Setting CardData to empty.
INFO:eval_models:Evaluating for voyage on Banking77Classification


Creating model voyage-pca for task Banking77Classification
Loading voyage from cache for Banking77Classification...


INFO:eval_models:Evaluating for voyage on EmotionClassification


Creating model voyage-pca for task EmotionClassification
Loading voyage from cache for EmotionClassification...


Repo card metadata block was not found. Setting CardData to empty.


Creating model voyage-pca for task ArxivClusteringS2S
Loading voyage from cache for ArxivClusteringS2S...


INFO:eval_models:Evaluating for voyage on ArxivClusteringS2S


Clustering: 100%|██████████| 31/31 [00:45<00:00,  1.46s/it]


Creating model voyage-pca for task RedditClustering
Loading voyage from cache for RedditClustering...


INFO:eval_models:Evaluating for voyage on RedditClustering


Clustering: 100%|██████████| 25/25 [00:25<00:00,  1.03s/it]


Creating model voyage-pca for task TwitterSemEval2015
Loading voyage from cache for TwitterSemEval2015...


INFO:eval_models:Evaluating for voyage on TwitterSemEval2015


INFO:eval_models:Evaluating for voyage on AskUbuntuDupQuestions


Creating model voyage-pca for task AskUbuntuDupQuestions
Loading voyage from cache for AskUbuntuDupQuestions...


INFO:eval_models:Evaluating for voyage on ArguAna


Creating model voyage-pca for task ArguAna
Loading voyage from cache for ArguAna...


INFO:eval_models:Evaluating for voyage on SciFact


Creating model voyage-pca for task SciFact
Loading voyage from cache for SciFact...


INFO:eval_models:Evaluating for voyage on SICK-R


Creating model voyage-pca for task SICK-R
Loading voyage from cache for SICK-R...


INFO:eval_models:Evaluating for voyage on STS12


Creating model voyage-pca for task STS12
Loading voyage from cache for STS12...


INFO:eval_models:Evaluating for voyage on STS13


Creating model voyage-pca for task STS13
Loading voyage from cache for STS13...


INFO:eval_models:Evaluating for voyage on STS14


Creating model voyage-pca for task STS14
Loading voyage from cache for STS14...


INFO:eval_models:Evaluating for voyage on STS15


Creating model voyage-pca for task STS15
Loading voyage from cache for STS15...


INFO:eval_models:Evaluating for voyage on STS16


Creating model voyage-pca for task STS16
Loading voyage from cache for STS16...


INFO:eval_models:Evaluating for voyage on STS17


Creating model voyage-pca for task STS17
Loading voyage from cache for STS17...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:eval_models:Evaluating for voyage on STS22


Creating model voyage-pca for task STS22
Loading voyage from cache for STS22...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:eval_models:Evaluating for voyage on STSBenchmark


Creating model voyage-pca for task STSBenchmark
Loading voyage from cache for STSBenchmark...


INFO:eval_models:Converting results to csv for voyage


Using model name voyage
Converting results_pca/512/voyage to results_pca/512/voyage_results.csv
BUCC - test set not found
Tatoeba - test set not found
de & accuracy not found for task AmazonCounterfactualClassification.
en-ext & accuracy not found for task AmazonCounterfactualClassification.
ja & accuracy not found for task AmazonCounterfactualClassification.
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not found
MedrxivClusteringS2S - test

INFO:eval_models:Finished evaluating voyage for all tasks


INFO:eval_models:Starting evaluation for angle$cohere
INFO:eval_models:Evaluating for angle$cohere on AmazonCounterfactualClassification


SummEval - test set not found
Not found: 'BUCC','Tatoeba','AmazonPolarityClassification','AmazonReviewsClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterURLCorpus','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','Touche2020','TRECCOVID','BIOSSES','SummEval' 40
Creating model angle$cohere-pca for task AmazonCounterfactualClassification
Loading angle from cache for AmazonCounterfactualClassification...
Loading cohere from

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Repo card metadata block was not found. Setting CardData to empty.


Creating model angle$cohere-pca for task Banking77Classification
Loading angle from cache for Banking77Classification...
Loading cohere from cache for Banking77Classification...


INFO:eval_models:Evaluating for angle$cohere on Banking77Classification


Creating model angle$cohere-pca for task EmotionClassification
Loading angle from cache for EmotionClassification...
Loading cohere from cache for EmotionClassification...


INFO:eval_models:Evaluating for angle$cohere on EmotionClassification


Repo card metadata block was not found. Setting CardData to empty.


Creating model angle$cohere-pca for task ArxivClusteringS2S
Loading angle from cache for ArxivClusteringS2S...
Loading cohere from cache for ArxivClusteringS2S...


INFO:eval_models:Evaluating for angle$cohere on ArxivClusteringS2S


Clustering: 100%|██████████| 31/31 [01:05<00:00,  2.12s/it]


Creating model angle$cohere-pca for task RedditClustering
Loading angle from cache for RedditClustering...
Loading cohere from cache for RedditClustering...


INFO:eval_models:Evaluating for angle$cohere on RedditClustering


Clustering: 100%|██████████| 25/25 [00:36<00:00,  1.45s/it]


Creating model angle$cohere-pca for task TwitterSemEval2015
Loading angle from cache for TwitterSemEval2015...
Loading cohere from cache for TwitterSemEval2015...


INFO:eval_models:Evaluating for angle$cohere on TwitterSemEval2015


INFO:eval_models:Evaluating for angle$cohere on AskUbuntuDupQuestions


Creating model angle$cohere-pca for task AskUbuntuDupQuestions
Loading angle from cache for AskUbuntuDupQuestions...
Loading cohere from cache for AskUbuntuDupQuestions...


INFO:eval_models:Evaluating for angle$cohere on ArguAna


Creating model angle$cohere-pca for task ArguAna
Loading angle from cache for ArguAna...
Loading cohere from cache for ArguAna...


INFO:eval_models:Evaluating for angle$cohere on SciFact


Creating model angle$cohere-pca for task SciFact
Loading angle from cache for SciFact...
Loading cohere from cache for SciFact...


Creating model angle$cohere-pca for task SICK-R
Loading angle from cache for SICK-R...
Loading cohere from cache for SICK-R...


INFO:eval_models:Evaluating for angle$cohere on SICK-R


INFO:eval_models:Evaluating for angle$cohere on STS12


Creating model angle$cohere-pca for task STS12
Loading angle from cache for STS12...
Loading cohere from cache for STS12...


INFO:eval_models:Evaluating for angle$cohere on STS13


Creating model angle$cohere-pca for task STS13
Loading angle from cache for STS13...
Loading cohere from cache for STS13...


INFO:eval_models:Evaluating for angle$cohere on STS14


Creating model angle$cohere-pca for task STS14
Loading angle from cache for STS14...
Loading cohere from cache for STS14...


INFO:eval_models:Evaluating for angle$cohere on STS15


Creating model angle$cohere-pca for task STS15
Loading angle from cache for STS15...
Loading cohere from cache for STS15...


INFO:eval_models:Evaluating for angle$cohere on STS16


Creating model angle$cohere-pca for task STS16
Loading angle from cache for STS16...
Loading cohere from cache for STS16...


INFO:eval_models:Evaluating for angle$cohere on STS17


Creating model angle$cohere-pca for task STS17
Loading angle from cache for STS17...
Loading cohere from cache for STS17...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:eval_models:Evaluating for angle$cohere on STS22


Creating model angle$cohere-pca for task STS22
Loading angle from cache for STS22...
Loading cohere from cache for STS22...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Creating model angle$cohere-pca for task STSBenchmark
Loading angle from cache for STSBenchmark...
Loading cohere from cache for STSBenchmark...


INFO:eval_models:Evaluating for angle$cohere on STSBenchmark


INFO:eval_models:Converting results to csv for angle$cohere


Using model name angle$cohere
Converting results_pca/512/angle$cohere to results_pca/512/angle$cohere_results.csv
BUCC - test set not found
Tatoeba - test set not found
de & accuracy not found for task AmazonCounterfactualClassification.
en-ext & accuracy not found for task AmazonCounterfactualClassification.
ja & accuracy not found for task AmazonCounterfactualClassification.
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not found
MedrxivCl

INFO:eval_models:Finished evaluating angle$cohere for all tasks


INFO:eval_models:Starting evaluation for angle$gist
INFO:eval_models:Evaluating for angle$gist on AmazonCounterfactualClassification


SummEval - test set not found
Not found: 'BUCC','Tatoeba','AmazonPolarityClassification','AmazonReviewsClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterURLCorpus','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','Touche2020','TRECCOVID','BIOSSES','SummEval' 40
Creating model angle$gist-pca for task AmazonCounterfactualClassification
Loading angle from cache for AmazonCounterfactualClassification...
Loading gist from cac

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Repo card metadata block was not found. Setting CardData to empty.
INFO:eval_models:Evaluating for angle$gist on Banking77Classification


Creating model angle$gist-pca for task Banking77Classification
Loading angle from cache for Banking77Classification...
Loading gist from cache for Banking77Classification...


Creating model angle$gist-pca for task EmotionClassification
Loading angle from cache for EmotionClassification...
Loading gist from cache for EmotionClassification...


INFO:eval_models:Evaluating for angle$gist on EmotionClassification


Repo card metadata block was not found. Setting CardData to empty.


Creating model angle$gist-pca for task ArxivClusteringS2S
Loading angle from cache for ArxivClusteringS2S...
Loading gist from cache for ArxivClusteringS2S...


INFO:eval_models:Evaluating for angle$gist on ArxivClusteringS2S


Clustering: 100%|██████████| 31/31 [00:59<00:00,  1.93s/it]


Creating model angle$gist-pca for task RedditClustering
Loading angle from cache for RedditClustering...
Loading gist from cache for RedditClustering...


INFO:eval_models:Evaluating for angle$gist on RedditClustering


Clustering: 100%|██████████| 25/25 [00:32<00:00,  1.31s/it]


Creating model angle$gist-pca for task TwitterSemEval2015
Loading angle from cache for TwitterSemEval2015...
Loading gist from cache for TwitterSemEval2015...


INFO:eval_models:Evaluating for angle$gist on TwitterSemEval2015


INFO:eval_models:Evaluating for angle$gist on AskUbuntuDupQuestions


Creating model angle$gist-pca for task AskUbuntuDupQuestions
Loading angle from cache for AskUbuntuDupQuestions...
Loading gist from cache for AskUbuntuDupQuestions...


INFO:eval_models:Evaluating for angle$gist on ArguAna


Creating model angle$gist-pca for task ArguAna
Loading angle from cache for ArguAna...
Loading gist from cache for ArguAna...


INFO:eval_models:Evaluating for angle$gist on SciFact


Creating model angle$gist-pca for task SciFact
Loading angle from cache for SciFact...
Loading gist from cache for SciFact...


Creating model angle$gist-pca for task SICK-R
Loading angle from cache for SICK-R...
Loading gist from cache for SICK-R...


INFO:eval_models:Evaluating for angle$gist on SICK-R


INFO:eval_models:Evaluating for angle$gist on STS12


Creating model angle$gist-pca for task STS12
Loading angle from cache for STS12...
Loading gist from cache for STS12...


INFO:eval_models:Evaluating for angle$gist on STS13


Creating model angle$gist-pca for task STS13
Loading angle from cache for STS13...
Loading gist from cache for STS13...


INFO:eval_models:Evaluating for angle$gist on STS14


Creating model angle$gist-pca for task STS14
Loading angle from cache for STS14...
Loading gist from cache for STS14...


INFO:eval_models:Evaluating for angle$gist on STS15


Creating model angle$gist-pca for task STS15
Loading angle from cache for STS15...
Loading gist from cache for STS15...


INFO:eval_models:Evaluating for angle$gist on STS16


Creating model angle$gist-pca for task STS16
Loading angle from cache for STS16...
Loading gist from cache for STS16...


INFO:eval_models:Evaluating for angle$gist on STS17


Creating model angle$gist-pca for task STS17
Loading angle from cache for STS17...
Loading gist from cache for STS17...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:eval_models:Evaluating for angle$gist on STS22


Creating model angle$gist-pca for task STS22
Loading angle from cache for STS22...
Loading gist from cache for STS22...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Creating model angle$gist-pca for task STSBenchmark
Loading angle from cache for STSBenchmark...
Loading gist from cache for STSBenchmark...


INFO:eval_models:Evaluating for angle$gist on STSBenchmark


INFO:eval_models:Converting results to csv for angle$gist


Using model name angle$gist
Converting results_pca/512/angle$gist to results_pca/512/angle$gist_results.csv
BUCC - test set not found
Tatoeba - test set not found
de & accuracy not found for task AmazonCounterfactualClassification.
en-ext & accuracy not found for task AmazonCounterfactualClassification.
ja & accuracy not found for task AmazonCounterfactualClassification.
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not found
MedrxivClusteri

INFO:eval_models:Finished evaluating angle$gist for all tasks


INFO:eval_models:Starting evaluation for angle$llmrails
INFO:eval_models:Evaluating for angle$llmrails on AmazonCounterfactualClassification


SummEval - test set not found
Not found: 'BUCC','Tatoeba','AmazonPolarityClassification','AmazonReviewsClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterURLCorpus','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','Touche2020','TRECCOVID','BIOSSES','SummEval' 40
Creating model angle$llmrails-pca for task AmazonCounterfactualClassification
Loading angle from cache for AmazonCounterfactualClassification...
Loading llmrails 

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Repo card metadata block was not found. Setting CardData to empty.
INFO:eval_models:Evaluating for angle$llmrails on Banking77Classification


Creating model angle$llmrails-pca for task Banking77Classification
Loading angle from cache for Banking77Classification...
Loading llmrails from cache for Banking77Classification...


Creating model angle$llmrails-pca for task EmotionClassification
Loading angle from cache for EmotionClassification...
Loading llmrails from cache for EmotionClassification...


INFO:eval_models:Evaluating for angle$llmrails on EmotionClassification


Repo card metadata block was not found. Setting CardData to empty.


Creating model angle$llmrails-pca for task ArxivClusteringS2S
Loading angle from cache for ArxivClusteringS2S...
Loading llmrails from cache for ArxivClusteringS2S...


INFO:eval_models:Evaluating for angle$llmrails on ArxivClusteringS2S


Clustering: 100%|██████████| 31/31 [01:00<00:00,  1.95s/it]


Creating model angle$llmrails-pca for task RedditClustering
Loading angle from cache for RedditClustering...
Loading llmrails from cache for RedditClustering...


INFO:eval_models:Evaluating for angle$llmrails on RedditClustering


Clustering: 100%|██████████| 25/25 [00:31<00:00,  1.26s/it]


Creating model angle$llmrails-pca for task TwitterSemEval2015
Loading angle from cache for TwitterSemEval2015...
Loading llmrails from cache for TwitterSemEval2015...


INFO:eval_models:Evaluating for angle$llmrails on TwitterSemEval2015


INFO:eval_models:Evaluating for angle$llmrails on AskUbuntuDupQuestions


Creating model angle$llmrails-pca for task AskUbuntuDupQuestions
Loading angle from cache for AskUbuntuDupQuestions...
Loading llmrails from cache for AskUbuntuDupQuestions...


INFO:eval_models:Evaluating for angle$llmrails on ArguAna


Creating model angle$llmrails-pca for task ArguAna
Loading angle from cache for ArguAna...
Loading llmrails from cache for ArguAna...


INFO:eval_models:Evaluating for angle$llmrails on SciFact


Creating model angle$llmrails-pca for task SciFact
Loading angle from cache for SciFact...
Loading llmrails from cache for SciFact...


Creating model angle$llmrails-pca for task SICK-R
Loading angle from cache for SICK-R...
Loading llmrails from cache for SICK-R...


INFO:eval_models:Evaluating for angle$llmrails on SICK-R


INFO:eval_models:Evaluating for angle$llmrails on STS12


Creating model angle$llmrails-pca for task STS12
Loading angle from cache for STS12...
Loading llmrails from cache for STS12...


INFO:eval_models:Evaluating for angle$llmrails on STS13


Creating model angle$llmrails-pca for task STS13
Loading angle from cache for STS13...
Loading llmrails from cache for STS13...


INFO:eval_models:Evaluating for angle$llmrails on STS14


Creating model angle$llmrails-pca for task STS14
Loading angle from cache for STS14...
Loading llmrails from cache for STS14...


INFO:eval_models:Evaluating for angle$llmrails on STS15


Creating model angle$llmrails-pca for task STS15
Loading angle from cache for STS15...
Loading llmrails from cache for STS15...


INFO:eval_models:Evaluating for angle$llmrails on STS16


Creating model angle$llmrails-pca for task STS16
Loading angle from cache for STS16...
Loading llmrails from cache for STS16...


INFO:eval_models:Evaluating for angle$llmrails on STS17


Creating model angle$llmrails-pca for task STS17
Loading angle from cache for STS17...
Loading llmrails from cache for STS17...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:eval_models:Evaluating for angle$llmrails on STS22


Creating model angle$llmrails-pca for task STS22
Loading angle from cache for STS22...
Loading llmrails from cache for STS22...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Creating model angle$llmrails-pca for task STSBenchmark
Loading angle from cache for STSBenchmark...
Loading llmrails from cache for STSBenchmark...


INFO:eval_models:Evaluating for angle$llmrails on STSBenchmark


INFO:eval_models:Converting results to csv for angle$llmrails


Using model name angle$llmrails
Converting results_pca/512/angle$llmrails to results_pca/512/angle$llmrails_results.csv
BUCC - test set not found
Tatoeba - test set not found
de & accuracy not found for task AmazonCounterfactualClassification.
en-ext & accuracy not found for task AmazonCounterfactualClassification.
ja & accuracy not found for task AmazonCounterfactualClassification.
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not found
Med

INFO:eval_models:Finished evaluating angle$llmrails for all tasks


INFO:eval_models:Starting evaluation for angle$voyage
INFO:eval_models:Evaluating for angle$voyage on AmazonCounterfactualClassification


SummEval - test set not found
Not found: 'BUCC','Tatoeba','AmazonPolarityClassification','AmazonReviewsClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterURLCorpus','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','Touche2020','TRECCOVID','BIOSSES','SummEval' 40
Creating model angle$voyage-pca for task AmazonCounterfactualClassification
Loading angle from cache for AmazonCounterfactualClassification...
Loading voyage from

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Repo card metadata block was not found. Setting CardData to empty.
INFO:eval_models:Evaluating for angle$voyage on Banking77Classification


Creating model angle$voyage-pca for task Banking77Classification
Loading angle from cache for Banking77Classification...
Loading voyage from cache for Banking77Classification...


Creating model angle$voyage-pca for task EmotionClassification
Loading angle from cache for EmotionClassification...
Loading voyage from cache for EmotionClassification...


INFO:eval_models:Evaluating for angle$voyage on EmotionClassification


Repo card metadata block was not found. Setting CardData to empty.


Creating model angle$voyage-pca for task ArxivClusteringS2S
Loading angle from cache for ArxivClusteringS2S...
Loading voyage from cache for ArxivClusteringS2S...


INFO:eval_models:Evaluating for angle$voyage on ArxivClusteringS2S


Clustering: 100%|██████████| 31/31 [01:08<00:00,  2.21s/it]


Creating model angle$voyage-pca for task RedditClustering
Loading angle from cache for RedditClustering...
Loading voyage from cache for RedditClustering...


INFO:eval_models:Evaluating for angle$voyage on RedditClustering


Clustering: 100%|██████████| 25/25 [00:37<00:00,  1.48s/it]


Creating model angle$voyage-pca for task TwitterSemEval2015
Loading angle from cache for TwitterSemEval2015...
Loading voyage from cache for TwitterSemEval2015...


INFO:eval_models:Evaluating for angle$voyage on TwitterSemEval2015


INFO:eval_models:Evaluating for angle$voyage on AskUbuntuDupQuestions


Creating model angle$voyage-pca for task AskUbuntuDupQuestions
Loading angle from cache for AskUbuntuDupQuestions...
Loading voyage from cache for AskUbuntuDupQuestions...


Creating model angle$voyage-pca for task ArguAna
Loading angle from cache for ArguAna...
Loading voyage from cache for ArguAna...


INFO:eval_models:Evaluating for angle$voyage on ArguAna


INFO:eval_models:Evaluating for angle$voyage on SciFact


Creating model angle$voyage-pca for task SciFact
Loading angle from cache for SciFact...
Loading voyage from cache for SciFact...


Creating model angle$voyage-pca for task SICK-R
Loading angle from cache for SICK-R...
Loading voyage from cache for SICK-R...


INFO:eval_models:Evaluating for angle$voyage on SICK-R


INFO:eval_models:Evaluating for angle$voyage on STS12


Creating model angle$voyage-pca for task STS12
Loading angle from cache for STS12...
Loading voyage from cache for STS12...


INFO:eval_models:Evaluating for angle$voyage on STS13


Creating model angle$voyage-pca for task STS13
Loading angle from cache for STS13...
Loading voyage from cache for STS13...


INFO:eval_models:Evaluating for angle$voyage on STS14


Creating model angle$voyage-pca for task STS14
Loading angle from cache for STS14...
Loading voyage from cache for STS14...


INFO:eval_models:Evaluating for angle$voyage on STS15


Creating model angle$voyage-pca for task STS15
Loading angle from cache for STS15...
Loading voyage from cache for STS15...


INFO:eval_models:Evaluating for angle$voyage on STS16


Creating model angle$voyage-pca for task STS16
Loading angle from cache for STS16...
Loading voyage from cache for STS16...


INFO:eval_models:Evaluating for angle$voyage on STS17


Creating model angle$voyage-pca for task STS17
Loading angle from cache for STS17...
Loading voyage from cache for STS17...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:eval_models:Evaluating for angle$voyage on STS22


Creating model angle$voyage-pca for task STS22
Loading angle from cache for STS22...
Loading voyage from cache for STS22...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Creating model angle$voyage-pca for task STSBenchmark
Loading angle from cache for STSBenchmark...
Loading voyage from cache for STSBenchmark...


INFO:eval_models:Evaluating for angle$voyage on STSBenchmark


INFO:eval_models:Converting results to csv for angle$voyage


Using model name angle$voyage
Converting results_pca/512/angle$voyage to results_pca/512/angle$voyage_results.csv
BUCC - test set not found
Tatoeba - test set not found
de & accuracy not found for task AmazonCounterfactualClassification.
en-ext & accuracy not found for task AmazonCounterfactualClassification.
ja & accuracy not found for task AmazonCounterfactualClassification.
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not found
MedrxivCl

INFO:eval_models:Finished evaluating angle$voyage for all tasks


INFO:eval_models:Starting evaluation for cohere$gist
INFO:eval_models:Evaluating for cohere$gist on AmazonCounterfactualClassification


SummEval - test set not found
Not found: 'BUCC','Tatoeba','AmazonPolarityClassification','AmazonReviewsClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterURLCorpus','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','Touche2020','TRECCOVID','BIOSSES','SummEval' 40
Creating model cohere$gist-pca for task AmazonCounterfactualClassification
Loading cohere from cache for AmazonCounterfactualClassification...
Loading gist from c

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Repo card metadata block was not found. Setting CardData to empty.


Creating model cohere$gist-pca for task Banking77Classification
Loading cohere from cache for Banking77Classification...
Loading gist from cache for Banking77Classification...


INFO:eval_models:Evaluating for cohere$gist on Banking77Classification


Creating model cohere$gist-pca for task EmotionClassification
Loading cohere from cache for EmotionClassification...
Loading gist from cache for EmotionClassification...


INFO:eval_models:Evaluating for cohere$gist on EmotionClassification


Repo card metadata block was not found. Setting CardData to empty.


Creating model cohere$gist-pca for task ArxivClusteringS2S
Loading cohere from cache for ArxivClusteringS2S...
Loading gist from cache for ArxivClusteringS2S...


INFO:eval_models:Evaluating for cohere$gist on ArxivClusteringS2S


Clustering: 100%|██████████| 31/31 [01:06<00:00,  2.16s/it]


Creating model cohere$gist-pca for task RedditClustering
Loading cohere from cache for RedditClustering...
Loading gist from cache for RedditClustering...


INFO:eval_models:Evaluating for cohere$gist on RedditClustering


Clustering: 100%|██████████| 25/25 [00:37<00:00,  1.48s/it]


Creating model cohere$gist-pca for task TwitterSemEval2015
Loading cohere from cache for TwitterSemEval2015...
Loading gist from cache for TwitterSemEval2015...


INFO:eval_models:Evaluating for cohere$gist on TwitterSemEval2015


Creating model cohere$gist-pca for task AskUbuntuDupQuestions
Loading cohere from cache for AskUbuntuDupQuestions...
Loading gist from cache for AskUbuntuDupQuestions...


INFO:eval_models:Evaluating for cohere$gist on AskUbuntuDupQuestions


Creating model cohere$gist-pca for task ArguAna
Loading cohere from cache for ArguAna...
Loading gist from cache for ArguAna...


INFO:eval_models:Evaluating for cohere$gist on ArguAna


INFO:eval_models:Evaluating for cohere$gist on SciFact


Creating model cohere$gist-pca for task SciFact
Loading cohere from cache for SciFact...
Loading gist from cache for SciFact...


Creating model cohere$gist-pca for task SICK-R
Loading cohere from cache for SICK-R...
Loading gist from cache for SICK-R...


INFO:eval_models:Evaluating for cohere$gist on SICK-R


INFO:eval_models:Evaluating for cohere$gist on STS12


Creating model cohere$gist-pca for task STS12
Loading cohere from cache for STS12...
Loading gist from cache for STS12...


INFO:eval_models:Evaluating for cohere$gist on STS13


Creating model cohere$gist-pca for task STS13
Loading cohere from cache for STS13...
Loading gist from cache for STS13...


INFO:eval_models:Evaluating for cohere$gist on STS14


Creating model cohere$gist-pca for task STS14
Loading cohere from cache for STS14...
Loading gist from cache for STS14...


INFO:eval_models:Evaluating for cohere$gist on STS15


Creating model cohere$gist-pca for task STS15
Loading cohere from cache for STS15...
Loading gist from cache for STS15...


INFO:eval_models:Evaluating for cohere$gist on STS16


Creating model cohere$gist-pca for task STS16
Loading cohere from cache for STS16...
Loading gist from cache for STS16...


INFO:eval_models:Evaluating for cohere$gist on STS17


Creating model cohere$gist-pca for task STS17
Loading cohere from cache for STS17...
Loading gist from cache for STS17...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:eval_models:Evaluating for cohere$gist on STS22


Creating model cohere$gist-pca for task STS22
Loading cohere from cache for STS22...
Loading gist from cache for STS22...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Creating model cohere$gist-pca for task STSBenchmark
Loading cohere from cache for STSBenchmark...
Loading gist from cache for STSBenchmark...


INFO:eval_models:Evaluating for cohere$gist on STSBenchmark


INFO:eval_models:Converting results to csv for cohere$gist


Using model name cohere$gist
Converting results_pca/512/cohere$gist to results_pca/512/cohere$gist_results.csv
BUCC - test set not found
Tatoeba - test set not found
de & accuracy not found for task AmazonCounterfactualClassification.
en-ext & accuracy not found for task AmazonCounterfactualClassification.
ja & accuracy not found for task AmazonCounterfactualClassification.
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not found
MedrxivClust

INFO:eval_models:Finished evaluating cohere$gist for all tasks


INFO:eval_models:Starting evaluation for cohere$llmrails
INFO:eval_models:Evaluating for cohere$llmrails on AmazonCounterfactualClassification


SummEval - test set not found
Not found: 'BUCC','Tatoeba','AmazonPolarityClassification','AmazonReviewsClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterURLCorpus','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','Touche2020','TRECCOVID','BIOSSES','SummEval' 40
Creating model cohere$llmrails-pca for task AmazonCounterfactualClassification
Loading cohere from cache for AmazonCounterfactualClassification...
Loading llmrail

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Repo card metadata block was not found. Setting CardData to empty.


Creating model cohere$llmrails-pca for task Banking77Classification
Loading cohere from cache for Banking77Classification...
Loading llmrails from cache for Banking77Classification...


INFO:eval_models:Evaluating for cohere$llmrails on Banking77Classification


Creating model cohere$llmrails-pca for task EmotionClassification
Loading cohere from cache for EmotionClassification...
Loading llmrails from cache for EmotionClassification...


INFO:eval_models:Evaluating for cohere$llmrails on EmotionClassification


Repo card metadata block was not found. Setting CardData to empty.


Creating model cohere$llmrails-pca for task ArxivClusteringS2S
Loading cohere from cache for ArxivClusteringS2S...
Loading llmrails from cache for ArxivClusteringS2S...


INFO:eval_models:Evaluating for cohere$llmrails on ArxivClusteringS2S


Clustering: 100%|██████████| 31/31 [01:05<00:00,  2.13s/it]


Creating model cohere$llmrails-pca for task RedditClustering
Loading cohere from cache for RedditClustering...
Loading llmrails from cache for RedditClustering...


INFO:eval_models:Evaluating for cohere$llmrails on RedditClustering


Clustering: 100%|██████████| 25/25 [00:38<00:00,  1.54s/it]


Creating model cohere$llmrails-pca for task TwitterSemEval2015
Loading cohere from cache for TwitterSemEval2015...
Loading llmrails from cache for TwitterSemEval2015...


INFO:eval_models:Evaluating for cohere$llmrails on TwitterSemEval2015


INFO:eval_models:Evaluating for cohere$llmrails on AskUbuntuDupQuestions


Creating model cohere$llmrails-pca for task AskUbuntuDupQuestions
Loading cohere from cache for AskUbuntuDupQuestions...
Loading llmrails from cache for AskUbuntuDupQuestions...


INFO:eval_models:Evaluating for cohere$llmrails on ArguAna


Creating model cohere$llmrails-pca for task ArguAna
Loading cohere from cache for ArguAna...
Loading llmrails from cache for ArguAna...


INFO:eval_models:Evaluating for cohere$llmrails on SciFact


Creating model cohere$llmrails-pca for task SciFact
Loading cohere from cache for SciFact...
Loading llmrails from cache for SciFact...


Creating model cohere$llmrails-pca for task SICK-R
Loading cohere from cache for SICK-R...
Loading llmrails from cache for SICK-R...


INFO:eval_models:Evaluating for cohere$llmrails on SICK-R


INFO:eval_models:Evaluating for cohere$llmrails on STS12


Creating model cohere$llmrails-pca for task STS12
Loading cohere from cache for STS12...
Loading llmrails from cache for STS12...


INFO:eval_models:Evaluating for cohere$llmrails on STS13


Creating model cohere$llmrails-pca for task STS13
Loading cohere from cache for STS13...
Loading llmrails from cache for STS13...


INFO:eval_models:Evaluating for cohere$llmrails on STS14


Creating model cohere$llmrails-pca for task STS14
Loading cohere from cache for STS14...
Loading llmrails from cache for STS14...


INFO:eval_models:Evaluating for cohere$llmrails on STS15


Creating model cohere$llmrails-pca for task STS15
Loading cohere from cache for STS15...
Loading llmrails from cache for STS15...


INFO:eval_models:Evaluating for cohere$llmrails on STS16


Creating model cohere$llmrails-pca for task STS16
Loading cohere from cache for STS16...
Loading llmrails from cache for STS16...


INFO:eval_models:Evaluating for cohere$llmrails on STS17


Creating model cohere$llmrails-pca for task STS17
Loading cohere from cache for STS17...
Loading llmrails from cache for STS17...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:eval_models:Evaluating for cohere$llmrails on STS22


Creating model cohere$llmrails-pca for task STS22
Loading cohere from cache for STS22...
Loading llmrails from cache for STS22...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Creating model cohere$llmrails-pca for task STSBenchmark
Loading cohere from cache for STSBenchmark...
Loading llmrails from cache for STSBenchmark...


INFO:eval_models:Evaluating for cohere$llmrails on STSBenchmark


INFO:eval_models:Converting results to csv for cohere$llmrails


Using model name cohere$llmrails
Converting results_pca/512/cohere$llmrails to results_pca/512/cohere$llmrails_results.csv
BUCC - test set not found
Tatoeba - test set not found
de & accuracy not found for task AmazonCounterfactualClassification.
en-ext & accuracy not found for task AmazonCounterfactualClassification.
ja & accuracy not found for task AmazonCounterfactualClassification.
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not found


INFO:eval_models:Finished evaluating cohere$llmrails for all tasks


INFO:eval_models:Starting evaluation for cohere$voyage
INFO:eval_models:Evaluating for cohere$voyage on AmazonCounterfactualClassification


SummEval - test set not found
Not found: 'BUCC','Tatoeba','AmazonPolarityClassification','AmazonReviewsClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterURLCorpus','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','Touche2020','TRECCOVID','BIOSSES','SummEval' 40
Creating model cohere$voyage-pca for task AmazonCounterfactualClassification
Loading cohere from cache for AmazonCounterfactualClassification...
Loading voyage fr

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Repo card metadata block was not found. Setting CardData to empty.


Creating model cohere$voyage-pca for task Banking77Classification
Loading cohere from cache for Banking77Classification...
Loading voyage from cache for Banking77Classification...


INFO:eval_models:Evaluating for cohere$voyage on Banking77Classification


Creating model cohere$voyage-pca for task EmotionClassification
Loading cohere from cache for EmotionClassification...
Loading voyage from cache for EmotionClassification...


INFO:eval_models:Evaluating for cohere$voyage on EmotionClassification


Repo card metadata block was not found. Setting CardData to empty.


Creating model cohere$voyage-pca for task ArxivClusteringS2S
Loading cohere from cache for ArxivClusteringS2S...
Loading voyage from cache for ArxivClusteringS2S...


INFO:eval_models:Evaluating for cohere$voyage on ArxivClusteringS2S


Clustering: 100%|██████████| 31/31 [01:17<00:00,  2.49s/it]


Creating model cohere$voyage-pca for task RedditClustering
Loading cohere from cache for RedditClustering...
Loading voyage from cache for RedditClustering...


INFO:eval_models:Evaluating for cohere$voyage on RedditClustering


Clustering: 100%|██████████| 25/25 [00:39<00:00,  1.58s/it]


Creating model cohere$voyage-pca for task TwitterSemEval2015
Loading cohere from cache for TwitterSemEval2015...
Loading voyage from cache for TwitterSemEval2015...


INFO:eval_models:Evaluating for cohere$voyage on TwitterSemEval2015


INFO:eval_models:Evaluating for cohere$voyage on AskUbuntuDupQuestions


Creating model cohere$voyage-pca for task AskUbuntuDupQuestions
Loading cohere from cache for AskUbuntuDupQuestions...
Loading voyage from cache for AskUbuntuDupQuestions...


Creating model cohere$voyage-pca for task ArguAna
Loading cohere from cache for ArguAna...
Loading voyage from cache for ArguAna...


INFO:eval_models:Evaluating for cohere$voyage on ArguAna


INFO:eval_models:Evaluating for cohere$voyage on SciFact


Creating model cohere$voyage-pca for task SciFact
Loading cohere from cache for SciFact...
Loading voyage from cache for SciFact...


Creating model cohere$voyage-pca for task SICK-R
Loading cohere from cache for SICK-R...
Loading voyage from cache for SICK-R...


INFO:eval_models:Evaluating for cohere$voyage on SICK-R


Creating model cohere$voyage-pca for task STS12
Loading cohere from cache for STS12...
Loading voyage from cache for STS12...


INFO:eval_models:Evaluating for cohere$voyage on STS12


INFO:eval_models:Evaluating for cohere$voyage on STS13


Creating model cohere$voyage-pca for task STS13
Loading cohere from cache for STS13...
Loading voyage from cache for STS13...


INFO:eval_models:Evaluating for cohere$voyage on STS14


Creating model cohere$voyage-pca for task STS14
Loading cohere from cache for STS14...
Loading voyage from cache for STS14...


INFO:eval_models:Evaluating for cohere$voyage on STS15


Creating model cohere$voyage-pca for task STS15
Loading cohere from cache for STS15...
Loading voyage from cache for STS15...


INFO:eval_models:Evaluating for cohere$voyage on STS16


Creating model cohere$voyage-pca for task STS16
Loading cohere from cache for STS16...
Loading voyage from cache for STS16...


INFO:eval_models:Evaluating for cohere$voyage on STS17


Creating model cohere$voyage-pca for task STS17
Loading cohere from cache for STS17...
Loading voyage from cache for STS17...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:eval_models:Evaluating for cohere$voyage on STS22


Creating model cohere$voyage-pca for task STS22
Loading cohere from cache for STS22...
Loading voyage from cache for STS22...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Creating model cohere$voyage-pca for task STSBenchmark
Loading cohere from cache for STSBenchmark...
Loading voyage from cache for STSBenchmark...


INFO:eval_models:Evaluating for cohere$voyage on STSBenchmark


INFO:eval_models:Converting results to csv for cohere$voyage


Using model name cohere$voyage
Converting results_pca/512/cohere$voyage to results_pca/512/cohere$voyage_results.csv
BUCC - test set not found
Tatoeba - test set not found
de & accuracy not found for task AmazonCounterfactualClassification.
en-ext & accuracy not found for task AmazonCounterfactualClassification.
ja & accuracy not found for task AmazonCounterfactualClassification.
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not found
Medrxi

INFO:eval_models:Finished evaluating cohere$voyage for all tasks


INFO:eval_models:Starting evaluation for gist$llmrails
INFO:eval_models:Evaluating for gist$llmrails on AmazonCounterfactualClassification


SummEval - test set not found
Not found: 'BUCC','Tatoeba','AmazonPolarityClassification','AmazonReviewsClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterURLCorpus','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','Touche2020','TRECCOVID','BIOSSES','SummEval' 40
Creating model gist$llmrails-pca for task AmazonCounterfactualClassification
Loading gist from cache for AmazonCounterfactualClassification...
Loading llmrails fr

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Repo card metadata block was not found. Setting CardData to empty.
INFO:eval_models:Evaluating for gist$llmrails on Banking77Classification


Creating model gist$llmrails-pca for task Banking77Classification
Loading gist from cache for Banking77Classification...
Loading llmrails from cache for Banking77Classification...


Creating model gist$llmrails-pca for task EmotionClassification
Loading gist from cache for EmotionClassification...
Loading llmrails from cache for EmotionClassification...


INFO:eval_models:Evaluating for gist$llmrails on EmotionClassification


Repo card metadata block was not found. Setting CardData to empty.


Creating model gist$llmrails-pca for task ArxivClusteringS2S
Loading gist from cache for ArxivClusteringS2S...
Loading llmrails from cache for ArxivClusteringS2S...


INFO:eval_models:Evaluating for gist$llmrails on ArxivClusteringS2S


Clustering: 100%|██████████| 31/31 [01:00<00:00,  1.97s/it]


Creating model gist$llmrails-pca for task RedditClustering
Loading gist from cache for RedditClustering...
Loading llmrails from cache for RedditClustering...


INFO:eval_models:Evaluating for gist$llmrails on RedditClustering


Clustering: 100%|██████████| 25/25 [00:32<00:00,  1.30s/it]


Creating model gist$llmrails-pca for task TwitterSemEval2015
Loading gist from cache for TwitterSemEval2015...
Loading llmrails from cache for TwitterSemEval2015...


INFO:eval_models:Evaluating for gist$llmrails on TwitterSemEval2015


INFO:eval_models:Evaluating for gist$llmrails on AskUbuntuDupQuestions


Creating model gist$llmrails-pca for task AskUbuntuDupQuestions
Loading gist from cache for AskUbuntuDupQuestions...
Loading llmrails from cache for AskUbuntuDupQuestions...


INFO:eval_models:Evaluating for gist$llmrails on ArguAna


Creating model gist$llmrails-pca for task ArguAna
Loading gist from cache for ArguAna...
Loading llmrails from cache for ArguAna...


INFO:eval_models:Evaluating for gist$llmrails on SciFact


Creating model gist$llmrails-pca for task SciFact
Loading gist from cache for SciFact...
Loading llmrails from cache for SciFact...


Creating model gist$llmrails-pca for task SICK-R
Loading gist from cache for SICK-R...
Loading llmrails from cache for SICK-R...


INFO:eval_models:Evaluating for gist$llmrails on SICK-R


INFO:eval_models:Evaluating for gist$llmrails on STS12


Creating model gist$llmrails-pca for task STS12
Loading gist from cache for STS12...
Loading llmrails from cache for STS12...


INFO:eval_models:Evaluating for gist$llmrails on STS13


Creating model gist$llmrails-pca for task STS13
Loading gist from cache for STS13...
Loading llmrails from cache for STS13...


INFO:eval_models:Evaluating for gist$llmrails on STS14


Creating model gist$llmrails-pca for task STS14
Loading gist from cache for STS14...
Loading llmrails from cache for STS14...


INFO:eval_models:Evaluating for gist$llmrails on STS15


Creating model gist$llmrails-pca for task STS15
Loading gist from cache for STS15...
Loading llmrails from cache for STS15...


INFO:eval_models:Evaluating for gist$llmrails on STS16


Creating model gist$llmrails-pca for task STS16
Loading gist from cache for STS16...
Loading llmrails from cache for STS16...


INFO:eval_models:Evaluating for gist$llmrails on STS17


Creating model gist$llmrails-pca for task STS17
Loading gist from cache for STS17...
Loading llmrails from cache for STS17...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:eval_models:Evaluating for gist$llmrails on STS22


Creating model gist$llmrails-pca for task STS22
Loading gist from cache for STS22...
Loading llmrails from cache for STS22...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Creating model gist$llmrails-pca for task STSBenchmark
Loading gist from cache for STSBenchmark...
Loading llmrails from cache for STSBenchmark...


INFO:eval_models:Evaluating for gist$llmrails on STSBenchmark


INFO:eval_models:Converting results to csv for gist$llmrails


Using model name gist$llmrails
Converting results_pca/512/gist$llmrails to results_pca/512/gist$llmrails_results.csv
BUCC - test set not found
Tatoeba - test set not found
de & accuracy not found for task AmazonCounterfactualClassification.
en-ext & accuracy not found for task AmazonCounterfactualClassification.
ja & accuracy not found for task AmazonCounterfactualClassification.
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not found
Medrxi

INFO:eval_models:Finished evaluating gist$llmrails for all tasks


INFO:eval_models:Starting evaluation for gist$voyage
INFO:eval_models:Evaluating for gist$voyage on AmazonCounterfactualClassification


SummEval - test set not found
Not found: 'BUCC','Tatoeba','AmazonPolarityClassification','AmazonReviewsClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterURLCorpus','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','Touche2020','TRECCOVID','BIOSSES','SummEval' 40
Creating model gist$voyage-pca for task AmazonCounterfactualClassification
Loading gist from cache for AmazonCounterfactualClassification...
Loading voyage from c

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Repo card metadata block was not found. Setting CardData to empty.
INFO:eval_models:Evaluating for gist$voyage on Banking77Classification


Creating model gist$voyage-pca for task Banking77Classification
Loading gist from cache for Banking77Classification...
Loading voyage from cache for Banking77Classification...


Creating model gist$voyage-pca for task EmotionClassification
Loading gist from cache for EmotionClassification...
Loading voyage from cache for EmotionClassification...


INFO:eval_models:Evaluating for gist$voyage on EmotionClassification


Repo card metadata block was not found. Setting CardData to empty.


Creating model gist$voyage-pca for task ArxivClusteringS2S
Loading gist from cache for ArxivClusteringS2S...
Loading voyage from cache for ArxivClusteringS2S...


INFO:eval_models:Evaluating for gist$voyage on ArxivClusteringS2S


Clustering: 100%|██████████| 31/31 [01:09<00:00,  2.23s/it]


Creating model gist$voyage-pca for task RedditClustering
Loading gist from cache for RedditClustering...
Loading voyage from cache for RedditClustering...


INFO:eval_models:Evaluating for gist$voyage on RedditClustering


Clustering: 100%|██████████| 25/25 [00:38<00:00,  1.54s/it]


Creating model gist$voyage-pca for task TwitterSemEval2015
Loading gist from cache for TwitterSemEval2015...
Loading voyage from cache for TwitterSemEval2015...


INFO:eval_models:Evaluating for gist$voyage on TwitterSemEval2015


INFO:eval_models:Evaluating for gist$voyage on AskUbuntuDupQuestions


Creating model gist$voyage-pca for task AskUbuntuDupQuestions
Loading gist from cache for AskUbuntuDupQuestions...
Loading voyage from cache for AskUbuntuDupQuestions...


Creating model gist$voyage-pca for task ArguAna
Loading gist from cache for ArguAna...
Loading voyage from cache for ArguAna...


INFO:eval_models:Evaluating for gist$voyage on ArguAna


INFO:eval_models:Evaluating for gist$voyage on SciFact


Creating model gist$voyage-pca for task SciFact
Loading gist from cache for SciFact...
Loading voyage from cache for SciFact...


Creating model gist$voyage-pca for task SICK-R
Loading gist from cache for SICK-R...
Loading voyage from cache for SICK-R...


INFO:eval_models:Evaluating for gist$voyage on SICK-R


INFO:eval_models:Evaluating for gist$voyage on STS12


Creating model gist$voyage-pca for task STS12
Loading gist from cache for STS12...
Loading voyage from cache for STS12...


INFO:eval_models:Evaluating for gist$voyage on STS13


Creating model gist$voyage-pca for task STS13
Loading gist from cache for STS13...
Loading voyage from cache for STS13...


INFO:eval_models:Evaluating for gist$voyage on STS14


Creating model gist$voyage-pca for task STS14
Loading gist from cache for STS14...
Loading voyage from cache for STS14...


INFO:eval_models:Evaluating for gist$voyage on STS15


Creating model gist$voyage-pca for task STS15
Loading gist from cache for STS15...
Loading voyage from cache for STS15...


INFO:eval_models:Evaluating for gist$voyage on STS16


Creating model gist$voyage-pca for task STS16
Loading gist from cache for STS16...
Loading voyage from cache for STS16...


INFO:eval_models:Evaluating for gist$voyage on STS17


Creating model gist$voyage-pca for task STS17
Loading gist from cache for STS17...
Loading voyage from cache for STS17...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:eval_models:Evaluating for gist$voyage on STS22


Creating model gist$voyage-pca for task STS22
Loading gist from cache for STS22...
Loading voyage from cache for STS22...


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Creating model gist$voyage-pca for task STSBenchmark
Loading gist from cache for STSBenchmark...
Loading voyage from cache for STSBenchmark...


INFO:eval_models:Evaluating for gist$voyage on STSBenchmark


INFO:eval_models:Converting results to csv for gist$voyage


Using model name gist$voyage
Converting results_pca/512/gist$voyage to results_pca/512/gist$voyage_results.csv
BUCC - test set not found
Tatoeba - test set not found
de & accuracy not found for task AmazonCounterfactualClassification.
en-ext & accuracy not found for task AmazonCounterfactualClassification.
ja & accuracy not found for task AmazonCounterfactualClassification.
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not found
MedrxivClust

INFO:eval_models:Finished evaluating gist$voyage for all tasks


INFO:eval_models:Starting evaluation for llmrails$voyage
INFO:eval_models:Evaluating for llmrails$voyage on AmazonCounterfactualClassification


SummEval - test set not found
Not found: 'BUCC','Tatoeba','AmazonPolarityClassification','AmazonReviewsClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterURLCorpus','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','Touche2020','TRECCOVID','BIOSSES','SummEval' 40
Creating model llmrails$voyage-pca for task AmazonCounterfactualClassification
Loading llmrails from cache for AmazonCounterfactualClassification...
Loading voyag

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Repo card metadata block was not found. Setting CardData to empty.


Creating model llmrails$voyage-pca for task Banking77Classification
Loading llmrails from cache for Banking77Classification...
Loading voyage from cache for Banking77Classification...


INFO:eval_models:Evaluating for llmrails$voyage on Banking77Classification


Creating model llmrails$voyage-pca for task EmotionClassification
Loading llmrails from cache for EmotionClassification...
Loading voyage from cache for EmotionClassification...


INFO:eval_models:Evaluating for llmrails$voyage on EmotionClassification


Repo card metadata block was not found. Setting CardData to empty.


Creating model llmrails$voyage-pca for task ArxivClusteringS2S
Loading llmrails from cache for ArxivClusteringS2S...
Loading voyage from cache for ArxivClusteringS2S...


INFO:eval_models:Evaluating for llmrails$voyage on ArxivClusteringS2S


Clustering: 100%|██████████| 31/31 [01:08<00:00,  2.21s/it]


Creating model llmrails$voyage-pca for task RedditClustering
Loading llmrails from cache for RedditClustering...
Loading voyage from cache for RedditClustering...


INFO:eval_models:Evaluating for llmrails$voyage on RedditClustering


Clustering: 100%|██████████| 25/25 [00:37<00:00,  1.50s/it]


Creating model llmrails$voyage-pca for task TwitterSemEval2015
Loading llmrails from cache for TwitterSemEval2015...
Loading voyage from cache for TwitterSemEval2015...


INFO:eval_models:Evaluating for llmrails$voyage on TwitterSemEval2015


Creating model llmrails$voyage-pca for task AskUbuntuDupQuestions
Loading llmrails from cache for AskUbuntuDupQuestions...
Loading voyage from cache for AskUbuntuDupQuestions...


INFO:eval_models:Evaluating for llmrails$voyage on AskUbuntuDupQuestions


INFO:eval_models:Evaluating for llmrails$voyage on ArguAna


Creating model llmrails$voyage-pca for task ArguAna
Loading llmrails from cache for ArguAna...
Loading voyage from cache for ArguAna...


INFO:eval_models:Evaluating for llmrails$voyage on SciFact


Creating model llmrails$voyage-pca for task SciFact
Loading llmrails from cache for SciFact...
Loading voyage from cache for SciFact...


Creating model llmrails$voyage-pca for task SICK-R
Loading llmrails from cache for SICK-R...
Loading voyage from cache for SICK-R...


INFO:eval_models:Evaluating for llmrails$voyage on SICK-R


KeyboardInterrupt: 

## example

In [None]:
pca_reloaded = pk.load(open("pca/256/angle$cohere.pkl", "rb"))
model_name = "angle$cohere"

for task in TASK_LIST_STS:
    sentences = datasets.Dataset.load_from_disk(f"data/sentences/{task}")['text']
    stacked_model = model_factory(model_name, task)
    embeddings = stacked_model.encode(sentences)
    
    print(np.array(embeddings).shape)
    pca_embs = pca_reloaded.transform(embeddings)
    print(np.array(pca_embs).shape)