# OpenNLP Experiments

In [5]:
# !pip install mteb
# !pip install openai
# !pip install cohere

# !pip install -U voyageai
# !pip install mteb[beir]

from typing import List, Optional
import numpy as np
import itertools
from tqdm import tqdm

In [6]:
access_token = "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"

"""Example script for benchmarking all datasets constituting the MTEB English leaderboard & average scores"""

import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("main")


TASK_LIST_CLASSIFICATION = [
    "AmazonCounterfactualClassification",
    # "AmazonPolarityClassification",
    "AmazonReviewsClassification",
    "Banking77Classification",
    "EmotionClassification",
    "ImdbClassification",
    "MassiveIntentClassification",
    "MassiveScenarioClassification",
    "MTOPDomainClassification",
    "MTOPIntentClassification",
    "ToxicConversationsClassification",
    "TweetSentimentExtractionClassification",
]

TASK_LIST_CLUSTERING = [
    # "ArxivClusteringP2P",
    "ArxivClusteringS2S",
    # "BiorxivClusteringP2P",
    "BiorxivClusteringS2S",
    "MedrxivClusteringP2P",
    "MedrxivClusteringS2S",
    "RedditClustering",
    # "RedditClusteringP2P",
    "StackExchangeClustering",
    "StackExchangeClusteringP2P",
    "TwentyNewsgroupsClustering",
]

TASK_LIST_PAIR_CLASSIFICATION = [
    "SprintDuplicateQuestions",
    "TwitterSemEval2015",
    # "TwitterURLCorpus",
]

TASK_LIST_RERANKING = [
    "AskUbuntuDupQuestions",
    # "MindSmallReranking",
    "SciDocsRR",
    "StackOverflowDupQuestions",
]

TASK_LIST_RETRIEVAL = [
    "ArguAna",
    # "ClimateFEVER",
    "CQADupstackAndroidRetrieval",
    "CQADupstackEnglishRetrieval",
    "CQADupstackGamingRetrieval",
    "CQADupstackGisRetrieval",
    "CQADupstackMathematicaRetrieval",
    "CQADupstackPhysicsRetrieval",
    "CQADupstackProgrammersRetrieval",
    "CQADupstackStatsRetrieval",
    "CQADupstackTexRetrieval",
    "CQADupstackUnixRetrieval",
    "CQADupstackWebmastersRetrieval",
    "CQADupstackWordpressRetrieval",
    # "DBPedia",
    # "FEVER",
    "FiQA2018",
    # "HotpotQA",
    # "MSMARCO",
    "NFCorpus",
    # "NQ",
    # "QuoraRetrieval",
    "SCIDOCS",
    "SciFact",
    # "Touche2020",
    # "TRECCOVID",
]

TASK_LIST_STS = [
    "BIOSSES",
    "SICK-R",
    "STS12",
    "STS13",
    "STS14",
    "STS15",
    "STS16",
    "STS17",
    "STS22",
    "STSBenchmark",
    # "SummEval",
]


TASK_LIST = (
    TASK_LIST_CLASSIFICATION
    + TASK_LIST_CLUSTERING
    + TASK_LIST_PAIR_CLASSIFICATION
    + TASK_LIST_RERANKING
    + TASK_LIST_RETRIEVAL
    + TASK_LIST_STS
)


## Evaluate Our Models

In [7]:
from model_factory import BASIC_MODELS, model_factory

### Generate List of Models

In [8]:
from model_factory import BASIC_MODELS

max_stack_size = len(BASIC_MODELS)

# Generate stacked model of all sizes
ALL_COMBINATIONS = []
for r in range(2, min(max_stack_size + 1, len(BASIC_MODELS) + 1)):
    combinations_object = itertools.combinations(BASIC_MODELS, r)
    combinations_list = [sorted(list(combination)) for combination in combinations_object] # Sort to ensure the same combination is always the same
    ALL_COMBINATIONS.extend(['$'.join(combination) for combination in combinations_list])


MODELS = BASIC_MODELS + ALL_COMBINATIONS

## Evaluate The Models

In [19]:
import os
from results_to_csv import main as convert_to_csv

def run_on_tasks(model_name):
    from mteb import MTEB # Import MTEB here to avoid concurrency warning
    print(f"Evaluating the model {model_name}...")
    for task in TASK_LIST_STS:
        if os.path.exists(f"results/{model_name}/{task}.json"):
            print(f"Skipping {task} as it already exists")
            continue

        # TODO: check the below condition as everything should exist
        model_names = model_name.split("$")
        if not np.all([os.path.exists(f"data/{model_name_}/{task}") for model_name_ in model_names]):
            print(f"Skipping {task} as it doesn't have the required data for model(s) {model_names}")
            continue
        
        logger.info(f"Running task: {task}")
        model = model_factory(model_name, task)
        eval_splits = ["dev"] if task == "MSMARCO" else ["test"]
        evaluation = MTEB(tasks=[task], task_langs=["en"])  # Remove "en" for running all languages
        evaluation.run(model, output_folder=f"results/{model_name}", eval_splits=eval_splits)

    if os.path.exists(f"results/{model_name}"):
        print("Converting the results to a CSV file...")
        convert_to_csv(f"results/{model_name}")

    print("--DONE--")

def evaluate_model(model_name):
    return run_on_tasks(model_name)

def evaluate_models(parallel=False, compute_used=0.75):
    print(f"Num models to be evaluated: {len(MODELS)}")
    print(f"Models to be evaluated: {MODELS}")
    if parallel:
        import multiprocessing
        num_cores = multiprocessing.cpu_count()
        print("Number of cores: ", num_cores)

        num_processes = int(num_cores * compute_used) # Use 75% of the cores by default
        print(f"Running in parallel with {num_processes} processes")
        pool = multiprocessing.Pool(processes=num_processes)
        results = pool.map(evaluate_model, MODELS)
        pool.close()
        pool.join()
    else:
        for model_name in MODELS:
            evaluate_model(model_name)

## Run

In [20]:
evaluate_models(parallel=False)

Num models to be evaluated: 255
Models to be evaluated: ['angle', 'cohere', 'flag-embedding', 'gist', 'gte-large', 'llmrails', 'mixed-bread', 'voyage', 'angle$cohere', 'angle$flag-embedding', 'angle$gist', 'angle$gte-large', 'angle$llmrails', 'angle$mixed-bread', 'angle$voyage', 'cohere$flag-embedding', 'cohere$gist', 'cohere$gte-large', 'cohere$llmrails', 'cohere$mixed-bread', 'cohere$voyage', 'flag-embedding$gist', 'flag-embedding$gte-large', 'flag-embedding$llmrails', 'flag-embedding$mixed-bread', 'flag-embedding$voyage', 'gist$gte-large', 'gist$llmrails', 'gist$mixed-bread', 'gist$voyage', 'gte-large$llmrails', 'gte-large$mixed-bread', 'gte-large$voyage', 'llmrails$mixed-bread', 'llmrails$voyage', 'mixed-bread$voyage', 'angle$cohere$flag-embedding', 'angle$cohere$gist', 'angle$cohere$gte-large', 'angle$cohere$llmrails', 'angle$cohere$mixed-bread', 'angle$cohere$voyage', 'angle$flag-embedding$gist', 'angle$flag-embedding$gte-large', 'angle$flag-embedding$llmrails', 'angle$flag-embed

INFO:main:Running task: STS16
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.03 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8546195285405633, 'spearman': 0.8682409112251157}, 'manhattan': {'pearson': 0.859693335655636, 'spearman': 0.867923777273595}, 'euclidean': {'pearson': 0.8601160570752957, 'spearman': 0.8682409112251157}, 'evaluation_time': 0.03}
INFO:main:Running task: STS17
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Loading llmrails from cache for STS17...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.01 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.8806351068729286, 'spearman': 0.8790311138579117}, 'manhattan': {'pearson': 0.8869205939269971, 'spearman': 0.8803280678212351}, 'euclidean': {'pearson': 0.8853723226837864, 'spearman': 0.8790311138579117}}, 'evaluation_time'

Loading llmrails from cache for STS22...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.03 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.6816566998736999, 'spearman': 0.6719307636861983}, 'manhattan': {'pearson': 0.6945296491283194, 'spearman': 0.673918221277809}, 'euclidean': {'pearson': 0.6952117004081915, 'spearman': 0.6719307636861983}}, 'evaluation_time': 0.03}

Loading llmrails from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.04 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8674035069875136, 'spearman': 0.8776870861049149}, 'manhattan': {'pearson': 0.8710889645280399, 'spearman': 0.8772408707674758}, 'euclidean': {'pearson': 0.8711306703236905, 'spearman': 0.8776870861049149}, 'evaluation_time': 0.04}


Converting the results to a CSV file...
Using model name llmrails
Converting results/llmrails to results/llmrails_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not found
MedrxivClusteringS2S - t

INFO:main:Running task: BIOSSES
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

********************** Evaluating BIOSSES **********************
INFO:mteb.evaluation.MTEB:Loading dataset for BIOSSES
INFO:mteb.abstasks.AbsTaskSTS:
Task: BIOSSES, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for BIOSSES on test took 0.01 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8859328600020908, 'spearman': 0.8667787862872268}, 'manhattan': {'pearson': 0.8749078747435275, 'spearman': 0.8626943558713114}, 'euclidean': {'pearson': 0.8769642758254219, 'spearman': 0.8667787862872268}, 'evaluation_time': 0.01}
INFO:main:Running task: SICK-R


Loading voyage from cache for SICK-R...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating SICK-R **********************
INFO:mteb.evaluation.MTEB:Loading dataset for SICK-R
INFO:mteb.abstasks.AbsTaskSTS:
Task: SICK-R, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for SICK-R on test took 0.37 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8311787880233297, 'spearman': 0.7626194213826961}, 'manhattan': {'pearson': 0.8054441284586997, 'spearman': 0.7630605400576247}, 'euclidean': {'pearson': 0.8051086473164781, 'spearman': 0.7626194213826961}, 'evaluation_time': 0.37}
INFO:main:Running task: STS12


Loading voyage from cache for STS12...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS12 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS12
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS12, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS12 on test took 0.13 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.729184360678129, 'spearman': 0.6835965689799975}, 'manhattan': {'pearson': 0.6992629581724723, 'spearman': 0.684289851507658}, 'euclidean': {'pearson': 0.698927569592614, 'spearman': 0.6835965689799975}, 'evaluation_time': 0.13}
INFO:main:Running task: STS13


Loading voyage from cache for STS13...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS13 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS13
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS13, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS13 on test took 0.07 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8165208017886731, 'spearman': 0.8226281160055834}, 'manhattan': {'pearson': 0.8188596095871379, 'spearman': 0.8224749518016917}, 'euclidean': {'pearson': 0.8189958404765078, 'spearman': 0.8226281160055834}, 'evaluation_time': 0.07}
INFO:main:Running task: STS14


Loading voyage from cache for STS14...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS14 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS14
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS14, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS14 on test took 0.13 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.793511652548444, 'spearman': 0.7488873417046866}, 'manhattan': {'pearson': 0.7823931431926552, 'spearman': 0.7484313905250602}, 'euclidean': {'pearson': 0.7827082417170659, 'spearman': 0.7488873417046866}, 'evaluation_time': 0.13}
INFO:main:Running task: STS15


Loading voyage from cache for STS15...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS15 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS15
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS15, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS15 on test took 0.11 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8307856362754787, 'spearman': 0.8428647085531656}, 'manhattan': {'pearson': 0.836161214998303, 'spearman': 0.8424325362979461}, 'euclidean': {'pearson': 0.836696528259346, 'spearman': 0.8428647085531656}, 'evaluation_time': 0.11}
INFO:main:Running task: STS16


Loading voyage from cache for STS16...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.05 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8186971640614678, 'spearman': 0.8376672814265603}, 'manhattan': {'pearson': 0.8338750138273515, 'spearman': 0.8384220379029413}, 'euclidean': {'pearson': 0.8334107174040893, 'spearman': 0.8376672814265603}, 'evaluation_time': 0.05}
INFO:main:Running task: STS17
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Loading voyage from cache for STS17...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.01 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.8774284649992389, 'spearman': 0.8817956938532354}, 'manhattan': {'pearson': 0.8764976423685331, 'spearman': 0.8804898987692372}, 'euclidean': {'pearson': 0.877437433498088, 'spearman': 0.8817956938532354}}, 'evaluation_time':

Loading voyage from cache for STS22...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.03 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.6376448149547632, 'spearman': 0.6454264459432041}, 'manhattan': {'pearson': 0.6553248863008131, 'spearman': 0.6486460946870255}, 'euclidean': {'pearson': 0.6558399843551653, 'spearman': 0.6454264459432041}}, 'evaluation_time': 0.03

Loading voyage from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.05 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8357664562533647, 'spearman': 0.8347909627723197}, 'manhattan': {'pearson': 0.8378288062693194, 'spearman': 0.8347580795518295}, 'euclidean': {'pearson': 0.8378885861571144, 'spearman': 0.8347909627723197}, 'evaluation_time': 0.05}


Converting the results to a CSV file...
Using model name voyage
Converting results/voyage to results/voyage_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not found
MedrxivClusteringS2S - test se

INFO:main:Running task: BIOSSES
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

********************** Evaluating BIOSSES **********************
INFO:mteb.evaluation.MTEB:Loading dataset for BIOSSES
INFO:mteb.abstasks.AbsTaskSTS:
Task: BIOSSES, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for BIOSSES on test took 0.02 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8764579080377759, 'spearman': 0.8563146924340629}, 'manhattan': {'pearson': 0.8560640362349622, 'spearman': 0.8516579598212274}, 'euclidean': {'pearson': 0.8579908639676834, 'spearman': 0.8563146924340629}, 'evaluation_time': 0.02}
INFO:main:Running task: SICK-R


Loading angle from cache for SICK-R...
Loading cohere from cache for SICK-R...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating SICK-R **********************
INFO:mteb.evaluation.MTEB:Loading dataset for SICK-R
INFO:mteb.abstasks.AbsTaskSTS:
Task: SICK-R, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for SICK-R on test took 1.04 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8700412162186232, 'spearman': 0.8251597472686164}, 'manhattan': {'pearson': 0.8420766941469868, 'spearman': 0.8247690918422333}, 'euclidean': {'pearson': 0.8430582264836717, 'spearman': 0.8251597472686164}, 'evaluation_time': 1.04}
INFO:main:Running task: STS12


Loading angle from cache for STS12...
Loading cohere from cache for STS12...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS12 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS12
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS12, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS12 on test took 0.28 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8658173314616611, 'spearman': 0.7765578262622401}, 'manhattan': {'pearson': 0.8363631828260711, 'spearman': 0.7783701137217786}, 'euclidean': {'pearson': 0.8352415595621193, 'spearman': 0.7765578262622401}, 'evaluation_time': 0.28}
INFO:main:Running task: STS13


Loading angle from cache for STS13...
Loading cohere from cache for STS13...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS13 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS13
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS13, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS13 on test took 0.14 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8806352546912593, 'spearman': 0.8831052064621221}, 'manhattan': {'pearson': 0.8780368079733815, 'spearman': 0.8852156758383696}, 'euclidean': {'pearson': 0.876080976257338, 'spearman': 0.8831052064621221}, 'evaluation_time': 0.14}
INFO:main:Running task: STS14


Loading angle from cache for STS14...
Loading cohere from cache for STS14...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS14 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS14
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS14, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS14 on test took 0.34 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8679823483894417, 'spearman': 0.838647457979168}, 'manhattan': {'pearson': 0.8576295237525775, 'spearman': 0.8399070027542562}, 'euclidean': {'pearson': 0.8576098787570373, 'spearman': 0.838647457979168}, 'evaluation_time': 0.34}
INFO:main:Running task: STS15


Loading angle from cache for STS15...
Loading cohere from cache for STS15...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS15 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS15
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS15, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS15 on test took 0.27 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8956467087693278, 'spearman': 0.903312067438125}, 'manhattan': {'pearson': 0.8959092996702536, 'spearman': 0.9032613223885422}, 'euclidean': {'pearson': 0.8959408383519474, 'spearman': 0.903312067438125}, 'evaluation_time': 0.27}
INFO:main:Running task: STS16


Loading angle from cache for STS16...
Loading cohere from cache for STS16...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.10 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8620098383952745, 'spearman': 0.873462716342824}, 'manhattan': {'pearson': 0.8658906179962358, 'spearman': 0.8731092515748518}, 'euclidean': {'pearson': 0.8664589251145449, 'spearman': 0.873462716342824}, 'evaluation_time': 0.1}
INFO:main:Running task: STS17
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Loading angle from cache for STS17...
Loading cohere from cache for STS17...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.04 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.9061118616149886, 'spearman': 0.9049186839865356}, 'manhattan': {'pearson': 0.9085075520521075, 'spearman': 0.9060307432087925}, 'euclidean': {'pearson': 0.9086142401785782, 'spearman': 0.9049186839865356}}, 'evaluation_time'

Loading angle from cache for STS22...
Loading cohere from cache for STS22...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.04 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.6822897527813888, 'spearman': 0.6838359764831191}, 'manhattan': {'pearson': 0.6985520920258437, 'spearman': 0.68355763475653}, 'euclidean': {'pearson': 0.6986570539757598, 'spearman': 0.6838359764831191}}, 'evaluation_time': 0.04}


Loading angle from cache for STSBenchmark...
Loading cohere from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.13 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8826363778966889, 'spearman': 0.8966002477651973}, 'manhattan': {'pearson': 0.8945923185234854, 'spearman': 0.8966007104897211}, 'euclidean': {'pearson': 0.8945281582221256, 'spearman': 0.8966002477651973}, 'evaluation_time': 0.13}


Converting the results to a CSV file...
Using model name angle$cohere
Converting results/angle$cohere to results/angle$cohere_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not found
MedrxivClust

INFO:main:Running task: BIOSSES
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

********************** Evaluating BIOSSES **********************
INFO:mteb.evaluation.MTEB:Loading dataset for BIOSSES
INFO:mteb.abstasks.AbsTaskSTS:
Task: BIOSSES, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for BIOSSES on test took 0.01 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.9003398813820975, 'spearman': 0.8867732177775265}, 'manhattan': {'pearson': 0.8781898682414133, 'spearman': 0.8840261672323091}, 'euclidean': {'pearson': 0.8811537890063653, 'spearman': 0.8867732177775265}, 'evaluation_time': 0.01}
INFO:main:Running task: SICK-R


Loading angle from cache for SICK-R...
Loading gist from cache for SICK-R...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating SICK-R **********************
INFO:mteb.evaluation.MTEB:Loading dataset for SICK-R
INFO:mteb.abstasks.AbsTaskSTS:
Task: SICK-R, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for SICK-R on test took 0.55 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8755526064484902, 'spearman': 0.8286136586791635}, 'manhattan': {'pearson': 0.8501573195227068, 'spearman': 0.8284920791550819}, 'euclidean': {'pearson': 0.8501601891019864, 'spearman': 0.8286136260541603}, 'evaluation_time': 0.55}
INFO:main:Running task: STS12


Loading angle from cache for STS12...
Loading gist from cache for STS12...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS12 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS12
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS12, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS12 on test took 0.17 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8730515258911355, 'spearman': 0.7830724054120972}, 'manhattan': {'pearson': 0.8483161074629066, 'spearman': 0.7829728443911249}, 'euclidean': {'pearson': 0.8482348052251674, 'spearman': 0.7830725533823167}, 'evaluation_time': 0.17}
INFO:main:Running task: STS13


Loading angle from cache for STS13...
Loading gist from cache for STS13...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS13 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS13
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS13, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS13 on test took 0.08 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8906859725378293, 'spearman': 0.8971056083653802}, 'manhattan': {'pearson': 0.8898214797072995, 'spearman': 0.8969844390075606}, 'euclidean': {'pearson': 0.8898886108871398, 'spearman': 0.8971056083653802}, 'evaluation_time': 0.08}
INFO:main:Running task: STS14


Loading angle from cache for STS14...
Loading gist from cache for STS14...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS14 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS14
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS14, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS14 on test took 0.21 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8705776375371563, 'spearman': 0.8465898104718269}, 'manhattan': {'pearson': 0.8632778862103021, 'spearman': 0.8465789080761331}, 'euclidean': {'pearson': 0.8634445514815521, 'spearman': 0.8465898104718269}, 'evaluation_time': 0.21}
INFO:main:Running task: STS15


Loading angle from cache for STS15...
Loading gist from cache for STS15...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS15 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS15
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS15, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS15 on test took 0.16 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8869218020914317, 'spearman': 0.8981128840690155}, 'manhattan': {'pearson': 0.8914638395080119, 'spearman': 0.8981395943127921}, 'euclidean': {'pearson': 0.8914922888765514, 'spearman': 0.8981129637212493}, 'evaluation_time': 0.16}
INFO:main:Running task: STS16


Loading angle from cache for STS16...
Loading gist from cache for STS16...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.08 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8544506787304045, 'spearman': 0.8674693137115043}, 'manhattan': {'pearson': 0.8610132578526501, 'spearman': 0.8675027367492141}, 'euclidean': {'pearson': 0.8611992887124744, 'spearman': 0.8674693137115043}, 'evaluation_time': 0.08}
INFO:main:Running task: STS17
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Loading angle from cache for STS17...
Loading gist from cache for STS17...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.02 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.8967555883744535, 'spearman': 0.8948970815034875}, 'manhattan': {'pearson': 0.9018958695359081, 'spearman': 0.8946057089177736}, 'euclidean': {'pearson': 0.9016373806876762, 'spearman': 0.8948970815034875}}, 'evaluation_time'

Loading angle from cache for STS22...
Loading gist from cache for STS22...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.04 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.6924012622675255, 'spearman': 0.6966331849886414}, 'manhattan': {'pearson': 0.71039440723528, 'spearman': 0.6994955471302036}, 'euclidean': {'pearson': 0.7094087177802503, 'spearman': 0.6966331849886414}}, 'evaluation_time': 0.04}


Loading angle from cache for STSBenchmark...
Loading gist from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.08 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8757592579622784, 'spearman': 0.8892242493430351}, 'manhattan': {'pearson': 0.8875548023838363, 'spearman': 0.8890500450134052}, 'euclidean': {'pearson': 0.8876133493672852, 'spearman': 0.8892242493430351}, 'evaluation_time': 0.08}


Converting the results to a CSV file...
Using model name angle$gist
Converting results/angle$gist to results/angle$gist_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not found
MedrxivClusteringS

INFO:main:Running task: BIOSSES
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

********************** Evaluating BIOSSES **********************
INFO:mteb.evaluation.MTEB:Loading dataset for BIOSSES
INFO:mteb.abstasks.AbsTaskSTS:
Task: BIOSSES, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for BIOSSES on test took 0.01 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8740813262119844, 'spearman': 0.8619112054965784}, 'manhattan': {'pearson': 0.8503280297899123, 'spearman': 0.8577605085104932}, 'euclidean': {'pearson': 0.8533454432939424, 'spearman': 0.8619112054965784}, 'evaluation_time': 0.01}
INFO:main:Running task: SICK-R


Loading angle from cache for SICK-R...
Loading llmrails from cache for SICK-R...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating SICK-R **********************
INFO:mteb.evaluation.MTEB:Loading dataset for SICK-R
INFO:mteb.abstasks.AbsTaskSTS:
Task: SICK-R, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for SICK-R on test took 0.54 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8687951324130269, 'spearman': 0.8276207935539898}, 'manhattan': {'pearson': 0.8407071895559026, 'spearman': 0.8275500449169495}, 'euclidean': {'pearson': 0.8408150461090937, 'spearman': 0.8276208171890161}, 'evaluation_time': 0.54}
INFO:main:Running task: STS12


Loading angle from cache for STS12...
Loading llmrails from cache for STS12...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS12 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS12
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS12, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS12 on test took 0.17 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8766145672055414, 'spearman': 0.7910836214227172}, 'manhattan': {'pearson': 0.843073886047077, 'spearman': 0.7911190399126511}, 'euclidean': {'pearson': 0.8432295336086979, 'spearman': 0.7910836713359038}, 'evaluation_time': 0.17}
INFO:main:Running task: STS13


Loading angle from cache for STS13...
Loading llmrails from cache for STS13...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS13 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS13
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS13, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS13 on test took 0.08 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.876984905758651, 'spearman': 0.8853434488042965}, 'manhattan': {'pearson': 0.8776027180100734, 'spearman': 0.8852712415259919}, 'euclidean': {'pearson': 0.8776256568859278, 'spearman': 0.8853434488042965}, 'evaluation_time': 0.08}
INFO:main:Running task: STS14


Loading angle from cache for STS14...
Loading llmrails from cache for STS14...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS14 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS14
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS14, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS14 on test took 0.20 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8584628209210815, 'spearman': 0.8445766363592351}, 'manhattan': {'pearson': 0.8500396948378897, 'spearman': 0.8442415332838598}, 'euclidean': {'pearson': 0.8503114410836126, 'spearman': 0.8445767205687229}, 'evaluation_time': 0.2}
INFO:main:Running task: STS15


Loading angle from cache for STS15...
Loading llmrails from cache for STS15...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS15 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS15
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS15, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS15 on test took 0.16 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8839897817647707, 'spearman': 0.8931739425027367}, 'manhattan': {'pearson': 0.8867121584475651, 'spearman': 0.8933066902379464}, 'euclidean': {'pearson': 0.8867150079169909, 'spearman': 0.8931739425027367}, 'evaluation_time': 0.16}
INFO:main:Running task: STS16


Loading angle from cache for STS16...
Loading llmrails from cache for STS16...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.07 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8568947461676559, 'spearman': 0.8698067774406727}, 'manhattan': {'pearson': 0.8612893829702674, 'spearman': 0.8692663355676261}, 'euclidean': {'pearson': 0.861790221890993, 'spearman': 0.8698067774406727}, 'evaluation_time': 0.07}
INFO:main:Running task: STS17
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Loading angle from cache for STS17...
Loading llmrails from cache for STS17...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.02 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.894752534047568, 'spearman': 0.8941755691771219}, 'manhattan': {'pearson': 0.899301196535077, 'spearman': 0.8956205158126088}, 'euclidean': {'pearson': 0.8981772817492211, 'spearman': 0.8941755691771219}}, 'evaluation_time': 

Loading angle from cache for STS22...
Loading llmrails from cache for STS22...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.04 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.6853883096865592, 'spearman': 0.6862718735231235}, 'manhattan': {'pearson': 0.7008449904837435, 'spearman': 0.6859512108322575}, 'euclidean': {'pearson': 0.7006024998938579, 'spearman': 0.6862718735231235}}, 'evaluation_time': 0.04

Loading angle from cache for STSBenchmark...
Loading llmrails from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.08 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8763908675553004, 'spearman': 0.8890674796488042}, 'manhattan': {'pearson': 0.8841185761200518, 'spearman': 0.8890791060468983}, 'euclidean': {'pearson': 0.8840438526851095, 'spearman': 0.8890674796488042}, 'evaluation_time': 0.08}


Converting the results to a CSV file...
Using model name angle$llmrails
Converting results/angle$llmrails to results/angle$llmrails_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not found
Medrxi

INFO:main:Running task: BIOSSES
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

********************** Evaluating BIOSSES **********************
INFO:mteb.evaluation.MTEB:Loading dataset for BIOSSES
INFO:mteb.abstasks.AbsTaskSTS:
Task: BIOSSES, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for BIOSSES on test took 0.02 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.9026567523508565, 'spearman': 0.8912853687827191}, 'manhattan': {'pearson': 0.8882311830226239, 'spearman': 0.8885684394057607}, 'euclidean': {'pearson': 0.888861937270814, 'spearman': 0.8912853687827191}, 'evaluation_time': 0.02}
INFO:main:Running task: SICK-R


Loading angle from cache for SICK-R...
Loading voyage from cache for SICK-R...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating SICK-R **********************
INFO:mteb.evaluation.MTEB:Loading dataset for SICK-R
INFO:mteb.abstasks.AbsTaskSTS:
Task: SICK-R, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for SICK-R on test took 0.93 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8773411208084132, 'spearman': 0.8269527465681227}, 'manhattan': {'pearson': 0.8535498013223455, 'spearman': 0.8232737488714652}, 'euclidean': {'pearson': 0.8543153711860323, 'spearman': 0.8269527465681227}, 'evaluation_time': 0.93}
INFO:main:Running task: STS12


Loading angle from cache for STS12...
Loading voyage from cache for STS12...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS12 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS12
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS12, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS12 on test took 0.28 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8697705082242454, 'spearman': 0.7821399467707333}, 'manhattan': {'pearson': 0.8248584821336777, 'spearman': 0.7767547031990761}, 'euclidean': {'pearson': 0.8352537376030773, 'spearman': 0.7821399467707333}, 'evaluation_time': 0.28}
INFO:main:Running task: STS13


Loading angle from cache for STS13...
Loading voyage from cache for STS13...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS13 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS13
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS13, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS13 on test took 0.14 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8835440333550035, 'spearman': 0.8889445737183714}, 'manhattan': {'pearson': 0.8795639009296511, 'spearman': 0.8850654566550702}, 'euclidean': {'pearson': 0.8830358885528884, 'spearman': 0.8889445737183714}, 'evaluation_time': 0.14}
INFO:main:Running task: STS14


Loading angle from cache for STS14...
Loading voyage from cache for STS14...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS14 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS14
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS14, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS14 on test took 0.34 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8654750439452288, 'spearman': 0.8384639999450848}, 'manhattan': {'pearson': 0.8526143365581278, 'spearman': 0.8315975388508208}, 'euclidean': {'pearson': 0.8571672255013565, 'spearman': 0.8384639999450848}, 'evaluation_time': 0.34}
INFO:main:Running task: STS15


Loading angle from cache for STS15...
Loading voyage from cache for STS15...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS15 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS15
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS15, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS15 on test took 0.25 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8912561249462041, 'spearman': 0.8995259808479725}, 'manhattan': {'pearson': 0.8898103627137585, 'spearman': 0.8969506657447526}, 'euclidean': {'pearson': 0.8923660290007323, 'spearman': 0.8995259808479725}, 'evaluation_time': 0.25}
INFO:main:Running task: STS16


Loading angle from cache for STS16...
Loading voyage from cache for STS16...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.12 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8585603441821907, 'spearman': 0.872805273771566}, 'manhattan': {'pearson': 0.8649123152048211, 'spearman': 0.87163009335124}, 'euclidean': {'pearson': 0.8654528225710884, 'spearman': 0.872805273771566}, 'evaluation_time': 0.12}
INFO:main:Running task: STS17
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Loading angle from cache for STS17...
Loading voyage from cache for STS17...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.03 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.9008545843235017, 'spearman': 0.901155441751522}, 'manhattan': {'pearson': 0.9048590310607951, 'spearman': 0.9024489361458858}, 'euclidean': {'pearson': 0.9045233218247799, 'spearman': 0.901155441751522}}, 'evaluation_time': 

Loading angle from cache for STS22...
Loading voyage from cache for STS22...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.04 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.678533965453411, 'spearman': 0.6792710093941167}, 'manhattan': {'pearson': 0.6952415825162055, 'spearman': 0.6774446970126367}, 'euclidean': {'pearson': 0.6982222993099391, 'spearman': 0.6792710093941167}}, 'evaluation_time': 0.04}

Loading angle from cache for STSBenchmark...
Loading voyage from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.12 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.882759105927938, 'spearman': 0.8916602167842199}, 'manhattan': {'pearson': 0.8889055128062456, 'spearman': 0.8891821666092402}, 'euclidean': {'pearson': 0.8906232673149121, 'spearman': 0.8916602167842199}, 'evaluation_time': 0.12}


Converting the results to a CSV file...
Using model name angle$voyage
Converting results/angle$voyage to results/angle$voyage_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not found
MedrxivClust

INFO:main:Running task: BIOSSES
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

********************** Evaluating BIOSSES **********************
INFO:mteb.evaluation.MTEB:Loading dataset for BIOSSES
INFO:mteb.abstasks.AbsTaskSTS:
Task: BIOSSES, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for BIOSSES on test took 0.02 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8961048100477851, 'spearman': 0.8741765452116281}, 'manhattan': {'pearson': 0.8770087986755146, 'spearman': 0.8749175259507986}, 'euclidean': {'pearson': 0.8774420383863064, 'spearman': 0.8741765452116281}, 'evaluation_time': 0.02}
INFO:main:Running task: SICK-R


Loading cohere from cache for SICK-R...
Loading gist from cache for SICK-R...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating SICK-R **********************
INFO:mteb.evaluation.MTEB:Loading dataset for SICK-R
INFO:mteb.abstasks.AbsTaskSTS:
Task: SICK-R, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for SICK-R on test took 0.92 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8714785393524618, 'spearman': 0.8255958639676884}, 'manhattan': {'pearson': 0.84259886243759, 'spearman': 0.825170159449083}, 'euclidean': {'pearson': 0.8435945032994765, 'spearman': 0.8255958639676884}, 'evaluation_time': 0.92}
INFO:main:Running task: STS12


Loading cohere from cache for STS12...
Loading gist from cache for STS12...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS12 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS12
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS12, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS12 on test took 0.27 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8565195137827195, 'spearman': 0.7656633054749225}, 'manhattan': {'pearson': 0.826503079570349, 'spearman': 0.7666737682738676}, 'euclidean': {'pearson': 0.8262810625997645, 'spearman': 0.7656633054749225}, 'evaluation_time': 0.27}
INFO:main:Running task: STS13


Loading cohere from cache for STS13...
Loading gist from cache for STS13...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS13 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS13
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS13, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS13 on test took 0.14 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8769504006949582, 'spearman': 0.8791747713028322}, 'manhattan': {'pearson': 0.8735819290316869, 'spearman': 0.8801793418496908}, 'euclidean': {'pearson': 0.8723789968898161, 'spearman': 0.8791747713028322}, 'evaluation_time': 0.14}
INFO:main:Running task: STS14


Loading cohere from cache for STS14...
Loading gist from cache for STS14...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS14 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS14
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS14, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS14 on test took 0.34 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8650575318275695, 'spearman': 0.8316593039060999}, 'manhattan': {'pearson': 0.8551473053157164, 'spearman': 0.8325144443728435}, 'euclidean': {'pearson': 0.8550880119993547, 'spearman': 0.8316593039060999}, 'evaluation_time': 0.34}
INFO:main:Running task: STS15


Loading cohere from cache for STS15...
Loading gist from cache for STS15...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS15 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS15
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS15, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS15 on test took 0.26 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8941723153965621, 'spearman': 0.9022745548148434}, 'manhattan': {'pearson': 0.8955593560050631, 'spearman': 0.9025826326794115}, 'euclidean': {'pearson': 0.8952214840841135, 'spearman': 0.9022745548148434}, 'evaluation_time': 0.26}
INFO:main:Running task: STS16


Loading cohere from cache for STS16...
Loading gist from cache for STS16...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.11 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8597110923048208, 'spearman': 0.8720545414858405}, 'manhattan': {'pearson': 0.8657235959930079, 'spearman': 0.8715000751441815}, 'euclidean': {'pearson': 0.8661410870915508, 'spearman': 0.8720545414858405}, 'evaluation_time': 0.11}
INFO:main:Running task: STS17
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Loading cohere from cache for STS17...
Loading gist from cache for STS17...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.03 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.9083863385971291, 'spearman': 0.9074926032925266}, 'manhattan': {'pearson': 0.90932114932805, 'spearman': 0.9076083066543998}, 'euclidean': {'pearson': 0.9091173993957058, 'spearman': 0.9074926032925266}}, 'evaluation_time': 

Loading cohere from cache for STS22...
Loading gist from cache for STS22...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.04 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.6868413425213724, 'spearman': 0.6900832019021212}, 'manhattan': {'pearson': 0.7027702827083578, 'spearman': 0.6887891570328907}, 'euclidean': {'pearson': 0.7028924338683807, 'spearman': 0.6900832019021212}}, 'evaluation_time': 0.04

Loading cohere from cache for STSBenchmark...
Loading gist from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.13 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8804228137063954, 'spearman': 0.8918328359387485}, 'manhattan': {'pearson': 0.8915021497410557, 'spearman': 0.8915291328824904}, 'euclidean': {'pearson': 0.8916239111964424, 'spearman': 0.8918328359387485}, 'evaluation_time': 0.13}


Converting the results to a CSV file...
Using model name cohere$gist
Converting results/cohere$gist to results/cohere$gist_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not found
MedrxivClusteri

INFO:main:Running task: BIOSSES
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

********************** Evaluating BIOSSES **********************
INFO:mteb.evaluation.MTEB:Loading dataset for BIOSSES
INFO:mteb.abstasks.AbsTaskSTS:
Task: BIOSSES, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for BIOSSES on test took 0.02 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8745048326380069, 'spearman': 0.85704362470593}, 'manhattan': {'pearson': 0.8509696786600179, 'spearman': 0.8548929732922399}, 'euclidean': {'pearson': 0.8533371986365555, 'spearman': 0.85704362470593}, 'evaluation_time': 0.02}
INFO:main:Running task: SICK-R


Loading cohere from cache for SICK-R...
Loading llmrails from cache for SICK-R...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating SICK-R **********************
INFO:mteb.evaluation.MTEB:Loading dataset for SICK-R
INFO:mteb.abstasks.AbsTaskSTS:
Task: SICK-R, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for SICK-R on test took 0.94 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8623660965575786, 'spearman': 0.8231889903759038}, 'manhattan': {'pearson': 0.830952621783903, 'spearman': 0.8222663770588637}, 'euclidean': {'pearson': 0.8329299799978155, 'spearman': 0.8231889903759038}, 'evaluation_time': 0.94}
INFO:main:Running task: STS12


Loading cohere from cache for STS12...
Loading llmrails from cache for STS12...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS12 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS12
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS12, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS12 on test took 0.27 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8592387174650422, 'spearman': 0.7742621167159857}, 'manhattan': {'pearson': 0.8204073024565246, 'spearman': 0.7754914006395739}, 'euclidean': {'pearson': 0.8207345071636825, 'spearman': 0.7742621167159857}, 'evaluation_time': 0.27}
INFO:main:Running task: STS13


Loading cohere from cache for STS13...
Loading llmrails from cache for STS13...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS13 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS13
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS13, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS13 on test took 0.14 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8704604509946123, 'spearman': 0.8729515050420267}, 'manhattan': {'pearson': 0.8658502662253235, 'spearman': 0.8729386359248522}, 'euclidean': {'pearson': 0.8656014810520674, 'spearman': 0.8729515050420267}, 'evaluation_time': 0.14}
INFO:main:Running task: STS14


Loading cohere from cache for STS14...
Loading llmrails from cache for STS14...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS14 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS14
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS14, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS14 on test took 0.34 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8590609438134296, 'spearman': 0.8323379027200857}, 'manhattan': {'pearson': 0.8452884714623194, 'spearman': 0.8325657506822635}, 'euclidean': {'pearson': 0.8466476774947467, 'spearman': 0.8323379027200857}, 'evaluation_time': 0.34}
INFO:main:Running task: STS15


Loading cohere from cache for STS15...
Loading llmrails from cache for STS15...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS15 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS15
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS15, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS15 on test took 0.26 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8947760823594291, 'spearman': 0.9008178688775351}, 'manhattan': {'pearson': 0.8932298637051768, 'spearman': 0.9005173451280707}, 'euclidean': {'pearson': 0.8935019779531528, 'spearman': 0.9008178688775351}, 'evaluation_time': 0.26}
INFO:main:Running task: STS16


Loading cohere from cache for STS16...
Loading llmrails from cache for STS16...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.11 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8624925543123428, 'spearman': 0.8739613474951905}, 'manhattan': {'pearson': 0.8665335670687333, 'spearman': 0.8737765995655381}, 'euclidean': {'pearson': 0.8668818348457072, 'spearman': 0.8739613474951905}, 'evaluation_time': 0.11}
INFO:main:Running task: STS17
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Loading cohere from cache for STS17...
Loading llmrails from cache for STS17...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.03 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.9103407500395598, 'spearman': 0.9083355849290317}, 'manhattan': {'pearson': 0.9089244247166315, 'spearman': 0.9084585918253754}, 'euclidean': {'pearson': 0.9090543978502437, 'spearman': 0.9083355849290317}}, 'evaluation_time'

Loading cohere from cache for STS22...
Loading llmrails from cache for STS22...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.05 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.6793534105577788, 'spearman': 0.6774438831479389}, 'manhattan': {'pearson': 0.6925330334778366, 'spearman': 0.6747988228806439}, 'euclidean': {'pearson': 0.6939052020036409, 'spearman': 0.6774438831479389}}, 'evaluation_time': 0.05

Loading cohere from cache for STSBenchmark...
Loading llmrails from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.13 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8818883460628203, 'spearman': 0.892614198983624}, 'manhattan': {'pearson': 0.8884542399363256, 'spearman': 0.892050059904798}, 'euclidean': {'pearson': 0.8888867593481973, 'spearman': 0.892614198983624}, 'evaluation_time': 0.13}


Converting the results to a CSV file...
Using model name cohere$llmrails
Converting results/cohere$llmrails to results/cohere$llmrails_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not found
Med

INFO:main:Running task: BIOSSES


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating BIOSSES **********************
INFO:mteb.evaluation.MTEB:Loading dataset for BIOSSES
INFO:mteb.abstasks.AbsTaskSTS:
Task: BIOSSES, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for BIOSSES on test took 0.02 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8847593720015356, 'spearman': 0.866887222492959}, 'manhattan': {'pearson': 0.8738875769677059, 'spearman': 0.8739235273982529}, 'euclidean': {'pearson': 0.8696844770742109, 'spearman': 0.866887222492959}, 'evaluation_time': 0.02}
INFO:main:Running task: SICK-R


Loading cohere from cache for SICK-R...
Loading voyage from cache for SICK-R...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating SICK-R **********************
INFO:mteb.evaluation.MTEB:Loading dataset for SICK-R
INFO:mteb.abstasks.AbsTaskSTS:
Task: SICK-R, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for SICK-R on test took 1.00 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8640121875093089, 'spearman': 0.8160328996596856}, 'manhattan': {'pearson': 0.8375556287826832, 'spearman': 0.8132193208074321}, 'euclidean': {'pearson': 0.8372006114552097, 'spearman': 0.8160328996596856}, 'evaluation_time': 1.0}
INFO:main:Running task: STS12


Loading cohere from cache for STS12...
Loading voyage from cache for STS12...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS12 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS12
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS12, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS12 on test took 0.28 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8232419067859442, 'spearman': 0.7412692346823067}, 'manhattan': {'pearson': 0.7767103238075166, 'spearman': 0.7367255189403076}, 'euclidean': {'pearson': 0.7877710157061415, 'spearman': 0.7412692346823067}, 'evaluation_time': 0.28}
INFO:main:Running task: STS13


Loading cohere from cache for STS13...
Loading voyage from cache for STS13...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS13 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS13
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS13, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS13 on test took 0.14 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8492501902880276, 'spearman': 0.8516599670727906}, 'manhattan': {'pearson': 0.8440583361001968, 'spearman': 0.849406186410488}, 'euclidean': {'pearson': 0.8457420847831665, 'spearman': 0.8516599670727906}, 'evaluation_time': 0.14}
INFO:main:Running task: STS14


Loading cohere from cache for STS14...
Loading voyage from cache for STS14...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS14 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS14
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS14, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS14 on test took 0.34 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.844277492575068, 'spearman': 0.8057053935627759}, 'manhattan': {'pearson': 0.8284294504106657, 'spearman': 0.8009045228159534}, 'euclidean': {'pearson': 0.8329819911104203, 'spearman': 0.8057053935627759}, 'evaluation_time': 0.34}
INFO:main:Running task: STS15


Loading cohere from cache for STS15...
Loading voyage from cache for STS15...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS15 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS15
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS15, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS15 on test took 0.26 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8836698442448395, 'spearman': 0.8898884776324923}, 'manhattan': {'pearson': 0.8796537787883966, 'spearman': 0.8862327201025152}, 'euclidean': {'pearson': 0.8831425803089191, 'spearman': 0.8898884776324923}, 'evaluation_time': 0.26}
INFO:main:Running task: STS16


Loading cohere from cache for STS16...
Loading voyage from cache for STS16...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.11 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8556636135805189, 'spearman': 0.8674008878767433}, 'manhattan': {'pearson': 0.8616645284992702, 'spearman': 0.86686257302378}, 'euclidean': {'pearson': 0.8625790506564277, 'spearman': 0.8674008878767433}, 'evaluation_time': 0.11}
INFO:main:Running task: STS17
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Loading cohere from cache for STS17...
Loading voyage from cache for STS17...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.03 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.9078199251004786, 'spearman': 0.9079884748434118}, 'manhattan': {'pearson': 0.9077786022993419, 'spearman': 0.907390353809941}, 'euclidean': {'pearson': 0.9089935042395625, 'spearman': 0.9079884748434118}}, 'evaluation_time':

Loading cohere from cache for STS22...
Loading voyage from cache for STS22...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.05 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.6622392191273226, 'spearman': 0.6648680458401717}, 'manhattan': {'pearson': 0.6745429182773758, 'spearman': 0.659006592287846}, 'euclidean': {'pearson': 0.6786688656473496, 'spearman': 0.6648680458401717}}, 'evaluation_time': 0.05}

Loading cohere from cache for STSBenchmark...
Loading voyage from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.13 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.878399135375005, 'spearman': 0.8846296788705146}, 'manhattan': {'pearson': 0.8832319263710944, 'spearman': 0.881740353641073}, 'euclidean': {'pearson': 0.8859214055075775, 'spearman': 0.8846296788705146}, 'evaluation_time': 0.13}


Converting the results to a CSV file...
Using model name cohere$voyage
Converting results/cohere$voyage to results/cohere$voyage_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not found
MedrxivCl

INFO:main:Running task: BIOSSES
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

********************** Evaluating BIOSSES **********************
INFO:mteb.evaluation.MTEB:Loading dataset for BIOSSES
INFO:mteb.abstasks.AbsTaskSTS:
Task: BIOSSES, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for BIOSSES on test took 0.01 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8975366462750217, 'spearman': 0.8795079919934645}, 'manhattan': {'pearson': 0.87332177479918, 'spearman': 0.8812008016496182}, 'euclidean': {'pearson': 0.8757457026677973, 'spearman': 0.8795079919934645}, 'evaluation_time': 0.01}
INFO:main:Running task: SICK-R


Loading gist from cache for SICK-R...
Loading llmrails from cache for SICK-R...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating SICK-R **********************
INFO:mteb.evaluation.MTEB:Loading dataset for SICK-R
INFO:mteb.abstasks.AbsTaskSTS:
Task: SICK-R, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for SICK-R on test took 0.53 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8710210276051696, 'spearman': 0.8283403000841321}, 'manhattan': {'pearson': 0.8415745465087463, 'spearman': 0.8281638407157976}, 'euclidean': {'pearson': 0.841857592463412, 'spearman': 0.8283402859227427}, 'evaluation_time': 0.53}
INFO:main:Running task: STS12


Loading gist from cache for STS12...
Loading llmrails from cache for STS12...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS12 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS12
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS12, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS12 on test took 0.17 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8708099210111337, 'spearman': 0.7834707551202027}, 'manhattan': {'pearson': 0.8368153772313522, 'spearman': 0.7832595968901254}, 'euclidean': {'pearson': 0.8372100927661502, 'spearman': 0.7834706580497988}, 'evaluation_time': 0.17}
INFO:main:Running task: STS13


Loading gist from cache for STS13...
Loading llmrails from cache for STS13...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS13 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS13
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS13, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS13 on test took 0.09 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8832621362745681, 'spearman': 0.8889923923897692}, 'manhattan': {'pearson': 0.8804329321216939, 'spearman': 0.8880560120702882}, 'euclidean': {'pearson': 0.881189489504506, 'spearman': 0.888992261698343}, 'evaluation_time': 0.09}
INFO:main:Running task: STS14


Loading gist from cache for STS14...
Loading llmrails from cache for STS14...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS14 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS14
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS14, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS14 on test took 0.20 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8633329637756585, 'spearman': 0.8417379516588805}, 'manhattan': {'pearson': 0.8531604712684883, 'spearman': 0.8413295966277324}, 'euclidean': {'pearson': 0.8538318713552329, 'spearman': 0.8417380239208857}, 'evaluation_time': 0.2}
INFO:main:Running task: STS15


Loading gist from cache for STS15...
Loading llmrails from cache for STS15...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS15 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS15
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS15, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS15 on test took 0.18 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8869179123167378, 'spearman': 0.8963329919414278}, 'manhattan': {'pearson': 0.8898564966821684, 'spearman': 0.8964126348839471}, 'euclidean': {'pearson': 0.8897606894831298, 'spearman': 0.8963329919414278}, 'evaluation_time': 0.18}
INFO:main:Running task: STS16


Loading gist from cache for STS16...
Loading llmrails from cache for STS16...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.07 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8574054693628335, 'spearman': 0.8709294317749924}, 'manhattan': {'pearson': 0.8636587445617853, 'spearman': 0.870607277618266}, 'euclidean': {'pearson': 0.8639336880959241, 'spearman': 0.8709294317749924}, 'evaluation_time': 0.07}
INFO:main:Running task: STS17
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Loading gist from cache for STS17...
Loading llmrails from cache for STS17...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.02 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.901495652604416, 'spearman': 0.8990508706343924}, 'manhattan': {'pearson': 0.9031073633849132, 'spearman': 0.8992315370133971}, 'euclidean': {'pearson': 0.9022075217294598, 'spearman': 0.8990508706343924}}, 'evaluation_time':

Loading gist from cache for STS22...
Loading llmrails from cache for STS22...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.04 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.6907011801816403, 'spearman': 0.6912958603015888}, 'manhattan': {'pearson': 0.7063609868945617, 'spearman': 0.6925125880245445}, 'euclidean': {'pearson': 0.7060092339990769, 'spearman': 0.6912958603015888}}, 'evaluation_time': 0.04

Loading gist from cache for STSBenchmark...
Loading llmrails from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.11 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8780397703332457, 'spearman': 0.8879027137002877}, 'manhattan': {'pearson': 0.8844067610026233, 'spearman': 0.8874728393112042}, 'euclidean': {'pearson': 0.88458752854946, 'spearman': 0.8879028043131394}, 'evaluation_time': 0.11}


Converting the results to a CSV file...
Using model name gist$llmrails
Converting results/gist$llmrails to results/gist$llmrails_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not found
MedrxivCl

INFO:main:Running task: BIOSSES
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

********************** Evaluating BIOSSES **********************
INFO:mteb.evaluation.MTEB:Loading dataset for BIOSSES
INFO:mteb.abstasks.AbsTaskSTS:
Task: BIOSSES, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for BIOSSES on test took 0.02 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.9138034099560781, 'spearman': 0.8972975539672083}, 'manhattan': {'pearson': 0.8996319131288183, 'spearman': 0.8953758234322864}, 'euclidean': {'pearson': 0.9008212936521327, 'spearman': 0.8972975539672083}, 'evaluation_time': 0.02}
INFO:main:Running task: SICK-R


Loading gist from cache for SICK-R...
Loading voyage from cache for SICK-R...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating SICK-R **********************
INFO:mteb.evaluation.MTEB:Loading dataset for SICK-R
INFO:mteb.abstasks.AbsTaskSTS:
Task: SICK-R, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for SICK-R on test took 1.00 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8773522046958858, 'spearman': 0.825728198685261}, 'manhattan': {'pearson': 0.8522376109289777, 'spearman': 0.8213368014235495}, 'euclidean': {'pearson': 0.8535983772087763, 'spearman': 0.825728198685261}, 'evaluation_time': 1.0}
INFO:main:Running task: STS12


Loading gist from cache for STS12...
Loading voyage from cache for STS12...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS12 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS12
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS12, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS12 on test took 0.27 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8544979670715414, 'spearman': 0.7643725288318837}, 'manhattan': {'pearson': 0.8108206821894568, 'spearman': 0.7604435271618657}, 'euclidean': {'pearson': 0.8207253114399915, 'spearman': 0.7643725288318837}, 'evaluation_time': 0.27}
INFO:main:Running task: STS13


Loading gist from cache for STS13...
Loading voyage from cache for STS13...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS13 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS13
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS13, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS13 on test took 0.13 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8809339594818054, 'spearman': 0.8840026245256057}, 'manhattan': {'pearson': 0.8746317413426887, 'spearman': 0.879816901457414}, 'euclidean': {'pearson': 0.878811007859426, 'spearman': 0.8840026245256057}, 'evaluation_time': 0.13}
INFO:main:Running task: STS14


Loading gist from cache for STS14...
Loading voyage from cache for STS14...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS14 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS14
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS14, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS14 on test took 0.31 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8625033544120484, 'spearman': 0.8281049910109122}, 'manhattan': {'pearson': 0.8493434080161373, 'spearman': 0.8223168063173252}, 'euclidean': {'pearson': 0.8540767754142758, 'spearman': 0.8281049910109122}, 'evaluation_time': 0.31}
INFO:main:Running task: STS15


Loading gist from cache for STS15...
Loading voyage from cache for STS15...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS15 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS15
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS15, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS15 on test took 0.25 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8888258703410591, 'spearman': 0.8974440122748362}, 'manhattan': {'pearson': 0.8881881346114018, 'spearman': 0.894669007579811}, 'euclidean': {'pearson': 0.8909747466864515, 'spearman': 0.8974440122748362}, 'evaluation_time': 0.25}
INFO:main:Running task: STS16
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Loading gist from cache for STS16...
Loading voyage from cache for STS16...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.12 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8538093791816602, 'spearman': 0.8688285926454026}, 'manhattan': {'pearson': 0.8633055733493322, 'spearman': 0.8680123496004676}, 'euclidean': {'pearson': 0.8639735678861098, 'spearman': 0.8688285926454026}, 'evaluation_time': 0.12}
INFO:main:Running task: STS17
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Loading gist from cache for STS17...
Loading voyage from cache for STS17...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.03 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.9069190610951878, 'spearman': 0.9056202076922455}, 'manhattan': {'pearson': 0.9077938872785183, 'spearman': 0.9068275972591687}, 'euclidean': {'pearson': 0.9080145485496828, 'spearman': 0.9056202076922455}}, 'evaluation_time'

Loading gist from cache for STS22...
Loading voyage from cache for STS22...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.04 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.6846023048615941, 'spearman': 0.6901116871665383}, 'manhattan': {'pearson': 0.7000013502896063, 'spearman': 0.6845790349520546}, 'euclidean': {'pearson': 0.7036831081376095, 'spearman': 0.6901116871665383}}, 'evaluation_time': 0.04

Loading gist from cache for STSBenchmark...
Loading voyage from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.12 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8780721502601535, 'spearman': 0.8830955867240672}, 'manhattan': {'pearson': 0.8829203211699591, 'spearman': 0.8810370306906353}, 'euclidean': {'pearson': 0.8846461071077283, 'spearman': 0.8830955867240672}, 'evaluation_time': 0.12}


Converting the results to a CSV file...
Using model name gist$voyage
Converting results/gist$voyage to results/gist$voyage_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not found
MedrxivClusteri

INFO:main:Running task: BIOSSES
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

********************** Evaluating BIOSSES **********************
INFO:mteb.evaluation.MTEB:Loading dataset for BIOSSES
INFO:mteb.abstasks.AbsTaskSTS:
Task: BIOSSES, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for BIOSSES on test took 0.02 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8970685291502821, 'spearman': 0.8831767502874064}, 'manhattan': {'pearson': 0.8821731141625437, 'spearman': 0.889020256929645}, 'euclidean': {'pearson': 0.8800385853982133, 'spearman': 0.8831767502874064}, 'evaluation_time': 0.02}
INFO:main:Running task: SICK-R


Loading llmrails from cache for SICK-R...
Loading voyage from cache for SICK-R...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating SICK-R **********************
INFO:mteb.evaluation.MTEB:Loading dataset for SICK-R
INFO:mteb.abstasks.AbsTaskSTS:
Task: SICK-R, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for SICK-R on test took 0.92 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.859078197735385, 'spearman': 0.817890756390503}, 'manhattan': {'pearson': 0.8347238214228405, 'spearman': 0.8140565885534623}, 'euclidean': {'pearson': 0.8336773587751347, 'spearman': 0.817890756390503}, 'evaluation_time': 0.92}
INFO:main:Running task: STS12


Loading llmrails from cache for STS12...
Loading voyage from cache for STS12...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS12 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS12
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS12, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS12 on test took 0.26 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8540156312585466, 'spearman': 0.7782905992180923}, 'manhattan': {'pearson': 0.8009688748187527, 'spearman': 0.7728679990690276}, 'euclidean': {'pearson': 0.8086024222157041, 'spearman': 0.7782905992180923}, 'evaluation_time': 0.26}
INFO:main:Running task: STS13
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Loading llmrails from cache for STS13...
Loading voyage from cache for STS13...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS13 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS13
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS13, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS13 on test took 0.13 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8639660131331912, 'spearman': 0.8696753164392599}, 'manhattan': {'pearson': 0.8624625814772828, 'spearman': 0.8676125706323566}, 'euclidean': {'pearson': 0.8638901947951633, 'spearman': 0.8696753164392599}, 'evaluation_time': 0.13}
INFO:main:Running task: STS14


Loading llmrails from cache for STS14...
Loading voyage from cache for STS14...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS14 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS14
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS14, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS14 on test took 0.32 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8489585524088743, 'spearman': 0.8270245346322668}, 'manhattan': {'pearson': 0.835609793260486, 'spearman': 0.8214375748102896}, 'euclidean': {'pearson': 0.8377456455788677, 'spearman': 0.8270245346322668}, 'evaluation_time': 0.32}
INFO:main:Running task: STS15


Loading llmrails from cache for STS15...
Loading voyage from cache for STS15...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS15 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS15
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS15, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS15 on test took 0.25 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8883962504951354, 'spearman': 0.894810497654876}, 'manhattan': {'pearson': 0.8862343988321655, 'spearman': 0.893913083371832}, 'euclidean': {'pearson': 0.8872518216694592, 'spearman': 0.894810497654876}, 'evaluation_time': 0.25}
INFO:main:Running task: STS16


Loading llmrails from cache for STS16...
Loading voyage from cache for STS16...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.10 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8582068361344009, 'spearman': 0.8732850937480312}, 'manhattan': {'pearson': 0.8646410959560384, 'spearman': 0.8718441525662279}, 'euclidean': {'pearson': 0.8654852026603848, 'spearman': 0.8732850937480312}, 'evaluation_time': 0.1}
INFO:main:Running task: STS17
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Loading llmrails from cache for STS17...
Loading voyage from cache for STS17...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.03 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.896962034327069, 'spearman': 0.8968156046898975}, 'manhattan': {'pearson': 0.9006459170123587, 'spearman': 0.8995786470990168}, 'euclidean': {'pearson': 0.8991490474650754, 'spearman': 0.8968156046898975}}, 'evaluation_time':

Loading llmrails from cache for STS22...
Loading voyage from cache for STS22...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.05 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.6748343787049773, 'spearman': 0.6702761767559182}, 'manhattan': {'pearson': 0.6889061235873887, 'spearman': 0.6709565676431362}, 'euclidean': {'pearson': 0.6922733025658843, 'spearman': 0.6702761767559182}}, 'evaluation_time': 0.05

Loading llmrails from cache for STSBenchmark...
Loading voyage from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.12 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8769810661608437, 'spearman': 0.8828592444467525}, 'manhattan': {'pearson': 0.8784687095694266, 'spearman': 0.880959941243115}, 'euclidean': {'pearson': 0.8791145695532055, 'spearman': 0.8828592444467525}, 'evaluation_time': 0.12}


Converting the results to a CSV file...
Using model name llmrails$voyage
Converting results/llmrails$voyage to results/llmrails$voyage_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not found
Med

INFO:main:Running task: BIOSSES
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

********************** Evaluating BIOSSES **********************
INFO:mteb.evaluation.MTEB:Loading dataset for BIOSSES
INFO:mteb.abstasks.AbsTaskSTS:
Task: BIOSSES, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for BIOSSES on test took 0.02 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.893496594059791, 'spearman': 0.8737307519213955}, 'manhattan': {'pearson': 0.87319914212027, 'spearman': 0.8717849244518664}, 'euclidean': {'pearson': 0.8741880434547791, 'spearman': 0.8737307519213955}, 'evaluation_time': 0.02}
INFO:main:Running task: SICK-R


Loading angle from cache for SICK-R...
Loading cohere from cache for SICK-R...
Loading gist from cache for SICK-R...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating SICK-R **********************
INFO:mteb.evaluation.MTEB:Loading dataset for SICK-R
INFO:mteb.abstasks.AbsTaskSTS:
Task: SICK-R, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for SICK-R on test took 2.21 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8745895903808968, 'spearman': 0.8284927673508724}, 'manhattan': {'pearson': 0.8470997614483057, 'spearman': 0.8279877097792708}, 'euclidean': {'pearson': 0.8481565047398594, 'spearman': 0.8284927673508724}, 'evaluation_time': 2.21}
INFO:main:Running task: STS12


Loading angle from cache for STS12...
Loading cohere from cache for STS12...
Loading gist from cache for STS12...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS12 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS12
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS12, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS12 on test took 0.54 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8719228295713733, 'spearman': 0.7825764929416429}, 'manhattan': {'pearson': 0.8421572358190006, 'spearman': 0.7810357230627712}, 'euclidean': {'pearson': 0.8440913835831119, 'spearman': 0.7825764929416429}, 'evaluation_time': 0.54}
INFO:main:Running task: STS13


Loading angle from cache for STS13...
Loading cohere from cache for STS13...
Loading gist from cache for STS13...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS13 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS13
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS13, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS13 on test took 0.26 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8885874703695091, 'spearman': 0.8927476469689892}, 'manhattan': {'pearson': 0.8848104509209277, 'spearman': 0.8918425932055892}, 'euclidean': {'pearson': 0.885409101715428, 'spearman': 0.8927476469689892}, 'evaluation_time': 0.26}
INFO:main:Running task: STS14


Loading angle from cache for STS14...
Loading cohere from cache for STS14...
Loading gist from cache for STS14...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS14 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS14
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS14, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS14 on test took 0.60 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8719711510591781, 'spearman': 0.8452367700535259}, 'manhattan': {'pearson': 0.8623041489649395, 'spearman': 0.8440587298468498}, 'euclidean': {'pearson': 0.8631634606736538, 'spearman': 0.8452367700535259}, 'evaluation_time': 0.6}
INFO:main:Running task: STS15


Loading angle from cache for STS15...
Loading cohere from cache for STS15...
Loading gist from cache for STS15...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS15 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS15
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS15, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS15 on test took 0.64 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8937310543377166, 'spearman': 0.9030503899281768}, 'manhattan': {'pearson': 0.8961699398402181, 'spearman': 0.903231615511711}, 'euclidean': {'pearson': 0.8959190105551023, 'spearman': 0.9030503899281768}, 'evaluation_time': 0.64}
INFO:main:Running task: STS16


Loading angle from cache for STS16...
Loading cohere from cache for STS16...
Loading gist from cache for STS16...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.19 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.860770803294941, 'spearman': 0.872991612405775}, 'manhattan': {'pearson': 0.865902660795013, 'spearman': 0.8728369784105463}, 'euclidean': {'pearson': 0.8660440134972508, 'spearman': 0.872991612405775}, 'evaluation_time': 0.19}
INFO:main:Running task: STS17
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Loading angle from cache for STS17...
Loading cohere from cache for STS17...
Loading gist from cache for STS17...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.06 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.9032584312180859, 'spearman': 0.9017562535608505}, 'manhattan': {'pearson': 0.9072815889329832, 'spearman': 0.9039092586434155}, 'euclidean': {'pearson': 0.9064974940164945, 'spearman': 0.9017562535608505}}, 'evaluation_time'

Loading angle from cache for STS22...
Loading cohere from cache for STS22...
Loading gist from cache for STS22...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.07 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.6889124202315657, 'spearman': 0.6921430934518209}, 'manhattan': {'pearson': 0.7058658942886542, 'spearman': 0.6911290180385747}, 'euclidean': {'pearson': 0.7057950901470593, 'spearman': 0.6921430934518209}}, 'evaluation_time': 0.07

Loading angle from cache for STSBenchmark...
Loading cohere from cache for STSBenchmark...
Loading gist from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.22 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8810733837292617, 'spearman': 0.8950620850173698}, 'manhattan': {'pearson': 0.8931620828505636, 'spearman': 0.8946230716893506}, 'euclidean': {'pearson': 0.893091502406258, 'spearman': 0.8950620850173698}, 'evaluation_time': 0.22}


Converting the results to a CSV file...
Using model name angle$cohere$gist
Converting results/angle$cohere$gist to results/angle$cohere$gist_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not fou

INFO:main:Running task: BIOSSES
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

********************** Evaluating BIOSSES **********************
INFO:mteb.evaluation.MTEB:Loading dataset for BIOSSES
INFO:mteb.abstasks.AbsTaskSTS:
Task: BIOSSES, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for BIOSSES on test took 0.02 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8804442513018818, 'spearman': 0.8628750828808651}, 'manhattan': {'pearson': 0.8566138364661187, 'spearman': 0.8607726253363895}, 'euclidean': {'pearson': 0.8601712263870457, 'spearman': 0.8628750828808651}, 'evaluation_time': 0.02}
INFO:main:Running task: SICK-R


Loading angle from cache for SICK-R...
Loading cohere from cache for SICK-R...
Loading llmrails from cache for SICK-R...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating SICK-R **********************
INFO:mteb.evaluation.MTEB:Loading dataset for SICK-R
INFO:mteb.abstasks.AbsTaskSTS:
Task: SICK-R, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for SICK-R on test took 1.87 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.871965430121567, 'spearman': 0.8286741667424542}, 'manhattan': {'pearson': 0.8418707571309965, 'spearman': 0.8277928103463549}, 'euclidean': {'pearson': 0.84424244937505, 'spearman': 0.8286741667424542}, 'evaluation_time': 1.87}
INFO:main:Running task: STS12


Loading angle from cache for STS12...
Loading cohere from cache for STS12...
Loading llmrails from cache for STS12...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS12 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS12
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS12, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS12 on test took 0.51 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8745006253360759, 'spearman': 0.7871408681046363}, 'manhattan': {'pearson': 0.8394985050368472, 'spearman': 0.7864835464180907}, 'euclidean': {'pearson': 0.8421687257536613, 'spearman': 0.7871408681046363}, 'evaluation_time': 0.51}
INFO:main:Running task: STS13


Loading angle from cache for STS13...
Loading cohere from cache for STS13...
Loading llmrails from cache for STS13...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS13 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS13
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS13, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS13 on test took 0.24 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8845182591726356, 'spearman': 0.8894633902554276}, 'manhattan': {'pearson': 0.8798292812219841, 'spearman': 0.8875738347924439}, 'euclidean': {'pearson': 0.8813763516804143, 'spearman': 0.8894633902554276}, 'evaluation_time': 0.24}
INFO:main:Running task: STS14


Loading angle from cache for STS14...
Loading cohere from cache for STS14...
Loading llmrails from cache for STS14...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS14 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS14
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS14, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS14 on test took 0.58 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8681293496643823, 'spearman': 0.8456430132021612}, 'manhattan': {'pearson': 0.8562701606160363, 'spearman': 0.8441004128844896}, 'euclidean': {'pearson': 0.8584062237502985, 'spearman': 0.8456430132021612}, 'evaluation_time': 0.58}
INFO:main:Running task: STS15


Loading angle from cache for STS15...
Loading cohere from cache for STS15...
Loading llmrails from cache for STS15...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS15 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS15
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS15, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS15 on test took 0.48 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.893790312889291, 'spearman': 0.9021198086712945}, 'manhattan': {'pearson': 0.8947512975014758, 'spearman': 0.9019476405967749}, 'euclidean': {'pearson': 0.8949061234180444, 'spearman': 0.9021198086712945}, 'evaluation_time': 0.48}
INFO:main:Running task: STS16


Loading angle from cache for STS16...
Loading cohere from cache for STS16...
Loading llmrails from cache for STS16...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.19 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8622842481965751, 'spearman': 0.874336250290625}, 'manhattan': {'pearson': 0.8663916865093773, 'spearman': 0.8742310923148684}, 'euclidean': {'pearson': 0.8665317534865713, 'spearman': 0.874336250290625}, 'evaluation_time': 0.19}
INFO:main:Running task: STS17
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Loading angle from cache for STS17...
Loading cohere from cache for STS17...
Loading llmrails from cache for STS17...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.05 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.9051288170515928, 'spearman': 0.9031193237309586}, 'manhattan': {'pearson': 0.9079051282242805, 'spearman': 0.9049748058829923}, 'euclidean': {'pearson': 0.9070283503734109, 'spearman': 0.9031193237309586}}, 'evaluation_time'

Loading angle from cache for STS22...
Loading cohere from cache for STS22...
Loading llmrails from cache for STS22...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.07 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.6852301029123573, 'spearman': 0.686136771983317}, 'manhattan': {'pearson': 0.7000278273284852, 'spearman': 0.6845855458696355}, 'euclidean': {'pearson': 0.7012877917381838, 'spearman': 0.686136771983317}}, 'evaluation_time': 0.07}


Loading angle from cache for STSBenchmark...
Loading cohere from cache for STSBenchmark...
Loading llmrails from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.22 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.882456885154732, 'spearman': 0.8958880414201892}, 'manhattan': {'pearson': 0.8920013941236247, 'spearman': 0.8955288114210763}, 'euclidean': {'pearson': 0.8921752069116017, 'spearman': 0.8958880414201892}, 'evaluation_time': 0.22}


Converting the results to a CSV file...
Using model name angle$cohere$llmrails
Converting results/angle$cohere$llmrails to results/angle$cohere$llmrails_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test

INFO:main:Running task: BIOSSES
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

********************** Evaluating BIOSSES **********************
INFO:mteb.evaluation.MTEB:Loading dataset for BIOSSES
INFO:mteb.abstasks.AbsTaskSTS:
Task: BIOSSES, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for BIOSSES on test took 0.03 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8922722790690027, 'spearman': 0.8741223271087619}, 'manhattan': {'pearson': 0.8774216714484759, 'spearman': 0.8768754018876311}, 'euclidean': {'pearson': 0.8752161775126861, 'spearman': 0.8741223271087619}, 'evaluation_time': 0.03}
INFO:main:Running task: SICK-R


Loading angle from cache for SICK-R...
Loading cohere from cache for SICK-R...
Loading voyage from cache for SICK-R...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating SICK-R **********************
INFO:mteb.evaluation.MTEB:Loading dataset for SICK-R
INFO:mteb.abstasks.AbsTaskSTS:
Task: SICK-R, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for SICK-R on test took 1.86 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8752229120182698, 'spearman': 0.8281367100768345}, 'manhattan': {'pearson': 0.8497252323914393, 'spearman': 0.8268071163746364}, 'euclidean': {'pearson': 0.8494851611646577, 'spearman': 0.8281367100768345}, 'evaluation_time': 1.86}
INFO:main:Running task: STS12


Loading angle from cache for STS12...
Loading cohere from cache for STS12...
Loading voyage from cache for STS12...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS12 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS12
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS12, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS12 on test took 0.52 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8693370880411129, 'spearman': 0.7811104999150806}, 'manhattan': {'pearson': 0.8273216781824781, 'spearman': 0.7759970305168273}, 'euclidean': {'pearson': 0.8373087113354485, 'spearman': 0.7811104999150806}, 'evaluation_time': 0.52}
INFO:main:Running task: STS13


Loading angle from cache for STS13...
Loading cohere from cache for STS13...
Loading voyage from cache for STS13...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS13 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS13
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS13, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS13 on test took 0.24 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8841513368151761, 'spearman': 0.8875731728301113}, 'manhattan': {'pearson': 0.8767802384447339, 'spearman': 0.8826879726716108}, 'euclidean': {'pearson': 0.8808361226001331, 'spearman': 0.8875731728301113}, 'evaluation_time': 0.24}
INFO:main:Running task: STS14


Loading angle from cache for STS14...
Loading cohere from cache for STS14...
Loading voyage from cache for STS14...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS14 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS14
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS14, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS14 on test took 0.60 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8695827346326507, 'spearman': 0.8411446807750531}, 'manhattan': {'pearson': 0.8556539052496617, 'spearman': 0.8350886698588184}, 'euclidean': {'pearson': 0.8600464601234674, 'spearman': 0.8411446807750531}, 'evaluation_time': 0.6}
INFO:main:Running task: STS15


Loading angle from cache for STS15...
Loading cohere from cache for STS15...
Loading voyage from cache for STS15...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS15 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS15
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS15, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS15 on test took 0.48 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8958364179619607, 'spearman': 0.9035349420821054}, 'manhattan': {'pearson': 0.8948124579911378, 'spearman': 0.9019667443092753}, 'euclidean': {'pearson': 0.8963211349390451, 'spearman': 0.9035349420821054}, 'evaluation_time': 0.48}
INFO:main:Running task: STS16


Loading angle from cache for STS16...
Loading cohere from cache for STS16...
Loading voyage from cache for STS16...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.19 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8631120330456202, 'spearman': 0.8754381926300666}, 'manhattan': {'pearson': 0.8683450832875212, 'spearman': 0.8754271525567002}, 'euclidean': {'pearson': 0.8681419972265486, 'spearman': 0.8754381926300666}, 'evaluation_time': 0.19}
INFO:main:Running task: STS17
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Loading angle from cache for STS17...
Loading cohere from cache for STS17...
Loading voyage from cache for STS17...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.05 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.905993607142755, 'spearman': 0.9056701792438853}, 'manhattan': {'pearson': 0.910160420069525, 'spearman': 0.9077036369990662}, 'euclidean': {'pearson': 0.9090101130694215, 'spearman': 0.9056701792438853}}, 'evaluation_time': 

Loading angle from cache for STS22...
Loading cohere from cache for STS22...
Loading voyage from cache for STS22...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.08 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.6817199813388726, 'spearman': 0.6852887249683873}, 'manhattan': {'pearson': 0.695645429479602, 'spearman': 0.6781942663991531}, 'euclidean': {'pearson': 0.6996006457020172, 'spearman': 0.6852887249683873}}, 'evaluation_time': 0.08}

Loading angle from cache for STSBenchmark...
Loading cohere from cache for STSBenchmark...
Loading voyage from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.26 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8847275805303604, 'spearman': 0.896816564607923}, 'manhattan': {'pearson': 0.8944999688283924, 'spearman': 0.8952731400866739}, 'euclidean': {'pearson': 0.8951144668192031, 'spearman': 0.896816564607923}, 'evaluation_time': 0.26}


Converting the results to a CSV file...
Using model name angle$cohere$voyage
Converting results/angle$cohere$voyage to results/angle$cohere$voyage_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set n

INFO:main:Running task: BIOSSES
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

********************** Evaluating BIOSSES **********************
INFO:mteb.evaluation.MTEB:Loading dataset for BIOSSES
INFO:mteb.abstasks.AbsTaskSTS:
Task: BIOSSES, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for BIOSSES on test took 0.02 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8908163298901028, 'spearman': 0.8763753904945324}, 'manhattan': {'pearson': 0.8684938693512755, 'spearman': 0.8745922173336018}, 'euclidean': {'pearson': 0.8702796721379447, 'spearman': 0.8763753904945324}, 'evaluation_time': 0.02}
INFO:main:Running task: SICK-R


Loading angle from cache for SICK-R...
Loading gist from cache for SICK-R...
Loading llmrails from cache for SICK-R...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating SICK-R **********************
INFO:mteb.evaluation.MTEB:Loading dataset for SICK-R
INFO:mteb.abstasks.AbsTaskSTS:
Task: SICK-R, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for SICK-R on test took 0.93 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8740367440604522, 'spearman': 0.8293982408761909}, 'manhattan': {'pearson': 0.8461718885293942, 'spearman': 0.8293824068954592}, 'euclidean': {'pearson': 0.8469575702531696, 'spearman': 0.8293981937544661}, 'evaluation_time': 0.93}
INFO:main:Running task: STS12


Loading angle from cache for STS12...
Loading gist from cache for STS12...
Loading llmrails from cache for STS12...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS12 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS12
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS12, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS12 on test took 0.28 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8762905962636974, 'spearman': 0.7883561119315888}, 'manhattan': {'pearson': 0.845578297659108, 'spearman': 0.7875836283867698}, 'euclidean': {'pearson': 0.8472830349266033, 'spearman': 0.7883559326639147}, 'evaluation_time': 0.28}
INFO:main:Running task: STS13


Loading angle from cache for STS13...
Loading gist from cache for STS13...
Loading llmrails from cache for STS13...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS13 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS13
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS13, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS13 on test took 0.14 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8864233842416258, 'spearman': 0.8936498037574502}, 'manhattan': {'pearson': 0.8848884581942831, 'spearman': 0.8926512104457393}, 'euclidean': {'pearson': 0.8858657651757305, 'spearman': 0.8936498037574502}, 'evaluation_time': 0.14}
INFO:main:Running task: STS14


Loading angle from cache for STS14...
Loading gist from cache for STS14...
Loading llmrails from cache for STS14...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS14 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS14
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS14, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS14 on test took 0.33 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8668047458530697, 'spearman': 0.84742722456712}, 'manhattan': {'pearson': 0.8578917022123259, 'spearman': 0.8462881900915267}, 'euclidean': {'pearson': 0.8589935488162691, 'spearman': 0.8474272493671172}, 'evaluation_time': 0.33}
INFO:main:Running task: STS15


Loading angle from cache for STS15...
Loading gist from cache for STS15...
Loading llmrails from cache for STS15...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS15 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS15
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS15, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS15 on test took 0.27 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8869362857519105, 'spearman': 0.8970592873449624}, 'manhattan': {'pearson': 0.8905214113500942, 'spearman': 0.8972235843438333}, 'euclidean': {'pearson': 0.8904711837721757, 'spearman': 0.8970592873449624}, 'evaluation_time': 0.27}
INFO:main:Running task: STS16


Loading angle from cache for STS16...
Loading gist from cache for STS16...
Loading llmrails from cache for STS16...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.11 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8569477030049104, 'spearman': 0.8699758627678924}, 'manhattan': {'pearson': 0.8625957056184377, 'spearman': 0.8698870861303428}, 'euclidean': {'pearson': 0.862551344526008, 'spearman': 0.8699758627678924}, 'evaluation_time': 0.11}
INFO:main:Running task: STS17
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Loading angle from cache for STS17...
Loading gist from cache for STS17...
Loading llmrails from cache for STS17...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.03 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.8982989233383333, 'spearman': 0.8979611064120978}, 'manhattan': {'pearson': 0.9028366250303576, 'spearman': 0.8988175419278904}, 'euclidean': {'pearson': 0.9017456786431932, 'spearman': 0.8979611064120978}}, 'evaluation_time'

Loading angle from cache for STS22...
Loading gist from cache for STS22...
Loading llmrails from cache for STS22...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.06 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.6895656035950217, 'spearman': 0.6928129040979696}, 'manhattan': {'pearson': 0.7063618522985032, 'spearman': 0.6935193386555119}, 'euclidean': {'pearson': 0.7056893476186665, 'spearman': 0.6928129040979696}}, 'evaluation_time': 0.06

Loading angle from cache for STSBenchmark...
Loading gist from cache for STSBenchmark...
Loading llmrails from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.13 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.877851209057702, 'spearman': 0.8908371443921702}, 'manhattan': {'pearson': 0.8872257434592623, 'spearman': 0.8902086888952525}, 'euclidean': {'pearson': 0.8875611198745037, 'spearman': 0.8908371443921702}, 'evaluation_time': 0.13}


Converting the results to a CSV file...
Using model name angle$gist$llmrails
Converting results/angle$gist$llmrails to results/angle$gist$llmrails_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set n

INFO:main:Running task: BIOSSES
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

********************** Evaluating BIOSSES **********************
INFO:mteb.evaluation.MTEB:Loading dataset for BIOSSES
INFO:mteb.abstasks.AbsTaskSTS:
Task: BIOSSES, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for BIOSSES on test took 0.02 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.9035840958162744, 'spearman': 0.8906046303800667}, 'manhattan': {'pearson': 0.8895526979474258, 'spearman': 0.8898275042389855}, 'euclidean': {'pearson': 0.8868362434802972, 'spearman': 0.8906046303800667}, 'evaluation_time': 0.02}
INFO:main:Running task: SICK-R


Loading angle from cache for SICK-R...
Loading gist from cache for SICK-R...
Loading voyage from cache for SICK-R...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating SICK-R **********************
INFO:mteb.evaluation.MTEB:Loading dataset for SICK-R
INFO:mteb.abstasks.AbsTaskSTS:
Task: SICK-R, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for SICK-R on test took 1.70 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8774417567361029, 'spearman': 0.8292011601890411}, 'manhattan': {'pearson': 0.8546607479016375, 'spearman': 0.8286025177821363}, 'euclidean': {'pearson': 0.8530632039994737, 'spearman': 0.8292011601890411}, 'evaluation_time': 1.7}
INFO:main:Running task: STS12


Loading angle from cache for STS12...
Loading gist from cache for STS12...
Loading voyage from cache for STS12...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS12 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS12
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS12, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS12 on test took 0.50 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8734575721605479, 'spearman': 0.7844328282201012}, 'manhattan': {'pearson': 0.837525072402153, 'spearman': 0.780712424426811}, 'euclidean': {'pearson': 0.844973388444163, 'spearman': 0.7844328282201012}, 'evaluation_time': 0.5}
INFO:main:Running task: STS13


Loading angle from cache for STS13...
Loading gist from cache for STS13...
Loading voyage from cache for STS13...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS13 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS13
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS13, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS13 on test took 0.22 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8895212865885554, 'spearman': 0.8953043234440765}, 'manhattan': {'pearson': 0.886437961800913, 'spearman': 0.8925319557961717}, 'euclidean': {'pearson': 0.88865774723991, 'spearman': 0.8953043234440765}, 'evaluation_time': 0.22}
INFO:main:Running task: STS14


Loading angle from cache for STS14...
Loading gist from cache for STS14...
Loading voyage from cache for STS14...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS14 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS14
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS14, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS14 on test took 0.55 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8702859065245809, 'spearman': 0.8451507035802281}, 'manhattan': {'pearson': 0.8605498279830985, 'spearman': 0.8406808617873825}, 'euclidean': {'pearson': 0.8627632601258285, 'spearman': 0.8451507035802281}, 'evaluation_time': 0.55}
INFO:main:Running task: STS15


Loading angle from cache for STS15...
Loading gist from cache for STS15...
Loading voyage from cache for STS15...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS15 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS15
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS15, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS15 on test took 0.43 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8904302412753525, 'spearman': 0.9002101838911554}, 'manhattan': {'pearson': 0.8934122277149179, 'spearman': 0.9003320522178541}, 'euclidean': {'pearson': 0.89339189272144, 'spearman': 0.9002101838911554}, 'evaluation_time': 0.43}
INFO:main:Running task: STS16


Loading angle from cache for STS16...
Loading gist from cache for STS16...
Loading voyage from cache for STS16...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.18 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8576587887035076, 'spearman': 0.8710453963261989}, 'manhattan': {'pearson': 0.8654015477538846, 'spearman': 0.8721398849743358}, 'euclidean': {'pearson': 0.8642935911077905, 'spearman': 0.8710453963261989}, 'evaluation_time': 0.18}
INFO:main:Running task: STS17
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Loading angle from cache for STS17...
Loading gist from cache for STS17...
Loading voyage from cache for STS17...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.05 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.8997938145088964, 'spearman': 0.8976920288263461}, 'manhattan': {'pearson': 0.906209726251252, 'spearman': 0.9024573928700095}, 'euclidean': {'pearson': 0.9042438938152133, 'spearman': 0.8976920288263461}}, 'evaluation_time':

Loading angle from cache for STS22...
Loading gist from cache for STS22...
Loading voyage from cache for STS22...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.06 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.687697013737536, 'spearman': 0.6897495173760932}, 'manhattan': {'pearson': 0.7047602562012842, 'spearman': 0.6898650861631564}, 'euclidean': {'pearson': 0.7058889396256316, 'spearman': 0.6897495173760932}}, 'evaluation_time': 0.06}

Loading angle from cache for STSBenchmark...
Loading gist from cache for STSBenchmark...
Loading voyage from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.21 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8798108409418048, 'spearman': 0.8915556845654371}, 'manhattan': {'pearson': 0.8899652224761023, 'spearman': 0.8903894652875495}, 'euclidean': {'pearson': 0.8901784447827406, 'spearman': 0.8915556845654371}, 'evaluation_time': 0.21}


Converting the results to a CSV file...
Using model name angle$gist$voyage
Converting results/angle$gist$voyage to results/angle$gist$voyage_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not fou

INFO:main:Running task: BIOSSES
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

********************** Evaluating BIOSSES **********************
INFO:mteb.evaluation.MTEB:Loading dataset for BIOSSES
INFO:mteb.abstasks.AbsTaskSTS:
Task: BIOSSES, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for BIOSSES on test took 0.03 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8909297098212849, 'spearman': 0.8803815058729746}, 'manhattan': {'pearson': 0.8757331512845151, 'spearman': 0.8811767047150112}, 'euclidean': {'pearson': 0.8729942364932892, 'spearman': 0.8803815058729746}, 'evaluation_time': 0.03}
INFO:main:Running task: SICK-R


Loading angle from cache for SICK-R...
Loading llmrails from cache for SICK-R...
Loading voyage from cache for SICK-R...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating SICK-R **********************
INFO:mteb.evaluation.MTEB:Loading dataset for SICK-R
INFO:mteb.abstasks.AbsTaskSTS:
Task: SICK-R, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for SICK-R on test took 1.64 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8741838562967831, 'spearman': 0.8290371366462641}, 'manhattan': {'pearson': 0.8484086011673402, 'spearman': 0.827540362374174}, 'euclidean': {'pearson': 0.8481733721111724, 'spearman': 0.8290371366462641}, 'evaluation_time': 1.64}
INFO:main:Running task: STS12


Loading angle from cache for STS12...
Loading llmrails from cache for STS12...
Loading voyage from cache for STS12...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS12 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS12
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS12, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS12 on test took 0.47 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8761521789364335, 'spearman': 0.7902031477831964}, 'manhattan': {'pearson': 0.8342279356313471, 'spearman': 0.7873903327313886}, 'euclidean': {'pearson': 0.8423278447644249, 'spearman': 0.7902031477831964}, 'evaluation_time': 0.47}
INFO:main:Running task: STS13


Loading angle from cache for STS13...
Loading llmrails from cache for STS13...
Loading voyage from cache for STS13...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS13 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS13
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS13, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS13 on test took 0.23 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8825749898312361, 'spearman': 0.8897088786305519}, 'manhattan': {'pearson': 0.8799158791962196, 'spearman': 0.8865308526356122}, 'euclidean': {'pearson': 0.8824339789488973, 'spearman': 0.8897088786305519}, 'evaluation_time': 0.23}
INFO:main:Running task: STS14


Loading angle from cache for STS14...
Loading llmrails from cache for STS14...
Loading voyage from cache for STS14...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS14 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS14
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS14, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS14 on test took 0.57 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8643748665008274, 'spearman': 0.8449790036905444}, 'manhattan': {'pearson': 0.8533092603849399, 'spearman': 0.8403640619763967}, 'euclidean': {'pearson': 0.8560793577549879, 'spearman': 0.8449790036905444}, 'evaluation_time': 0.57}
INFO:main:Running task: STS15


Loading angle from cache for STS15...
Loading llmrails from cache for STS15...
Loading voyage from cache for STS15...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS15 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS15
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS15, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS15 on test took 0.44 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8901244303667488, 'spearman': 0.8989237829490876}, 'manhattan': {'pearson': 0.8921422810918533, 'spearman': 0.8993929626278067}, 'euclidean': {'pearson': 0.8918300497148444, 'spearman': 0.8989237829490876}, 'evaluation_time': 0.44}
INFO:main:Running task: STS16


Loading angle from cache for STS16...
Loading llmrails from cache for STS16...
Loading voyage from cache for STS16...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.18 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8594806926906815, 'spearman': 0.8729541068029306}, 'manhattan': {'pearson': 0.866104137234398, 'spearman': 0.8739071321976937}, 'euclidean': {'pearson': 0.8649635478117504, 'spearman': 0.8729541068029306}, 'evaluation_time': 0.18}
INFO:main:Running task: STS17
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Loading angle from cache for STS17...
Loading llmrails from cache for STS17...
Loading voyage from cache for STS17...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.04 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.9000569954922364, 'spearman': 0.8994729380474715}, 'manhattan': {'pearson': 0.905301898607051, 'spearman': 0.9029513424381395}, 'euclidean': {'pearson': 0.903513727312971, 'spearman': 0.8994729380474715}}, 'evaluation_time': 

Loading angle from cache for STS22...
Loading llmrails from cache for STS22...
Loading voyage from cache for STS22...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.07 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.6834842510185669, 'spearman': 0.6841118766156153}, 'manhattan': {'pearson': 0.6986574269387646, 'spearman': 0.6802159063080642}, 'euclidean': {'pearson': 0.7007754453148309, 'spearman': 0.6841118766156153}}, 'evaluation_time': 0.07

Loading angle from cache for STSBenchmark...
Loading llmrails from cache for STSBenchmark...
Loading voyage from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.22 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8811266834236569, 'spearman': 0.8925346057841013}, 'manhattan': {'pearson': 0.8887322235557927, 'spearman': 0.8915912937373299}, 'euclidean': {'pearson': 0.8890078022397936, 'spearman': 0.8925346057841013}, 'evaluation_time': 0.22}


Converting the results to a CSV file...
Using model name angle$llmrails$voyage
Converting results/angle$llmrails$voyage to results/angle$llmrails$voyage_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test

INFO:main:Running task: BIOSSES
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

********************** Evaluating BIOSSES **********************
INFO:mteb.evaluation.MTEB:Loading dataset for BIOSSES
INFO:mteb.abstasks.AbsTaskSTS:
Task: BIOSSES, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for BIOSSES on test took 0.02 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8881769777353928, 'spearman': 0.8675197670263973}, 'manhattan': {'pearson': 0.8697276644968379, 'spearman': 0.8726765110323319}, 'euclidean': {'pearson': 0.8678114291068738, 'spearman': 0.8675197670263973}, 'evaluation_time': 0.02}
INFO:main:Running task: SICK-R


Loading cohere from cache for SICK-R...
Loading gist from cache for SICK-R...
Loading llmrails from cache for SICK-R...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating SICK-R **********************
INFO:mteb.evaluation.MTEB:Loading dataset for SICK-R
INFO:mteb.abstasks.AbsTaskSTS:
Task: SICK-R, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for SICK-R on test took 2.18 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8686831711772592, 'spearman': 0.8255932572697272}, 'manhattan': {'pearson': 0.8395571760640507, 'spearman': 0.8264371175943837}, 'euclidean': {'pearson': 0.8395973361262565, 'spearman': 0.8255932572697272}, 'evaluation_time': 2.18}
INFO:main:Running task: STS12


Loading cohere from cache for STS12...
Loading gist from cache for STS12...
Loading llmrails from cache for STS12...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS12 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS12
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS12, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS12 on test took 0.43 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8591693480662017, 'spearman': 0.7710741452236979}, 'manhattan': {'pearson': 0.828217469401616, 'spearman': 0.7756182535666056}, 'euclidean': {'pearson': 0.8249105873215069, 'spearman': 0.7710741452236979}, 'evaluation_time': 0.43}
INFO:main:Running task: STS13


Loading cohere from cache for STS13...
Loading gist from cache for STS13...
Loading llmrails from cache for STS13...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS13 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS13
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS13, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS13 on test took 0.22 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8753609557726459, 'spearman': 0.8774705848671113}, 'manhattan': {'pearson': 0.8742567811932285, 'spearman': 0.8812638087588297}, 'euclidean': {'pearson': 0.8703807862897865, 'spearman': 0.8774705848671113}, 'evaluation_time': 0.22}
INFO:main:Running task: STS14


Loading cohere from cache for STS14...
Loading gist from cache for STS14...
Loading llmrails from cache for STS14...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS14 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS14
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS14, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS14 on test took 0.53 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8637044456238652, 'spearman': 0.8332942680188694}, 'manhattan': {'pearson': 0.8536392037685087, 'spearman': 0.8366214664833135}, 'euclidean': {'pearson': 0.8524515904971937, 'spearman': 0.8332942680188694}, 'evaluation_time': 0.53}
INFO:main:Running task: STS15


Loading cohere from cache for STS15...
Loading gist from cache for STS15...
Loading llmrails from cache for STS15...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS15 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS15
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS15, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS15 on test took 0.42 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8956904848625692, 'spearman': 0.9026486852550959}, 'manhattan': {'pearson': 0.8958851238737839, 'spearman': 0.9030987527690333}, 'euclidean': {'pearson': 0.8954582029359524, 'spearman': 0.9026486852550959}, 'evaluation_time': 0.42}
INFO:main:Running task: STS16


Loading cohere from cache for STS16...
Loading gist from cache for STS16...
Loading llmrails from cache for STS16...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.17 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8621421380239014, 'spearman': 0.8737363831978189}, 'manhattan': {'pearson': 0.8674952414529471, 'spearman': 0.8741333734367964}, 'euclidean': {'pearson': 0.8673913297394941, 'spearman': 0.8737363831978189}, 'evaluation_time': 0.17}
INFO:main:Running task: STS17


Loading cohere from cache for STS17...
Loading gist from cache for STS17...
Loading llmrails from cache for STS17...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.04 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.9115045632642604, 'spearman': 0.9093131053584127}, 'manhattan': {'pearson': 0.9105010466783028, 'spearman': 0.909548740444221}, 'euclidean': {'pearson': 0.9105989658290508, 'spearman': 0.9093131053584127}}, 'evaluation_time':

Loading cohere from cache for STS22...
Loading gist from cache for STS22...
Loading llmrails from cache for STS22...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.07 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.6837833758550852, 'spearman': 0.6834086975168636}, 'manhattan': {'pearson': 0.7005723223152738, 'spearman': 0.6835299633568105}, 'euclidean': {'pearson': 0.6990949215168663, 'spearman': 0.6834086975168636}}, 'evaluation_time': 0.07

Loading cohere from cache for STSBenchmark...
Loading gist from cache for STSBenchmark...
Loading llmrails from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.20 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8827657786155687, 'spearman': 0.8936736640487286}, 'manhattan': {'pearson': 0.89148454008245, 'spearman': 0.8934904434630315}, 'euclidean': {'pearson': 0.8918414177619085, 'spearman': 0.8936736640487286}, 'evaluation_time': 0.2}


Converting the results to a CSV file...
Using model name cohere$gist$llmrails
Converting results/cohere$gist$llmrails to results/cohere$gist$llmrails_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test se

INFO:main:Running task: BIOSSES
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

********************** Evaluating BIOSSES **********************
INFO:mteb.evaluation.MTEB:Loading dataset for BIOSSES
INFO:mteb.abstasks.AbsTaskSTS:
Task: BIOSSES, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for BIOSSES on test took 0.03 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8938486060861954, 'spearman': 0.8723391539478315}, 'manhattan': {'pearson': 0.8850386018760288, 'spearman': 0.8842309911764701}, 'euclidean': {'pearson': 0.8769126641612885, 'spearman': 0.8723391539478315}, 'evaluation_time': 0.03}
INFO:main:Running task: SICK-R


Loading cohere from cache for SICK-R...
Loading gist from cache for SICK-R...
Loading voyage from cache for SICK-R...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating SICK-R **********************
INFO:mteb.evaluation.MTEB:Loading dataset for SICK-R
INFO:mteb.abstasks.AbsTaskSTS:
Task: SICK-R, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for SICK-R on test took 1.88 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8700842072286148, 'spearman': 0.8233027145311181}, 'manhattan': {'pearson': 0.8459139057456059, 'spearman': 0.8239876309052767}, 'euclidean': {'pearson': 0.842617996554139, 'spearman': 0.8233027145311181}, 'evaluation_time': 1.88}
INFO:main:Running task: STS12


Loading cohere from cache for STS12...
Loading gist from cache for STS12...
Loading voyage from cache for STS12...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS12 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS12
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS12, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS12 on test took 0.50 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8465673660863504, 'spearman': 0.7580703295818536}, 'manhattan': {'pearson': 0.8104024652090374, 'spearman': 0.7592033615833503}, 'euclidean': {'pearson': 0.8133563382637732, 'spearman': 0.7580703295818536}, 'evaluation_time': 0.5}
INFO:main:Running task: STS13


Loading cohere from cache for STS13...
Loading gist from cache for STS13...
Loading voyage from cache for STS13...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS13 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS13
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS13, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS13 on test took 0.24 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8676057629002112, 'spearman': 0.869693385163788}, 'manhattan': {'pearson': 0.8659094264882936, 'spearman': 0.8719814312741421}, 'euclidean': {'pearson': 0.8632143104144379, 'spearman': 0.869693385163788}, 'evaluation_time': 0.24}
INFO:main:Running task: STS14


Loading cohere from cache for STS14...
Loading gist from cache for STS14...
Loading voyage from cache for STS14...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS14 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS14
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS14, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS14 on test took 0.61 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8588542623255573, 'spearman': 0.8231262443586583}, 'manhattan': {'pearson': 0.8484600216462895, 'spearman': 0.8231977383871418}, 'euclidean': {'pearson': 0.8483174992983609, 'spearman': 0.8231262443586583}, 'evaluation_time': 0.61}
INFO:main:Running task: STS15


Loading cohere from cache for STS15...
Loading gist from cache for STS15...
Loading voyage from cache for STS15...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS15 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS15
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS15, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS15 on test took 0.47 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8927343774694524, 'spearman': 0.8996450472434552}, 'manhattan': {'pearson': 0.8922675947321024, 'spearman': 0.8992111191380314}, 'euclidean': {'pearson': 0.8926214892657927, 'spearman': 0.8996450472434552}, 'evaluation_time': 0.47}
INFO:main:Running task: STS16


Loading cohere from cache for STS16...
Loading gist from cache for STS16...
Loading voyage from cache for STS16...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.19 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8599322315249781, 'spearman': 0.8719484093859631}, 'manhattan': {'pearson': 0.8670317206680336, 'spearman': 0.8727605917561932}, 'euclidean': {'pearson': 0.8664122717533238, 'spearman': 0.8719484093859631}, 'evaluation_time': 0.19}
INFO:main:Running task: STS17
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Loading cohere from cache for STS17...
Loading gist from cache for STS17...
Loading voyage from cache for STS17...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.05 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.9101103709752365, 'spearman': 0.908753039583498}, 'manhattan': {'pearson': 0.911370669550276, 'spearman': 0.9104082511324223}, 'euclidean': {'pearson': 0.9107536186388938, 'spearman': 0.908753039583498}}, 'evaluation_time': 0

Loading cohere from cache for STS22...
Loading gist from cache for STS22...
Loading voyage from cache for STS22...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.07 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.6783467680448472, 'spearman': 0.6808962971952822}, 'manhattan': {'pearson': 0.6943580383033037, 'spearman': 0.678690723864707}, 'euclidean': {'pearson': 0.694804907169986, 'spearman': 0.6808962971952822}}, 'evaluation_time': 0.07}


Loading cohere from cache for STSBenchmark...
Loading gist from cache for STSBenchmark...
Loading voyage from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.22 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8816809862540996, 'spearman': 0.8909953686907209}, 'manhattan': {'pearson': 0.8907054771916652, 'spearman': 0.889824086931657}, 'euclidean': {'pearson': 0.8914064200166186, 'spearman': 0.8909953686907209}, 'evaluation_time': 0.22}


Converting the results to a CSV file...
Using model name cohere$gist$voyage
Converting results/cohere$gist$voyage to results/cohere$gist$voyage_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test set not 

INFO:main:Running task: BIOSSES
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

********************** Evaluating BIOSSES **********************
INFO:mteb.evaluation.MTEB:Loading dataset for BIOSSES
INFO:mteb.abstasks.AbsTaskSTS:
Task: BIOSSES, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for BIOSSES on test took 0.02 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8832380331747777, 'spearman': 0.8654414064165289}, 'manhattan': {'pearson': 0.8718045880363292, 'spearman': 0.8747849928104592}, 'euclidean': {'pearson': 0.8647667575688622, 'spearman': 0.8654414064165289}, 'evaluation_time': 0.02}
INFO:main:Running task: SICK-R


Loading cohere from cache for SICK-R...
Loading llmrails from cache for SICK-R...
Loading voyage from cache for SICK-R...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating SICK-R **********************
INFO:mteb.evaluation.MTEB:Loading dataset for SICK-R
INFO:mteb.abstasks.AbsTaskSTS:
Task: SICK-R, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for SICK-R on test took 1.84 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8649160000634099, 'spearman': 0.8219682414082158}, 'manhattan': {'pearson': 0.8381558673474356, 'spearman': 0.8217702234802462}, 'euclidean': {'pearson': 0.8365881363408463, 'spearman': 0.8219682414082158}, 'evaluation_time': 1.84}
INFO:main:Running task: STS12


Loading cohere from cache for STS12...
Loading llmrails from cache for STS12...
Loading voyage from cache for STS12...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS12 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS12
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS12, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS12 on test took 0.49 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8475973900301141, 'spearman': 0.7634599821017798}, 'manhattan': {'pearson': 0.8059437917218855, 'spearman': 0.7657305065318349}, 'euclidean': {'pearson': 0.809783980230552, 'spearman': 0.7634599821017798}, 'evaluation_time': 0.49}
INFO:main:Running task: STS13


Loading cohere from cache for STS13...
Loading llmrails from cache for STS13...
Loading voyage from cache for STS13...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS13 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS13
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS13, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS13 on test took 0.24 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8641191029625117, 'spearman': 0.8661722046576462}, 'manhattan': {'pearson': 0.8612329245158399, 'spearman': 0.8670470572793592}, 'euclidean': {'pearson': 0.8596557669769862, 'spearman': 0.8661722046576462}, 'evaluation_time': 0.24}
INFO:main:Running task: STS14


Loading cohere from cache for STS14...
Loading llmrails from cache for STS14...
Loading voyage from cache for STS14...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS14 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS14
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS14, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS14 on test took 0.59 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8560874451605629, 'spearman': 0.823939664427776}, 'manhattan': {'pearson': 0.8427134123345277, 'spearman': 0.8237950513253167}, 'euclidean': {'pearson': 0.8440692160577163, 'spearman': 0.823939664427776}, 'evaluation_time': 0.59}
INFO:main:Running task: STS15


Loading cohere from cache for STS15...
Loading llmrails from cache for STS15...
Loading voyage from cache for STS15...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS15 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS15
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS15, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS15 on test took 0.46 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8941050865763783, 'spearman': 0.8998080819608583}, 'manhattan': {'pearson': 0.8920613135684199, 'spearman': 0.8995727384226874}, 'euclidean': {'pearson': 0.8924429623693111, 'spearman': 0.8998080819608583}, 'evaluation_time': 0.46}
INFO:main:Running task: STS16


Loading cohere from cache for STS16...
Loading llmrails from cache for STS16...
Loading voyage from cache for STS16...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.18 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8616282195431654, 'spearman': 0.8731302007160513}, 'manhattan': {'pearson': 0.8678535568229993, 'spearman': 0.8746058017448408}, 'euclidean': {'pearson': 0.8669449258305534, 'spearman': 0.8731302007160513}, 'evaluation_time': 0.18}
INFO:main:Running task: STS17


Loading cohere from cache for STS17...
Loading llmrails from cache for STS17...
Loading voyage from cache for STS17...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.05 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.9124778202634012, 'spearman': 0.9111063152691726}, 'manhattan': {'pearson': 0.9116684864485762, 'spearman': 0.9127415381974409}, 'euclidean': {'pearson': 0.9118664806561625, 'spearman': 0.9111063152691726}}, 'evaluation_time'

Loading cohere from cache for STS22...
Loading llmrails from cache for STS22...
Loading voyage from cache for STS22...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.07 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.6735339647587084, 'spearman': 0.67179973146988}, 'manhattan': {'pearson': 0.6876500394339601, 'spearman': 0.6686940237837268}, 'euclidean': {'pearson': 0.6893113070501564, 'spearman': 0.67179973146988}}, 'evaluation_time': 0.07}
IN

Loading cohere from cache for STSBenchmark...
Loading llmrails from cache for STSBenchmark...
Loading voyage from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.21 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8829947297027733, 'spearman': 0.8919086471670397}, 'manhattan': {'pearson': 0.8893609026678737, 'spearman': 0.8907707938187465}, 'euclidean': {'pearson': 0.8903642213717491, 'spearman': 0.8919086471670397}, 'evaluation_time': 0.21}


Converting the results to a CSV file...
Using model name cohere$llmrails$voyage
Converting results/cohere$llmrails$voyage to results/cohere$llmrails$voyage_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - t

INFO:main:Running task: BIOSSES
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

********************** Evaluating BIOSSES **********************
INFO:mteb.evaluation.MTEB:Loading dataset for BIOSSES
INFO:mteb.abstasks.AbsTaskSTS:
Task: BIOSSES, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for BIOSSES on test took 0.02 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.9088580397167556, 'spearman': 0.8968216395087166}, 'manhattan': {'pearson': 0.8903103371634311, 'spearman': 0.8915142896614873}, 'euclidean': {'pearson': 0.8908347879333874, 'spearman': 0.8968216395087166}, 'evaluation_time': 0.02}
INFO:main:Running task: SICK-R


Loading gist from cache for SICK-R...
Loading llmrails from cache for SICK-R...
Loading voyage from cache for SICK-R...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating SICK-R **********************
INFO:mteb.evaluation.MTEB:Loading dataset for SICK-R
INFO:mteb.abstasks.AbsTaskSTS:
Task: SICK-R, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for SICK-R on test took 1.68 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8755443032445576, 'spearman': 0.8288886997182481}, 'manhattan': {'pearson': 0.8481614073919828, 'spearman': 0.8268173665535267}, 'euclidean': {'pearson': 0.8484864573743844, 'spearman': 0.8288886997182481}, 'evaluation_time': 1.68}
INFO:main:Running task: STS12


Loading gist from cache for STS12...
Loading llmrails from cache for STS12...
Loading voyage from cache for STS12...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS12 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS12
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS12, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS12 on test took 0.45 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8666355056807474, 'spearman': 0.7785851051844316}, 'manhattan': {'pearson': 0.8260088202998656, 'spearman': 0.7772479305252974}, 'euclidean': {'pearson': 0.832983915598493, 'spearman': 0.7785851051844316}, 'evaluation_time': 0.45}
INFO:main:Running task: STS13


Loading gist from cache for STS13...
Loading llmrails from cache for STS13...
Loading voyage from cache for STS13...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS13 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS13
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS13, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS13 on test took 0.23 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.885049497864915, 'spearman': 0.8894019167319343}, 'manhattan': {'pearson': 0.8792781926208317, 'spearman': 0.8855508921399868}, 'euclidean': {'pearson': 0.8827042815399513, 'spearman': 0.8894019167319343}, 'evaluation_time': 0.23}
INFO:main:Running task: STS14


Loading gist from cache for STS14...
Loading llmrails from cache for STS14...
Loading voyage from cache for STS14...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS14 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS14
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS14, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS14 on test took 0.55 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8660513759952165, 'spearman': 0.8387802459626184}, 'manhattan': {'pearson': 0.8535654658176408, 'spearman': 0.8354650741105225}, 'euclidean': {'pearson': 0.85690921146952, 'spearman': 0.8387802459626184}, 'evaluation_time': 0.55}
INFO:main:Running task: STS15


Loading gist from cache for STS15...
Loading llmrails from cache for STS15...
Loading voyage from cache for STS15...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS15 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS15
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS15, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS15 on test took 0.44 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8909236557826761, 'spearman': 0.899811779833881}, 'manhattan': {'pearson': 0.892921455539764, 'spearman': 0.8999363833632604}, 'euclidean': {'pearson': 0.893070072681164, 'spearman': 0.899811779833881}, 'evaluation_time': 0.44}
INFO:main:Running task: STS16


Loading gist from cache for STS16...
Loading llmrails from cache for STS16...
Loading voyage from cache for STS16...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.21 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8578378376298117, 'spearman': 0.8719957547369467}, 'manhattan': {'pearson': 0.8665900925286827, 'spearman': 0.8728965896989579}, 'euclidean': {'pearson': 0.8656740195973429, 'spearman': 0.8719957547369467}, 'evaluation_time': 0.21}
INFO:main:Running task: STS17
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Loading gist from cache for STS17...
Loading llmrails from cache for STS17...
Loading voyage from cache for STS17...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.04 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.906439674649727, 'spearman': 0.9037812545919078}, 'manhattan': {'pearson': 0.9081490341414107, 'spearman': 0.9065785082940727}, 'euclidean': {'pearson': 0.9071599559124713, 'spearman': 0.9037812545919078}}, 'evaluation_time':

Loading gist from cache for STS22...
Loading llmrails from cache for STS22...
Loading voyage from cache for STS22...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.06 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.6890153964211962, 'spearman': 0.690101106925469}, 'manhattan': {'pearson': 0.7031425264688624, 'spearman': 0.6863304717813526}, 'euclidean': {'pearson': 0.7061367688194846, 'spearman': 0.690101106925469}}, 'evaluation_time': 0.06}


Loading gist from cache for STSBenchmark...
Loading llmrails from cache for STSBenchmark...
Loading voyage from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.21 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8803360151684518, 'spearman': 0.8881321873968734}, 'manhattan': {'pearson': 0.8866892424051107, 'spearman': 0.8874604831919897}, 'euclidean': {'pearson': 0.8870542001353642, 'spearman': 0.8881321873968734}, 'evaluation_time': 0.21}


Converting the results to a CSV file...
Using model name gist$llmrails$voyage
Converting results/gist$llmrails$voyage to results/gist$llmrails$voyage_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP2P - test se

INFO:main:Running task: BIOSSES


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating BIOSSES **********************
INFO:mteb.evaluation.MTEB:Loading dataset for BIOSSES
INFO:mteb.abstasks.AbsTaskSTS:
Task: BIOSSES, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for BIOSSES on test took 0.03 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8928454831649659, 'spearman': 0.878568211543785}, 'manhattan': {'pearson': 0.8697696940783188, 'spearman': 0.8751464468295668}, 'euclidean': {'pearson': 0.8720664693523265, 'spearman': 0.878568211543785}, 'evaluation_time': 0.03}
INFO:main:Running task: SICK-R


Loading angle from cache for SICK-R...
Loading cohere from cache for SICK-R...
Loading gist from cache for SICK-R...
Loading llmrails from cache for SICK-R...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating SICK-R **********************
INFO:mteb.evaluation.MTEB:Loading dataset for SICK-R
INFO:mteb.abstasks.AbsTaskSTS:
Task: SICK-R, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for SICK-R on test took 2.63 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8731951761443993, 'spearman': 0.829225182214024}, 'manhattan': {'pearson': 0.8441007864316483, 'spearman': 0.8288565375052802}, 'euclidean': {'pearson': 0.844865951982962, 'spearman': 0.829225182214024}, 'evaluation_time': 2.63}
INFO:main:Running task: STS12


Loading angle from cache for STS12...
Loading cohere from cache for STS12...
Loading gist from cache for STS12...
Loading llmrails from cache for STS12...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS12 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS12
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS12, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS12 on test took 0.78 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.870882249176649, 'spearman': 0.7828969015541773}, 'manhattan': {'pearson': 0.8389084402898784, 'spearman': 0.7835580250597293}, 'euclidean': {'pearson': 0.8386307963865465, 'spearman': 0.7828969015541773}, 'evaluation_time': 0.78}
INFO:main:Running task: STS13


Loading angle from cache for STS13...
Loading cohere from cache for STS13...
Loading gist from cache for STS13...
Loading llmrails from cache for STS13...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS13 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS13
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS13, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS13 on test took 0.34 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8850616976570924, 'spearman': 0.8891220863916071}, 'manhattan': {'pearson': 0.8817690343919058, 'spearman': 0.8894158517508215}, 'euclidean': {'pearson': 0.8813019286448043, 'spearman': 0.8891220863916071}, 'evaluation_time': 0.34}
INFO:main:Running task: STS14


Loading angle from cache for STS14...
Loading cohere from cache for STS14...
Loading gist from cache for STS14...
Loading llmrails from cache for STS14...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS14 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS14
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS14, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS14 on test took 0.87 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8688748256813532, 'spearman': 0.8433078615568645}, 'manhattan': {'pearson': 0.8584116355106992, 'spearman': 0.8435316794244463}, 'euclidean': {'pearson': 0.8589564542403686, 'spearman': 0.8433078615568645}, 'evaluation_time': 0.87}
INFO:main:Running task: STS15


Loading angle from cache for STS15...
Loading cohere from cache for STS15...
Loading gist from cache for STS15...
Loading llmrails from cache for STS15...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS15 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS15
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS15, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS15 on test took 0.69 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8944098927487286, 'spearman': 0.902822909680659}, 'manhattan': {'pearson': 0.8955080867871639, 'spearman': 0.9026036621774538}, 'euclidean': {'pearson': 0.895724415269392, 'spearman': 0.902822909680659}, 'evaluation_time': 0.69}
INFO:main:Running task: STS16


Loading angle from cache for STS16...
Loading cohere from cache for STS16...
Loading gist from cache for STS16...
Loading llmrails from cache for STS16...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.27 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8622216001712073, 'spearman': 0.8741925359713997}, 'manhattan': {'pearson': 0.8667850452227646, 'spearman': 0.8737718675843231}, 'euclidean': {'pearson': 0.8673296863040566, 'spearman': 0.8741925359713997}, 'evaluation_time': 0.27}
INFO:main:Running task: STS17


Loading angle from cache for STS17...
Loading cohere from cache for STS17...
Loading gist from cache for STS17...
Loading llmrails from cache for STS17...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.06 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.9075910234116579, 'spearman': 0.9050182426932638}, 'manhattan': {'pearson': 0.9084425447448106, 'spearman': 0.9056621069163125}, 'euclidean': {'pearson': 0.908249257879477, 'spearman': 0.9050182426932638}}, 'evaluation_time':

Loading angle from cache for STS22...
Loading cohere from cache for STS22...
Loading gist from cache for STS22...
Loading llmrails from cache for STS22...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.09 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.6879843577800525, 'spearman': 0.6883049075378014}, 'manhattan': {'pearson': 0.7039182754965613, 'spearman': 0.6890129698247387}, 'euclidean': {'pearson': 0.703959377593659, 'spearman': 0.6883049075378014}}, 'evaluation_time': 0.09}

Loading angle from cache for STSBenchmark...
Loading cohere from cache for STSBenchmark...
Loading gist from cache for STSBenchmark...
Loading llmrails from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.32 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8826731329802971, 'spearman': 0.8945500803317957}, 'manhattan': {'pearson': 0.891802813519075, 'spearman': 0.8941726185283024}, 'euclidean': {'pearson': 0.8919960340159819, 'spearman': 0.8945500803317957}, 'evaluation_time': 0.32}


Converting the results to a CSV file...
Using model name angle$cohere$gist$llmrails
Converting results/angle$cohere$gist$llmrails to results/angle$cohere$gist$llmrails_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClust

INFO:main:Running task: BIOSSES
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

********************** Evaluating BIOSSES **********************
INFO:mteb.evaluation.MTEB:Loading dataset for BIOSSES
INFO:mteb.abstasks.AbsTaskSTS:
Task: BIOSSES, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for BIOSSES on test took 0.04 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.9025296520424542, 'spearman': 0.8828273447356023}, 'manhattan': {'pearson': 0.8859564672420053, 'spearman': 0.8831406048854956}, 'euclidean': {'pearson': 0.8854061028586233, 'spearman': 0.8828273447356023}, 'evaluation_time': 0.04}
INFO:main:Running task: SICK-R


Loading angle from cache for SICK-R...
Loading cohere from cache for SICK-R...
Loading gist from cache for SICK-R...
Loading voyage from cache for SICK-R...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating SICK-R **********************
INFO:mteb.evaluation.MTEB:Loading dataset for SICK-R
INFO:mteb.abstasks.AbsTaskSTS:
Task: SICK-R, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for SICK-R on test took 3.76 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8761987174142484, 'spearman': 0.8285039221955587}, 'manhattan': {'pearson': 0.850813090616238, 'spearman': 0.8280080317567412}, 'euclidean': {'pearson': 0.8499617695052438, 'spearman': 0.8285039221955587}, 'evaluation_time': 3.76}
INFO:main:Running task: STS12


Loading angle from cache for STS12...
Loading cohere from cache for STS12...
Loading gist from cache for STS12...
Loading voyage from cache for STS12...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS12 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS12
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS12, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS12 on test took 0.79 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8639521391279211, 'spearman': 0.7748556837889371}, 'manhattan': {'pearson': 0.8286024462519068, 'spearman': 0.7740965428594113}, 'euclidean': {'pearson': 0.8321859326399719, 'spearman': 0.7748556837889371}, 'evaluation_time': 0.79}
INFO:main:Running task: STS13


Loading angle from cache for STS13...
Loading cohere from cache for STS13...
Loading gist from cache for STS13...
Loading voyage from cache for STS13...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS13 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS13
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS13, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS13 on test took 0.35 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8829004497629773, 'spearman': 0.8856118282639522}, 'manhattan': {'pearson': 0.878930638057638, 'spearman': 0.8849165969858086}, 'euclidean': {'pearson': 0.8792429049490869, 'spearman': 0.8856118282639522}, 'evaluation_time': 0.35}
INFO:main:Running task: STS14


Loading angle from cache for STS14...
Loading cohere from cache for STS14...
Loading gist from cache for STS14...
Loading voyage from cache for STS14...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS14 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS14
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS14, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS14 on test took 0.92 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8687380000763596, 'spearman': 0.8374765510239803}, 'manhattan': {'pearson': 0.8577398068693263, 'spearman': 0.8357995423473643}, 'euclidean': {'pearson': 0.8593349918752674, 'spearman': 0.8374765510239803}, 'evaluation_time': 0.92}
INFO:main:Running task: STS15


Loading angle from cache for STS15...
Loading cohere from cache for STS15...
Loading gist from cache for STS15...
Loading voyage from cache for STS15...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS15 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS15
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS15, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS15 on test took 0.73 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8954260111823948, 'spearman': 0.9033171345807015}, 'manhattan': {'pearson': 0.8955742612052551, 'spearman': 0.902604522530708}, 'euclidean': {'pearson': 0.8963536388196738, 'spearman': 0.9033171345807015}, 'evaluation_time': 0.73}
INFO:main:Running task: STS16


Loading angle from cache for STS16...
Loading cohere from cache for STS16...
Loading gist from cache for STS16...
Loading voyage from cache for STS16...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.29 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8620807274604054, 'spearman': 0.8749747357497841}, 'manhattan': {'pearson': 0.8682099692687597, 'spearman': 0.8747092887511324}, 'euclidean': {'pearson': 0.8682852434684607, 'spearman': 0.8749747357497841}, 'evaluation_time': 0.29}
INFO:main:Running task: STS17


Loading angle from cache for STS17...
Loading cohere from cache for STS17...
Loading gist from cache for STS17...
Loading voyage from cache for STS17...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.07 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.9083095581385054, 'spearman': 0.9070140295864395}, 'manhattan': {'pearson': 0.9103917819426894, 'spearman': 0.9084632045839883}, 'euclidean': {'pearson': 0.9100833840777028, 'spearman': 0.9070140295864395}}, 'evaluation_time'

Loading angle from cache for STS22...
Loading cohere from cache for STS22...
Loading gist from cache for STS22...
Loading voyage from cache for STS22...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.10 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.684714690079876, 'spearman': 0.6875447579102156}, 'manhattan': {'pearson': 0.7008282457957171, 'spearman': 0.6848394716552958}, 'euclidean': {'pearson': 0.7023685276238469, 'spearman': 0.6875447579102156}}, 'evaluation_time': 0.1}


Loading angle from cache for STSBenchmark...
Loading cohere from cache for STSBenchmark...
Loading gist from cache for STSBenchmark...
Loading voyage from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.34 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8837666989434638, 'spearman': 0.8941944696631166}, 'manhattan': {'pearson': 0.8932146816277504, 'spearman': 0.8931776966621203}, 'euclidean': {'pearson': 0.893747794459035, 'spearman': 0.8941944696631166}, 'evaluation_time': 0.34}


Converting the results to a CSV file...
Using model name angle$cohere$gist$voyage
Converting results/angle$cohere$gist$voyage to results/angle$cohere$gist$voyage_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClusteringP

INFO:main:Running task: BIOSSES
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

********************** Evaluating BIOSSES **********************
INFO:mteb.evaluation.MTEB:Loading dataset for BIOSSES
INFO:mteb.abstasks.AbsTaskSTS:
Task: BIOSSES, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for BIOSSES on test took 0.03 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8913998100264161, 'spearman': 0.8740681090058959}, 'manhattan': {'pearson': 0.8740239755580557, 'spearman': 0.8758874275687373}, 'euclidean': {'pearson': 0.8727199094941536, 'spearman': 0.8740681090058959}, 'evaluation_time': 0.03}
INFO:main:Running task: SICK-R


Loading angle from cache for SICK-R...
Loading cohere from cache for SICK-R...
Loading llmrails from cache for SICK-R...
Loading voyage from cache for SICK-R...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating SICK-R **********************
INFO:mteb.evaluation.MTEB:Loading dataset for SICK-R
INFO:mteb.abstasks.AbsTaskSTS:
Task: SICK-R, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for SICK-R on test took 2.71 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8721599423983546, 'spearman': 0.8279302076147137}, 'manhattan': {'pearson': 0.8453255208436226, 'spearman': 0.8270788740767793}, 'euclidean': {'pearson': 0.8445956529440799, 'spearman': 0.8279302076147137}, 'evaluation_time': 2.71}
INFO:main:Running task: STS12


Loading angle from cache for STS12...
Loading cohere from cache for STS12...
Loading llmrails from cache for STS12...
Loading voyage from cache for STS12...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS12 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS12
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS12, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS12 on test took 0.79 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8664084566840555, 'spearman': 0.7807568764644216}, 'manhattan': {'pearson': 0.8257336740809008, 'spearman': 0.7799082289374633}, 'euclidean': {'pearson': 0.8293437453921647, 'spearman': 0.7807568764644216}, 'evaluation_time': 0.79}
INFO:main:Running task: STS13


Loading angle from cache for STS13...
Loading cohere from cache for STS13...
Loading llmrails from cache for STS13...
Loading voyage from cache for STS13...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS13 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS13
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS13, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS13 on test took 0.36 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.878676405464993, 'spearman': 0.8818798501237417}, 'manhattan': {'pearson': 0.8747151483700053, 'spearman': 0.8809095183329048}, 'euclidean': {'pearson': 0.8750582708629598, 'spearman': 0.8818798501237417}, 'evaluation_time': 0.36}
INFO:main:Running task: STS14


Loading angle from cache for STS14...
Loading cohere from cache for STS14...
Loading llmrails from cache for STS14...
Loading voyage from cache for STS14...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS14 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS14
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS14, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS14 on test took 0.90 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.865144710591132, 'spearman': 0.8382607246025318}, 'manhattan': {'pearson': 0.8526658618410576, 'spearman': 0.8362967959028212}, 'euclidean': {'pearson': 0.8543943916864605, 'spearman': 0.8382607246025318}, 'evaluation_time': 0.9}
INFO:main:Running task: STS15


Loading angle from cache for STS15...
Loading cohere from cache for STS15...
Loading llmrails from cache for STS15...
Loading voyage from cache for STS15...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS15 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS15
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS15, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS15 on test took 0.72 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8963678324817785, 'spearman': 0.903144859962953}, 'manhattan': {'pearson': 0.8951375114277484, 'spearman': 0.9025414565018556}, 'euclidean': {'pearson': 0.8958202269985571, 'spearman': 0.903144859962953}, 'evaluation_time': 0.72}
INFO:main:Running task: STS16


Loading angle from cache for STS16...
Loading cohere from cache for STS16...
Loading llmrails from cache for STS16...
Loading voyage from cache for STS16...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.29 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.864095708447068, 'spearman': 0.8766725866626655}, 'manhattan': {'pearson': 0.868940182435777, 'spearman': 0.8763770914008789}, 'euclidean': {'pearson': 0.8690423080164662, 'spearman': 0.8766725866626655}, 'evaluation_time': 0.29}
INFO:main:Running task: STS17
INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


Loading angle from cache for STS17...
Loading cohere from cache for STS17...
Loading llmrails from cache for STS17...
Loading voyage from cache for STS17...


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.07 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.9100624408372278, 'spearman': 0.9080822676018739}, 'manhattan': {'pearson': 0.9107498053751549, 'spearman': 0.9096836636336478}, 'euclidean': {'pearson': 0.910646269323119, 'spearman': 0.9080822676018739}}, 'evaluation_time':

Loading angle from cache for STS22...
Loading cohere from cache for STS22...
Loading llmrails from cache for STS22...
Loading voyage from cache for STS22...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.09 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.6803724000031001, 'spearman': 0.680086501821141}, 'manhattan': {'pearson': 0.6954575061563223, 'spearman': 0.6770548558224722}, 'euclidean': {'pearson': 0.6972253755776037, 'spearman': 0.680086501821141}}, 'evaluation_time': 0.09}


Loading angle from cache for STSBenchmark...
Loading cohere from cache for STSBenchmark...
Loading llmrails from cache for STSBenchmark...
Loading voyage from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.34 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.885503574257826, 'spearman': 0.8955603820616054}, 'manhattan': {'pearson': 0.8924412291469711, 'spearman': 0.8946213124198739}, 'euclidean': {'pearson': 0.892947778790786, 'spearman': 0.8955603820616054}, 'evaluation_time': 0.34}


Converting the results to a CSV file...
Using model name angle$cohere$llmrails$voyage
Converting results/angle$cohere$llmrails$voyage to results/angle$cohere$llmrails$voyage_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
Medrxi

INFO:main:Running task: BIOSSES


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating BIOSSES **********************
INFO:mteb.evaluation.MTEB:Loading dataset for BIOSSES
INFO:mteb.abstasks.AbsTaskSTS:
Task: BIOSSES, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for BIOSSES on test took 0.03 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.9024910184853756, 'spearman': 0.8868876782169106}, 'manhattan': {'pearson': 0.8840434918440997, 'spearman': 0.8885383182375018}, 'euclidean': {'pearson': 0.8841579314447515, 'spearman': 0.8868876782169106}, 'evaluation_time': 0.03}
INFO:main:Running task: SICK-R


Loading angle from cache for SICK-R...
Loading gist from cache for SICK-R...
Loading llmrails from cache for SICK-R...
Loading voyage from cache for SICK-R...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating SICK-R **********************
INFO:mteb.evaluation.MTEB:Loading dataset for SICK-R
INFO:mteb.abstasks.AbsTaskSTS:
Task: SICK-R, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for SICK-R on test took 2.72 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8754805555679854, 'spearman': 0.8295237641959206}, 'manhattan': {'pearson': 0.8498011613180825, 'spearman': 0.8286796396514127}, 'euclidean': {'pearson': 0.8488480642526135, 'spearman': 0.8295237641959206}, 'evaluation_time': 2.72}
INFO:main:Running task: STS12


Loading angle from cache for STS12...
Loading gist from cache for STS12...
Loading llmrails from cache for STS12...
Loading voyage from cache for STS12...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS12 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS12
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS12, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS12 on test took 0.77 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8720848048996026, 'spearman': 0.785150208231002}, 'manhattan': {'pearson': 0.8344813195146163, 'spearman': 0.7836117887496128}, 'euclidean': {'pearson': 0.8382988305253825, 'spearman': 0.785150208231002}, 'evaluation_time': 0.77}
INFO:main:Running task: STS13


Loading angle from cache for STS13...
Loading gist from cache for STS13...
Loading llmrails from cache for STS13...
Loading voyage from cache for STS13...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS13 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS13
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS13, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS13 on test took 0.35 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8850344237676135, 'spearman': 0.8905590090945071}, 'manhattan': {'pearson': 0.8824592468465416, 'spearman': 0.8891684451085078}, 'euclidean': {'pearson': 0.8835294162692858, 'spearman': 0.8905590090945071}, 'evaluation_time': 0.35}
INFO:main:Running task: STS14


Loading angle from cache for STS14...
Loading gist from cache for STS14...
Loading llmrails from cache for STS14...
Loading voyage from cache for STS14...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS14 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS14
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS14, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS14 on test took 0.86 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8664429127898862, 'spearman': 0.8428622154687644}, 'manhattan': {'pearson': 0.8564054579342617, 'spearman': 0.8405450541374963}, 'euclidean': {'pearson': 0.8576008784503846, 'spearman': 0.8428622154687644}, 'evaluation_time': 0.86}
INFO:main:Running task: STS15


Loading angle from cache for STS15...
Loading gist from cache for STS15...
Loading llmrails from cache for STS15...
Loading voyage from cache for STS15...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS15 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS15
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS15, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS15 on test took 0.67 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8912842989209255, 'spearman': 0.9001219445221906}, 'manhattan': {'pearson': 0.8933528365232414, 'spearman': 0.9004026369956867}, 'euclidean': {'pearson': 0.8931696493374555, 'spearman': 0.9001219445221906}, 'evaluation_time': 0.67}
INFO:main:Running task: STS16


Loading angle from cache for STS16...
Loading gist from cache for STS16...
Loading llmrails from cache for STS16...
Loading voyage from cache for STS16...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.30 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.859545381725349, 'spearman': 0.8732509921070015}, 'manhattan': {'pearson': 0.8664565324751847, 'spearman': 0.8735770099245656}, 'euclidean': {'pearson': 0.8660546876913764, 'spearman': 0.8732509921070015}, 'evaluation_time': 0.3}
INFO:main:Running task: STS17


Loading angle from cache for STS17...
Loading gist from cache for STS17...
Loading llmrails from cache for STS17...
Loading voyage from cache for STS17...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.07 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.9040280040063552, 'spearman': 0.9014383576131126}, 'manhattan': {'pearson': 0.9070576345477948, 'spearman': 0.9044097429529138}, 'euclidean': {'pearson': 0.9059374700701144, 'spearman': 0.9014383576131126}}, 'evaluation_time'

Loading angle from cache for STS22...
Loading gist from cache for STS22...
Loading llmrails from cache for STS22...
Loading voyage from cache for STS22...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.09 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.6867493719183996, 'spearman': 0.6857428614696643}, 'manhattan': {'pearson': 0.7030227606766792, 'spearman': 0.685214663280903}, 'euclidean': {'pearson': 0.7039465021418093, 'spearman': 0.6857428614696643}}, 'evaluation_time': 0.09}

Loading angle from cache for STSBenchmark...
Loading gist from cache for STSBenchmark...
Loading llmrails from cache for STSBenchmark...
Loading voyage from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.32 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8815893961679444, 'spearman': 0.8911670601054863}, 'manhattan': {'pearson': 0.8888969308515111, 'spearman': 0.890632867549925}, 'euclidean': {'pearson': 0.8889027670210661, 'spearman': 0.8911670601054863}, 'evaluation_time': 0.32}


Converting the results to a CSV file...
Using model name angle$gist$llmrails$voyage
Converting results/angle$gist$llmrails$voyage to results/angle$gist$llmrails$voyage_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivClust

INFO:main:Running task: BIOSSES


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating BIOSSES **********************
INFO:mteb.evaluation.MTEB:Loading dataset for BIOSSES
INFO:mteb.abstasks.AbsTaskSTS:
Task: BIOSSES, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for BIOSSES on test took 0.03 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.902068906256329, 'spearman': 0.8833153076613977}, 'manhattan': {'pearson': 0.8842317502362458, 'spearman': 0.886556345366062}, 'euclidean': {'pearson': 0.8832224599544868, 'spearman': 0.8833153076613977}, 'evaluation_time': 0.03}
INFO:main:Running task: SICK-R


Loading cohere from cache for SICK-R...
Loading gist from cache for SICK-R...
Loading llmrails from cache for SICK-R...
Loading voyage from cache for SICK-R...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating SICK-R **********************
INFO:mteb.evaluation.MTEB:Loading dataset for SICK-R
INFO:mteb.abstasks.AbsTaskSTS:
Task: SICK-R, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for SICK-R on test took 2.97 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8728324657920584, 'spearman': 0.8277428605763455}, 'manhattan': {'pearson': 0.8451597756511766, 'spearman': 0.8266319531508496}, 'euclidean': {'pearson': 0.8446759466550162, 'spearman': 0.8277428605763455}, 'evaluation_time': 2.97}
INFO:main:Running task: STS12


Loading cohere from cache for STS12...
Loading gist from cache for STS12...
Loading llmrails from cache for STS12...
Loading voyage from cache for STS12...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS12 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS12
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS12, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS12 on test took 0.81 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8608834330975482, 'spearman': 0.7745422784955762}, 'manhattan': {'pearson': 0.8204064464610623, 'spearman': 0.7734093296213127}, 'euclidean': {'pearson': 0.8244275965193357, 'spearman': 0.7745422784955762}, 'evaluation_time': 0.81}
INFO:main:Running task: STS13


Loading cohere from cache for STS13...
Loading gist from cache for STS13...
Loading llmrails from cache for STS13...
Loading voyage from cache for STS13...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS13 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS13
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS13, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS13 on test took 0.38 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8780548412558931, 'spearman': 0.8804844637777391}, 'manhattan': {'pearson': 0.8732044415546495, 'spearman': 0.879390329015502}, 'euclidean': {'pearson': 0.8740146447678392, 'spearman': 0.8804844637777391}, 'evaluation_time': 0.38}
INFO:main:Running task: STS14


Loading cohere from cache for STS14...
Loading gist from cache for STS14...
Loading llmrails from cache for STS14...
Loading voyage from cache for STS14...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS14 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS14
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS14, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS14 on test took 0.93 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8646329819404941, 'spearman': 0.8347126908047604}, 'manhattan': {'pearson': 0.8520469747515732, 'spearman': 0.8327629999046228}, 'euclidean': {'pearson': 0.8539036247802785, 'spearman': 0.8347126908047604}, 'evaluation_time': 0.93}
INFO:main:Running task: STS15


Loading cohere from cache for STS15...
Loading gist from cache for STS15...
Loading llmrails from cache for STS15...
Loading voyage from cache for STS15...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS15 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS15
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS15, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS15 on test took 0.75 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8958951090382712, 'spearman': 0.9028650013737256}, 'manhattan': {'pearson': 0.8950841347871976, 'spearman': 0.9023499932039379}, 'euclidean': {'pearson': 0.8956839442361757, 'spearman': 0.9028650013737256}, 'evaluation_time': 0.75}
INFO:main:Running task: STS16


Loading cohere from cache for STS16...
Loading gist from cache for STS16...
Loading llmrails from cache for STS16...
Loading voyage from cache for STS16...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.31 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8629960470219902, 'spearman': 0.8759885399225414}, 'manhattan': {'pearson': 0.8689770508398238, 'spearman': 0.875587842904139}, 'euclidean': {'pearson': 0.8691043509097831, 'spearman': 0.8759885399225414}, 'evaluation_time': 0.31}
INFO:main:Running task: STS17


Loading cohere from cache for STS17...
Loading gist from cache for STS17...
Loading llmrails from cache for STS17...
Loading voyage from cache for STS17...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.07 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.9120602095312327, 'spearman': 0.9101853011327993}, 'manhattan': {'pearson': 0.9116549849338026, 'spearman': 0.9115948832855874}, 'euclidean': {'pearson': 0.9114093902101122, 'spearman': 0.9101853011327993}}, 'evaluation_time'

Loading cohere from cache for STS22...
Loading gist from cache for STS22...
Loading llmrails from cache for STS22...
Loading voyage from cache for STS22...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.10 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.6834983923027987, 'spearman': 0.683124658737391}, 'manhattan': {'pearson': 0.6981798409030401, 'spearman': 0.6816328447466368}, 'euclidean': {'pearson': 0.7001547136579196, 'spearman': 0.683124658737391}}, 'evaluation_time': 0.1}
I

Loading cohere from cache for STSBenchmark...
Loading gist from cache for STSBenchmark...
Loading llmrails from cache for STSBenchmark...
Loading voyage from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.36 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8845281747756357, 'spearman': 0.8932145634368028}, 'manhattan': {'pearson': 0.8909011449632938, 'spearman': 0.8919789308989154}, 'euclidean': {'pearson': 0.8916331630303775, 'spearman': 0.8932145634368028}, 'evaluation_time': 0.36}


Converting the results to a CSV file...
Using model name cohere$gist$llmrails$voyage
Converting results/cohere$gist$llmrails$voyage to results/cohere$gist$llmrails$voyage_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set not found
MedrxivCl

INFO:main:Running task: BIOSSES


Not found: 'BUCC','Tatoeba','AmazonCounterfactualClassification','AmazonPolarityClassification','AmazonReviewsClassification','Banking77Classification','EmotionClassification','ImdbClassification','MassiveIntentClassification','MassiveScenarioClassification','MTOPDomainClassification','MTOPIntentClassification','ToxicConversationsClassification','TweetSentimentExtractionClassification','ArxivClusteringP2P','ArxivClusteringS2S','BiorxivClusteringP2P','BiorxivClusteringS2S','MedrxivClusteringP2P','MedrxivClusteringS2S','RedditClustering','RedditClusteringP2P','StackExchangeClustering','StackExchangeClusteringP2P','TwentyNewsgroupsClustering','SprintDuplicateQuestions','TwitterSemEval2015','TwitterURLCorpus','AskUbuntuDupQuestions','MindSmallReranking','SciDocsRR','StackOverflowDupQuestions','ArguAna','ClimateFEVER','CQADupstackRetrieval','DBPedia','FEVER','FiQA2018','HotpotQA','MSMARCO','NFCorpus','NQ','QuoraRetrieval','SCIDOCS','SciFact','Touche2020','TRECCOVID','SummEval' 48
--DONE--
E

INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating BIOSSES **********************
INFO:mteb.evaluation.MTEB:Loading dataset for BIOSSES
INFO:mteb.abstasks.AbsTaskSTS:
Task: BIOSSES, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 100 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for BIOSSES on test took 0.04 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8983269731774247, 'spearman': 0.8783151937304096}, 'manhattan': {'pearson': 0.8798572896921963, 'spearman': 0.8807128387238231}, 'euclidean': {'pearson': 0.8792331752501344, 'spearman': 0.8783151937304096}, 'evaluation_time': 0.04}
INFO:main:Running task: SICK-R


Loading angle from cache for SICK-R...
Loading cohere from cache for SICK-R...
Loading gist from cache for SICK-R...
Loading llmrails from cache for SICK-R...
Loading voyage from cache for SICK-R...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating SICK-R **********************
INFO:mteb.evaluation.MTEB:Loading dataset for SICK-R
INFO:mteb.abstasks.AbsTaskSTS:
Task: SICK-R, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 9927 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for SICK-R on test took 4.18 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8749605811868715, 'spearman': 0.8292228986109901}, 'manhattan': {'pearson': 0.8479708469867344, 'spearman': 0.8290555401946768}, 'euclidean': {'pearson': 0.8475821000801335, 'spearman': 0.8292228986109901}, 'evaluation_time': 4.18}
INFO:main:Running task: STS12


Loading angle from cache for STS12...
Loading cohere from cache for STS12...
Loading gist from cache for STS12...
Loading llmrails from cache for STS12...
Loading voyage from cache for STS12...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS12 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS12
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS12, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3108 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS12 on test took 1.04 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8682058743387977, 'spearman': 0.7797342352253419}, 'manhattan': {'pearson': 0.8331264778878771, 'spearman': 0.7801270670877979}, 'euclidean': {'pearson': 0.8361776715237077, 'spearman': 0.7797342352253419}, 'evaluation_time': 1.04}
INFO:main:Running task: STS13


Loading angle from cache for STS13...
Loading cohere from cache for STS13...
Loading gist from cache for STS13...
Loading llmrails from cache for STS13...
Loading voyage from cache for STS13...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS13 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS13
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS13, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1500 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS13 on test took 0.53 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8846494460683345, 'spearman': 0.8879172935927823}, 'manhattan': {'pearson': 0.8803892138775383, 'spearman': 0.8871936157290263}, 'euclidean': {'pearson': 0.8808803201377117, 'spearman': 0.8879172935927823}, 'evaluation_time': 0.53}
INFO:main:Running task: STS14


Loading angle from cache for STS14...
Loading cohere from cache for STS14...
Loading gist from cache for STS14...
Loading llmrails from cache for STS14...
Loading voyage from cache for STS14...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS14 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS14
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS14, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3750 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS14 on test took 1.27 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8694540477544168, 'spearman': 0.8412732889961586}, 'manhattan': {'pearson': 0.8580080488402744, 'spearman': 0.8400150437335128}, 'euclidean': {'pearson': 0.859758313230245, 'spearman': 0.8412732889961586}, 'evaluation_time': 1.27}
INFO:main:Running task: STS15


Loading angle from cache for STS15...
Loading cohere from cache for STS15...
Loading gist from cache for STS15...
Loading llmrails from cache for STS15...
Loading voyage from cache for STS15...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS15 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS15
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS15, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 3000 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS15 on test took 1.04 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8955773225517052, 'spearman': 0.9037034394198733}, 'manhattan': {'pearson': 0.8962652903465607, 'spearman': 0.9034388162899286}, 'euclidean': {'pearson': 0.8966186332067334, 'spearman': 0.9037034394198733}, 'evaluation_time': 1.04}
INFO:main:Running task: STS16


Loading angle from cache for STS16...
Loading cohere from cache for STS16...
Loading gist from cache for STS16...
Loading llmrails from cache for STS16...
Loading voyage from cache for STS16...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS16 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS16
INFO:mteb.abstasks.AbsTaskSTS:
Task: STS16, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1186 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS16 on test took 0.38 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.8625455936197435, 'spearman': 0.874960025381042}, 'manhattan': {'pearson': 0.8683652943065561, 'spearman': 0.8755438650309207}, 'euclidean': {'pearson': 0.8681176390797851, 'spearman': 0.874960025381042}, 'evaluation_time': 0.38}
INFO:main:Running task: STS17


Loading angle from cache for STS17...
Loading cohere from cache for STS17...
Loading gist from cache for STS17...
Loading llmrails from cache for STS17...
Loading voyage from cache for STS17...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS17 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS17
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS17, split: test, language: en-en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 250 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS17 on test took 0.09 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en-en': {'cos_sim': {'pearson': 0.9085451183073564, 'spearman': 0.9063128902772808}, 'manhattan': {'pearson': 0.9103538829503438, 'spearman': 0.9093526982031732}, 'euclidean': {'pearson': 0.9096969645623989, 'spearman': 0.9063128902772808}}, 'evaluation_time'

Loading angle from cache for STS22...
Loading cohere from cache for STS22...
Loading gist from cache for STS22...
Loading llmrails from cache for STS22...
Loading voyage from cache for STS22...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STS22 **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STS22
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
INFO:mteb.abstasks.AbsTaskSTS:Task: STS22, split: test, language: en. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 197 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STS22 on test took 0.12 seconds
INFO:mteb.evaluation.MTEB:Scores: {'en': {'cos_sim': {'pearson': 0.6866864266199815, 'spearman': 0.6878963474595914}, 'manhattan': {'pearson': 0.7019771748178814, 'spearman': 0.6857306534991999}, 'euclidean': {'pearson': 0.7034921706547612, 'spearman': 0.6878963474595914}}, 'evaluation_time': 0.12

Loading angle from cache for STSBenchmark...
Loading cohere from cache for STSBenchmark...
Loading gist from cache for STSBenchmark...
Loading llmrails from cache for STSBenchmark...
Loading voyage from cache for STSBenchmark...


INFO:mteb.evaluation.MTEB:

## Evaluating 1 tasks:


INFO:mteb.evaluation.MTEB:

********************** Evaluating STSBenchmark **********************
INFO:mteb.evaluation.MTEB:Loading dataset for STSBenchmark
INFO:mteb.abstasks.AbsTaskSTS:
Task: STSBenchmark, split: test. Running...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences1...
INFO:mteb.evaluation.evaluators.STSEvaluator:Encoding 1379 sentences2...
INFO:mteb.evaluation.evaluators.STSEvaluator:Evaluating...
INFO:mteb.evaluation.MTEB:Evaluation for STSBenchmark on test took 0.46 seconds
INFO:mteb.evaluation.MTEB:Scores: {'cos_sim': {'pearson': 0.883634247259111, 'spearman': 0.8948418304347612}, 'manhattan': {'pearson': 0.8928743301916746, 'spearman': 0.89416202396928}, 'euclidean': {'pearson': 0.8933075996272707, 'spearman': 0.8948418304347612}, 'evaluation_time': 0.46}


Converting the results to a CSV file...
Using model name angle$cohere$gist$llmrails$voyage
Converting results/angle$cohere$gist$llmrails$voyage to results/angle$cohere$gist$llmrails$voyage_results.csv
BUCC - test set not found
Tatoeba - test set not found
AmazonCounterfactualClassification - test set not found
AmazonPolarityClassification - test set not found
AmazonReviewsClassification - test set not found
Banking77Classification - test set not found
EmotionClassification - test set not found
ImdbClassification - test set not found
MassiveIntentClassification - test set not found
MassiveScenarioClassification - test set not found
MTOPDomainClassification - test set not found
MTOPIntentClassification - test set not found
ToxicConversationsClassification - test set not found
TweetSentimentExtractionClassification - test set not found
ArxivClusteringP2P - test set not found
ArxivClusteringS2S - test set not found
BiorxivClusteringP2P - test set not found
BiorxivClusteringS2S - test set n