In [32]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
from sklearn.linear_model import LogisticRegression

import pandas as pd

In [33]:
statistics_df = pd.DataFrame(columns=['dataset', 'transformer', 'accuracy', 'f1', 'precision', 'recall'])

In [34]:
datasets = {
    'SST-2': load_dataset("nyu-mll/glue", "sst2"),
    'CoLA': load_dataset("nyu-mll/glue", "cola")
}

In [35]:
#split the training set into a training and validation set 95% training, 5% validation
def get_train_val_split(dataset):
    split = dataset["train"].train_test_split(test_size=0.05)
    
    train_dataset = split["train"]
    val_dataset = split["test"]
    return train_dataset, val_dataset

In [36]:
sentence_transformers = {
    "paraphrase-MiniLM-L6-v2": SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2'),
    "all-mpnet-base-v2": SentenceTransformer("all-mpnet-base-v2")
}




In [37]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

#Train a logistic regression model on the training sets for each sentence transformer
for dataset_name, dataset in datasets.items():
    train_dataset, val_dataset = get_train_val_split(dataset)
    
    for transformer_name, transformer in sentence_transformers.items():
        print(f"Training model for {transformer_name} on {dataset_name}")
        train_embeddings = transformer.encode(train_dataset["sentence"])
        val_embeddings = transformer.encode(val_dataset["sentence"])

        model = LogisticRegression(max_iter=1000)
        model.fit(train_embeddings, train_dataset["label"])
        
        val_predictions = model.predict(val_embeddings)
        
        #compute the accuracy, f1, precision, and recall
        accuracy = accuracy_score(val_dataset["label"], val_predictions)
        f1 = f1_score(val_dataset["label"], val_predictions)
        precision = precision_score(val_dataset["label"], val_predictions)
        recall = recall_score(val_dataset["label"], val_predictions)
        
        #store the results in the statistics dataframe
        statistics_df = statistics_df._append(
            {
                'dataset': dataset_name, 
                'transformer': transformer_name,
                'accuracy': accuracy,
                'f1': f1,
                'precision': precision,
                'recall': recall
            }, 
            ignore_index=True
        )
        
        

Training model for paraphrase-MiniLM-L6-v2 on SST-2


  statistics_df = statistics_df._append(


Training model for all-mpnet-base-v2 on SST-2
Training model for paraphrase-MiniLM-L6-v2 on CoLA
Training model for all-mpnet-base-v2 on CoLA


In [38]:
statistics_df

Unnamed: 0,dataset,transformer,accuracy,f1,precision,recall
0,SST-2,paraphrase-MiniLM-L6-v2,0.851841,0.8631,0.869541,0.856754
1,SST-2,all-mpnet-base-v2,0.897862,0.906318,0.906318,0.906318
2,CoLA,paraphrase-MiniLM-L6-v2,0.714953,0.819527,0.730871,0.93266
3,CoLA,all-mpnet-base-v2,0.696262,0.816901,0.702179,0.976431
