In [34]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [35]:
import pandas as pd
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nanoGPT.chat import init_model as init_nanoGPT
from  nanoGPT.chat import respond as get_respond_nanoGPT
import torch
from bert_score import score
import tiktoken

### Small Validation dataset  - With emotion tag

In [81]:
data_path = '../../data/emotion/validation/100_validation.csv'
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,Situation,grouped_emotion,empathetic_dialogues,labels
0,Last night I heard strange noises coming from ...,afraid,In the middle of the night I heard some weird ...,Should have grabbed the gun.
1,My mom and sister threw me a baby shower when ...,excited,that was very nice of them congratulations,"Thank you! It was so nice, I had no idea it w..."
2,I just applied for a new job. After the inter...,grateful,"Oh really, do you feel like you did a great job?",I do! I'm feeling very optimistic about it
3,I loaned some money to my friend at work. Turn...,annoyed,Wow! What a jerk for him to up and leave with ...,"It was a medium amount of money but still, he ..."
4,I was out walking late last night and seen som...,,Oh my god. What happened?,"Well, I started walking much faster. It looked..."


### Big Validation dataset - Withouth emotion tag
(Facebook data + Chat GPT generated data)

-> already saved with model's outputs in data/emotion/validation/final.csv because takes a lot of time to run

In [65]:
data_path = '../../data/emotion/validation/final.csv'
test_df = pd.read_csv(data_path)

In [68]:
selected_columns = ['empathetic_dialogues', 'labels']
test_df = test_df[selected_columns]
test_df.head()

Unnamed: 0,empathetic_dialogues,labels
0,it felt good to get approved for a vacation,That's great! Where ya headed?
1,to disneyworld!,That sounds like a lot of fun! Who ya going with?
2,That's great! Where ya headed?,to disneyworld!
3,I applied for an internship with a law office,Very nice! I bet your more than excited about it!
4,I am actually terrified. They probably laughed...,Oh no! Dont be anxious or worried! I'm sure yo...


## Get model response from small dataset

In [82]:
model_list = {
    'default': 'block_size=64/withoutemotion/singleConversation',
    'rope':'Rope/withoutemotion/singleConversation',
    'relative:': 'Relative/withoutemotion/singleConversation'
}

for model_type, model_path in model_list.items():
    model_list[model_type] = init_nanoGPT(model_path)

Loading model from: ../trained-saved/block_size=64/withoutemotion/singleConversation/ckpt.pt
number of parameters: 3.42M
Loading model from: ../trained-saved/Rope/withoutemotion/singleConversation/ckpt.pt


  checkpoint = torch.load(ckpt_path, map_location=device)


number of parameters: 3.42M
Loading model from: ../trained-saved/Relative/withoutemotion/singleConversation/ckpt.pt
number of parameters: 3.42M


In [83]:
def get_response_from_nanoGPT(row,model):
    situation = row['Situation']
    emotion = row['grouped_emotion']
    human = row['empathetic_dialogues']
    start = '<bot> ' + human + '<human>'
    response, new_emotion, new_context = get_respond_nanoGPT(start, 1, model=model, enable_print=False)
    return response #, new_emotion, new_context

In [85]:
for model_type, model in model_list.items():
    label = 'new_label_' + model_type
    df[label] = df.apply(lambda row: get_response_from_nanoGPT(row, model), axis=1)

# df.to_csv('./evaluation_result.csv', index=False)
df.head()

Unnamed: 0,Situation,grouped_emotion,empathetic_dialogues,labels,new_label_default,new_label_rope,new_label_relative:
0,Last night I heard strange noises coming from ...,afraid,In the middle of the night I heard some weird ...,Should have grabbed the gun.,I was scared of the meal.\n,\n,\n sure gun gunences itepal me toeive no one! ...
1,My mom and sister threw me a baby shower when ...,excited,that was very nice of them congratulations,"Thank you! It was so nice, I had no idea it w...",I am so happy that I can't wait for the switch,I am! I hope you get a good look in it.,much of fast you fault theyw thenk
2,I just applied for a new job. After the inter...,grateful,"Oh really, do you feel like you did a great job?",I do! I'm feeling very optimistic about it,"I am, it was a good day. I think it will be a...",Well I am just really nervous about it. I'm p...,anyway teachers youay chemistry path ofately ...
3,I loaned some money to my friend at work. Turn...,annoyed,Wow! What a jerk for him to up and leave with ...,"It was a medium amount of money but still, he ...",It was a lot of money. It was nice to find yo...,"It was, I was very angry about it.",it more times for McDonald of pets and day fo...
4,I was out walking late last night and seen som...,,Oh my god. What happened?,"Well, I started walking much faster. It looked...",It was a huge shock to do is always such a te...,I went to the police station. I was so scared!,so knife than thing of shoes you door sure th...


## Get model response from big dataset

In [69]:
def get_response_from_nanoGPT_big(row,model):
    # situation = row['Situation']
    # emotion = row['grouped_emotion']
    human = row['empathetic_dialogues']
    start = '<bot> ' + human + '<human>'
    response, new_emotion, new_context = get_respond_nanoGPT(start, 1, model=model, enable_print=False)
    return response #, new_emotion, new_context

In [71]:
for model_type, model in model_list.items():
    label = 'new_label_' + model_type
    test_df[label] = test_df.apply(lambda row: get_response_from_nanoGPT_big(row, model), axis=1)

# df.to_csv('./evaluation_result.csv', index=False)
test_df.head()

Unnamed: 0,empathetic_dialogues,labels,new_label_default,new_label_rope,new_label_relative:
0,it felt good to get approved for a vacation,That's great! Where ya headed?,That's great! Did you go to school?,I know I'm sure you will be fine!,much at. of no you now I\nle I\nle together y...
1,to disneyworld!,That sounds like a lot of fun! Who ya going with?,I am glad to hear that.,wow that's awesome!,- yard we house than that you night ofos you n...
2,That's great! Where ya headed?,to disneyworld!,"I haven't, I'm in a few days!",I am going to Las Vegas.,no.! that have list celebrate day meast day f...
3,I applied for an internship with a law office,Very nice! I bet your more than excited about it!,That sounds great! How did you do?,I am sure you will do good.,! Ik position. of dollars that know! for them!...
4,I am actually terrified. They probably laughed...,Oh no! Dont be anxious or worried! I'm sure yo...,I'm sure you will do it!,I am sorry to hear that.,. I I night..... day it because I buyi stillik...


In [72]:
test_df.to_csv('./pe_evaluation_result_big.csv', index=False)

## BLEU

In [86]:
def get_bleu(data, compared_column):
    bleu_scores = []
    smoothing_function = SmoothingFunction().method1  # To avoid 0 scores due to short sentences
    for _, row in data.iterrows():
        for ref, output in zip(row['labels'], row[compared_column]):
            # Tokenize each sentence (split by words)
            reference_tokens = [ref.split()]  # BLEU expects a list of lists for references
            output_tokens = output.split()
            
            # Calculate BLEU score
            bleu = sentence_bleu(reference_tokens, output_tokens, smoothing_function=smoothing_function)
            bleu_scores.append(bleu)
    return bleu_scores

In [87]:
def bleu_dataset(data):
    bleu_scores = {}
    bleu_scores_average = {}
    for model_type, model in model_list.items():
        label = 'new_label_' + model_type
        # print(label)
        bleu_scores[model_type] = get_bleu(data,label)
        bleu_scores_average[model_type] = sum(bleu_scores[model_type]) / len(bleu_scores[model_type])
    
    return bleu_scores_average

In [88]:
small_bleu_score_avg = bleu_dataset(df)
small_bleu_score_avg

{'default': 0.006376490258100479,
 'rope': 0.006449718067498683,
 'relative:': 0.006187011514139771}

In [89]:
# Find the best-performing model
best_model = max(small_bleu_score_avg, key=small_bleu_score_avg.get)
best_score = small_bleu_score_avg[best_model]

# Print the results
print("====================================================")
print("BEST MODEL: ")
print(f"{best_model}: with a BLEU score of {best_score:.6f}.")
print("====================================================")

BEST MODEL: 
rope: with a BLEU score of 0.006450.


**BLEU with big test dataset**

In [90]:
big_bleu_score_avg = bleu_dataset(test_df)
big_bleu_score_avg

{'default': 0.006052025366630673,
 'rope': 0.005886990900708465,
 'relative:': 0.005992347214885741}

In [91]:
# Find the best-performing model
best_model = max(big_bleu_score_avg, key=big_bleu_score_avg.get)
best_score = big_bleu_score_avg[best_model]

# Print the results
print("====================================================")
print("BEST MODEL: ")
print(f"{best_model}: with a BLEU score of {best_score:.6f}.")
print("====================================================")

BEST MODEL: 
default: with a BLEU score of 0.006052.


## BertScore

In [92]:
def calculate_bert_score(data,compared_column):
    # Check for MPS device
    device = "mps" if torch.backends.mps.is_available() else "cpu"
    print(f"Using device: {device}")

    model_outputs = data['labels']
    reference_sentences = data[compared_column]

    if len(model_outputs) != len(reference_sentences):
        raise ValueError("Mismatch in lengths: model_outputs and reference_sentences must be of the same length.")
    # Convert model outputs and reference sentences to strings
    model_outputs = [str(output) for output in model_outputs]
    reference_sentences = [str(ref) for ref in reference_sentences]
    # Calculate precision, recall, and F1 for each pair of reference and output
    P, R, F1 = score(model_outputs, reference_sentences, lang='en', verbose=True, device = device)
    return P, R, F1

In [93]:
def bert_score_all_models(data):
    bert_scores = {}

    for model_type, model in model_list.items():
        label = 'new_label_' + model_type
        bert_scores[model_type] = {}
        
        # Calculate BERT score and assign it to the dictionary
        bert_scores[model_type]['P'], bert_scores[model_type]['R'], bert_scores[model_type]['F1'] = calculate_bert_score(data, label)

    return bert_scores

In [94]:
def print_bert_scores(bert_scores):

    avg_f1_scores ={}

    for model_type, model in model_list.items():
        label = 'new_label_' + model_type
        # P avarage
        P_average = sum(bert_scores[model_type]['P']) / len(bert_scores[model_type]['P'])
        # R avarage
        R_average = sum(bert_scores[model_type]['R']) / len(bert_scores[model_type]['R'])
        # F1 avarage
        F1_average = sum(bert_scores[model_type]['F1']) / len(bert_scores[model_type]['F1'])
        print("--------------------------------------------------")
        print(f"Model: {model_type}")
        print(f"Average Precision: {P_average}")
        print(f"Average Recall: {R_average}")
        print(f"Average F1: {F1_average}")

        avg_f1_scores[model_type] = F1_average

    return avg_f1_scores

In [95]:
small_bert_scores = bert_score_all_models(data=df)
small_bert_f1 = print_bert_scores(small_bert_scores)

Using device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/4 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]



done in 6.12 seconds, 16.34 sentences/sec
Using device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/3 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 1.72 seconds, 58.12 sentences/sec
Using device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/4 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 2.15 seconds, 46.49 sentences/sec
--------------------------------------------------
Model: default
Average Precision: 0.8497958183288574
Average Recall: 0.8566957712173462
Average F1: 0.8530194163322449
--------------------------------------------------
Model: rope
Average Precision: 0.5225028991699219
Average Recall: 0.5253137350082397
Average F1: 0.5237810611724854
--------------------------------------------------
Model: relative:
Average Precision: 0.8232231736183167
Average Recall: 0.7937153577804565
Average F1: 0.8079032897949219




In [96]:
# Identify the best model by highest F1 average
best_model = max(small_bert_f1, key=small_bert_f1.get)
best_f1_score = small_bert_f1[best_model]

# Print the best model
print("--------------------------------------------------")
print(f"Best Model: {best_model}")
print(f"Highest Average F1 Score: {best_f1_score:.4f}")
print("--------------------------------------------------")

--------------------------------------------------
Best Model: default
Highest Average F1 Score: 0.8530
--------------------------------------------------


**BERT Score for Big Dataset**

In [97]:
big_bert_scores = bert_score_all_models(data=test_df)
big_bert_f1 = print_bert_scores(big_bert_scores)

Using device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/352 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/187 [00:00<?, ?it/s]



done in 162.03 seconds, 73.85 sentences/sec
Using device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/286 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/187 [00:00<?, ?it/s]



done in 164.19 seconds, 72.88 sentences/sec
Using device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/360 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/187 [00:00<?, ?it/s]



done in 240.56 seconds, 49.75 sentences/sec
--------------------------------------------------
Model: default
Average Precision: 0.8403091430664062
Average Recall: 0.8542397618293762
Average F1: 0.847074568271637
--------------------------------------------------
Model: rope
Average Precision: 0.47583577036857605
Average Recall: 0.48094046115875244
Average F1: 0.4782959818840027
--------------------------------------------------
Model: relative:
Average Precision: 0.7887973785400391
Average Recall: 0.7663851380348206
Average F1: 0.7772354483604431


In [98]:
# Identify the best model by highest F1 average
best_model = max(big_bert_f1, key=big_bert_f1.get)
best_f1_score = big_bert_f1[best_model]

# Print the best model
print("--------------------------------------------------")
print(f"Best Model: {best_model}")
print(f"Highest Average F1 Score: {best_f1_score:.4f}")

--------------------------------------------------
Best Model: default
Highest Average F1 Score: 0.8471


## # GLUE - Sentiment Analysis Evaluation (SST-2)

In [99]:
from transformers import pipeline
def evaluate_sentiment(data, compared_column):
    device = "mps" if torch.backends.mps.is_available() else "cpu"
    print(f"Using device: {device}")
    
    # Load multi-class sentiment or emotion pipeline
    sentiment_pipeline = pipeline(
        "text-classification", 
        model="bhadresh-savani/distilbert-base-uncased-emotion", 
        device=0 if device == "mps" else -1
    )
    
    scores = []
    model_outputs = data['labels']
    reference_sentences = data[compared_column]

    if len(model_outputs) != len(reference_sentences):
        raise ValueError("Mismatch in lengths: model_outputs and reference_sentences must be of the same length.")
    
    # Convert model outputs and reference sentences to strings
    model_outputs = [str(output) for output in model_outputs]
    reference_sentences = [str(ref) for ref in reference_sentences]

    for i, (output, reference) in enumerate(zip(model_outputs, reference_sentences), start=1):
        output_sentiment = sentiment_pipeline(output)[0]['label']
        reference_sentiment = sentiment_pipeline(reference)[0]['label']
        
        score = 1 if output_sentiment == reference_sentiment else 0
        scores.append(score)

    return scores

In [100]:
def glue_all_models(data):
    sentiment_scores = {}

    for model_type, _ in model_list.items():
        label = 'new_label_' + model_type
        sentiment_scores[model_type] = {}
        
        sentiment_scores[model_type] = evaluate_sentiment(data, label)
    return sentiment_scores

In [101]:
def print_glue_scores(data):
    
    sentiment_scores = glue_all_models(data)

    avg_glue_scores = {}
    
    for model_type, _ in model_list.items():
        GLUE_average = sum(sentiment_scores[model_type]) / len(sentiment_scores[model_type])
        avg_glue_scores[model_type] = GLUE_average

        print("--------------------------------------------------")
        print(f"Model: {model_type}")
        print(f"Average Sentiment Score: {GLUE_average}")

    return avg_glue_scores        

In [102]:
small_glue_scores = print_glue_scores(df)

# Identify the best model based on the highest GLUE score
best_model = max(small_glue_scores, key=small_glue_scores.get)
best_glue_score = small_glue_scores[best_model]

# Print the best model
print("--------------------------------------------------")
print(f"Best Model: {best_model}")
print(f"Highest Average GLUE Score: {best_glue_score:.4f}")
print("--------------------------------------------------")

Using device: mps
Using device: mps
Using device: mps
--------------------------------------------------
Model: default
Average Sentiment Score: 0.4
--------------------------------------------------
Model: rope
Average Sentiment Score: 0.36
--------------------------------------------------
Model: relative:
Average Sentiment Score: 0.42
--------------------------------------------------
Best Model: relative:
Highest Average GLUE Score: 0.4200
--------------------------------------------------


**GLUE Score for big dataset**

In [103]:
big_glue_scores = print_glue_scores(test_df)

# Identify the best model based on the highest GLUE score
best_model = max(big_glue_scores, key=big_glue_scores.get)
best_glue_score = big_glue_scores[best_model]

# Print the best model
print("--------------------------------------------------")
print(f"Best Model: {best_model}")
print(f"Highest Average GLUE Score: {best_glue_score:.4f}")
print("--------------------------------------------------")

Using device: mps
Using device: mps
Using device: mps
--------------------------------------------------
Model: default
Average Sentiment Score: 0.43018300325896214
--------------------------------------------------
Model: rope
Average Sentiment Score: 0.32974011865964736
--------------------------------------------------
Model: relative:
Average Sentiment Score: 0.36667502297986126
--------------------------------------------------
Best Model: default
Highest Average GLUE Score: 0.4302
--------------------------------------------------


# Perplexity

In [104]:
tokenizer = tiktoken.get_encoding("gpt2")


def get_token_probabilities(reference_text, output_text, model):
    try:
        # Tokenize the input
        input_ids = tokenizer.encode(output_text)  # List of token IDs
        input_ids = torch.tensor([input_ids], dtype=torch.long)  # Convert to PyTorch tensor

        # Pass the tokenized input to the model
        logits, _ = model(input_ids)

        # Convert logits to probabilities
        probs = torch.nn.functional.softmax(logits, dim=-1)

        # Handle length mismatch
        max_length = min(len(input_ids[0]), probs.size(1))  # to deal when reference has difference size of the model output

        # Extract probabilities for the predicted tokens
        token_probs = []
        for i, token_id in enumerate(input_ids[0][:max_length]):
            token_probs.append(probs[0, i, token_id].item())

        return token_probs
    except Exception as e:
        print(f"Error fetching token probabilities: {e}")
        return []

In [105]:
def calculate_sentence_perplexity(token_probs):
    """
    Calculate sentence perplexity based on token probabilities.
    """
    if not token_probs:  # Handle empty token probabilities
        return float('inf')

    log_probs = np.log(token_probs)
    avg_log_prob = np.mean(log_probs)
    return np.exp(-avg_log_prob)

In [106]:

def get_perplexity(data, compared_column, model):
    """
    Calculate sentence-level perplexity using token probabilities.
    """
    perplexities = []
    
    for _, row in data.iterrows():
        reference_text = row['labels']
        output_text = row[compared_column]
        
        # Query model for token probabilities
        token_probs = get_token_probabilities(reference_text, output_text, model)
        
        # Calculate sentence-level perplexity
        sentence_perplexity = calculate_sentence_perplexity(token_probs)
        
        perplexities.append(sentence_perplexity)
    
    print(f"Completed token-based perplexity calculations for column: {compared_column}")
    return perplexities

In [107]:
def perplexity_scores_average(data):
    # Compute perplexities for each model
    perplexity_scores = {}
    perplexity_scores_average = {}
    
    for model_type, model in model_list.items():
        label = 'new_label_' + model_type
        perplexity_scores[model_type] = get_perplexity(data,label, model)
        perplexity_scores_average[model_type] = sum(perplexity_scores[model_type]) / len(perplexity_scores[model_type])

        print(f"Average Perplexity for {model_type}: {perplexity_scores_average[model_type]}")

    return perplexity_scores_average

**Perplexity Score for small dataset**

In [108]:
small_perplexity = perplexity_scores_average(df)
small_perplexity

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Completed token-based perplexity calculations for column: new_label_default
Average Perplexity for default: 251835562.7811293


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Completed token-based perplexity calculations for column: new_label_rope
Average Perplexity for rope: 138019.7266265564
Completed token-based perplexity calculations for column: new_label_relative:
Average Perplexity for relative:: 44209813.24339561


{'default': 251835562.7811293,
 'rope': 138019.7266265564,
 'relative:': 44209813.24339561}

In [109]:
# Find the best-performing model
best_model = min(small_perplexity, key=small_perplexity.get)
best_score = small_perplexity[best_model]

# Print the results
print("====================================================")
print("BEST MODEL: ")
print(f"{best_model}: with a Perplexity Score of {best_score:.6f}.")
print("====================================================")

BEST MODEL: 
rope: with a Perplexity Score of 138019.726627.


**Perplexity Score for big dataset**

In [111]:
big_perplexity = perplexity_scores_average(test_df)
big_perplexity

Completed token-based perplexity calculations for column: new_label_default
Average Perplexity for default: 1043911092.461109
Completed token-based perplexity calculations for column: new_label_rope
Average Perplexity for rope: 3776063.144329027
Error fetching token probabilities: Cannot forward sequence of length 92, block size is only 64


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Completed token-based perplexity calculations for column: new_label_relative:
Average Perplexity for relative:: inf


{'default': 1043911092.461109, 'rope': 3776063.144329027, 'relative:': inf}

In [112]:
# Find the best-performing model
best_model = min(big_perplexity, key=big_perplexity.get)
best_score = big_perplexity[best_model]

# Print the results
print("====================================================")
print("BEST MODEL: ")
print(f"{best_model}: with a Perplexity Score of {best_score:.6f}.")
print("====================================================")

BEST MODEL: 
rope: with a Perplexity Score of 3776063.144329.
