In [3]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [34]:
import pandas as pd
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from chat import init_model as init_nanoGPT
from chat import respond as get_respond_nanoGPT
import torch
from bert_score import score
import tiktoken

In [8]:
data_path = '../../data/emotion/validation/100_validation.csv'
df = pd.read_csv(data_path)

In [9]:
df.head()

Unnamed: 0,Situation,grouped_emotion,empathetic_dialogues,labels
0,Last night I heard strange noises coming from ...,afraid,In the middle of the night I heard some weird ...,Should have grabbed the gun.
1,My mom and sister threw me a baby shower when ...,excited,that was very nice of them congratulations,"Thank you! It was so nice, I had no idea it w..."
2,I just applied for a new job. After the inter...,grateful,"Oh really, do you feel like you did a great job?",I do! I'm feeling very optimistic about it
3,I loaned some money to my friend at work. Turn...,annoyed,Wow! What a jerk for him to up and leave with ...,"It was a medium amount of money but still, he ..."
4,I was out walking late last night and seen som...,,Oh my god. What happened?,"Well, I started walking much faster. It looked..."


## Get model response

In [11]:
model_list = {
    'default': 'block_size=64/withoutemotion/singleConversation',
    'rope':'Rope/withoutemotion/singleConversation',
    'relative:': 'Relative/withoutemotion/singleConversation'
}

for model_type, model_path in model_list.items():
    model_list[model_type] = init_nanoGPT(model_path)

Loading model from: ../trained-saved/block_size=64/withoutemotion/singleConversation/ckpt.pt
number of parameters: 3.42M


  checkpoint = torch.load(ckpt_path, map_location=device)


Loading model from: ../trained-saved/Rope/withoutemotion/singleConversation/ckpt.pt
number of parameters: 3.42M
Loading model from: ../trained-saved/Relative/withoutemotion/singleConversation/ckpt.pt


  checkpoint = torch.load(ckpt_path, map_location=device)


number of parameters: 3.42M


In [12]:
def get_response_from_nanoGPT(row,model):
    situation = row['Situation']
    emotion = row['grouped_emotion']
    human = row['empathetic_dialogues']
    start = '<bot> ' + human + '<human>'
    response, new_emotion, new_context = get_respond_nanoGPT(start, 1, model=model, enable_print=False)
    return response #, new_emotion, new_context

In [13]:
for model_type, model in model_list.items():
    label = 'new_label_' + model_type
    df[label] = df.apply(lambda row: get_response_from_nanoGPT(row, model), axis=1)

In [None]:
# df.to_csv('./evaluation_result.csv', index=False)

In [15]:
df.head()

Unnamed: 0,Situation,grouped_emotion,empathetic_dialogues,labels,new_label_default,new_label_rope,new_label_relative:
0,Last night I heard strange noises coming from ...,afraid,In the middle of the night I heard some weird ...,Should have grabbed the gun.,I was really mad at the store but it was a mi...,That is a good idea!,I don't know. I have a new job.
1,My mom and sister threw me a baby shower when ...,excited,that was very nice of them congratulations,"Thank you! It was so nice, I had no idea it w...",It was! I was so happy.,Thank you! I appreciate you!,much of no! they okay you sure for and future...
2,I just applied for a new job. After the inter...,grateful,"Oh really, do you feel like you did a great job?",I do! I'm feeling very optimistic about it,"I hope so, I am very happy for the job I did ...","I did, but I was so nervous and prepared for ...",I am so sorry to hear that. I hope you do not...
3,I loaned some money to my friend at work. Turn...,annoyed,Wow! What a jerk for him to up and leave with ...,"It was a medium amount of money but still, he ...",I was really happy to see him,I was so upset. He just wanted to take care o...,it alot i out of resc than were with me perso...
4,I was out walking late last night and seen som...,,Oh my god. What happened?,"Well, I started walking much faster. It looked...",I was in a car accident and I was so mad.,I was so scared,? up you if you got to im you by by by by by by


## BLEU

In [16]:
def get_bleu(compared_column):
    bleu_scores = []
    smoothing_function = SmoothingFunction().method1  # To avoid 0 scores due to short sentences
    for _, row in df.iterrows():
        for ref, output in zip(row['labels'], row[compared_column]):
            # Tokenize each sentence (split by words)
            reference_tokens = [ref.split()]  # BLEU expects a list of lists for references
            output_tokens = output.split()
            
            # Calculate BLEU score
            bleu = sentence_bleu(reference_tokens, output_tokens, smoothing_function=smoothing_function)
            bleu_scores.append(bleu)
    return bleu_scores

In [17]:
bleu_scores = {}
bleu_scores_average = {}
for model_type, model in model_list.items():
    label = 'new_label_' + model_type
    # print(label)
    bleu_scores[model_type] = get_bleu(label)
    bleu_scores_average[model_type] = sum(bleu_scores[model_type]) / len(bleu_scores[model_type])

In [18]:
bleu_scores_average

{'default': 0.0075534911087161045,
 'rope': 0.005971290402260085,
 'relative:': 0.006887701940291574}

## BertScore

In [21]:
def calculate_bert_score(compared_column):
    # Check for MPS device
    device = "mps" if torch.backends.mps.is_available() else "cpu"
    print(f"Using device: {device}")

    model_outputs = df['labels']
    reference_sentences = df[compared_column]

    if len(model_outputs) != len(reference_sentences):
        raise ValueError("Mismatch in lengths: model_outputs and reference_sentences must be of the same length.")
    # Convert model outputs and reference sentences to strings
    model_outputs = [str(output) for output in model_outputs]
    reference_sentences = [str(ref) for ref in reference_sentences]
    # Calculate precision, recall, and F1 for each pair of reference and output
    P, R, F1 = score(model_outputs, reference_sentences, lang='en', verbose=True, device = device)
    return P, R, F1

In [22]:
bert_scores = {}

for model_type, model in model_list.items():
    label = 'new_label_' + model_type
    bert_scores[model_type] = {}
    
    # Calculate BERT score and assign it to the dictionary
    bert_scores[model_type]['P'], bert_scores[model_type]['R'], bert_scores[model_type]['F1'] = calculate_bert_score(label)

Using device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:01<00:00,  2.63it/s]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 24.92it/s]


done in 1.61 seconds, 62.30 sentences/sec
Using device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:01<00:00,  2.77it/s]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 22.23it/s]


done in 1.54 seconds, 64.94 sentences/sec
Using device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:01<00:00,  2.71it/s]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 24.84it/s]

done in 1.56 seconds, 64.00 sentences/sec





In [23]:
for model_type, model in model_list.items():
    label = 'new_label_' + model_type
    # P avarage
    P_average = sum(bert_scores[model_type]['P']) / len(bert_scores[model_type]['P'])
    # R avarage
    R_average = sum(bert_scores[model_type]['R']) / len(bert_scores[model_type]['R'])
    # F1 avarage
    F1_average = sum(bert_scores[model_type]['F1']) / len(bert_scores[model_type]['F1'])
    print("--------------------------------------------------")
    print(f"Model: {model_type}")
    print(f"Average Precision: {P_average}")
    print(f"Average Recall: {R_average}")
    print(f"Average F1: {F1_average}")
    

--------------------------------------------------
Model: default
Average Precision: 0.857781171798706
Average Recall: 0.8652021884918213
Average F1: 0.861308217048645
--------------------------------------------------
Model: rope
Average Precision: 0.8573615550994873
Average Recall: 0.8608891367912292
Average F1: 0.8589116930961609
--------------------------------------------------
Model: relative:
Average Precision: 0.8318263292312622
Average Recall: 0.8209909796714783
Average F1: 0.8260389566421509


## # GLUE - Sentiment Analysis Evaluation (SST-2)

In [24]:
from transformers import pipeline
def evaluate_sentiment(compared_column):
    device = "mps" if torch.backends.mps.is_available() else "cpu"
    print(f"Using device: {device}")
    
    # Load multi-class sentiment or emotion pipeline
    sentiment_pipeline = pipeline(
        "text-classification", 
        model="bhadresh-savani/distilbert-base-uncased-emotion", 
        device=0 if device == "mps" else -1
    )
    
    scores = []
    model_outputs = df['labels']
    reference_sentences = df[compared_column]

    if len(model_outputs) != len(reference_sentences):
        raise ValueError("Mismatch in lengths: model_outputs and reference_sentences must be of the same length.")
    # Convert model outputs and reference sentences to strings
    model_outputs = [str(output) for output in model_outputs]
    reference_sentences = [str(ref) for ref in reference_sentences]

    for i, (output, reference) in enumerate(zip(model_outputs, reference_sentences), start=1):
        output_sentiment = sentiment_pipeline(output)[0]['label']
        reference_sentiment = sentiment_pipeline(reference)[0]['label']
        
        score = 1 if output_sentiment == reference_sentiment else 0
        scores.append(score)

    return scores


In [25]:
sentiment_scores = {}

for model_type, model in model_list.items():
    label = 'new_label_' + model_type
    sentiment_scores[model_type] = {}
    
    sentiment_scores[model_type] = evaluate_sentiment(label)

Using device: mps
Using device: mps
Using device: mps


In [26]:
for model_type, model in model_list.items():
    label = 'new_label_' + model_type
    GLUE_average = sum(sentiment_scores[model_type]) / len(sentiment_scores[model_type])
    print("--------------------------------------------------")
    print(f"Model: {model_type}")
    print(f"Average Sentiment Score: {GLUE_average}")
    

--------------------------------------------------
Model: default
Average Sentiment Score: 0.43
--------------------------------------------------
Model: rope
Average Sentiment Score: 0.42
--------------------------------------------------
Model: relative:
Average Sentiment Score: 0.4


Perplexity

In [29]:
tokenizer = tiktoken.get_encoding("gpt2")


def get_token_probabilities(reference_text, output_text, model):
    try:
        # Tokenize the input
        input_ids = tokenizer.encode(output_text)  # List of token IDs
        input_ids = torch.tensor([input_ids], dtype=torch.long)  # Convert to PyTorch tensor

        # Pass the tokenized input to the model
        logits, _ = model(input_ids)

        # Convert logits to probabilities
        probs = torch.nn.functional.softmax(logits, dim=-1)

        # Handle length mismatch
        max_length = min(len(input_ids[0]), probs.size(1))  # to deal when reference has difference size of the model output

        # Extract probabilities for the predicted tokens
        token_probs = []
        for i, token_id in enumerate(input_ids[0][:max_length]):
            token_probs.append(probs[0, i, token_id].item())

        return token_probs
    except Exception as e:
        print(f"Error fetching token probabilities: {e}")
        return []

In [30]:
def calculate_sentence_perplexity(token_probs):
    """
    Calculate sentence perplexity based on token probabilities.
    """
    if not token_probs:  # Handle empty token probabilities
        return float('inf')

    log_probs = np.log(token_probs)
    avg_log_prob = np.mean(log_probs)
    return np.exp(-avg_log_prob)

In [31]:

def get_perplexity(data, compared_column, model):
    """
    Calculate sentence-level perplexity using token probabilities.
    """
    perplexities = []
    
    for _, row in data.iterrows():
        reference_text = row['labels']
        output_text = row[compared_column]
        
        # Query model for token probabilities
        token_probs = get_token_probabilities(reference_text, output_text, model)
        
        # Calculate sentence-level perplexity
        sentence_perplexity = calculate_sentence_perplexity(token_probs)
        
        perplexities.append(sentence_perplexity)
    
    print(f"Completed token-based perplexity calculations for column: {compared_column}")
    return perplexities

In [32]:
def perplexity_scores_average(data):
    # Compute perplexities for each model
    perplexity_scores = {}
    perplexity_scores_average = {}
    
    for model_type, model in model_list.items():
        label = 'new_label_' + model_type
        perplexity_scores[model_type] = get_perplexity(data,label, model)
        perplexity_scores_average[model_type] = sum(perplexity_scores[model_type]) / len(perplexity_scores[model_type])

        print(f"Average Perplexity for {model_type}: {perplexity_scores_average[model_type]}")

    return perplexity_scores_average

In [35]:
small_perplexity = perplexity_scores_average(df)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Completed token-based perplexity calculations for column: new_label_default
Average Perplexity for default: 7517.115005763853


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Completed token-based perplexity calculations for column: new_label_rope
Average Perplexity for rope: 2884.6339049980215


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Completed token-based perplexity calculations for column: new_label_relative:
Average Perplexity for relative:: 375210408.8260035


In [36]:
small_perplexity

{'default': 7517.115005763853,
 'rope': 2884.6339049980215,
 'relative:': 375210408.8260035}

In [37]:
# Find the best-performing model
best_model = min(small_perplexity, key=small_perplexity.get)
best_score = small_perplexity[best_model]

# Print the results
print("====================================================")
print("BEST MODEL: ")
print(f"{best_model}: with a Perplexity Score of {best_score:.6f}.")
print("====================================================")

BEST MODEL: 
rope: with a Perplexity Score of 2884.633905.
