In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [18]:
import pandas as pd
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nanoGPT.chat import init_model as init_nanoGPT
from  nanoGPT.chat import respond as get_respond_nanoGPT
import torch
from bert_score import score
import tiktoken
from transformers import pipeline

### Big Validation dataset - Withouth emotion tag
**(Facebook data + Chat GPT generated data)**
- 6k pais of data
- file already saved with model's outputs in data/emotion/validation/final.csv because takes a lot of time to run

## Get model response

In [23]:
model_list = {
    'single_conversation': 'block_size=64/withoutemotion/singleConversation',
    'whole_conversation':'block_size=64/withoutemotion/wholeConversation',
    'single_conversation_withemotion':'block_size=64/withemotion',
    'single_conversation_withcontext': 'block_size=64/withcontext',
    'single_conversation_withGPTdata': 'block_size=64/withoutemotion/singleConversation_withGPTdata',
    'single_conversation_withGPTdata_bs256': 'block_size=256/singleConversation_withGPTdata',
}

for model_type, model_path in model_list.items():
    model_list[model_type] = init_nanoGPT(model_path)


Loading model from: ../trained-saved/block_size=64/withoutemotion/singleConversation/ckpt.pt
number of parameters: 3.42M
Loading model from: ../trained-saved/block_size=64/withoutemotion/wholeConversation/ckpt.pt


  checkpoint = torch.load(ckpt_path, map_location=device)


number of parameters: 3.42M
Loading model from: ../trained-saved/block_size=64/withemotion/ckpt.pt
number of parameters: 3.42M
Loading model from: ../trained-saved/block_size=64/withcontext/ckpt.pt
number of parameters: 3.42M
Loading model from: ../trained-saved/block_size=64/withoutemotion/singleConversation_withGPTdata/ckpt.pt
number of parameters: 3.42M
Loading model from: ../trained-saved/block_size=256/singleConversation_withGPTdata/ckpt.pt
number of parameters: 3.45M


**For big validation dataset - without emotion tag**

In [5]:
# Processing model's output, because we get (response, emotion, context), and we only want the response.

test_file_path = '../../data/emotion/validation/final.csv'  
test_df = pd.read_csv(test_file_path)


In [6]:
test_df.head()

Unnamed: 0,empathetic_dialogues,labels,new_label_single_conversation,new_label_whole_conversation,new_label_single_conversation_withemotion,new_label_single_conversation_withcontext,new_label_single_conversation_withGPTdata,new_label_single_conversation_withGPTdata_bs256
0,it felt good to get approved for a vacation,That's great! Where ya headed?,That's great! You are prepared and prepared.,"Yeah, I think I was a good job, I didn't get ...",oh no. I hope it turns out well for you!,That's awesome! I hope you do well!,That sounds great! A fun experience are alway...,That's great to hear! Where are you planning?
1,to disneyworld!,That sounds like a lot of fun! Who ya going with?,I am glad you are very excited! I'm sure you ...,I am! I never win anything.,"I love to get a lot of fun, but I'm happy for...",I bet you are so excited!,That sounds exciting! What do you enjoy most ...,That sounds exciting! Enjoy theney films!
2,That's great! Where ya headed?,to disneyworld!,I've never seen that song but I've never watc...,We are going to the beach.,I am going to Las Vegas for my birthday.,"I have a big meeting with my girlfriend, I ha...",I'm here to assist you with any questions or ...,I'm here to assist you! What do you need help...
3,I applied for an internship with a law office,Very nice! I bet your more than excited about it!,That's great! I bet your worked hard for it!,I hope so. I am hoping to get a raise.,That's great! Did you get the money back?,That's awesome! I hope you guys get it!,That is awesome. What kind of job?,That's awesome. What position do you get?
4,I am actually terrified. They probably laughed...,Oh no! Dont be anxious or worried! I'm sure yo...,I would be so nervous about it.,I am sure it will be worth it. I'm sure they ...,"I know what you mean, that's a good idea.",I bet you are so envious!,It's understandable to feel scared. Stay calm...,It's understandable to feel scared. Stay safe!


## AVG(BLEU) and BLEU-1,-2,-3,-4

**(AVG)BLEU with small test dataset**

In [56]:
def get_bleu(data, compared_column):
    bleu_scores = []
    smoothing_function = SmoothingFunction().method1  # To avoid 0 scores due to short sentences
    for _, row in data.iterrows():
        for ref, output in zip(row['labels'], row[compared_column]):
            # Tokenize each sentence (split by words)
            reference_tokens = [ref.split()]  # BLEU expects a list of lists for references
            output_tokens = output.split()
            
            # Calculate BLEU score
            bleu = sentence_bleu(reference_tokens, output_tokens, smoothing_function=smoothing_function)
            bleu_scores.append(bleu)
    return bleu_scores

In [22]:
def bleu_small_dataset(data):
    bleu_scores = {}
    bleu_scores_average = {}
    for model_type, model in model_list.items():
        label = 'new_label_' + model_type
        # print(label)
        bleu_scores[model_type] = get_bleu(data,label)
        bleu_scores_average[model_type] = sum(bleu_scores[model_type]) / len(bleu_scores[model_type])
    
    return bleu_scores_average

In [31]:
small_bleu_score_avg = bleu_small_dataset(df)

small_bleu_score_avg

{'withoutemotion_single': 0.006294341508914749,
 'withoutemotion_whole': 0.005549381274013914,
 'withemotion': 0.005371129646648176,
 'withcontext': 0.0056097142272521225,
 'gpt_withoutemotion': 0.006452090556086916,
 'gpt_blocksize_256': 0.00661592114219667}

In [32]:
# Find the best-performing model
best_model = max(small_bleu_score_avg, key=small_bleu_score_avg.get)
best_score = small_bleu_score_avg[best_model]

# Print the results
print("====================================================")
print("BEST MODEL: ")
print(f"{best_model}: with a BLEU score of {best_score:.6f}.")
print("====================================================")

BEST MODEL: 
gpt_blocksize_256: with a BLEU score of 0.006616.


In [None]:
big_bleu_score_avg = bleu_small_dataset(test_df)

big_bleu_score_avg

In [None]:
# Find the best-performing model
best_model = max(big_bleu_score_avg, key=big_bleu_score_avg.get)
best_score = big_bleu_score_avg[best_model]

# Print the results
print("====================================================")
print("BEST MODEL: ")
print(f"{best_model}: with a BLEU score of {best_score:.6f}.")
print("====================================================")

**AVG(BLEU) with big test dataset**

In [11]:
def get_bleu(data, compared_column, weights):
    bleu_scores = []
    smoothing_function = SmoothingFunction().method1  # Avoid 0 scores due to short sentences

    for _, row in data.iterrows():
        # Extract reference and candidate sentences
        reference_sentences = row['labels']
        candidate_sentence = row[compared_column]

        for ref, output in zip(reference_sentences, candidate_sentence):
            # Tokenize sentences (split by words)
            reference_tokens = [ref.split()]  # BLEU expects a list of lists for references
            output_tokens = output.split()

            # Calculate BLEU score
            bleu = sentence_bleu(
                reference_tokens,
                output_tokens,
                weights=weights,
                smoothing_function=smoothing_function
            )
            bleu_scores.append(bleu)

    return bleu_scores

In [12]:
def bleu_calculation(data, weights):
    bleu_scores = {}
    bleu_scores_average = {}

    for model_type, _ in model_list.items():
        # Dynamically construct the label column name
        label = 'new_label_' + model_type

        # Ensure the column exists in the DataFrame
        if label not in data.columns:
            raise KeyError(f"Column '{label}' does not exist in the DataFrame!")

        # Calculate BLEU scores for this model type
        bleu_scores[model_type] = get_bleu(data, label, weights)
        bleu_scores_average[model_type] = sum(bleu_scores[model_type]) / len(bleu_scores[model_type])

    return bleu_scores_average

**BLEU-1**

In [14]:
weights = (1,0,0,0)

bleu_1 = bleu_calculation(test_df, weights)

bleu_1

{'single_conversation': 0.03303577372298825,
 'whole_conversation': 0.031149590503226302,
 'single_conversation_withemotion': 0.03287725085675706,
 'single_conversation_withcontext': 0.03263109583247091,
 'single_conversation_withGPTdata': 0.036170623502072514,
 'single_conversation_withGPTdata_bs256': 0.034916404327927836}

**BLEU-2**

In [76]:
weights = (0.5, 0.5, 0, 0)

bleu_2 = bleu_calculation(test_df,weights)

bleu_2  

{'single_conversation': 0.010446828923058333,
 'whole_conversation': 0.009850365417174566,
 'single_conversation_withemotion': 0.010396699591207457,
 'single_conversation_withcontext': 0.010318858537783629,
 'single_conversation_withGPTdata': 0.011438155465496527,
 'single_conversation_withGPTdata_bs256': 0.011041536537961272}

**BLEU-3**

In [78]:
weights = (1/3, 1/3, 1/3, 0)

bleu_3 = bleu_calculation(test_df, weights)

bleu_3

{'single_conversation': 0.007117341692084967,
 'whole_conversation': 0.006710975836043849,
 'single_conversation_withemotion': 0.007083188975867789,
 'single_conversation_withcontext': 0.007030156483523018,
 'single_conversation_withGPTdata': 0.007792724603294758,
 'single_conversation_withGPTdata_bs256': 0.007522511273526714}

**BLEU-4**

In [79]:
weights = (1/4, 1/4, 1/4, 1/4)

bleu_4 = bleu_calculation(test_df, weights)

bleu_4

{'single_conversation': 0.00587468362062949,
 'whole_conversation': 0.005539267542303131,
 'single_conversation_withemotion': 0.005846493825725561,
 'single_conversation_withcontext': 0.005802720584588993,
 'single_conversation_withGPTdata': 0.006432147502200551,
 'single_conversation_withGPTdata_bs256': 0.0062091122888948}

## BertScore

In [70]:
def calculate_bert_score(data,compared_column):
    # Check for MPS device
    device = "mps" if torch.backends.mps.is_available() else "cpu"
    print(f"Using device: {device}")

    model_outputs = data['labels']
    reference_sentences = data[compared_column]

    if len(model_outputs) != len(reference_sentences):
        raise ValueError("Mismatch in lengths: model_outputs and reference_sentences must be of the same length.")
    # Convert model outputs and reference sentences to strings
    model_outputs = [str(output) for output in model_outputs]
    reference_sentences = [str(ref) for ref in reference_sentences]
    # Calculate precision, recall, and F1 for each pair of reference and output
    P, R, F1 = score(model_outputs, reference_sentences, lang='en', verbose=True, device = device)
    return P, R, F1

In [71]:
def bert_score_all_models(data):
    bert_scores = {}

    for model_type, model in model_list.items():
        label = 'new_label_' + model_type
        bert_scores[model_type] = {}
        
        # Calculate BERT score and assign it to the dictionary
        bert_scores[model_type]['P'], bert_scores[model_type]['R'], bert_scores[model_type]['F1'] = calculate_bert_score(data, label)

    return bert_scores

In [72]:
def print_bert_scores(bert_scores):

    avg_f1_scores ={}

    for model_type, model in model_list.items():
        label = 'new_label_' + model_type
        # P avarage
        P_average = sum(bert_scores[model_type]['P']) / len(bert_scores[model_type]['P'])
        # R avarage
        R_average = sum(bert_scores[model_type]['R']) / len(bert_scores[model_type]['R'])
        # F1 avarage
        F1_average = sum(bert_scores[model_type]['F1']) / len(bert_scores[model_type]['F1'])
        print("--------------------------------------------------")
        print(f"Model: {model_type}")
        print(f"Average Precision: {P_average}")
        print(f"Average Recall: {R_average}")
        print(f"Average F1: {F1_average}")

        avg_f1_scores[model_type] = F1_average

    return avg_f1_scores
        

**BERT Score for Big Dataset**

In [60]:
big_bert_scores = bert_score_all_models(data=test_df)

big_bert_f1 = print_bert_scores(big_bert_scores)

Using device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/355 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/187 [00:00<?, ?it/s]



done in 135.93 seconds, 88.04 sentences/sec
Using device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/335 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/187 [00:00<?, ?it/s]



done in 181.02 seconds, 66.11 sentences/sec
Using device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/357 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/187 [00:00<?, ?it/s]



done in 180.65 seconds, 66.24 sentences/sec
Using device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/351 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/187 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


done in 129.50 seconds, 92.41 sentences/sec
Using device: mps
calculating scores...
computing bert embedding.


  0%|          | 0/339 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/187 [00:00<?, ?it/s]

done in 120.92 seconds, 98.96 sentences/sec
Using device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/273 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/187 [00:00<?, ?it/s]



done in 105.99 seconds, 112.91 sentences/sec
--------------------------------------------------
Model: withoutemotion_single
Average Precision: 0.8516004681587219
Average Recall: 0.8623329401016235
Average F1: 0.8568010926246643
--------------------------------------------------
Model: withoutemotion_whole
Average Precision: 0.8422025442123413
Average Recall: 0.8542978167533875
Average F1: 0.8480852246284485
--------------------------------------------------
Model: withemotion
Average Precision: 0.8510995507240295
Average Recall: 0.8639686107635498
Average F1: 0.8573451042175293
--------------------------------------------------
Model: withcontext
Average Precision: 0.8292639851570129
Average Recall: 0.8429385423660278
Average F1: 0.8359043002128601
--------------------------------------------------
Model: gpt_withoutemotion
Average Precision: 0.8533097505569458
Average Recall: 0.8622663617134094
Average F1: 0.8576485514640808
--------------------------------------------------
Model: g

In [61]:
# Identify the best model by highest F1 average
best_model = max(big_bert_f1, key=big_bert_f1.get)
best_f1_score = big_bert_f1[best_model]

# Print the best model
print("--------------------------------------------------")
print(f"Best Model: {best_model}")
print(f"Highest Average F1 Score: {best_f1_score:.4f}")
print("--------------------------------------------------")

--------------------------------------------------
Best Model: gpt_withoutemotion
Highest Average F1 Score: 0.8576
--------------------------------------------------


## GLUE - Sentiment Analysis Evaluation 

**MODEL USED:** "bhadresh-savani/distilbert-base-uncased-emotion"

Supported Emotions: Joy, Anger, Sadness, Fear, Surprise, Love, Neutral

In [19]:
def evaluate_sentiment(data, compared_column):
    device = "mps" if torch.backends.mps.is_available() else "cpu"
    print(f"Using device: {device}")
    
    # Load multi-class sentiment or emotion pipeline
    sentiment_pipeline = pipeline(
        "text-classification", 
        model="bhadresh-savani/distilbert-base-uncased-emotion", 
        device=0 if device == "mps" else -1
    )
    
    scores = []
    model_outputs = data['labels']
    reference_sentences = data[compared_column]

    if len(model_outputs) != len(reference_sentences):
        raise ValueError("Mismatch in lengths: model_outputs and reference_sentences must be of the same length.")
    
    # Convert model outputs and reference sentences to strings
    model_outputs = [str(output) for output in model_outputs]
    reference_sentences = [str(ref) for ref in reference_sentences]

    for i, (output, reference) in enumerate(zip(model_outputs, reference_sentences), start=1):
        output_sentiment = sentiment_pipeline(output)[0]['label']
        reference_sentiment = sentiment_pipeline(reference)[0]['label']
        
        score = 1 if output_sentiment == reference_sentiment else 0
        scores.append(score)

    return scores


In [20]:
def glue_all_models(data):
    sentiment_scores = {}

    for model_type, _ in model_list.items():
        label = 'new_label_' + model_type
        sentiment_scores[model_type] = {}
        
        sentiment_scores[model_type] = evaluate_sentiment(data, label)
    return sentiment_scores

In [22]:
def print_glue_scores(data):
    
    sentiment_scores = glue_all_models(data)

    avg_glue_scores = {}
    
    for model_type, _ in model_list.items():
        GLUE_average = sum(sentiment_scores[model_type]) / len(sentiment_scores[model_type])
        avg_glue_scores[model_type] = GLUE_average

        print("--------------------------------------------------")
        print(f"Model: {model_type}")
        print(f"Average Sentiment Score: {GLUE_average}")

    return avg_glue_scores
        

**GLUE Score for big dataset**

In [89]:
big_glue_scores = print_glue_scores(test_df)

Using device: mps
Using device: mps
Using device: mps
Using device: mps
Using device: mps
Using device: mps
model_type withoutemotion_single
--------------------------------------------------
Model: withoutemotion_single
Average Sentiment Score: 0.4727166374195705
model_type withoutemotion_whole
--------------------------------------------------
Model: withoutemotion_whole
Average Sentiment Score: 0.43603242249519514
model_type withemotion
--------------------------------------------------
Model: withemotion
Average Sentiment Score: 0.4721316954959472
model_type withcontext
--------------------------------------------------
Model: withcontext
Average Sentiment Score: 0.45884515751650373
model_type gpt_withoutemotion
--------------------------------------------------
Model: gpt_withoutemotion
Average Sentiment Score: 0.47171387983621627
model_type gpt_blocksize_256
--------------------------------------------------
Model: gpt_blocksize_256
Average Sentiment Score: 0.34519929806969163


In [90]:
# Identify the best model based on the highest GLUE score
best_model = max(big_glue_scores, key=big_glue_scores.get)
best_glue_score = big_glue_scores[best_model]

# Print the best model
print("--------------------------------------------------")
print(f"Best Model: {best_model}")
print(f"Highest Average GLUE Score: {best_glue_score:.4f}")
print("--------------------------------------------------")

--------------------------------------------------
Best Model: withoutemotion_single
Highest Average GLUE Score: 0.4727
--------------------------------------------------


# Perplexity

In [91]:
tokenizer = tiktoken.get_encoding("gpt2")


def get_token_probabilities(reference_text, output_text, model):
    try:
        # Tokenize the input
        input_ids = tokenizer.encode(output_text)  # List of token IDs
        input_ids = torch.tensor([input_ids], dtype=torch.long)  # Convert to PyTorch tensor

        # Pass the tokenized input to the model
        logits, _ = model(input_ids)

        # Convert logits to probabilities
        probs = torch.nn.functional.softmax(logits, dim=-1)

        # Handle length mismatch
        max_length = min(len(input_ids[0]), probs.size(1))  # to deal when reference has difference size of the model output

        # Extract probabilities for the predicted tokens
        token_probs = []
        for i, token_id in enumerate(input_ids[0][:max_length]):
            token_probs.append(probs[0, i, token_id].item())

        return token_probs
    except Exception as e:
        print(f"Error fetching token probabilities: {e}")
        return []

In [92]:
def calculate_sentence_perplexity(token_probs):
    """
    Calculate sentence perplexity based on token probabilities.
    """
    if not token_probs:  # Handle empty token probabilities
        return float('inf')

    log_probs = np.log(token_probs)
    avg_log_prob = np.mean(log_probs)
    return np.exp(-avg_log_prob)

In [96]:
def get_perplexity(data, compared_column, model):
    """
    Calculate sentence-level perplexity using token probabilities.
    """
    perplexities = []
    
    for _, row in data.iterrows():
        reference_text = row['labels']
        output_text = row[compared_column]
        
        # Query model for token probabilities
        token_probs = get_token_probabilities(reference_text, output_text, model)
        
        # Calculate sentence-level perplexity
        sentence_perplexity = calculate_sentence_perplexity(token_probs)
        
        perplexities.append(sentence_perplexity)
    
    print(f"Completed token-based perplexity calculations for column: {compared_column}")
    return perplexities

In [97]:
def perplexity_scores_average(data):
    # Compute perplexities for each model
    perplexity_scores = {}
    perplexity_scores_average = {}
    
    for model_type, model in model_list.items():
        label = 'new_label_' + model_type
        perplexity_scores[model_type] = get_perplexity(data,label, model)
        perplexity_scores_average[model_type] = sum(perplexity_scores[model_type]) / len(perplexity_scores[model_type])

        print(f"Average Perplexity for {model_type}: {perplexity_scores_average[model_type]}")

    return perplexity_scores_average

**Perplexity Score for big dataset**

In [101]:
big_perplexity = perplexity_scores_average(test_df)

Calculating token-based perplexity for column: new_label_withoutemotion_single
Completed token-based perplexity calculations for column: new_label_withoutemotion_single
Average Perplexity for withoutemotion_single: 84066.95248548205
Calculating token-based perplexity for column: new_label_withoutemotion_whole
Completed token-based perplexity calculations for column: new_label_withoutemotion_whole
Average Perplexity for withoutemotion_whole: 29549.33700155131
Calculating token-based perplexity for column: new_label_withemotion
Completed token-based perplexity calculations for column: new_label_withemotion
Average Perplexity for withemotion: 2008198.4398578384
Calculating token-based perplexity for column: new_label_withcontext
Completed token-based perplexity calculations for column: new_label_withcontext
Average Perplexity for withcontext: 63035.03316265359
Calculating token-based perplexity for column: new_label_gpt_withoutemotion
Completed token-based perplexity calculations for colu

In [102]:
# Find the best-performing model
best_model = min(big_perplexity, key=big_perplexity.get)
best_score = big_perplexity[best_model]

# Print the results
print("====================================================")
print("BEST MODEL: ")
print(f"{best_model}: with a Perplexity Score of {best_score:.6f}.")
print("====================================================")

BEST MODEL: 
gpt_withoutemotion: with a Perplexity Score of 28233.374812.
