In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nanoGPT.chat import init_model as init_nanoGPT
from  nanoGPT.chat import respond as get_respond_nanoGPT
import torch
from bert_score import score
import tiktoken

loading GPT-2 encodings...


In [3]:
data_path = '../../data/emotion/validation/100_validation.csv'
df = pd.read_csv(data_path)

In [4]:
df.head()

Unnamed: 0,Situation,grouped_emotion,empathetic_dialogues,labels
0,Last night I heard strange noises coming from ...,afraid,In the middle of the night I heard some weird ...,Should have grabbed the gun.
1,My mom and sister threw me a baby shower when ...,excited,that was very nice of them congratulations,"Thank you! It was so nice, I had no idea it w..."
2,I just applied for a new job. After the inter...,grateful,"Oh really, do you feel like you did a great job?",I do! I'm feeling very optimistic about it
3,I loaned some money to my friend at work. Turn...,annoyed,Wow! What a jerk for him to up and leave with ...,"It was a medium amount of money but still, he ..."
4,I was out walking late last night and seen som...,,Oh my god. What happened?,"Well, I started walking much faster. It looked..."


## Get model response

In [5]:
model_list = {
    'withoutemotion_single': 'block_size=64/withoutemotion/singleConversation',
    'withoutemotion_whole':'block_size=64/withoutemotion/wholeConversation',
    'withemotion':'block_size=64/withemotion',
    'withcontext': 'block_size=64/withcontext',
    'gpt_withoutemotion': 'block_size=64/withoutemotion/singleConversation_withGPTdata',
    'gpt_blocksize_256': 'block_size=256/singleConversation_withGPTdata',
}
for model_type, model_path in model_list.items():
    model_list[model_type] = init_nanoGPT(model_path)


Loading model from: ../trained-saved/block_size=64/withoutemotion/singleConversation/ckpt.pt
number of parameters: 3.42M


  checkpoint = torch.load(ckpt_path, map_location=device)


Loading model from: ../trained-saved/block_size=64/withoutemotion/wholeConversation/ckpt.pt
number of parameters: 3.42M
Loading model from: ../trained-saved/block_size=64/withemotion/ckpt.pt


  checkpoint = torch.load(ckpt_path, map_location=device)


number of parameters: 3.42M
Loading model from: ../trained-saved/block_size=64/withcontext/ckpt.pt
number of parameters: 3.42M
Loading model from: ../trained-saved/block_size=64/withoutemotion/singleConversation_withGPTdata/ckpt.pt
number of parameters: 3.42M
Loading model from: ../trained-saved/block_size=256/singleConversation_withGPTdata/ckpt.pt
number of parameters: 3.42M


In [6]:
def get_response_from_nanoGPT(row,model):
    situation = row['Situation']
    emotion = row['grouped_emotion']
    human = row['empathetic_dialogues']
    start = '<bot> ' + human + '<human>'
    response, new_emotion, new_context = get_respond_nanoGPT(start, 1, model=model, enable_print=False)
    return response #, new_emotion, new_context

In [7]:
for model_type, model in model_list.items():
    label = 'new_label_' + model_type
    df[label] = df.apply(lambda row: get_response_from_nanoGPT(row, model), axis=1)

In [None]:
# df.to_csv('./evaluation_result.csv', index=False)

In [8]:
df.head()

Unnamed: 0,Situation,grouped_emotion,empathetic_dialogues,labels,new_label_withoutemotion_single,new_label_withoutemotion_whole,new_label_withemotion,new_label_withcontext,new_label_gpt_withoutemotion,new_label_gpt_blocksize_256
0,Last night I heard strange noises coming from ...,afraid,In the middle of the night I heard some weird ...,Should have grabbed the gun.,Oh no! I hate to hear that. Did you end up ge...,"He is, he is so old now. He is a very close f...",I'm sorry to hear that. I'm sure you'll get it!,I hope that there was a big storm.,That sounds like a surprising moment! Did you...,
1,My mom and sister threw me a baby shower when ...,excited,that was very nice of them congratulations,"Thank you! It was so nice, I had no idea it w...",It was nice to have some good friends.,Thank you. I am happy with your friend.,Yeah I was so happy for them. I was so glad I...,I was shocked to see it.,Thank you! How can I assist you today?,That was very nice of them.
2,I just applied for a new job. After the inter...,grateful,"Oh really, do you feel like you did a great job?",I do! I'm feeling very optimistic about it,"I did, I was really happy. My best friend was...","Yes, I am so proud of him, I have a lot of fa...","I do, it's a lot better than I am. I'm not su...",I did. I just got a new one and I was so happy.,Thank you! I'm here to assist you with any qu...,I did and I was so proud of myself for my job.
3,I loaned some money to my friend at work. Turn...,annoyed,Wow! What a jerk for him to up and leave with ...,"It was a medium amount of money but still, he ...",I was so happy to see him.,"Yes, I was so shocked. I was so excited to se...","Not really, but I didn't know what to do thou...",I was so mad! I was so mad.,"No, he's just a bit mad about it.","Yes, he had a lot of money."
4,I was out walking late last night and seen som...,,Oh my god. What happened?,"Well, I started walking much faster. It looked...",I was a little sad and scared to tell someone...,I hope so. I'm a little bummed out.,I went to the hospital to go to a new city an...,I got to go to work and I had to go to work.,I'm here to help! What happened?,I'm here to listen. What happened?


In [36]:
print(f"Number of rows: {df.shape[0]}")

Number of rows: 100


## BLEU

In [24]:
def get_bleu(compared_column):
    bleu_scores = []
    smoothing_function = SmoothingFunction().method1  # To avoid 0 scores due to short sentences
    for _, row in df.iterrows():
        for ref, output in zip(row['labels'], row[compared_column]):
            # Tokenize each sentence (split by words)
            reference_tokens = [ref.split()]  # BLEU expects a list of lists for references
            output_tokens = output.split()
            
            # Calculate BLEU score
            bleu = sentence_bleu(reference_tokens, output_tokens, smoothing_function=smoothing_function)
            bleu_scores.append(bleu)
    return bleu_scores

In [26]:
bleu_scores = {}
bleu_scores_average = {}
for model_type, model in model_list.items():
    label = 'new_label_' + model_type
    # print(label)
    bleu_scores[model_type] = get_bleu(label)
    bleu_scores_average[model_type] = sum(bleu_scores[model_type]) / len(bleu_scores[model_type])

In [27]:
bleu_scores_average

{'withoutemotion_single': 0.005812953669427137,
 'withoutemotion_whole': 0.0051150978040450336,
 'withemotion': 0.0058044728517590806,
 'withcontext': 0.00634356982528503,
 'gpt_withoutemotion': 0.006235954271153653,
 'gpt_blocksize_256': 0.006639256948168738}

## BertScore

In [28]:
def calculate_bert_score(compared_column):
    # Check for MPS device
    device = "mps" if torch.backends.mps.is_available() else "cpu"
    print(f"Using device: {device}")

    model_outputs = df['labels']
    reference_sentences = df[compared_column]

    if len(model_outputs) != len(reference_sentences):
        raise ValueError("Mismatch in lengths: model_outputs and reference_sentences must be of the same length.")
    # Convert model outputs and reference sentences to strings
    model_outputs = [str(output) for output in model_outputs]
    reference_sentences = [str(ref) for ref in reference_sentences]
    # Calculate precision, recall, and F1 for each pair of reference and output
    P, R, F1 = score(model_outputs, reference_sentences, lang='en', verbose=True, device = device)
    return P, R, F1

In [29]:
bert_scores = {}

for model_type, model in model_list.items():
    label = 'new_label_' + model_type
    bert_scores[model_type] = {}
    
    # Calculate BERT score and assign it to the dictionary
    bert_scores[model_type]['P'], bert_scores[model_type]['R'], bert_scores[model_type]['F1'] = calculate_bert_score(label)

Using device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/4 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 13.27 seconds, 7.54 sentences/sec
Using device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/3 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]



done in 2.73 seconds, 36.57 sentences/sec
Using device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/4 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 1.73 seconds, 57.82 sentences/sec
Using device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/4 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 1.17 seconds, 85.14 sentences/sec
Using device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/4 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 1.33 seconds, 75.38 sentences/sec
Using device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/3 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 1.39 seconds, 71.75 sentences/sec




In [26]:
for model_type, model in model_list.items():
    label = 'new_label_' + model_type
    # P avarage
    P_average = sum(bert_scores[model_type]['P']) / len(bert_scores[model_type]['P'])
    # R avarage
    R_average = sum(bert_scores[model_type]['R']) / len(bert_scores[model_type]['R'])
    # F1 avarage
    F1_average = sum(bert_scores[model_type]['F1']) / len(bert_scores[model_type]['F1'])
    print("--------------------------------------------------")
    print(f"Model: {model_type}")
    print(f"Average Precision: {P_average}")
    print(f"Average Recall: {R_average}")
    print(f"Average F1: {F1_average}")
    

--------------------------------------------------
Model: withoutemotion_single
Average Precision: 0.8582502603530884
Average Recall: 0.8634032607078552
Average F1: 0.8606156706809998
--------------------------------------------------
Model: withoutemotion_whole
Average Precision: 0.7520891427993774
Average Recall: 0.7587206959724426
Average F1: 0.7552400231361389
--------------------------------------------------
Model: withemotion
Average Precision: 0.857521653175354
Average Recall: 0.8639740943908691
Average F1: 0.8604713678359985
--------------------------------------------------
Model: withcontext
Average Precision: 0.8555678725242615
Average Recall: 0.8625133633613586
Average F1: 0.8588142991065979
--------------------------------------------------
Model: gpt_withoutemotion
Average Precision: 0.8563526272773743
Average Recall: 0.8623169660568237
Average F1: 0.8591411113739014
--------------------------------------------------
Model: gpt_blocksize_256
Average Precision: 0.56293547

## # GLUE - Sentiment Analysis Evaluation (SST-2)

In [30]:
from transformers import pipeline
def evaluate_sentiment(compared_column):
    device = "mps" if torch.backends.mps.is_available() else "cpu"
    print(f"Using device: {device}")
    
    # Load multi-class sentiment or emotion pipeline
    sentiment_pipeline = pipeline(
        "text-classification", 
        model="bhadresh-savani/distilbert-base-uncased-emotion", 
        device=0 if device == "mps" else -1
    )
    
    scores = []
    model_outputs = df['labels']
    reference_sentences = df[compared_column]

    if len(model_outputs) != len(reference_sentences):
        raise ValueError("Mismatch in lengths: model_outputs and reference_sentences must be of the same length.")
    # Convert model outputs and reference sentences to strings
    model_outputs = [str(output) for output in model_outputs]
    reference_sentences = [str(ref) for ref in reference_sentences]

    for i, (output, reference) in enumerate(zip(model_outputs, reference_sentences), start=1):
        output_sentiment = sentiment_pipeline(output)[0]['label']
        reference_sentiment = sentiment_pipeline(reference)[0]['label']
        
        score = 1 if output_sentiment == reference_sentiment else 0
        scores.append(score)

    return scores


In [28]:
sentiment_scores = {}

for model_type, model in model_list.items():
    label = 'new_label_' + model_type
    sentiment_scores[model_type] = {}
    
    sentiment_scores[model_type] = evaluate_sentiment(label)

Using device: mps
Using device: mps
Using device: mps
Using device: mps
Using device: mps
Using device: mps


In [29]:
for model_type, model in model_list.items():
    label = 'new_label_' + model_type
    GLUE_average = sum(sentiment_scores[model_type]) / len(sentiment_scores[model_type])
    print("--------------------------------------------------")
    print(f"Model: {model_type}")
    print(f"Average Sentiment Score: {GLUE_average}")
    

--------------------------------------------------
Model: withoutemotion_single
Average Sentiment Score: 0.53
--------------------------------------------------
Model: withoutemotion_whole
Average Sentiment Score: 0.44
--------------------------------------------------
Model: withemotion
Average Sentiment Score: 0.52
--------------------------------------------------
Model: withcontext
Average Sentiment Score: 0.47
--------------------------------------------------
Model: gpt_withoutemotion
Average Sentiment Score: 0.42
--------------------------------------------------
Model: gpt_blocksize_256
Average Sentiment Score: 0.41


# Perplexity

In [30]:
tokenizer = tiktoken.get_encoding("gpt2")


def get_token_probabilities(reference_text, output_text, model):
    try:
        # Tokenize the input
        input_ids = tokenizer.encode(output_text)  # List of token IDs
        input_ids = torch.tensor([input_ids], dtype=torch.long)  # Convert to PyTorch tensor

        # Pass the tokenized input to the model
        logits, _ = model(input_ids)

        # Convert logits to probabilities
        probs = torch.nn.functional.softmax(logits, dim=-1)

        # Handle length mismatch
        max_length = min(len(input_ids[0]), probs.size(1))  # to deal when reference has difference size of the model output

        # Extract probabilities for the predicted tokens
        token_probs = []
        for i, token_id in enumerate(input_ids[0][:max_length]):
            token_probs.append(probs[0, i, token_id].item())

        return token_probs
    except Exception as e:
        print(f"Error fetching token probabilities: {e}")
        return []

In [31]:
def calculate_sentence_perplexity(token_probs):
    """
    Calculate sentence perplexity based on token probabilities.
    """
    if not token_probs:  # Handle empty token probabilities
        return float('inf')

    log_probs = np.log(token_probs)
    avg_log_prob = np.mean(log_probs)
    return np.exp(-avg_log_prob)

In [33]:
def get_perplexity(compared_column, model):
    """
    Calculate sentence-level perplexity using token probabilities.
    """
    perplexities = []
    print(f"Calculating token-based perplexity for column: {compared_column}")
    
    for index, row in df.iterrows():
        print(f"\nProcessing row {index}...")
        reference_text = row['labels']
        output_text = row[compared_column]
        
        print(f"Reference Text: {reference_text}")
        print(f"Output Text: {output_text}")
        
        # Query model for token probabilities
        token_probs = get_token_probabilities(reference_text, output_text, model)
        print(f"Token Probabilities: {token_probs}")
        
        # Calculate sentence-level perplexity
        sentence_perplexity = calculate_sentence_perplexity(token_probs)
        print(f"Sentence Perplexity for row {index}: {sentence_perplexity}")
        
        perplexities.append(sentence_perplexity)
    
    print(f"Completed token-based perplexity calculations for column: {compared_column}")
    return perplexities

In [34]:
def perplexity_scores_average():
    # Compute perplexities for each model
    perplexity_scores = {}
    perplexity_scores_average = {}

    print("")
    print("Columns in the DataFrame:")
    print(df.columns)
    print("")
    
    for model_type, model in model_list.items():
        label = 'new_label_' + model_type
        print(label)
        perplexity_scores[model_type] = get_perplexity(label, model)
        perplexity_scores_average[model_type] = sum(perplexity_scores[model_type]) / len(perplexity_scores[model_type])

        print(f"Average Perplexity for {model_type}: {perplexity_scores_average[model_type]}")

    return perplexity_scores_average

In [35]:
perplexity_scores_average()


Columns in the DataFrame:
Index(['Situation', 'grouped_emotion', 'empathetic_dialogues', 'labels',
       'new_label_withoutemotion_single', 'new_label_withoutemotion_whole',
       'new_label_withemotion', 'new_label_withcontext',
       'new_label_gpt_withoutemotion', 'new_label_gpt_blocksize_256'],
      dtype='object')

new_label_withoutemotion_single
Calculating token-based perplexity for column: new_label_withoutemotion_single

Processing row 0...
Reference Text: Should have grabbed the gun.
Output Text:  Oh no! I hate to hear that. Did you end up getting it? 
Token Probabilities: [8.637574501335621e-05]
Sentence Perplexity for row 0: 11577.324164849406

Processing row 1...
Reference Text: Thank you!  It was so nice, I had no idea it was happening.  They're not usually so good with secrets haha
Output Text:  It was nice to have some good friends. 
Token Probabilities: [0.17217420041561127]
Sentence Perplexity for row 1: 5.80807111394216

Processing row 2...
Reference Text: I do!

{'withoutemotion_single': 1061.468651178604,
 'withoutemotion_whole': inf,
 'withemotion': 1587.2087837246113,
 'withcontext': 511.2836617769946,
 'gpt_withoutemotion': 404.2289543040273,
 'gpt_blocksize_256': 504.67186487360993}