In [16]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from chat import init_model as init_nanoGPT
from chat import respond as get_respond_nanoGPT
import torch
from bert_score import score

In [17]:
data_path = '../../../data/emotion/validation/100_validation.csv'
df = pd.read_csv(data_path)

In [30]:
df.head()

Unnamed: 0,Situation,grouped_emotion,empathetic_dialogues,labels
0,Last night I heard strange noises coming from ...,afraid,In the middle of the night I heard some weird ...,Should have grabbed the gun.
1,My mom and sister threw me a baby shower when ...,excited,that was very nice of them congratulations,"Thank you! It was so nice, I had no idea it w..."
2,I just applied for a new job. After the inter...,grateful,"Oh really, do you feel like you did a great job?",I do! I'm feeling very optimistic about it
3,I loaned some money to my friend at work. Turn...,annoyed,Wow! What a jerk for him to up and leave with ...,"It was a medium amount of money but still, he ..."
4,I was out walking late last night and seen som...,,Oh my god. What happened?,"Well, I started walking much faster. It looked..."


## Get model response

In [18]:
model_list = {
    'withoutemotion_single': 'block_size=64/withoutemotion/singleConversation',
    'withoutemotion_whole':'block_size=64/withoutemotion/wholeConversation',
    'withemotion':'block_size=64/withemotion',
    'withcontext': 'block_size=64/withcontext',
    'gpt_withoutemotion': 'block_size=64/withoutemotion/singleConversation_withGPTdata',
    'gpt_blocksize_256': 'block_size=256/GPT_without_emotion',
}
for model_type, model_path in model_list.items():
    model_list[model_type] = init_nanoGPT(model_path)


Loading model from: ../trained-saved/block_size=64/withoutemotion/singleConversation/ckpt.pt
number of parameters: 3.42M


  checkpoint = torch.load(ckpt_path, map_location=device)


Loading model from: ../trained-saved/block_size=64/withoutemotion/wholeConversation/ckpt.pt
number of parameters: 3.42M
Loading model from: ../trained-saved/block_size=64/withemotion/ckpt.pt
number of parameters: 3.42M
Loading model from: ../trained-saved/block_size=64/withcontext/ckpt.pt
number of parameters: 3.42M
Loading model from: ../trained-saved/block_size=64/withoutemotion/singleConversation_withGPTdata/ckpt.pt
number of parameters: 3.42M
Loading model from: ../trained-saved/block_size=256/GPT_without_emotion/ckpt.pt
number of parameters: 3.42M


In [19]:
def get_response_from_nanoGPT(row,model):
    situation = row['Situation']
    emotion = row['grouped_emotion']
    human = row['empathetic_dialogues']
    start = '<bot> ' + human + '<human>'
    response, new_emotion, new_context = get_respond_nanoGPT(start, 1, model=model, enable_print=False)
    return response #, new_emotion, new_context

In [20]:
for model_type, model in model_list.items():
    label = 'new_label_' + model_type
    df[label] = df.apply(lambda row: get_response_from_nanoGPT(row, model), axis=1)

In [None]:
# df.to_csv('./evaluation_result.csv', index=False)

In [6]:
df.head()

Unnamed: 0,Situation,grouped_emotion,empathetic_dialogues,labels,new_label_withoutemotion_single,new_label_withoutemotion_whole,new_label_withemotion,new_label_withcontext,new_label_gpt_withoutemotion,new_label_gpt_blocksize_256
0,Last night I heard strange noises coming from ...,afraid,In the middle of the night I heard some weird ...,Should have grabbed the gun.,That's great! I hope you're able to do some g...,"Yes, it was so mortifying on the news.",Did you get the chance to go?,That's cool.,Sounds like a fun movie! What did you find?,
1,My mom and sister threw me a baby shower when ...,excited,that was very nice of them congratulations,"Thank you! It was so nice, I had no idea it w...",I hope they are happy,I bet. I've been there for a few years. I'v...,It was. I was so happy that they had a great ...,Thank you so much! It was really nice to see ...,Thank you! How can I assist you today?,Thank you! I'm here to help with any question...
2,I just applied for a new job. After the inter...,grateful,"Oh really, do you feel like you did a great job?",I do! I'm feeling very optimistic about it,"I did, I was able to stay with my friends.",I feel so bad for you.,"I did, but I'm not sure I will do it again.",I really hope so. I am feeling a little happy...,I'm here to help! What do you need assistance...,I'm here to help! What do you need assistance...
3,I loaned some money to my friend at work. Turn...,annoyed,Wow! What a jerk for him to up and leave with ...,"It was a medium amount of money but still, he ...",I was so shocked by that. He had a great job ...,It was a brand new car. I was so upset.,I'm not sure about it.,I was so mad. He had a huge nail and we got t...,It sounds like you had a great time! What hap...,"Yes, it wasn't a kind thing. I don't know wha..."
4,I was out walking late last night and seen som...,,Oh my god. What happened?,"Well, I started walking much faster. It looked...",I was driving in the middle of the night and ...,I was so upset! I was so upset.,I was driving a new car and I was so mad!,I got to see a dead skunk in my yard.,I'm here to listen. What happened?,I'm here for you. What happened?


## BLEU

In [21]:
def get_bleu(compared_column):
    bleu_scores = []
    smoothing_function = SmoothingFunction().method1  # To avoid 0 scores due to short sentences
    for _, row in df.iterrows():
        for ref, output in zip(row['labels'], row[compared_column]):
            # Tokenize each sentence (split by words)
            reference_tokens = [ref.split()]  # BLEU expects a list of lists for references
            output_tokens = output.split()
            
            # Calculate BLEU score
            bleu = sentence_bleu(reference_tokens, output_tokens, smoothing_function=smoothing_function)
            bleu_scores.append(bleu)
    return bleu_scores

In [22]:
bleu_scores = {}
bleu_scores_average = {}
for model_type, model in model_list.items():
    label = 'new_label_' + model_type
    # print(label)
    bleu_scores[model_type] = get_bleu(label)
    bleu_scores_average[model_type] = sum(bleu_scores[model_type]) / len(bleu_scores[model_type])

In [23]:
bleu_scores_average

{'withoutemotion_single': 0.005745652374923808,
 'withoutemotion_whole': 0.005187762489436854,
 'withemotion': 0.004634877185739742,
 'withcontext': 0.006159774135493186,
 'gpt_withoutemotion': 0.0063584190169655086,
 'gpt_blocksize_256': 0.006069653759755217}

## BertScore

In [24]:
def calculate_bert_score(compared_column):
    # Check for MPS device
    device = "mps" if torch.backends.mps.is_available() else "cpu"
    print(f"Using device: {device}")

    model_outputs = df['labels']
    reference_sentences = df[compared_column]

    if len(model_outputs) != len(reference_sentences):
        raise ValueError("Mismatch in lengths: model_outputs and reference_sentences must be of the same length.")
    # Convert model outputs and reference sentences to strings
    model_outputs = [str(output) for output in model_outputs]
    reference_sentences = [str(ref) for ref in reference_sentences]
    # Calculate precision, recall, and F1 for each pair of reference and output
    P, R, F1 = score(model_outputs, reference_sentences, lang='en', verbose=True, device = device)
    return P, R, F1

In [25]:
bert_scores = {}

for model_type, model in model_list.items():
    label = 'new_label_' + model_type
    bert_scores[model_type] = {}
    
    # Calculate BERT score and assign it to the dictionary
    bert_scores[model_type]['P'], bert_scores[model_type]['R'], bert_scores[model_type]['F1'] = calculate_bert_score(label)

Using device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/4 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 2.66 seconds, 37.52 sentences/sec
Using device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/3 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]



done in 1.68 seconds, 59.35 sentences/sec
Using device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/4 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 1.30 seconds, 77.03 sentences/sec
Using device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/4 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 1.28 seconds, 78.31 sentences/sec
Using device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/4 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 1.78 seconds, 56.03 sentences/sec
Using device: mps


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/3 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 1.32 seconds, 75.99 sentences/sec




In [26]:
for model_type, model in model_list.items():
    label = 'new_label_' + model_type
    # P avarage
    P_average = sum(bert_scores[model_type]['P']) / len(bert_scores[model_type]['P'])
    # R avarage
    R_average = sum(bert_scores[model_type]['R']) / len(bert_scores[model_type]['R'])
    # F1 avarage
    F1_average = sum(bert_scores[model_type]['F1']) / len(bert_scores[model_type]['F1'])
    print("--------------------------------------------------")
    print(f"Model: {model_type}")
    print(f"Average Precision: {P_average}")
    print(f"Average Recall: {R_average}")
    print(f"Average F1: {F1_average}")
    

--------------------------------------------------
Model: withoutemotion_single
Average Precision: 0.8582502603530884
Average Recall: 0.8634032607078552
Average F1: 0.8606156706809998
--------------------------------------------------
Model: withoutemotion_whole
Average Precision: 0.7520891427993774
Average Recall: 0.7587206959724426
Average F1: 0.7552400231361389
--------------------------------------------------
Model: withemotion
Average Precision: 0.857521653175354
Average Recall: 0.8639740943908691
Average F1: 0.8604713678359985
--------------------------------------------------
Model: withcontext
Average Precision: 0.8555678725242615
Average Recall: 0.8625133633613586
Average F1: 0.8588142991065979
--------------------------------------------------
Model: gpt_withoutemotion
Average Precision: 0.8563526272773743
Average Recall: 0.8623169660568237
Average F1: 0.8591411113739014
--------------------------------------------------
Model: gpt_blocksize_256
Average Precision: 0.56293547

## # GLUE - Sentiment Analysis Evaluation (SST-2)

In [27]:
from transformers import pipeline
def evaluate_sentiment(compared_column):
    device = "mps" if torch.backends.mps.is_available() else "cpu"
    print(f"Using device: {device}")
    
    # Load multi-class sentiment or emotion pipeline
    sentiment_pipeline = pipeline(
        "text-classification", 
        model="bhadresh-savani/distilbert-base-uncased-emotion", 
        device=0 if device == "mps" else -1
    )
    
    scores = []
    model_outputs = df['labels']
    reference_sentences = df[compared_column]

    if len(model_outputs) != len(reference_sentences):
        raise ValueError("Mismatch in lengths: model_outputs and reference_sentences must be of the same length.")
    # Convert model outputs and reference sentences to strings
    model_outputs = [str(output) for output in model_outputs]
    reference_sentences = [str(ref) for ref in reference_sentences]

    for i, (output, reference) in enumerate(zip(model_outputs, reference_sentences), start=1):
        output_sentiment = sentiment_pipeline(output)[0]['label']
        reference_sentiment = sentiment_pipeline(reference)[0]['label']
        
        score = 1 if output_sentiment == reference_sentiment else 0
        scores.append(score)

    return scores


In [28]:
sentiment_scores = {}

for model_type, model in model_list.items():
    label = 'new_label_' + model_type
    sentiment_scores[model_type] = {}
    
    sentiment_scores[model_type] = evaluate_sentiment(label)

Using device: mps
Using device: mps
Using device: mps
Using device: mps
Using device: mps
Using device: mps


In [29]:
for model_type, model in model_list.items():
    label = 'new_label_' + model_type
    GLUE_average = sum(sentiment_scores[model_type]) / len(sentiment_scores[model_type])
    print("--------------------------------------------------")
    print(f"Model: {model_type}")
    print(f"Average Sentiment Score: {GLUE_average}")
    

--------------------------------------------------
Model: withoutemotion_single
Average Sentiment Score: 0.53
--------------------------------------------------
Model: withoutemotion_whole
Average Sentiment Score: 0.44
--------------------------------------------------
Model: withemotion
Average Sentiment Score: 0.52
--------------------------------------------------
Model: withcontext
Average Sentiment Score: 0.47
--------------------------------------------------
Model: gpt_withoutemotion
Average Sentiment Score: 0.42
--------------------------------------------------
Model: gpt_blocksize_256
Average Sentiment Score: 0.41
