### Code to create the appropriate test dataset

In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
import pandas as pd
import numpy as np
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nanoGPT.chat import init_model as init_nanoGPT
from  nanoGPT.chat import respond as get_respond_nanoGPT
import torch
from bert_score import score
import tiktoken

In [None]:
def txt_to_dataframe_cleaned(file_path):
    data = []

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                # Remove tags and strip whitespace
                line_cleaned = line.replace("<bot>", "").replace("<human>", "").replace("<endOfText>", "").strip()
                if line_cleaned:  # Skip empty lines
                    # Determine speaker based on presence of "<bot>" or "<human>"
                    if "<bot>" in line:
                        speaker = "bot"
                    elif "<human>" in line:
                        speaker = "human"
                    else:
                        speaker = None

                    # Append cleaned message to the data list
                    data.append({'speaker': speaker, 'message': line_cleaned})

        # Convert to DataFrame
        df = pd.DataFrame(data)
        return df

    except Exception as e:
        print(f"Error reading file: {e}")
        return None

In [None]:
def transform_dataframe(df):
    # Filter bot and human messages
    bot_messages = df[df['speaker'] == 'bot']['message'].reset_index(drop=True)
    human_messages = df[df['speaker'] == 'human']['message'].reset_index(drop=True)

    # Ensure both columns have the same length
    min_length = min(len(bot_messages), len(human_messages))
    bot_messages = bot_messages[:min_length]
    human_messages = human_messages[:min_length]

    # Combine into a new DataFrame
    transformed_df = pd.DataFrame({
        'empathetic_dialogues': bot_messages,
        'label': human_messages
    })
    return transformed_df

In [None]:
file_path = "../../data/emotion/validation/validation_data.txt" 

val_df = txt_to_dataframe_cleaned(file_path)
val_df = transform_dataframe(val_df)

val_df.head()

In [None]:
# Processing model's output, because we get (response, emotion, context), and we only want the response.

input_file_path = '../../data/emotion/validation/val_df_with_labels.csv'  
output_file_path = '../../data/emotion/validation/final_val.csv'  

val_df = pd.read_csv(input_file_path)

# Columns to modify
columns_to_modify = [
    'new_label_withoutemotion_single',
    'new_label_withoutemotion_whole',
    'new_label_withemotion',
    'new_label_withcontext',
    'new_label_gpt_withoutemotion',
    'new_label_gpt_blocksize_256'
]

for column in columns_to_modify:
    val_df[column] = val_df[column].str.extract(r'"([^"]+)"|\'([^\']+)\'').fillna('').sum(axis=1)


val_df.to_csv(output_file_path, index=False)

print(f"Processed file saved at: {output_file_path}")
