In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torcheval.metrics import MultilabelAccuracy
import torch
from prettytable import PrettyTable


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df1 = pd.read_csv("results/phi3_prompting/prompt1.csv")
df2 = pd.read_csv("results/phi3_prompting/prompt2.csv")
df3 = pd.read_csv("results/phi3_prompting/prompt3.csv")
df4 = pd.read_csv("results/phi3_prompting/prompt4.csv")

In [3]:
df = df1[["text", "category", "polarity", "joint", "category_labels", "polarity_labels", "joint_labels", "true_labels", "prompt_1"]]

In [4]:
df["prompt_2"] = df2["prompt_2"]
df["prompt_3"] = df3["prompt_3"]
df["prompt_4"] = df4["prompt_4"]

In [5]:
df.head(2)

Unnamed: 0,text,category,polarity,joint,category_labels,polarity_labels,joint_labels,true_labels,prompt_1,prompt_2,prompt_3,prompt_4
0,bread top notch well,['food'],['positive'],['food#positive'],[0 1 0 0 0],[0 0 1],[0 0 0 0 0 1 0 0 0 0 0 0 0 0 0],"{'ambience': None, 'food': 'positive', 'other'...","{'quality':'high','food':'positive'}","{'quality':'high','food':'positive'}","{'quality':'high','food':'positive'}","{'quality':'high','food':'positive'}"
1,say one fastest delivery times city,['service'],['positive'],['service#positive'],[0 0 0 0 1],[0 0 1],[0 0 0 0 0 0 0 0 0 0 0 0 0 0 1],"{'ambience': None, 'food': None, 'other': None...",The fastest delivery times in the city typica...,The fastest delivery times in the city typica...,The fastest delivery times in the city typica...,The fastest delivery times in the city typica...


In [6]:
df["category_labels"] = df["category_labels"].apply(lambda x: [int(y) for y in x.strip("[").strip("]").split(" ")])
df["polarity_labels"] = df["polarity_labels"].apply(lambda x: [int(y) for y in x.strip("[").strip("]").split(" ")])
df["joint_labels"]    =    df["joint_labels"].apply(lambda x: [int(y) for y in x.strip("[").strip("]").split(" ")])

In [7]:
filtered = {
    "true_labels": [],
    "prompt_1": [],
    "prompt_2": [],
    "prompt_3":  [],
    "prompt_4": []
    }
for i, x in df.iterrows():
    filtered["true_labels"].append({k:v for k,v in eval(x["true_labels"]).items() if v is not None})

    for shot in ['prompt_1', 'prompt_2', 'prompt_3', 'prompt_4']: 
        try:
            _dict = eval(x[shot])
            if not isinstance(_dict, dict):
                _dict = {}
        except:
            _dict = {}

        filtered[shot].append(_dict)

In [8]:
for k,v in filtered.items():
    df[k] = filtered[k]

In [9]:
# Allowed keys and values
allowed_keys = ["food", "service", "ambience", "price", "other"]
allowed_values = ["positive", "neutral", "negative"]


In [10]:
def get_singular_cols(df, col):
    aspects =  []
    sentiments = []

    # Iterate through the DataFrame and process each row
    for index, row in df.iterrows():
        label_dict = row[col]
    
        # Filter keys and values
        filtered_keys = list(set([key for key in label_dict.keys() if key in allowed_keys]))
        filtered_values = list(set([value for value in label_dict.values() if value in allowed_values]))

        filtered_keys.sort()
        filtered_values.sort()
    
        aspects.append(filtered_keys)
        sentiments.append(filtered_values)
    
    return aspects, sentiments

In [11]:
for shot in ['prompt_1', 'prompt_2', 'prompt_3', 'prompt_4']:
    aspects, sentiments = get_singular_cols(df, shot)
    
    df[f"aspect_{shot}"] = aspects
    df[f"sentiment_{shot}"] = sentiments

In [12]:
df.head(2)

Unnamed: 0,text,category,polarity,joint,category_labels,polarity_labels,joint_labels,true_labels,prompt_1,prompt_2,prompt_3,prompt_4,aspect_prompt_1,sentiment_prompt_1,aspect_prompt_2,sentiment_prompt_2,aspect_prompt_3,sentiment_prompt_3,aspect_prompt_4,sentiment_prompt_4
0,bread top notch well,['food'],['positive'],['food#positive'],"[0, 1, 0, 0, 0]","[0, 0, 1]","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",{'food': 'positive'},"{'quality': 'high', 'food': 'positive'}","{'quality': 'high', 'food': 'positive'}","{'quality': 'high', 'food': 'positive'}","{'quality': 'high', 'food': 'positive'}",[food],[positive],[food],[positive],[food],[positive],[food],[positive]
1,say one fastest delivery times city,['service'],['positive'],['service#positive'],"[0, 0, 0, 0, 1]","[0, 0, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",{'service': 'positive'},{},{},{},{},[],[],[],[],[],[],[],[]


# Multi label binarizer

In [13]:
category_mlb = MultiLabelBinarizer()
sentiment_mlb = MultiLabelBinarizer()

category_mlb.fit([["food", "service", "ambience", "price", "other"]])
sentiment_mlb.fit([["positive", "neutral", "negative"]])

for shot in ['prompt_1', 'prompt_2', 'prompt_3', 'prompt_4']:
    df[f"aspect_{shot}_label"] = category_mlb.transform(df[f"aspect_{shot}"]).tolist()
    df[f"sentiment_{shot}_label"] = sentiment_mlb.transform(df[f"sentiment_{shot}"]).tolist()

    aspect_idx_to_text = dict(zip(range(len(category_mlb.classes_)), category_mlb.classes_))
    category_idx_to_text = dict(zip(range(len(sentiment_mlb.classes_)), sentiment_mlb.classes_))
    print(shot)
    print(category_idx_to_text)
    print(aspect_idx_to_text)
    print()


prompt_1
{0: 'negative', 1: 'neutral', 2: 'positive'}
{0: 'ambience', 1: 'food', 2: 'other', 3: 'price', 4: 'service'}

prompt_2
{0: 'negative', 1: 'neutral', 2: 'positive'}
{0: 'ambience', 1: 'food', 2: 'other', 3: 'price', 4: 'service'}

prompt_3
{0: 'negative', 1: 'neutral', 2: 'positive'}
{0: 'ambience', 1: 'food', 2: 'other', 3: 'price', 4: 'service'}

prompt_4
{0: 'negative', 1: 'neutral', 2: 'positive'}
{0: 'ambience', 1: 'food', 2: 'other', 3: 'price', 4: 'service'}



## Accuracies

In [14]:
df.head()

Unnamed: 0,text,category,polarity,joint,category_labels,polarity_labels,joint_labels,true_labels,prompt_1,prompt_2,...,aspect_prompt_4,sentiment_prompt_4,aspect_prompt_1_label,sentiment_prompt_1_label,aspect_prompt_2_label,sentiment_prompt_2_label,aspect_prompt_3_label,sentiment_prompt_3_label,aspect_prompt_4_label,sentiment_prompt_4_label
0,bread top notch well,['food'],['positive'],['food#positive'],"[0, 1, 0, 0, 0]","[0, 0, 1]","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",{'food': 'positive'},"{'quality': 'high', 'food': 'positive'}","{'quality': 'high', 'food': 'positive'}",...,[food],[positive],"[0, 1, 0, 0, 0]","[0, 0, 1]","[0, 1, 0, 0, 0]","[0, 0, 1]","[0, 1, 0, 0, 0]","[0, 0, 1]","[0, 1, 0, 0, 0]","[0, 0, 1]"
1,say one fastest delivery times city,['service'],['positive'],['service#positive'],"[0, 0, 0, 0, 1]","[0, 0, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",{'service': 'positive'},{},{},...,[],[],"[0, 0, 0, 0, 0]","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0]"
2,food always fresh ready eat,['food'],['positive'],['food#positive'],"[0, 1, 0, 0, 0]","[0, 0, 1]","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",{'food': 'positive'},"{'food': 'positive', 'ready_to_eat': 'positive'}","{'food': 'positive', 'ready_to_eat': 'positive'}",...,[food],[positive],"[0, 1, 0, 0, 0]","[0, 0, 1]","[0, 1, 0, 0, 0]","[0, 0, 1]","[0, 1, 0, 0, 0]","[0, 0, 1]","[0, 1, 0, 0, 0]","[0, 0, 1]"
3,mention coffee outstanding,['food'],['positive'],['food#positive'],"[0, 1, 0, 0, 0]","[0, 0, 1]","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",{'food': 'positive'},{'coffee': 'positive'},{'coffee': 'positive'},...,[],[positive],"[0, 0, 0, 0, 0]","[0, 0, 1]","[0, 0, 0, 0, 0]","[0, 0, 1]","[0, 0, 0, 0, 0]","[0, 0, 1]","[0, 0, 0, 0, 0]","[0, 0, 1]"
4,trust people go sushi never disappoints,['other'],['positive'],['other#positive'],"[0, 0, 1, 0, 0]","[0, 0, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]",{'other': 'positive'},"{'trust': 'positive', 'sushi': 'positive', 'ne...","{'trust': 'positive', 'sushi': 'positive', 'ne...",...,[],[positive],"[0, 0, 0, 0, 0]","[0, 0, 1]","[0, 0, 0, 0, 0]","[0, 0, 1]","[0, 0, 0, 0, 0]","[0, 0, 1]","[0, 0, 0, 0, 0]","[0, 0, 1]"


In [15]:
def model_metrics(test_labels, predictions):
    accuracy = accuracy_score(test_labels, predictions)

    exact_accuracy = MultilabelAccuracy(criteria='exact_match')
    exact_accuracy.update(target=torch.Tensor(test_labels), input=torch.Tensor(predictions))

    overlap_accuracy = MultilabelAccuracy(criteria='overlap')
    overlap_accuracy.update(target=torch.Tensor(test_labels), input=torch.Tensor(predictions))

    macro_f1 = f1_score(test_labels, predictions, average='macro')
    micro_f1 = f1_score(test_labels, predictions, average='micro')

    metrics = {}
    metrics = {
        'accuracy' : accuracy,
        'exact_match_accuracy': exact_accuracy.compute().detach().item(),
        'overlap_accuracy': overlap_accuracy.compute().detach().item(),
        'macro_f1': macro_f1,
        'micro_f1': micro_f1
    }

    return metrics

def pretty_table(dict):
    table = PrettyTable()
    table.field_names = ['metric', 'value']
    for k,v in dict.items():
        table.add_row([k, v])
    print(table)


In [16]:
df.columns

Index(['text', 'category', 'polarity', 'joint', 'category_labels',
       'polarity_labels', 'joint_labels', 'true_labels', 'prompt_1',
       'prompt_2', 'prompt_3', 'prompt_4', 'aspect_prompt_1',
       'sentiment_prompt_1', 'aspect_prompt_2', 'sentiment_prompt_2',
       'aspect_prompt_3', 'sentiment_prompt_3', 'aspect_prompt_4',
       'sentiment_prompt_4', 'aspect_prompt_1_label',
       'sentiment_prompt_1_label', 'aspect_prompt_2_label',
       'sentiment_prompt_2_label', 'aspect_prompt_3_label',
       'sentiment_prompt_3_label', 'aspect_prompt_4_label',
       'sentiment_prompt_4_label'],
      dtype='object')

In [17]:
true = "category_labels"
pred = "aspect_prompt_1_label"
print(pred, true)
pretty_table(model_metrics(np.array(df[pred].tolist()), np.array(df[true].tolist())))

aspect_prompt_1_label category_labels
+----------------------+---------------------+
|        metric        |        value        |
+----------------------+---------------------+
|       accuracy       | 0.13351134846461948 |
| exact_match_accuracy | 0.13351134955883026 |
|   overlap_accuracy   |  0.3044058680534363 |
|       macro_f1       | 0.32543116354991175 |
|       micro_f1       | 0.42097902097902096 |
+----------------------+---------------------+


In [18]:
true = "category_labels"
pred = "aspect_prompt_2_label"
print(pred, true)
pretty_table(model_metrics(np.array(df[pred].tolist()), np.array(df[true].tolist())))

aspect_prompt_2_label category_labels
+----------------------+---------------------+
|        metric        |        value        |
+----------------------+---------------------+
|       accuracy       | 0.13351134846461948 |
| exact_match_accuracy | 0.13351134955883026 |
|   overlap_accuracy   |  0.3044058680534363 |
|       macro_f1       | 0.32543116354991175 |
|       micro_f1       | 0.42097902097902096 |
+----------------------+---------------------+


In [19]:
true = "category_labels"
pred = "aspect_prompt_3_label"
print(pred, true)
pretty_table(model_metrics(np.array(df[pred].tolist()), np.array(df[true].tolist())))

aspect_prompt_3_label category_labels
+----------------------+---------------------+
|        metric        |        value        |
+----------------------+---------------------+
|       accuracy       | 0.13351134846461948 |
| exact_match_accuracy | 0.13351134955883026 |
|   overlap_accuracy   |  0.3044058680534363 |
|       macro_f1       | 0.32543116354991175 |
|       micro_f1       | 0.42097902097902096 |
+----------------------+---------------------+


In [20]:
true = "category_labels"
pred = "aspect_prompt_4_label"
print(pred, true)
pretty_table(model_metrics(np.array(df[pred].tolist()), np.array(df[true].tolist())))

aspect_prompt_4_label category_labels
+----------------------+---------------------+
|        metric        |        value        |
+----------------------+---------------------+
|       accuracy       | 0.13351134846461948 |
| exact_match_accuracy | 0.13351134955883026 |
|   overlap_accuracy   |  0.3044058680534363 |
|       macro_f1       | 0.32543116354991175 |
|       micro_f1       | 0.42097902097902096 |
+----------------------+---------------------+


In [21]:
true = "polarity_labels"
pred = "sentiment_prompt_1_label"
print(pred, true)
pretty_table(model_metrics(np.array(df[pred].tolist()), np.array(df[true].tolist())))

sentiment_prompt_1_label polarity_labels
+----------------------+---------------------+
|        metric        |        value        |
+----------------------+---------------------+
|       accuracy       | 0.40587449933244324 |
| exact_match_accuracy | 0.40587449073791504 |
|   overlap_accuracy   |  0.5180240273475647 |
|       macro_f1       |  0.4588083545899923 |
|       micro_f1       |  0.6064318529862175 |
+----------------------+---------------------+


In [22]:
true = "polarity_labels"
pred = "sentiment_prompt_2_label"
print(pred, true)
pretty_table(model_metrics(np.array(df[pred].tolist()), np.array(df[true].tolist())))

sentiment_prompt_2_label polarity_labels
+----------------------+---------------------+
|        metric        |        value        |
+----------------------+---------------------+
|       accuracy       | 0.40587449933244324 |
| exact_match_accuracy | 0.40587449073791504 |
|   overlap_accuracy   |  0.5180240273475647 |
|       macro_f1       |  0.4588083545899923 |
|       micro_f1       |  0.6064318529862175 |
+----------------------+---------------------+


In [23]:
true = "polarity_labels"
pred = "sentiment_prompt_3_label"
print(pred, true)
pretty_table(model_metrics(np.array(df[pred].tolist()), np.array(df[true].tolist())))

sentiment_prompt_3_label polarity_labels
+----------------------+---------------------+
|        metric        |        value        |
+----------------------+---------------------+
|       accuracy       | 0.40587449933244324 |
| exact_match_accuracy | 0.40587449073791504 |
|   overlap_accuracy   |  0.5180240273475647 |
|       macro_f1       |  0.4588083545899923 |
|       micro_f1       |  0.6064318529862175 |
+----------------------+---------------------+


In [24]:
true = "polarity_labels"
pred = "sentiment_prompt_4_label"
print(pred, true)
pretty_table(model_metrics(np.array(df[pred].tolist()), np.array(df[true].tolist())))

sentiment_prompt_4_label polarity_labels
+----------------------+---------------------+
|        metric        |        value        |
+----------------------+---------------------+
|       accuracy       | 0.40587449933244324 |
| exact_match_accuracy | 0.40587449073791504 |
|   overlap_accuracy   |  0.5180240273475647 |
|       macro_f1       |  0.4588083545899923 |
|       micro_f1       |  0.6064318529862175 |
+----------------------+---------------------+


## LLM metrics

In [25]:
def calculate_exact_match_accuracy(df, pred_col_name):
    exact_matches = df.apply(lambda row: row['true_labels'] == row[pred_col_name], axis=1)
    exact_match_accuracy = exact_matches.mean()
    return exact_match_accuracy

def calculate_partial_accuracy(df, pred_col_name):
    def partial_match(row):
        true_keys = set(row['true_labels'].keys())
        pred_keys = set(row[pred_col_name].keys())
        matching_keys = true_keys & pred_keys
        if len(true_keys) == 0:
            return 0
        return len(matching_keys) / len(true_keys)
    
    partial_accuracies = df.apply(partial_match, axis=1)
    partial_accuracy = partial_accuracies.mean()
    return partial_accuracy

def count_extra_keys(df, pred_col_name):
    def extra_key_count(row):
        true_keys = set(row['true_labels'].keys())
        pred_keys = set(row[pred_col_name].keys())
        extra_keys = pred_keys - true_keys
        return len(extra_keys)
    
    extra_keys_counts = df.apply(extra_key_count, axis=1)
    return extra_keys_counts.sum()

def count_rows_with_extra_keys(df, pred_col_name):
    def has_extra_keys(row):
        true_keys = set(row['true_labels'].keys())
        pred_keys = set(row[pred_col_name].keys())
        extra_keys = pred_keys - true_keys
        return len(extra_keys) > 0
    
    rows_with_extra_keys = df.apply(has_extra_keys, axis=1)
    return rows_with_extra_keys.sum()

def count_empty_predicted_labels(df, pred_col):
    empty_predicted_labels = df[pred_col].apply(lambda x: len(x) == 0)
    return empty_predicted_labels.sum()

In [26]:
pred_col = "prompt_1"
print(pred_col)

exact_match_accuracy = calculate_exact_match_accuracy(df, pred_col)
partial_accuracy = calculate_partial_accuracy(df, pred_col)
extra_key_count = count_extra_keys(df, pred_col)
rows_with_extra_keys = count_rows_with_extra_keys(df, pred_col)
empty_predicted_labels_count = count_empty_predicted_labels(df, pred_col)


print(f'Exact Match Accuracy: {exact_match_accuracy}')
print(f'Partial Accuracy: {partial_accuracy}')
print(f'Count of Extra Keys: {extra_key_count}')
print(f'Number of Rows with Extra Keys: {rows_with_extra_keys}')
print(f'Number of Rows with Empty Predicted Labels: {empty_predicted_labels_count}')
print(f'Total: {df.shape[0]}')

prompt_1
Exact Match Accuracy: 0.03471295060080107
Partial Accuracy: 0.26045838896306184
Count of Extra Keys: 1284
Number of Rows with Extra Keys: 510
Number of Rows with Empty Predicted Labels: 207
Total: 749


In [27]:
pred_col = "prompt_2"
print(pred_col)

exact_match_accuracy = calculate_exact_match_accuracy(df, pred_col)
partial_accuracy = calculate_partial_accuracy(df, pred_col)
extra_key_count = count_extra_keys(df, pred_col)
rows_with_extra_keys = count_rows_with_extra_keys(df, pred_col)
empty_predicted_labels_count = count_empty_predicted_labels(df, pred_col)

print(f'Exact Match Accuracy: {exact_match_accuracy}')
print(f'Partial Accuracy: {partial_accuracy}')
print(f'Count of Extra Keys: {extra_key_count}')
print(f'Number of Rows with Extra Keys: {rows_with_extra_keys}')
print(f'Number of Rows with Empty Predicted Labels: {empty_predicted_labels_count}')
print(f'Total: {df.shape[0]}')

prompt_2
Exact Match Accuracy: 0.03471295060080107
Partial Accuracy: 0.26045838896306184
Count of Extra Keys: 1284
Number of Rows with Extra Keys: 510
Number of Rows with Empty Predicted Labels: 207
Total: 749


In [28]:
pred_col = "prompt_3"
print(pred_col)

exact_match_accuracy = calculate_exact_match_accuracy(df, pred_col)
partial_accuracy = calculate_partial_accuracy(df, pred_col)
extra_key_count = count_extra_keys(df, pred_col)
rows_with_extra_keys = count_rows_with_extra_keys(df, pred_col)
empty_predicted_labels_count = count_empty_predicted_labels(df, pred_col)

print(f'Exact Match Accuracy: {exact_match_accuracy}')
print(f'Partial Accuracy: {partial_accuracy}')
print(f'Count of Extra Keys: {extra_key_count}')
print(f'Number of Rows with Extra Keys: {rows_with_extra_keys}')
print(f'Number of Rows with Empty Predicted Labels: {empty_predicted_labels_count}')
print(f'Total: {df.shape[0]}')

prompt_3
Exact Match Accuracy: 0.03471295060080107
Partial Accuracy: 0.26045838896306184
Count of Extra Keys: 1284
Number of Rows with Extra Keys: 510
Number of Rows with Empty Predicted Labels: 207
Total: 749


In [29]:
pred_col = "prompt_4"
print(pred_col)

exact_match_accuracy = calculate_exact_match_accuracy(df, pred_col)
partial_accuracy = calculate_partial_accuracy(df, pred_col)
extra_key_count = count_extra_keys(df, pred_col)
rows_with_extra_keys = count_rows_with_extra_keys(df, pred_col)
empty_predicted_labels_count = count_empty_predicted_labels(df, pred_col)

print(f'Exact Match Accuracy: {exact_match_accuracy}')
print(f'Partial Accuracy: {partial_accuracy}')
print(f'Count of Extra Keys: {extra_key_count}')
print(f'Number of Rows with Extra Keys: {rows_with_extra_keys}')
print(f'Number of Rows with Empty Predicted Labels: {empty_predicted_labels_count}')
print(f'Total: {df.shape[0]}')

prompt_4
Exact Match Accuracy: 0.03471295060080107
Partial Accuracy: 0.26045838896306184
Count of Extra Keys: 1284
Number of Rows with Extra Keys: 510
Number of Rows with Empty Predicted Labels: 207
Total: 749
