In [26]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torcheval.metrics import MultilabelAccuracy
import torch
from prettytable import PrettyTable


In [2]:
df1 = pd.read_csv("results/phi3_n_shot/phi3_one_shot.csv")
df2 = pd.read_csv("results/phi3_n_shot/phi3_two_shot.csv")
df3 = pd.read_csv("results/phi3_n_shot/phi3_three_shot.csv")
df5 = pd.read_csv("results/phi3_n_shot/phi3_five_shot.csv")

In [3]:
df = df1[["text", "category", "polarity", "joint", "category_labels", "polarity_labels", "joint_labels", "true_labels", "one_shot_result"]]

In [4]:
df["two_shot_result"] = df2["two_shot_result"]
df["three_shot_result"] = df3["three_shot_result"]
df["five_shot_result"] = df5["five_shot_result"]

In [5]:
df.head()

Unnamed: 0,text,category,polarity,joint,category_labels,polarity_labels,joint_labels,true_labels,one_shot_result,two_shot_result,three_shot_result,five_shot_result
0,bread top notch well,['food'],['positive'],['food#positive'],[0 1 0 0 0],[0 0 1],[0 0 0 0 0 1 0 0 0 0 0 0 0 0 0],"{'ambience': None, 'food': 'positive', 'other'...","{'quality':'high','food':'positive'}",{'bread':'positive'},{'food':'positive'},{'food':'positive'}
1,say one fastest delivery times city,['service'],['positive'],['service#positive'],[0 0 0 0 1],[0 0 1],[0 0 0 0 0 0 0 0 0 0 0 0 0 0 1],"{'ambience': None, 'food': None, 'other': None...",The fastest delivery times in the city typica...,The fastest delivery times in the city typica...,The fastest delivery times in the city typica...,The fastest delivery times in the city typica...
2,food always fresh ready eat,['food'],['positive'],['food#positive'],[0 1 0 0 0],[0 0 1],[0 0 0 0 0 1 0 0 0 0 0 0 0 0 0],"{'ambience': None, 'food': 'positive', 'other'...","{'food':'positive','ready_to_eat':'positive'}",{'food':'positive'},{'food':'positive'},{'food':'positive'}
3,mention coffee outstanding,['food'],['positive'],['food#positive'],[0 1 0 0 0],[0 0 1],[0 0 0 0 0 1 0 0 0 0 0 0 0 0 0],"{'ambience': None, 'food': 'positive', 'other'...",{'coffee':'positive'},{'coffee':'positive'},The coffee is outstanding.,"{'food':'not mentioned','service':'not mentio..."
4,trust people go sushi never disappoints,['other'],['positive'],['other#positive'],[0 0 1 0 0],[0 0 1],[0 0 0 0 0 0 0 0 1 0 0 0 0 0 0],"{'ambience': None, 'food': None, 'other': 'pos...","{'trust':'positive','sushi':'positive','never...","{'trust':'positive','sushi':'positive'}",{'food':'positive'},{'food':'positive'}


In [9]:
df["category_labels"] = df["category_labels"].apply(lambda x: [int(y) for y in x.strip("[").strip("]").split(" ")])
df["polarity_labels"] = df["polarity_labels"].apply(lambda x: [int(y) for y in x.strip("[").strip("]").split(" ")])
df["joint_labels"]    =    df["joint_labels"].apply(lambda x: [int(y) for y in x.strip("[").strip("]").split(" ")])

In [10]:
filtered = {
    "true_labels": [],
    "one_shot_result": [],
    "two_shot_result": [],
    "three_shot_result":  [],
    "five_shot_result": []
}
for i, x in df.iterrows():
    filtered["true_labels"].append({k:v for k,v in eval(x["true_labels"]).items() if v is not None})

    for shot in ['one_shot_result', 'two_shot_result', 'three_shot_result', 'five_shot_result']: 
        try:
            _dict = eval(x[shot])
            if not isinstance(_dict, dict):
                _dict = {}
        except:
            _dict = {}

        filtered[shot].append(_dict)

In [11]:
for k,v in filtered.items():
    df[k] = filtered[k]

In [12]:
# Allowed keys and values
allowed_keys = ["food", "service", "ambience", "price", "anecdotes/miscellaneous"]
allowed_values = ["positive", "neutral", "negative"]


In [13]:
def get_singular_cols(df, col):
    aspects =  []
    sentiments = []

    # Iterate through the DataFrame and process each row
    for index, row in df.iterrows():
        label_dict = row[col]
    
        # Filter keys and values
        filtered_keys = list(set([key for key in label_dict.keys() if key in allowed_keys]))
        filtered_values = list(set([value for value in label_dict.values() if value in allowed_values]))

        filtered_keys.sort()
        filtered_values.sort()
    
        aspects.append(filtered_keys)
        sentiments.append(filtered_values)
    
    return aspects, sentiments

In [14]:
for shot in ['one_shot_result', 'two_shot_result', 'three_shot_result', 'five_shot_result']:
    aspects, sentiments = get_singular_cols(df, shot)
    
    df[f"aspect_{shot}"] = aspects
    df[f"sentiment_{shot}"] = sentiments

In [15]:
df.head(2)

Unnamed: 0,text,category,polarity,joint,category_labels,polarity_labels,joint_labels,true_labels,one_shot_result,two_shot_result,three_shot_result,five_shot_result,aspect_one_shot_result,sentiment_one_shot_result,aspect_two_shot_result,sentiment_two_shot_result,aspect_three_shot_result,sentiment_three_shot_result,aspect_five_shot_result,sentiment_five_shot_result
0,bread top notch well,['food'],['positive'],['food#positive'],"[0, 1, 0, 0, 0]","[0, 0, 1]","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",{'food': 'positive'},"{'quality': 'high', 'food': 'positive'}",{'bread': 'positive'},{'food': 'positive'},{'food': 'positive'},[food],[positive],[],[positive],[food],[positive],[food],[positive]
1,say one fastest delivery times city,['service'],['positive'],['service#positive'],"[0, 0, 0, 0, 1]","[0, 0, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",{'service': 'positive'},{},{},{},{},[],[],[],[],[],[],[],[]


# Multi label binarizer

In [16]:
category_mlb = MultiLabelBinarizer()
sentiment_mlb = MultiLabelBinarizer()

category_mlb.fit([["food", "service", "ambience", "price", "anecdotes/miscellaneous"]])
sentiment_mlb.fit([["positive", "neutral", "negative"]])

for shot in ['one_shot_result', 'two_shot_result', 'three_shot_result', 'five_shot_result']:
    df[f"aspect_{shot}_label"] = category_mlb.transform(df[f"aspect_{shot}"]).tolist()
    df[f"sentiment_{shot}_label"] = sentiment_mlb.transform(df[f"sentiment_{shot}"]).tolist()

    aspect_idx_to_text = dict(zip(range(len(category_mlb.classes_)), category_mlb.classes_))
    category_idx_to_text = dict(zip(range(len(sentiment_mlb.classes_)), sentiment_mlb.classes_))
    print(shot)
    print(category_idx_to_text)
    print(aspect_idx_to_text)
    print()


one_shot_result
{0: 'negative', 1: 'neutral', 2: 'positive'}
{0: 'ambience', 1: 'anecdotes/miscellaneous', 2: 'food', 3: 'price', 4: 'service'}

two_shot_result
{0: 'negative', 1: 'neutral', 2: 'positive'}
{0: 'ambience', 1: 'anecdotes/miscellaneous', 2: 'food', 3: 'price', 4: 'service'}

three_shot_result
{0: 'negative', 1: 'neutral', 2: 'positive'}
{0: 'ambience', 1: 'anecdotes/miscellaneous', 2: 'food', 3: 'price', 4: 'service'}

five_shot_result
{0: 'negative', 1: 'neutral', 2: 'positive'}
{0: 'ambience', 1: 'anecdotes/miscellaneous', 2: 'food', 3: 'price', 4: 'service'}



## Accuracies

In [17]:
df.head()

Unnamed: 0,text,category,polarity,joint,category_labels,polarity_labels,joint_labels,true_labels,one_shot_result,two_shot_result,...,aspect_five_shot_result,sentiment_five_shot_result,aspect_one_shot_result_label,sentiment_one_shot_result_label,aspect_two_shot_result_label,sentiment_two_shot_result_label,aspect_three_shot_result_label,sentiment_three_shot_result_label,aspect_five_shot_result_label,sentiment_five_shot_result_label
0,bread top notch well,['food'],['positive'],['food#positive'],"[0, 1, 0, 0, 0]","[0, 0, 1]","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",{'food': 'positive'},"{'quality': 'high', 'food': 'positive'}",{'bread': 'positive'},...,[food],[positive],"[0, 0, 1, 0, 0]","[0, 0, 1]","[0, 0, 0, 0, 0]","[0, 0, 1]","[0, 0, 1, 0, 0]","[0, 0, 1]","[0, 0, 1, 0, 0]","[0, 0, 1]"
1,say one fastest delivery times city,['service'],['positive'],['service#positive'],"[0, 0, 0, 0, 1]","[0, 0, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]",{'service': 'positive'},{},{},...,[],[],"[0, 0, 0, 0, 0]","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0]"
2,food always fresh ready eat,['food'],['positive'],['food#positive'],"[0, 1, 0, 0, 0]","[0, 0, 1]","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",{'food': 'positive'},"{'food': 'positive', 'ready_to_eat': 'positive'}",{'food': 'positive'},...,[food],[positive],"[0, 0, 1, 0, 0]","[0, 0, 1]","[0, 0, 1, 0, 0]","[0, 0, 1]","[0, 0, 1, 0, 0]","[0, 0, 1]","[0, 0, 1, 0, 0]","[0, 0, 1]"
3,mention coffee outstanding,['food'],['positive'],['food#positive'],"[0, 1, 0, 0, 0]","[0, 0, 1]","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",{'food': 'positive'},{'coffee': 'positive'},{'coffee': 'positive'},...,"[food, service]",[positive],"[0, 0, 0, 0, 0]","[0, 0, 1]","[0, 0, 0, 0, 0]","[0, 0, 1]","[0, 0, 0, 0, 0]","[0, 0, 0]","[0, 0, 1, 0, 1]","[0, 0, 1]"
4,trust people go sushi never disappoints,['other'],['positive'],['other#positive'],"[0, 0, 1, 0, 0]","[0, 0, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]",{'other': 'positive'},"{'trust': 'positive', 'sushi': 'positive', 'ne...","{'trust': 'positive', 'sushi': 'positive'}",...,[food],[positive],"[0, 0, 0, 0, 0]","[0, 0, 1]","[0, 0, 0, 0, 0]","[0, 0, 1]","[0, 0, 1, 0, 0]","[0, 0, 1]","[0, 0, 1, 0, 0]","[0, 0, 1]"


In [18]:
def model_metrics(test_labels, predictions):
    accuracy = accuracy_score(test_labels, predictions)

    exact_accuracy = MultilabelAccuracy(criteria='exact_match')
    exact_accuracy.update(target=torch.Tensor(test_labels), input=torch.Tensor(predictions))

    overlap_accuracy = MultilabelAccuracy(criteria='overlap')
    overlap_accuracy.update(target=torch.Tensor(test_labels), input=torch.Tensor(predictions))

    macro_f1 = f1_score(test_labels, predictions, average='macro')
    micro_f1 = f1_score(test_labels, predictions, average='micro')

    metrics = {}
    metrics = {
        'accuracy' : accuracy,
        'exact_match_accuracy': exact_accuracy.compute().detach().item(),
        'overlap_accuracy': overlap_accuracy.compute().detach().item(),
        'macro_f1': macro_f1,
        'micro_f1': micro_f1
    }

    return metrics

def pretty_table(dict):
    table = PrettyTable()
    table.field_names = ['metric', 'value']
    for k,v in dict.items():
        table.add_row([k, v])
    print(table)


In [29]:
true = "category_labels"
pred = "aspect_one_shot_result_label"
print(pred, true)
pretty_table(model_metrics(np.array(df[pred].tolist()), np.array(df[true].tolist())))

aspect_one_shot_result_label category_labels
+----------------------+---------------------+
|        metric        |        value        |
+----------------------+---------------------+
|       accuracy       | 0.04138851802403204 |
| exact_match_accuracy | 0.04138851910829544 |
|   overlap_accuracy   | 0.19893190264701843 |
|       macro_f1       | 0.24078617105508995 |
|       micro_f1       | 0.23216783216783216 |
+----------------------+---------------------+


In [30]:
true = "category_labels"
pred = "aspect_two_shot_result_label"
print(pred, true)
pretty_table(model_metrics(np.array(df[pred].tolist()), np.array(df[true].tolist())))

aspect_two_shot_result_label category_labels
+----------------------+---------------------+
|        metric        |        value        |
+----------------------+---------------------+
|       accuracy       | 0.06408544726301736 |
| exact_match_accuracy |  0.0640854462981224 |
|   overlap_accuracy   | 0.22963951528072357 |
|       macro_f1       |  0.3220310892414962 |
|       micro_f1       |  0.2565880721220527 |
+----------------------+---------------------+


In [31]:
true = "category_labels"
pred = "aspect_three_shot_result_label"
print(pred, true)
pretty_table(model_metrics(np.array(df[pred].tolist()), np.array(df[true].tolist())))

aspect_three_shot_result_label category_labels
+----------------------+---------------------+
|        metric        |        value        |
+----------------------+---------------------+
|       accuracy       | 0.10947930574098798 |
| exact_match_accuracy | 0.10947930812835693 |
|   overlap_accuracy   |  0.2563417851924896 |
|       macro_f1       |  0.3383466485929315 |
|       micro_f1       | 0.27278865631330185 |
+----------------------+---------------------+


In [32]:
true = "category_labels"
pred = "aspect_five_shot_result_label"
print(pred, true)
pretty_table(model_metrics(np.array(df[pred].tolist()), np.array(df[true].tolist())))

aspect_five_shot_result_label category_labels
+----------------------+---------------------+
|        metric        |        value        |
+----------------------+---------------------+
|       accuracy       |  0.1081441922563418 |
| exact_match_accuracy | 0.10814419388771057 |
|   overlap_accuracy   | 0.30974632501602173 |
|       macro_f1       | 0.36887399086628386 |
|       micro_f1       |  0.2930232558139535 |
+----------------------+---------------------+


In [33]:
true = "polarity_labels"
pred = "sentiment_one_shot_result_label"
print(pred, true)
pretty_table(model_metrics(np.array(df[pred].tolist()), np.array(df[true].tolist())))

sentiment_one_shot_result_label polarity_labels
+----------------------+---------------------+
|        metric        |        value        |
+----------------------+---------------------+
|       accuracy       | 0.40587449933244324 |
| exact_match_accuracy | 0.40587449073791504 |
|   overlap_accuracy   |  0.5180240273475647 |
|       macro_f1       |  0.4588083545899923 |
|       micro_f1       |  0.6064318529862175 |
+----------------------+---------------------+


In [34]:
true = "polarity_labels"
pred = "sentiment_two_shot_result_label"
print(pred, true)
pretty_table(model_metrics(np.array(df[pred].tolist()), np.array(df[true].tolist())))

sentiment_two_shot_result_label polarity_labels
+----------------------+--------------------+
|        metric        |       value        |
+----------------------+--------------------+
|       accuracy       | 0.5660881174899867 |
| exact_match_accuracy | 0.5660881400108337 |
|   overlap_accuracy   | 0.672897219657898  |
|       macro_f1       | 0.5491899808854452 |
|       micro_f1       | 0.708507670850767  |
+----------------------+--------------------+


In [35]:
true = "polarity_labels"
pred = "sentiment_three_shot_result_label"
print(pred, true)
pretty_table(model_metrics(np.array(df[pred].tolist()), np.array(df[true].tolist())))

sentiment_three_shot_result_label polarity_labels
+----------------------+--------------------+
|        metric        |       value        |
+----------------------+--------------------+
|       accuracy       | 0.6421895861148198 |
| exact_match_accuracy | 0.6421895623207092 |
|   overlap_accuracy   | 0.6982643604278564 |
|       macro_f1       | 0.5494570887882732 |
|       micro_f1       | 0.7407407407407407 |
+----------------------+--------------------+


In [36]:
true = "polarity_labels"
pred = "sentiment_five_shot_result_label"
print(pred, true)
pretty_table(model_metrics(np.array(df[pred].tolist()), np.array(df[true].tolist())))

sentiment_five_shot_result_label polarity_labels
+----------------------+--------------------+
|        metric        |       value        |
+----------------------+--------------------+
|       accuracy       | 0.6728971962616822 |
| exact_match_accuracy | 0.672897219657898  |
|   overlap_accuracy   | 0.7556742429733276 |
|       macro_f1       | 0.6322621093278159 |
|       micro_f1       | 0.756578947368421  |
+----------------------+--------------------+


## LLM metrics

In [37]:
def calculate_exact_match_accuracy(df, pred_col_name):
    exact_matches = df.apply(lambda row: row['true_labels'] == row[pred_col_name], axis=1)
    exact_match_accuracy = exact_matches.mean()
    return exact_match_accuracy

def calculate_partial_accuracy(df, pred_col_name):
    def partial_match(row):
        true_keys = set(row['true_labels'].keys())
        pred_keys = set(row[pred_col_name].keys())
        matching_keys = true_keys & pred_keys
        if len(true_keys) == 0:
            return 0
        return len(matching_keys) / len(true_keys)
    
    partial_accuracies = df.apply(partial_match, axis=1)
    partial_accuracy = partial_accuracies.mean()
    return partial_accuracy

def count_extra_keys(df, pred_col_name):
    def extra_key_count(row):
        true_keys = set(row['true_labels'].keys())
        pred_keys = set(row[pred_col_name].keys())
        extra_keys = pred_keys - true_keys
        return len(extra_keys)
    
    extra_keys_counts = df.apply(extra_key_count, axis=1)
    return extra_keys_counts.sum()

def count_rows_with_extra_keys(df, pred_col_name):
    def has_extra_keys(row):
        true_keys = set(row['true_labels'].keys())
        pred_keys = set(row[pred_col_name].keys())
        extra_keys = pred_keys - true_keys
        return len(extra_keys) > 0
    
    rows_with_extra_keys = df.apply(has_extra_keys, axis=1)
    return rows_with_extra_keys.sum()

def count_empty_predicted_labels(df, pred_col):
    empty_predicted_labels = df[pred_col].apply(lambda x: len(x) == 0)
    return empty_predicted_labels.sum()

In [38]:
pred_col = "one_shot_result"
print(pred_col)

exact_match_accuracy = calculate_exact_match_accuracy(df, pred_col)
partial_accuracy = calculate_partial_accuracy(df, pred_col)
extra_key_count = count_extra_keys(df, pred_col)
rows_with_extra_keys = count_rows_with_extra_keys(df, pred_col)
empty_predicted_labels_count = count_empty_predicted_labels(df, pred_col)


print(f'Exact Match Accuracy: {exact_match_accuracy}')
print(f'Partial Accuracy: {partial_accuracy}')
print(f'Count of Extra Keys: {extra_key_count}')
print(f'Number of Rows with Extra Keys: {rows_with_extra_keys}')
print(f'Number of Rows with Empty Predicted Labels: {empty_predicted_labels_count}')
print(f'Total: {df.shape[0]}')

one_shot_result
Exact Match Accuracy: 0.03471295060080107
Partial Accuracy: 0.26045838896306184
Count of Extra Keys: 1284
Number of Rows with Extra Keys: 510
Number of Rows with Empty Predicted Labels: 207
Total: 749


In [39]:
pred_col = "two_shot_result"
print(pred_col)

exact_match_accuracy = calculate_exact_match_accuracy(df, pred_col)
partial_accuracy = calculate_partial_accuracy(df, pred_col)
extra_key_count = count_extra_keys(df, pred_col)
rows_with_extra_keys = count_rows_with_extra_keys(df, pred_col)
empty_predicted_labels_count = count_empty_predicted_labels(df, pred_col)

print(f'Exact Match Accuracy: {exact_match_accuracy}')
print(f'Partial Accuracy: {partial_accuracy}')
print(f'Count of Extra Keys: {extra_key_count}')
print(f'Number of Rows with Extra Keys: {rows_with_extra_keys}')
print(f'Number of Rows with Empty Predicted Labels: {empty_predicted_labels_count}')
print(f'Total: {df.shape[0]}')

two_shot_result
Exact Match Accuracy: 0.14419225634178906
Partial Accuracy: 0.3835113484646195
Count of Extra Keys: 924
Number of Rows with Extra Keys: 480
Number of Rows with Empty Predicted Labels: 128
Total: 749


In [40]:
pred_col = "three_shot_result"
print(pred_col)

exact_match_accuracy = calculate_exact_match_accuracy(df, pred_col)
partial_accuracy = calculate_partial_accuracy(df, pred_col)
extra_key_count = count_extra_keys(df, pred_col)
rows_with_extra_keys = count_rows_with_extra_keys(df, pred_col)
empty_predicted_labels_count = count_empty_predicted_labels(df, pred_col)

print(f'Exact Match Accuracy: {exact_match_accuracy}')
print(f'Partial Accuracy: {partial_accuracy}')
print(f'Count of Extra Keys: {extra_key_count}')
print(f'Number of Rows with Extra Keys: {rows_with_extra_keys}')
print(f'Number of Rows with Empty Predicted Labels: {empty_predicted_labels_count}')
print(f'Total: {df.shape[0]}')

three_shot_result
Exact Match Accuracy: 0.2923898531375167
Partial Accuracy: 0.46517578994214503
Count of Extra Keys: 491
Number of Rows with Extra Keys: 331
Number of Rows with Empty Predicted Labels: 115
Total: 749


In [41]:
pred_col = "five_shot_result"
print(pred_col)

exact_match_accuracy = calculate_exact_match_accuracy(df, pred_col)
partial_accuracy = calculate_partial_accuracy(df, pred_col)
extra_key_count = count_extra_keys(df, pred_col)
rows_with_extra_keys = count_rows_with_extra_keys(df, pred_col)
empty_predicted_labels_count = count_empty_predicted_labels(df, pred_col)

print(f'Exact Match Accuracy: {exact_match_accuracy}')
print(f'Partial Accuracy: {partial_accuracy}')
print(f'Count of Extra Keys: {extra_key_count}')
print(f'Number of Rows with Extra Keys: {rows_with_extra_keys}')
print(f'Number of Rows with Empty Predicted Labels: {empty_predicted_labels_count}')
print(f'Total: {df.shape[0]}')

five_shot_result
Exact Match Accuracy: 0.31108144192256343
Partial Accuracy: 0.4939919893190921
Count of Extra Keys: 534
Number of Rows with Extra Keys: 430
Number of Rows with Empty Predicted Labels: 28
Total: 749
