In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
options = ['acceptable', 'unacceptable']

def extract_response(sent):
    sent  = str(sent)
    ret = []
    if(sent == '' or sent==np.nan):
        return ret
    gen = sent.lower()
    if options[1] in gen:
        ret.append(options[1])
    if len(ret):
        return ret
    if options[0] in gen:
        ret.append(options[0])
    if len(ret):
        return ret
    for i, opt in enumerate(options):
        if chr(i + 1 + ord('0')) in gen:
            ret.append(opt)
    return ret

In [3]:
model = "llama31" # Change this to the model response you want to analyze
n_responses = 3
analysis_array = np.zeros((2,7, n_responses))
for response_number in range(1, n_responses + 1):
    response_path = os.path.abspath(f"../../response_{response_number}/esensi/{model}")
    response_files = os.listdir(response_path)
    regions = [response_file.split("_")[-1].split(".")[0] for response_file in response_files]

    groups = ["dining","business","visits","travel"]
    analysis_dict = {region: {g:{"total":0,"abstain":0,"wrong":0,"correct":0} for g in groups} for region in regions}
    true_label_dict = {region: {g:{"total":0,"positive":0,"negative":0} for g in groups} for region in regions}
    response_rates = {region: {g:{"true_positive":0,"false_positive":0,"true_negative":0,"false_negative":0} for g in groups} for region in regions}

    # Read the response files and calculate the data required for analysis
    for file in response_files:
        df = pd.read_csv(os.path.join(response_path, file))
        region = file.split("_")[-1].split(".")[0]
        for i in range(len(df)):
            row = df.iloc[i]
            grp = row["group"]
            true_label_dict[region][grp]["total"] += 1
            if row["true_label"]=="positive":
                true_label_dict[region][grp]["positive"] += 1
            else:
                true_label_dict[region][grp]["negative"] += 1
            response = extract_response(row[f'{model}_sentence'])
            
            if len(response)==0 or len(response)>1:
                analysis_dict[region][grp]["abstain"] += 1
            elif len(response)==1 and ((response[0]=='acceptable' and row["true_label"]=="positive") or (response[0]=='unacceptable' and row["true_label"]=="negative")):
                analysis_dict[region][grp]["correct"] += 1
                if response[0]=='acceptable' and row["true_label"]=="positive":
                    response_rates[region][grp]["true_positive"] += 1
                else:
                    response_rates[region][grp]["true_negative"] += 1
            else:
                analysis_dict[region][grp]["wrong"] += 1
                if response[0]=='acceptable' and row["true_label"]=="negative":
                    response_rates[region][grp]["false_positive"] += 1
                else:
                    response_rates[region][grp]["false_negative"] += 1
            analysis_dict[region][grp]["total"] += 1

    # Calculate the total number of abstentions for a particular response_number
    ab = 0
    for region in regions:
        for grp in groups:
            ab += analysis_dict[region][grp]["abstain"]

    analysis_array[0][-1][response_number-1] = ab
    analysis_array[1][-1][response_number-1] = ab

    results_acc = []
    results_f1 = []
    ord_regions = ['NE', 'INDIA', 'EA', 'LA', 'MEA']

    # Calculate the accuracy and f1 score for each region along with their averages across regions.
    for i,r in enumerate(ord_regions):
        r = r.lower()
        corrects = sum([analysis_dict[r][g]["correct"] for g in groups])
        wrongs = sum([analysis_dict[r][g]["wrong"] for g in groups])
        abstains = sum([analysis_dict[r][g]["abstain"] for g in groups])
        total = corrects + wrongs + abstains
        
        true_positive_rate = sum([response_rates[r][g]["true_positive"] for g in groups]) / sum([true_label_dict[r][g]["positive"] for g in groups])
        true_negative_rate = sum([response_rates[r][g]["true_negative"] for g in groups]) / sum([true_label_dict[r][g]["negative"] for g in groups])
        false_positive_rate = sum([response_rates[r][g]["false_positive"] for g in groups]) / sum([true_label_dict[r][g]["negative"] for g in groups])
        false_negative_rate = sum([response_rates[r][g]["false_negative"] for g in groups]) / sum([true_label_dict[r][g]["positive"] for g in groups])
        
        precision = sum([response_rates[r][g]["true_positive"] for g in groups]) / (sum([response_rates[r][g]["true_positive"] for g in groups]) + sum([response_rates[r][g]["false_positive"] for g in groups]))
        recall = sum([response_rates[r][g]["true_positive"] for g in groups]) / (sum([response_rates[r][g]["true_positive"] for g in groups]) + sum([response_rates[r][g]["false_negative"] for g in groups]))
        
        print(f"Region: {r}")
        print(f"Accuracy: {corrects / total}")
        print(f"f1 score: {2 * precision * recall / (precision + recall)}")
        results_acc.append(corrects / total)
        results_f1.append(2 * precision * recall / (precision + recall))
        print("\n")

    print(f"Average Accuracy: {sum(results_acc) / len(results_acc)}")
    print(f"Average F1 Score: {sum(results_f1) / len(results_f1)}")

    analysis_array[0][5][response_number-1] = sum(results_acc) / len(results_acc)
    analysis_array[1][5][response_number-1] = sum(results_f1) / len(results_f1)

    analysis_array[0][:5][response_number-1] = results_acc
    analysis_array[1][:5][response_number-1] = results_f1


In [None]:
# Calculate the mean and standard deviation across all responses for each region as well as average accuracy and f1 score.
print(np.mean(analysis_array[0], axis=2))
print(np.mean(analysis_array[1], axis=2))
print(np.std(analysis_array[0], axis=2))
print(np.std(analysis_array[1], axis=2))