In [1]:
import sys
import os
sys.path.append("../src")
import llm_utils
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
import ast
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report

classes = ["War/Terror", "Conspiracy Theory", "Education", "Election Campaign", "Environment", 
              "Government/Public", "Health", "Immigration/Integration", 
              "Justice/Crime", "Labor/Employment", 
              "Macroeconomics/Economic Regulation", "Media/Journalism", "Religion", "Science/Technology"]


def calculate_binary_metrics_lora(df, class_full, extraction_function):
    prediction_per_class = None
    # Iterate through class labels and extract binary predictions
    pred_column_name = f"response"
    pred_column_df = df[df[pred_column_name].notna()].copy()
    pred_column_df[pred_column_name] = pred_column_df[pred_column_name].apply(extraction_function)
    prediction_per_class = pred_column_df

    confusion_matrices = {}
    classification_reports = {}
    pred_column_name = f"response"

    current_df = prediction_per_class
    
    # Ignore rows with NaN or invalid values in the predictions
    try:
        valid_rows = current_df[pred_column_name].notna()
        
        y_true = current_df.loc[valid_rows, 'annotations'].apply(lambda x: int(class_full in x))
        y_pred = current_df.loc[valid_rows, pred_column_name].astype(int)
    except KeyError:
        y_true = []
        y_pred = []
    except TypeError:
        y_true = []
        y_pred = []
    cm = confusion_matrix(y_true, y_pred)
    confusion_matrices[class_full] = cm
    cr = classification_report(y_true, y_pred, output_dict=True)
    classification_reports[class_full] = cr
    return prediction_per_class, confusion_matrices, classification_reports

def load_model(path, class_full):
    binary_df = pd.read_csv(path)
    extraction_function = llm_utils.get_extraction_function("extract_using_class_token", 1)
    _, confusion_matrices, classification_reports = calculate_binary_metrics_lora(binary_df, class_full, extraction_function)
    return {"confusion_matrices": confusion_matrices, "classification_reports": classification_reports}

binary_war_lora = load_model("../data/vicuna_4bit/lora/binary_war_v01/test_generic_test_0.csv", "War/Terror")
binary_conspiracy_theory_lora = load_model("../data/vicuna_4bit/lora/binary_conspiracy_theory_v01/test_generic_test_0.csv", "Conspiracy Theory")
binary_education_lora = load_model("../data/vicuna_4bit/lora/binary_education_v01/test_generic_test_0.csv", "Education")
binary_election_campaign_lora = load_model("../data/vicuna_4bit/lora/binary_election_campaign_v01/test_generic_test_0.csv", "Election Campaign")
binary_environment_lora = load_model("../data/vicuna_4bit/lora/binary_environment_v01/test_generic_test_0.csv", "Environment")
binary_government_public_lora = load_model("../data/vicuna_4bit/lora/binary_government_public_v01/test_generic_test_0.csv", "Government/Public")
binary_health_lora = load_model("../data/vicuna_4bit/lora/binary_health_v01/test_generic_test_0.csv", "Health")
binary_immigration_integration_lora = load_model("../data/vicuna_4bit/lora/binary_immigration_integration_v01/test_generic_test_0.csv", "Immigration/Integration")
binary_justice_crime_lora = load_model("../data/vicuna_4bit/lora/binary_justice_crime_v01/test_generic_test_0.csv", "Justice/Crime")
binary_labor_employment_lora = load_model("../data/vicuna_4bit/lora/binary_labor_employment_v01/test_generic_test_0.csv", "Labor/Employment")
binary_macroeconomics_economic_regulation_lora = load_model("../data/vicuna_4bit/lora/binary_macroeconomics_economic_regulation_v01/test_generic_test_0.csv", "Macroeconomics/Economic Regulation")
binary_media_journalism_lora = load_model("../data/vicuna_4bit/lora/binary_media_journalism_v01/test_generic_test_0.csv", "Media/Journalism")
binary_religion_lora = load_model("../data/vicuna_4bit/lora/binary_religion_v01/test_generic_test_0.csv", "Religion")
binary_science_technology_lora = load_model("../data/vicuna_4bit/lora/binary_science_technology_v01/test_generic_test_0.csv", "Science/Technology")
binary_others_lora = load_model("../data/vicuna_4bit/lora/binary_others_v01/test_generic_test_0.csv", "Others")

In [2]:
data_list = [binary_war_lora, binary_conspiracy_theory_lora, binary_education_lora, binary_election_campaign_lora, binary_environment_lora, 
             binary_government_public_lora, binary_health_lora, binary_immigration_integration_lora, binary_justice_crime_lora, binary_labor_employment_lora, 
             binary_macroeconomics_economic_regulation_lora, binary_media_journalism_lora, binary_religion_lora, binary_science_technology_lora, binary_others_lora]

In [3]:
llm_utils.classification_reports_to_df(data_list)

AttributeError: 'list' object has no attribute 'items'

In [4]:
# Extracting metrics and constructing DataFrame
rows = []
for data in data_list:
    for label, report in data['classification_reports'].items():
        row = {
            'label': label,
            'f1_score_macro': report['macro avg']['f1-score'],
            'precision_macro': report['macro avg']['precision'],
            'recall_macro': report['macro avg']['recall'],
            'support_macro': report['macro avg']['support'],
            'f1_score_class_0': report['0']['f1-score'],
            'support_class_0': report['0']['support'],
            'f1_score_class_1': report['1']['f1-score'],
            'support_class_1': report['1']['support'],
            'precision_class_0': report['0']['precision'],
            'recall_class_0': report['0']['recall'],
            'precision_class_1': report['1']['precision'],
            'recall_class_1': report['1']['recall']
        }
        rows.append(row)

df = pd.DataFrame(rows)
print(df)

                                 label  f1_score_macro  precision_macro  recall_macro  support_macro  f1_score_class_0  support_class_0  f1_score_class_1  support_class_1  precision_class_0  recall_class_0  precision_class_1  recall_class_1
0                           War/Terror        0.934538         0.932357      0.936781           1000          0.966353              745          0.902724              255           0.968961        0.963758           0.895753        0.909804
1                    Conspiracy Theory        0.571270         0.572070      0.797906           1000          0.881671              955          0.260870               45           0.988296        0.795812           0.155844        0.800000
2                            Education        0.455351         0.512493      0.678953            949          0.849633              936          0.061069               13           0.992857        0.742521           0.032129        0.615385
3                    Election Campai

In [24]:
# Calculate TP, FP, and FN for class 1 for each label
df['TP_class_1'] = df['precision_class_1'] * df['recall_class_1'] * df['support_class_1'] / (df['precision_class_1'] + df['recall_class_1'])
df['FP_class_1'] = df['support_class_1'] - df['TP_class_1']
df['FN_class_1'] = df['support_class_1'] - df['TP_class_1']

# Calculate micro-average precision and recall for class 1
precision_micro = df['TP_class_1'].sum() / (df['TP_class_1'].sum() + df['FP_class_1'].sum())
recall_micro = df['TP_class_1'].sum() / (df['TP_class_1'].sum() + df['FN_class_1'].sum())

# Calculate micro-average F1 score for class 1
f1_micro = 2 * precision_micro * recall_micro / (precision_micro + recall_micro)

print(f1_micro)

0.3035672108984692


In [23]:
df["f1_score_class_1"].mean()

0.3058254649872635

In [17]:
binary_war_df = pd.read_csv("../data/vicuna_4bit/generic_prompt_without_context_only_classification/generic_test_0.csv")
extraction_function = llm_utils.get_extraction_function("extract_using_class_token", 1)
binary_war_predictions_per_class, confusion_matrices, classification_reports = llm_utils.calculate_binary_metrics(binary_war_df, ["War/Terror"], extraction_function)
binary_war = {"confusion_matrices": confusion_matrices, "classification_reports": classification_reports}

In [22]:
import prompt_utils
def calculate_fbeta_score(beta, precision, recall):
    if precision == 0 and recall == 0:
        return 0.0
    return (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)

avg_fbeta_score_class_1_low_0_25 = 0
try:
    for class_ in prompt_utils.LOW_F1_LABELS:
        avg_precision_class_1_low = df[df.label == class_]["precision_class_1"]
        avg_recall_class_1_low = df[df.label == class_]["recall_class_1"]
        avg_precision_class_1_low = avg_precision_class_1_low.values[0]
        #print(avg_precision_class_1_low)
        avg_recall_class_1_low = avg_recall_class_1_low.values[0]
        #print(avg_recall_class_1_low)
        avg_fbeta_score_class_1_low_0_25 = avg_fbeta_score_class_1_low_0_25 + calculate_fbeta_score(0.25, avg_precision_class_1_low, avg_recall_class_1_low)
        print(avg_recall_class_1_low, avg_precision_class_1_low, calculate_fbeta_score(0.25, avg_precision_class_1_low, avg_recall_class_1_low))
    print(avg_fbeta_score_class_1_low_0_25)
except Exception as e:
    print("Error", e)
avg_fbeta_score_class_1_low_0_25 = avg_fbeta_score_class_1_low_0_25/len(prompt_utils.LOW_F1_LABELS)
avg_fbeta_score_class_1_low_0_25

0.8 0.15584415584415584 0.16359262229350438
0.6153846153846154 0.0321285140562249 0.03402551913935451
0.21428571428571427 0.008595988538681949 0.009110396570203644
0.8148148148148148 0.06451612903225806 0.06821083348531826
0.0 0.0 0.0
0.45454545454545453 0.016129032258064516 0.017099175216254273
0.292038546704635


0.04867309111743917