In [1]:
import pandas as pd
import json
import matplotlib.pyplot as plt

import re
import random
import json
from tqdm import tqdm
import os

import numpy as np
from collections import Counter
import os
import sys
sys.path.append("../src")
import prompt_utils
import llm_utils
from sklearn.metrics import fbeta_score
from sklearn.metrics import confusion_matrix, classification_report
from collections import Counter

# vicuna 
# with rules classification only (beta 0.5: 0.76, beta 0.25: 0.86) (0.7-0.8s per prompt)
vicuna_base_path = "../data/vicuna_4bit/"
vicuna_with_rules_classification_only_name = "generic_prompt_with_rules_only_classification"
vicuna_with_rules_classification_only_func = prompt_utils.get_vicuna_prompt_with_rules_only_classification

# OA LLAMA
# Classification Only V03 (bta 0.5: 0.81, beta 0.25: 0.83)
# With Rules Classification only (beta 0.5: 0.8, beta 0.25: 0.89)
# With 3 Random Examples Classification only (beta 0.5: 0.78, beta 0.25: 0.88)
oa_base_path = "../data/openassistant_llama_30b_4bit/"
oa_classification_only_v03_name = "generic_prompt_without_context_only_classification_v03"
oa_classification_only_v03_func = prompt_utils.get_openassistant_llama_30b_4bit_without_context_only_classification_v03 #
oa_with_rules_classification_only_name = "generic_prompt_with_rules_only_classification"
oa_with_rules_classification_only_func = prompt_utils.get_openassistant_llama_30b_4bit_with_rules_only_classification
oa_with_3_random_examples_classification_only_name = "generic_prompt_few_shot_prompt_only_classification_3_random_example"
oa_with_rules_classification_only_func = prompt_utils.get_openassistant_llama_30b_4bit_few_shot_prompt_only_classification_n_random_example

# Text Davinci
# Elaboration First V04 (beta 0.5: 0.87, beta 0.25: 0.93)
davinci_base_path = "../data/openai_text_davinci_003/"
davinci_elaboration_first_v04_name = "generic_prompt_without_context_elaboration_first_v04"
davinci_elaboration_first_v04_func = prompt_utils.get_openai_prompt_without_context_elaboration_first_v04

# Define a list of filenames to load
labeled_data_filename = "../data/labeled_data/generic_test_0.json"

dfs = []
with open(labeled_data_filename) as f:
    data = json.load(f)
df = pd.DataFrame(data["train"])
dfs.append(df)
df = pd.DataFrame(data["test"])
dfs.append(df)
df = pd.DataFrame(data["valid"])
dfs.append(df)
df_all = pd.concat(dfs)
ALL_LABELS = prompt_utils.ALL_LABELS[:-1]
LOW_F1_LABELS = prompt_utils.LOW_F1_LABELS

In [2]:
from sklearn.metrics import classification_report
from collections import Counter
from sklearn.preprocessing import LabelEncoder

def calculate_binary_metrics(df, classes, extraction_function):
    class_predictions = pd.DataFrame(index=df.index)
    confusion_matrices = {}
    binary_classification_reports = {}

    # Iterate through class labels and extract binary predictions
    for idx, label in enumerate(classes):
        pred_column_name = f"{label}_pred"
        try:
            df[pred_column_name] = df[pred_column_name].apply(extraction_function)
            class_predictions[pred_column_name] = df[pred_column_name]
        
        # Skip if the column (for example Others_pred) does not exist
        except KeyError:
            pass

    for label in classes:
        pred_column_name = f"{label}_pred"

        # Ignore rows with NaN or invalid values in the predictions
        try:
            valid_rows = class_predictions[pred_column_name].notna()
            
            y_true = df.loc[valid_rows, 'annotations'].apply(lambda x: int(label in x))
            y_pred = class_predictions.loc[valid_rows, pred_column_name].astype(int)
        except KeyError:
            y_true = []
            y_pred = []
        cm = confusion_matrix(y_true, y_pred)
        confusion_matrices[label] = cm
        cr = classification_report(y_true, y_pred, output_dict=True)
        binary_classification_reports[label] = cr

    return class_predictions, confusion_matrices, binary_classification_reports

def calculate_majority_vote_classification_report(class_predictions_per_model, true_labels):
    majority_vote_classification_report = {}
    for column in class_predictions_per_model[0].columns:
        # Perform majority vote
        #print(class_predictions_1[column])
        models = {}
        for idx, class_prediction in enumerate(class_predictions_per_model):
            models[idx] =  class_prediction[column]
        majority_vote = pd.DataFrame(models).mode(axis=1)[0]

        #print(majority_vote)
        
        majority_vote = majority_vote.dropna().astype(int)  # Remove NaN values and convert to int
        #print(true_labels.unique())
        true_labels_for_column = true_labels.apply(lambda x: int(column.replace("_pred", "") in x)).loc[majority_vote.index]
        
        # Compute classification report
        majority_vote_classification_report[column] = classification_report(true_labels_for_column, majority_vote, output_dict=True)
        
    return majority_vote_classification_report

def load_model(url, extraction_func, metrics_calculation_func):
    df = pd.read_csv(url)
    #print(df)
    predictions_per_class, confusion_matrices, binary_classification_reports = metrics_calculation_func(df, ALL_LABELS, extraction_func)
    return predictions_per_class, confusion_matrices, binary_classification_reports

vicuna_with_rules_classification_only, _, vicuna_binary_classification_reports = load_model(f"{vicuna_base_path}{vicuna_with_rules_classification_only_name}/generic_test_0.csv", llm_utils.get_extraction_function("extract_nth_character", 2), calculate_binary_metrics)
oa_classification_only_v03, _, binary_classification_reports = load_model(f"{oa_base_path}{oa_classification_only_v03_name}/generic_test_0.csv", llm_utils.get_extraction_function("extract_nth_character", 1), calculate_binary_metrics)
oa_with_rules_classification_only, _, binary_classification_reports = load_model(f"{oa_base_path}{oa_with_rules_classification_only_name}/generic_test_0.csv", llm_utils.get_extraction_function("extract_nth_character", 1), calculate_binary_metrics)
oa_with_3_random_examples_classification_only, _, binary_classification_reports = load_model(f"{oa_base_path}{oa_with_3_random_examples_classification_only_name}/generic_test_0.csv", llm_utils.get_extraction_function("extract_nth_character", 1), calculate_binary_metrics)
text_davinci_003_turbo_without_context_elaboration_first_v04, _, binary_classification_reports = load_model(f"{davinci_base_path}{davinci_elaboration_first_v04_name}/generic_test_0.csv", llm_utils.get_extraction_function("extract_using_class_token", 1), calculate_binary_metrics)
df_all.reset_index(drop=True, inplace=True)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [3]:
majority_vote_classification_report_3_top_models = calculate_majority_vote_classification_report([oa_with_rules_classification_only, oa_with_3_random_examples_classification_only, text_davinci_003_turbo_without_context_elaboration_first_v04], df_all["annotations"])
majority_vote_classification_report_5_top_models = calculate_majority_vote_classification_report([vicuna_with_rules_classification_only, oa_classification_only_v03, oa_with_rules_classification_only, oa_with_3_random_examples_classification_only, text_davinci_003_turbo_without_context_elaboration_first_v04], df_all["annotations"])

In [4]:
df_majority_vote_classification_report_3_top_models = llm_utils.classification_reports_to_df(majority_vote_classification_report_3_top_models)
df_majority_vote_classification_report_3_top_models["label"] = df_majority_vote_classification_report_3_top_models["label"].str.replace("_pred", "")
df_majority_vote_classification_report_5_top_models = llm_utils.classification_reports_to_df(majority_vote_classification_report_5_top_models)
df_majority_vote_classification_report_5_top_models["label"] = df_majority_vote_classification_report_5_top_models["label"].str.replace("_pred", "")

  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({


## Top 3 Models Majority Vote results:

In [5]:
avg_f1_class_0, avg_f1_class_1, avg_f1_class_1_low, avg_f1_score, avg_accuracy, fbeta_score_class_0, fbeta_score_class_1, avg_fbeta_score, avg_fbeta_score_low, avg_fbeta_score_low_0_25 = llm_utils.calculate_metrics_streamlit(df_majority_vote_classification_report_3_top_models, 0.5)

metrics = {
    "Avg F1-Score Class 0": avg_f1_class_0, 
    "Avg F1-Score Class 1": avg_f1_class_1, 
    "Avg F1-Score": avg_f1_score, 
    "Avg F1-Score Class 1 Low Class": avg_f1_class_1_low,
    #"Avg Accuracy": avg_accuracy,
    "Avg F-Beta-Score Class 0": fbeta_score_class_0,
    "Avg F-Beta-Score Class 1": fbeta_score_class_1,
    "Avg F-Beta-Score": avg_fbeta_score,
    "Avg F-Beta-Score (0.5) Low Class 1": avg_fbeta_score_low,
    "Avg F-Beta-Score (0.25) Low Class 1": avg_fbeta_score_low_0_25
}
metrics

{'Avg F1-Score Class 0': 0.7747086793915982,
 'Avg F1-Score Class 1': 0.6307666492862062,
 'Avg F1-Score': 0.7027376643389022,
 'Avg F1-Score Class 1 Low Class': 0.6822621354564614,
 'Avg F-Beta-Score Class 0': 0.7016397536106048,
 'Avg F-Beta-Score Class 1': 0.7927591490585494,
 'Avg F-Beta-Score': 0.7471994513345771,
 'Avg F-Beta-Score (0.5) Low Class 1': 0.8282669650089607,
 'Avg F-Beta-Score (0.25) Low Class 1': 0.9178371648161194}

## Top 5 Models Majority Vote results:

In [6]:
avg_f1_class_0, avg_f1_class_1, avg_f1_class_1_low, avg_f1_score, avg_accuracy, fbeta_score_class_0, fbeta_score_class_1, avg_fbeta_score, avg_fbeta_score_low, avg_fbeta_score_low_0_25 = llm_utils.calculate_metrics_streamlit(df_majority_vote_classification_report_5_top_models, 0.5)

metrics = {
    "Avg F1-Score Class 0": avg_f1_class_0, 
    "Avg F1-Score Class 1": avg_f1_class_1, 
    "Avg F1-Score": avg_f1_score, 
    "Avg F1-Score Class 1 Low Class": avg_f1_class_1_low,
    #"Avg Accuracy": avg_accuracy,
    "Avg F-Beta-Score Class 0": fbeta_score_class_0,
    "Avg F-Beta-Score Class 1": fbeta_score_class_1,
    "Avg F-Beta-Score": avg_fbeta_score,
    "Avg F-Beta-Score (0.5) Low Class 1": avg_fbeta_score_low,
    "Avg F-Beta-Score (0.25) Low Class 1": avg_fbeta_score_low_0_25
}
metrics

{'Avg F1-Score Class 0': 0.7805940503231785,
 'Avg F1-Score Class 1': 0.6592229213054407,
 'Avg F1-Score': 0.7199084858143097,
 'Avg F1-Score Class 1 Low Class': 0.7028810451565485,
 'Avg F-Beta-Score Class 0': 0.7138977382307383,
 'Avg F-Beta-Score Class 1': 0.8004890211112579,
 'Avg F-Beta-Score': 0.7571933796709981,
 'Avg F-Beta-Score (0.5) Low Class 1': 0.8413157135922563,
 'Avg F-Beta-Score (0.25) Low Class 1': 0.9236461807293542}

In [45]:
pd.DataFrame(majority_vote_classification_report).transpose()["1"].iloc[4]

{'precision': 0.9473684210526315,
 'recall': 0.5538461538461539,
 'f1-score': 0.6990291262135921,
 'support': 65}

In [34]:
df_all_reset = df_all.reset_index(drop=True, inplace=True)
df_all["annotations"]

0                                    [Science/Technology]
1                                                [Others]
2       [Health, Justice/Crime, Macroeconomics/Economi...
3                                                [Others]
4                                     [Government/Public]
                              ...                        
4995                                             [Others]
4996                                         [War/Terror]
4997                                             [Others]
4998                                             [Others]
4999                   [Government/Public, Justice/Crime]
Name: annotations, Length: 5000, dtype: object

In [40]:
with_rules_classification_only_predictions_per_class

[                       id campaign_name                                               text                                        annotations                                   normalized_tweet  War/Terror_pred Conspiracy Theory_pred  ... Immigration/Integration_pred Justice/Crime_pred                              Labor/Employment_pred Macroeconomics/Economic Regulation_pred Media/Journalism_pred Religion_pred Science/Technology_pred
 96    1155203013805559809    GRU_202012  In the city of Ras al-Ain #Hasaka and the surr...                                     ['War/Terror']  In the city of Ras al-Ain #Hasaka and the surr...                1                    NaN  ...                          NaN                NaN                                                NaN                                     NaN                   NaN           NaN                     NaN
 188    993671467157131265   UGANDA_0621  @DjShiru smashing it up with  Tomorrow @Brothe...                                 

In [36]:
def load_model(url, extraction_func, metrics_calculation_func):
    df = pd.read_csv(url)
    print(df.iloc[4])
    predictions_per_class, confusion_matrices, binary_classification_reports, multilabel_classification_reports = metrics_calculation_func(df, ALL_LABELS, extraction_func)
    data = {"confusion_matrices": confusion_matrices, "binary_classification_reports": binary_classification_reports, "multilabel_classification_reports": multilabel_classification_reports}
    return data, predictions_per_class
text_davinci_003_turbo_without_context_elaboration_first, text_davinci_003_turbo_without_context_elaboration_first_predictions_per_class = load_model("../data/openai_text_davinci_003/generic_prompt_without_context_elaboration_first/generic_test_0.csv", llm_utils.get_extraction_function("extract_nth_character", 1), llm_utils.calculate_binary_metrics)
text_davinci_003_turbo_without_context_elaboration_first_v02, text_davinci_003_turbo_without_context_elaboration_first_v02_predictions_per_class = load_model("../data/openai_text_davinci_003/generic_prompt_without_context_elaboration_first_v02/generic_test_0.csv", llm_utils.get_extraction_function("extract_nth_character", 0), llm_utils.calculate_binary_metrics)
text_davinci_003_turbo_without_context_elaboration_first_v03, text_davinci_003_turbo_without_context_elaboration_first_v03_predictions_per_class = load_model("../data/openai_text_davinci_003/generic_prompt_without_context_elaboration_first_v03/generic_test_0.csv", llm_utils.get_extraction_function("extract_nth_character", 1, True), llm_utils.calculate_binary_metrics)
text_davinci_003_turbo_without_context_elaboration_first_v04, text_davinci_003_turbo_without_context_elaboration_first_v04_predictions_per_class = load_model("../data/openai_text_davinci_003/generic_prompt_without_context_elaboration_first_v04/generic_test_0.csv", llm_utils.get_extraction_function("extract_using_class_token", 1), llm_utils.calculate_binary_metrics)

oa_with_rules_only_classification, oa_with_rules_only_classification_predictions_per_class = load_model("../data/openassistant_llama_30b_4bit/generic_prompt_with_rules_only_classification/generic_test_0.csv", llm_utils.get_extraction_function("extract_nth_character", 1), llm_utils.calculate_binary_metrics)

id                                                                       1286189347973279746
campaign_name                                                                    UGANDA_0621
text                                       RT @ArthurMirama: You can only under estimate ...
annotations                                                            ['Government/Public']
normalized_tweet                                                                         NaN
War/Terror_pred                                                                          NaN
Conspiracy Theory_pred                                                                   NaN
Education_pred                                                                           NaN
Election Campaign_pred                                                                   NaN
Environment_pred                                                                         NaN
Government/Public_pred                                                

  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({


id                                                                       1286189347973279746
campaign_name                                                                    UGANDA_0621
text                                       RT @ArthurMirama: You can only under estimate ...
annotations                                                            ['Government/Public']
normalized_tweet                                                                         NaN
War/Terror_pred                                                                          NaN
Conspiracy Theory_pred                                                                   NaN
Education_pred                                                                           NaN
Election Campaign_pred                                                                   NaN
Environment_pred                                                                         NaN
Government/Public_pred                                                

  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  df = df.append({
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(r

id                                                                       1286189347973279746
campaign_name                                                                    UGANDA_0621
text                                       RT @ArthurMirama: You can only under estimate ...
annotations                                                            ['Government/Public']
normalized_tweet                                                                         NaN
War/Terror_pred                                                                          NaN
Conspiracy Theory_pred                                                                   NaN
Education_pred                                                                           NaN
Election Campaign_pred                                                                   NaN
Environment_pred                                                                         NaN
Government/Public_pred                                                