In [1]:
import pandas as pd
import json
import os
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import confusion_matrix, classification_report, multilabel_confusion_matrix
from sklearn.preprocessing import MultiLabelBinarizer
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
)
from typing import List, Dict
import seaborn as sns
import matplotlib.pyplot as plt
import sys
sys.path.append('../src')
import prompt_utils
import llm_utils
from reproduce_model import normalizeTweet, reformat_json_binary_v01

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def model_summary(model):
    print("Model summary:")
    print("---------------------------")
    total_params = 0
    for name, param in model.named_parameters():
        param_count = param.numel()
        total_params += param_count
    print(f"Total parameters: {total_params}")
    
"""def print_report(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    report = classification_report(y_true, y_pred, target_names=categories)
    print(report)
    sns.heatmap(cm, annot=True, xticklabels=categories, yticklabels=categories, fmt='g')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()"""

class TweetDataset(Dataset):
    def __init__(self, x, y, mlb, tokenizer, fsl_strategy = None, binary = False):
        self.x = x
        self.y = y
        self.mlb = mlb
        self.tokenizer = tokenizer
        self.max_length = 128
        self.encoded_tweets = self.preprocess_text(self.x, fsl_strategy)
        self.binary = binary

    def preprocess_text(self, X, fsl_strategy):
        X = [normalizeTweet(tweet) for tweet in X]
        
        return self.tokenizer(X, return_attention_mask=True, return_tensors='pt', padding=True, truncation = True, max_length=self.max_length)
        
    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        if self.binary:
            label = self.y[idx].item()  # Convert tensor to a single integer value
            return {
                'input_ids': self.encoded_tweets['input_ids'][idx],
                'attention_mask': self.encoded_tweets['attention_mask'][idx],
                'label': torch.tensor(label)
            }
        else:
            label = self.y[idx]
            #print(label)
            return {'input_ids': self.encoded_tweets['input_ids'][idx],
                    'attention_mask': self.encoded_tweets['attention_mask'][idx],
                    'label': torch.tensor(label, dtype=torch.float32)}
                    #'label_ids': self.labels[idx]}
        
class MultiLabelDataCollator(DataCollatorWithPadding):
    def __init__(self, tokenizer):
        super().__init__(tokenizer)

    def __call__(self, features: List[Dict[str, torch.Tensor]]):
        batch = super().__call__(features)
        batch["labels"] = torch.stack([feature["label"] for feature in features])
        return batch
    
def get_binary_classification_report(data_loader, model, class_name):
    labels = []
    predictions = []
    for batch in data_loader:
        batch_inputs = {'input_ids': batch['input_ids'].to(device),
                        'attention_mask': batch['attention_mask'].to(device)}
        with torch.no_grad():
            logits = model(**batch_inputs).logits
        # Convert logits to 0 or 1 based on argmax
        batch_predictions = np.argmax(logits.detach().cpu().numpy(), axis=1)
        predictions.append(batch_predictions)
        labels.append(batch['labels'].detach().cpu().numpy().astype(int))

    predictions = np.concatenate(predictions, axis=0)
    labels = np.concatenate(labels, axis = 0)

    dict_report = classification_report(labels, predictions, target_names=[class_name, "not_" + class_name], zero_division=0, output_dict=True)
    report = classification_report(labels, predictions, target_names=[class_name, "not_" + class_name], zero_division=0)
    return dict_report, report
    
def get_report_df(reports):
    rows_tmp = []
    for report in reports:
        for label, metrics in report.items():
            if "not_" in label or "avg" in label:
                continue
            if label == "accuracy":
                continue  # Skip accuracy as it's a single value, not a dictionary
            row = {
                'label': label,
                'f1_score_macro': report['macro avg']['f1-score'],
                'precision_macro': report['macro avg']['precision'],
                'recall_macro': report['macro avg']['recall'],
                'support_macro': report['macro avg']['support'],
                'f1_score_class_0': metrics['f1-score'],
                'support_class_0': metrics['support'],
                'precision_class_0': metrics['precision'],
                'recall_class_0': metrics['recall'],
            }
            # Check if the "not_" label exists in the report
            not_label = "not_" + label
            if not_label in report:
                row.update({
                    'f1_score_class_1': report[not_label]['f1-score'],
                    'support_class_1': report[not_label]['support'],
                    'precision_class_1': report[not_label]['precision'],
                    'recall_class_1': report[not_label]['recall']
                })
            rows_tmp.append(row)
    return pd.DataFrame(rows_tmp)
    

def calculate_binary_metrics(task, class_names):
    k = 5
    print(task)
    val_classification_reports = []
    test_classification_reports = []

    # Loop over each fold and load the corresponding model
    for binary_task, class_name in zip(task, class_names):
        model_path = f"../models/{binary_task}_epochs_200_train_size_full_fold_0"
        # find the latest checkpoint file
        #checkpoint_files = [f for f in os.listdir(model_path) if f.startswith("checkpoint")]
        latest_checkpoint = os.path.join(model_path, "")  # use "" for models that were manually saved after training. use sorted(checkpoint_files)[0] for the first automatically saved checkpoint 
        print(latest_checkpoint)
        
        # Load the model and tokenizer
        model = AutoModelForSequenceClassification.from_pretrained(latest_checkpoint)
        model.to(device)
        tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-large")

        filename = f"../data/labeled_data/generic_test_0.json"
        with open(filename) as f:
            data = json.load(f)
        val_df = pd.DataFrame(data["valid"])
        test_df = pd.DataFrame(data["test"])
        
        checkpoint = torch.load(os.path.join(model_path, "pytorch_model.bin"))
        model.load_state_dict(checkpoint)
        
        train_df, val_df = reformat_json_binary_v01(class_name, filename)
        val_df['output'] = val_df['output'].astype(int)
        test_df['output'] = test_df['annotations'].apply(lambda x: 1 if class_name in x else 0)

    
        val_dataset = TweetDataset(val_df['input'].to_list(), torch.tensor(val_df['output']), None, tokenizer, binary = True)
        test_dataset = TweetDataset(test_df['text'].to_list(), torch.tensor(test_df['output']), None, tokenizer, binary = True)
        
        val_loader = torch.utils.data.DataLoader(
            val_dataset, batch_size=256, shuffle=False, collate_fn=MultiLabelDataCollator(tokenizer)
        )
        test_loader = torch.utils.data.DataLoader(
            test_dataset, batch_size=256, shuffle=False, collate_fn=MultiLabelDataCollator(tokenizer)
        )
        
        model.eval()
        val_report_dict, val_report = get_binary_classification_report(val_loader, model, class_name)
        test_report_dict, test_report = get_binary_classification_report(test_loader, model, class_name)
        val_classification_reports.append(val_report_dict)
        test_classification_reports.append(test_report_dict)

    
    print(val_classification_reports[0])
    val_report_df = get_report_df(val_classification_reports)
    test_report_df = get_report_df(test_classification_reports)
    print("\nAverage Validation Classification Report In DataFrame Format:")
    print(val_report_df) 
    print("\nTest Classification Report In DataFrame Format:")
    print(test_report_df) 
    return val_report_df, test_report_df

2023-08-17 23:47:24.380833: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
binary_tasks = [
    "binary_war_terror_generic",
    "binary_conspiracy_theory_generic",
    "binary_education_generic",
    "binary_election_campaign_generic",
    "binary_environment_generic",
    "binary_government_public_generic",
    "binary_health_generic",
    "binary_immigration_integration_generic",
    "binary_justice_crime_generic",
    "binary_labor_employment_generic",
    "binary_macroeconomics_economic_regulation_generic",
    "binary_media_journalism_generic",
    "binary_religion_generic",
    "binary_science_technology_generic",
    "binary_others_generic",
]
class_names = prompt_utils.ALL_LABELS

val_avg_report, test_avg_report = calculate_binary_metrics(binary_tasks, class_names)

['binary_war_terror_generic', 'binary_conspiracy_theory_generic', 'binary_education_generic', 'binary_election_campaign_generic', 'binary_environment_generic', 'binary_government_public_generic', 'binary_health_generic', 'binary_immigration_integration_generic', 'binary_justice_crime_generic', 'binary_labor_employment_generic', 'binary_macroeconomics_economic_regulation_generic', 'binary_media_journalism_generic', 'binary_religion_generic', 'binary_science_technology_generic', 'binary_others_generic']
../models/binary_war_terror_generic_epochs_200_train_size_full_fold_0/
---------------------
Label: War/Terror
Positive samples: 939
Negative samples: 939
Total samples: 1878
Train samples: 1502
Valid samples: 376
---------------------


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


../models/binary_conspiracy_theory_generic_epochs_200_train_size_full_fold_0/
---------------------
Label: Conspiracy Theory
Positive samples: 254
Negative samples: 254
Total samples: 508
Train samples: 406
Valid samples: 102
---------------------


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


../models/binary_education_generic_epochs_200_train_size_full_fold_0/


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


---------------------
Label: Education
Positive samples: 63
Negative samples: 63
Total samples: 126
Train samples: 100
Valid samples: 26
---------------------
../models/binary_election_campaign_generic_epochs_200_train_size_full_fold_0/


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


---------------------
Label: Election Campaign
Positive samples: 133
Negative samples: 133
Total samples: 266
Train samples: 212
Valid samples: 54
---------------------
../models/binary_environment_generic_epochs_200_train_size_full_fold_0/
---------------------
Label: Environment
Positive samples: 58
Negative samples: 58
Total samples: 116
Train samples: 92
Valid samples: 24
---------------------


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


../models/binary_government_public_generic_epochs_200_train_size_full_fold_0/
---------------------
Label: Government/Public
Positive samples: 1248
Negative samples: 1248
Total samples: 2496
Train samples: 1996
Valid samples: 500
---------------------


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


../models/binary_health_generic_epochs_200_train_size_full_fold_0/
---------------------
Label: Health
Positive samples: 214
Negative samples: 214
Total samples: 428
Train samples: 342
Valid samples: 86
---------------------


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


../models/binary_immigration_integration_generic_epochs_200_train_size_full_fold_0/


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


---------------------
Label: Immigration/Integration
Positive samples: 201
Negative samples: 201
Total samples: 402
Train samples: 321
Valid samples: 81
---------------------
../models/binary_justice_crime_generic_epochs_200_train_size_full_fold_0/


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


---------------------
Label: Justice/Crime
Positive samples: 572
Negative samples: 572
Total samples: 1144
Train samples: 915
Valid samples: 229
---------------------
../models/binary_labor_employment_generic_epochs_200_train_size_full_fold_0/


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


---------------------
Label: Labor/Employment
Positive samples: 97
Negative samples: 97
Total samples: 194
Train samples: 155
Valid samples: 39
---------------------
../models/binary_macroeconomics_economic_regulation_generic_epochs_200_train_size_full_fold_0/


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


---------------------
Label: Macroeconomics/Economic Regulation
Positive samples: 250
Negative samples: 250
Total samples: 500
Train samples: 400
Valid samples: 100
---------------------
../models/binary_media_journalism_generic_epochs_200_train_size_full_fold_0/


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


---------------------
Label: Media/Journalism
Positive samples: 184
Negative samples: 184
Total samples: 368
Train samples: 294
Valid samples: 74
---------------------
../models/binary_religion_generic_epochs_200_train_size_full_fold_0/


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


---------------------
Label: Religion
Positive samples: 71
Negative samples: 71
Total samples: 142
Train samples: 113
Valid samples: 29
---------------------
../models/binary_science_technology_generic_epochs_200_train_size_full_fold_0/


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


---------------------
Label: Science/Technology
Positive samples: 54
Negative samples: 54
Total samples: 108
Train samples: 86
Valid samples: 22
---------------------
../models/binary_others_generic_epochs_200_train_size_full_fold_0/
---------------------
Label: Others
Positive samples: 1057
Negative samples: 1057
Total samples: 2114
Train samples: 1691
Valid samples: 423
---------------------


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'War/Terror': {'precision': 0.9834254143646409, 'recall': 0.9270833333333334, 'f1-score': 0.9544235924932977, 'support': 192}, 'not_War/Terror': {'precision': 0.9282051282051282, 'recall': 0.9836956521739131, 'f1-score': 0.9551451187335093, 'support': 184}, 'accuracy': 0.9547872340425532, 'macro avg': {'precision': 0.9558152712848846, 'recall': 0.9553894927536233, 'f1-score': 0.9547843556134035, 'support': 376}, 'weighted avg': {'precision': 0.9564027211376455, 'recall': 0.9547872340425532, 'f1-score': 0.9547766798023373, 'support': 376}}

Average Validation Classification Report In DataFrame Format:
                                 label  f1_score_macro  precision_macro  recall_macro  support_macro  f1_score_class_0  support_class_0  precision_class_0  recall_class_0  f1_score_class_1  support_class_1  precision_class_1  recall_class_1
0                           War/Terror        0.954784         0.955815      0.955389            376          0.954424              192           0.98

In [48]:
test_avg_report

Unnamed: 0,label,f1_score_macro,precision_macro,recall_macro,support_macro,f1_score_class_0,support_class_0,precision_class_0,recall_class_0,f1_score_class_1,support_class_1,precision_class_1,recall_class_1,TP_class_1
0,War/Terror,0.930956,0.914402,0.95277,1000,0.962095,745,0.988669,0.936913,0.899818,255,0.840136,0.968627,291.597222
1,Conspiracy Theory,0.659844,0.621445,0.872077,1000,0.931629,955,0.992891,0.877487,0.38806,45,0.25,0.866667,-26.590909
2,Education,0.591992,0.565,0.955927,1000,0.953895,987,1.0,0.911854,0.230088,13,0.13,1.0,-2.283784
3,Election Campaign,0.763544,0.695117,0.944878,1000,0.973517,967,0.997828,0.950362,0.553571,33,0.392405,0.939394,-78.692308
4,Environment,0.602754,0.571429,0.957404,1000,0.955508,986,1.0,0.914807,0.25,14,0.142857,1.0,-2.8
5,Government/Public,0.802953,0.790996,0.827655,1000,0.870343,709,0.922591,0.823695,0.735562,291,0.659401,0.831615,327.544186
6,Health,0.750797,0.6906,0.911585,1000,0.962121,954,0.994407,0.931866,0.539474,46,0.386792,0.891304,-134.714286
7,Immigration/Integration,0.782461,0.715482,0.93603,1000,0.975636,964,0.996753,0.955394,0.589286,36,0.434211,0.916667,-297.0
8,Justice/Crime,0.809085,0.766076,0.916283,1000,0.92707,863,0.993377,0.869061,0.691099,137,0.538776,0.963504,623.586207
9,Labor/Employment,0.641043,0.601599,0.897046,1000,0.946515,972,0.996587,0.901235,0.33557,28,0.206612,0.892857,-10.769231


In [53]:
df = test_avg_report.copy()# Calculate TP, FP, and FN for class 1 for each label
df['TP_class_1'] = df['recall_class_1'] * df['support_class_1']
df['FP_class_1'] = (df['TP_class_1'] / df['precision_class_1']) - df['TP_class_1']
df['FN_class_1'] = df['support_class_1'] - df['TP_class_1']

print(df)
# Calculate micro-average precision and recall for class 1
precision_micro = df['TP_class_1'].sum() / (df['TP_class_1'].sum() + df['FP_class_1'].sum())
recall_micro = df['TP_class_1'].sum() / (df['TP_class_1'].sum() + df['FN_class_1'].sum())

print(precision_micro)
# Calculate micro-average F1 score for class 1
f1_micro = 2 * precision_micro * recall_micro / (precision_micro + recall_micro)

print(f1_micro)

                                 label  f1_score_macro  precision_macro  recall_macro  support_macro  f1_score_class_0  support_class_0  ...  f1_score_class_1  support_class_1  precision_class_1  recall_class_1  TP_class_1  FP_class_1  FN_class_1
0                           War/Terror        0.930956         0.914402      0.952770           1000          0.962095              745  ...          0.899818              255           0.840136        0.968627       247.0        47.0         8.0
1                    Conspiracy Theory        0.659844         0.621445      0.872077           1000          0.931629              955  ...          0.388060               45           0.250000        0.866667        39.0       117.0         6.0
2                            Education        0.591992         0.565000      0.955927           1000          0.953895              987  ...          0.230088               13           0.130000        1.000000        13.0        87.0         0.0
3           

In [50]:
df["f1_score_class_1"].mean()

0.4907528119869895

In [4]:
avg_f1_class_0, avg_f1_class_1, avg_f1_class_1_low, avg_f1_score, avg_accuracy, fbeta_score_class_0, fbeta_score_class_1, avg_fbeta_score, avg_fbeta_score_low, avg_fbeta_score_class_1_low_0_25 = llm_utils.calculate_metrics_streamlit(test_avg_report.iloc[0:-1], 0.25)

Index(['f1_score_macro', 'precision_macro', 'recall_macro', 'support_macro', 'f1_score_class_0', 'support_class_0', 'precision_class_0', 'recall_class_0', 'f1_score_class_1', 'support_class_1', 'precision_class_1', 'recall_class_1'], dtype='object')
Index(['f1_score_macro', 'precision_macro', 'recall_macro', 'support_macro', 'f1_score_class_0', 'support_class_0', 'precision_class_0', 'recall_class_0', 'f1_score_class_1', 'support_class_1', 'precision_class_1', 'recall_class_1'], dtype='object')
Index(['f1_score_macro', 'precision_macro', 'recall_macro', 'support_macro', 'f1_score_class_0', 'support_class_0', 'precision_class_0', 'recall_class_0', 'f1_score_class_1', 'support_class_1', 'precision_class_1', 'recall_class_1'], dtype='object')
Index(['f1_score_macro', 'precision_macro', 'recall_macro', 'support_macro', 'f1_score_class_0', 'support_class_0', 'precision_class_0', 'recall_class_0', 'f1_score_class_1', 'support_class_1', 'precision_class_1', 'recall_class_1'], dtype='object')


In [45]:
import prompt_utils
def calculate_fbeta_score(beta, precision, recall):
    return (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)

avg_fbeta_score_class_1_low_0_25 = 0
try:
    for class_ in prompt_utils.LOW_F1_LABELS:
        avg_precision_class_1_low = df[df.label == class_]["precision_class_1"]
        avg_recall_class_1_low = df[df.label == class_]["recall_class_1"]
        avg_precision_class_1_low = avg_precision_class_1_low.values[0]
        avg_recall_class_1_low = avg_recall_class_1_low.values[0]
        avg_fbeta_score_class_1_low_0_25 += calculate_fbeta_score(0.25, avg_precision_class_1_low, avg_recall_class_1_low)
except Exception as e:
    print("Error", e)
avg_fbeta_score_class_1_low_0_25 = avg_fbeta_score_class_1_low_0_25/len(prompt_utils.LOW_F1_LABELS)
avg_fbeta_score_class_1_low_0_25

0.17935431218965814

In [47]:
avg_f1_score

0.7055038660630035

In [44]:
avg_fbeta_score_class_1_low_0_25

0.17935431218965814