In [2]:
import pandas as pd
import json
import os
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import confusion_matrix, classification_report, multilabel_confusion_matrix
from sklearn.preprocessing import MultiLabelBinarizer
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
)
from typing import List, Dict
import seaborn as sns
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the dataset class

# Load data from json file
with open('../reports/VENEZUELA_201901_2_epochs_200_train_size_full.json') as f:
    data = json.load(f)

dfs = []
for k, v in data.items():
    valid_metrics = v['valid']
    valid_metrics['dataset'] = 'valid'
    valid_metrics['fold'] = int(k) + 1
    dfs.append(pd.DataFrame([valid_metrics]))
    
    test_metrics = v['test']
    test_metrics['dataset'] = 'test'
    test_metrics['fold'] = int(k) + 1
    dfs.append(pd.DataFrame([test_metrics]))

# Concatenate all dataframes together
df = pd.concat(dfs, ignore_index=True)

# Rename columns
df.columns = df.columns.str.replace('eval_', '')
df = df.rename(columns={'epoch': 'num_epochs'})

# Print the final dataframe
print(df)

       loss  accuracy  micro_precision  micro_recall  micro_f1  \
0  0.132741       0.0         0.823881      0.787072  0.805056   
1  0.184075       0.0         0.779221      0.703934  0.739666   
2  0.109380       0.0         0.830753      0.772553  0.800597   
3  0.153071       0.0         0.784553      0.665977  0.720418   
4  0.113060       0.0         0.816616      0.772031  0.793698   
5  0.157828       0.0         0.798791      0.638371  0.709628   
6  0.113176       0.0         0.824473      0.772556  0.797671   
7  0.152108       0.0         0.803723      0.655625  0.722159   
8  0.111680       0.0         0.827362      0.729187  0.775178   
9  0.154341       0.0         0.802740      0.606625  0.691038   

   macro_precision  macro_recall  macro_f1  runtime  samples_per_second  \
0         0.721881      0.660063  0.683039   3.3753             237.019   
1         0.707577      0.623297  0.637038   4.2092             237.577   
2         0.810919      0.593703  0.664724   3.3

In [3]:
df[df.dataset == "test"].mean()

  df[df.dataset == "test"].mean()


loss                    0.160285
accuracy                0.000000
micro_precision         0.793805
micro_recall            0.654106
micro_f1                0.716582
macro_precision         0.691064
macro_recall            0.525625
macro_f1                0.562369
runtime                 4.245200
samples_per_second    235.569600
steps_per_second       58.892200
num_epochs              5.800000
fold                    3.000000
dtype: float64

In [4]:
def model_summary(model):
    print("Model summary:")
    print("---------------------------")
    total_params = 0
    for name, param in model.named_parameters():
        param_count = param.numel()
        total_params += param_count
    print(f"Total parameters: {total_params}")
    
"""def print_report(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    report = classification_report(y_true, y_pred, target_names=categories)
    print(report)
    sns.heatmap(cm, annot=True, xticklabels=categories, yticklabels=categories, fmt='g')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()"""

class TweetDataset(Dataset):
    def __init__(self, x, y, mlb, tokenizer):
        self.x = x
        self.y = y
        self.mlb = mlb
        self.tokenizer = tokenizer
        self.encoded_tweets = self.preprocess_text(self.x)
    
    def preprocess_text(self, text):
        return self.tokenizer(text, return_attention_mask=True, return_tensors='pt', padding=True)
        
    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        label = self.y[idx]
        return {'input_ids': self.encoded_tweets['input_ids'][idx],
                'attention_mask': self.encoded_tweets['attention_mask'][idx],
                'label': torch.tensor(label, dtype=torch.float32)}
        
class MultiLabelDataCollator(DataCollatorWithPadding):
    def __init__(self, tokenizer):
        super().__init__(tokenizer)

    def __call__(self, features: List[Dict[str, torch.Tensor]]):
        batch = super().__call__(features)
        batch["labels"] = torch.stack([feature["label"] for feature in features])
        return batch
    
def get_classification_report(data_loader, model, target_names, label_names):
    labels = []
    predictions = []
    for batch in data_loader:
        batch_inputs = {'input_ids': batch['input_ids'].to(device),
                        'attention_mask': batch['attention_mask'].to(device)}
        with torch.no_grad():
            logits = model(**batch_inputs).logits
        sigmoid = torch.nn.Sigmoid()
        probs = sigmoid(torch.Tensor(logits))
        batch_predictions = (probs >= 0.5).detach().cpu().numpy().astype(int)
        predictions.append(batch_predictions)
        labels.append(batch['labels'].detach().cpu().numpy().astype(int))

    predictions = np.concatenate(predictions, axis=0)
    labels = np.concatenate(labels, axis = 0)

    #cm = multilabel_confusion_matrix(labels, predictions)
    dict_report = classification_report(labels, predictions, target_names=target_names, labels=label_names, zero_division=0, output_dict=True)
    report = classification_report(labels, predictions, target_names=target_names, labels=label_names, zero_division=0)
    return dict_report, report
    
def calculate_average_report(reports):
    print(reports)
    print(len(reports))
    avg_report = {}
    for report in reports:
        for key, scores in report.items():
            if key not in avg_report:
                avg_report[key] = {}
                for score_key, score_value in scores.items():
                    avg_report[key][score_key] = score_value
            else:
                for score_key, score_value in scores.items():
                    avg_report[key][score_key] += score_value

    num_reports = len(reports)
    for key, scores in avg_report.items():
        for score_key in scores:
            avg_report[key][score_key] /= num_reports

    return avg_report

def average_report_to_dataframe(average_report):
    data = {
        "precision": [],
        "recall": [],
        "f1-score": [],
        "support": []
    }
    index = []

    for class_name, metrics in average_report.items():
        if class_name == 'accuracy':
            continue

        index.append(class_name)
        data["precision"].append(metrics["precision"])
        data["recall"].append(metrics["recall"])
        data["f1-score"].append(metrics["f1-score"])
        data["support"].append(metrics["support"])

    return pd.DataFrame(data, index=index)

def calculate_metrics(task):
    k = 5
    
    val_classification_reports = []
    test_classification_reports = []

    # Loop over each fold and load the corresponding model
    for fold in range(k):
        model_path = f"../models/weak_labeled/{task}_epochs_200_train_size_full_fold_{fold}"
        # find the latest checkpoint file
        #checkpoint_files = [f for f in os.listdir(model_path) if f.startswith("checkpoint")]
        latest_checkpoint = os.path.join(model_path, "")  # use "" for models that were manually saved after training. use sorted(checkpoint_files)[0] for the first automatically saved checkpoint 
        print(latest_checkpoint)
        
        # Load the model and tokenizer
        model = AutoModelForSequenceClassification.from_pretrained(latest_checkpoint)
        model.to(device)
        tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-large")

        filename = f"../data/labeled_data/generic_test_{fold}.json"
        with open(filename) as f:
            data = json.load(f)
        train_df = pd.read_csv(f"{model_path}/train_df.csv")
        val_df = pd.DataFrame(data["valid"])
        test_df = pd.DataFrame(data["test"])
        
        import ast
        print(train_df["annotations"])
        train_annotations = train_df["annotations"].apply(ast.literal_eval).tolist()
        classes = set()
        for annotation in train_annotations:
            classes.update(annotation)
        classes = sorted(list(classes))

        print(classes)
        
        checkpoint = torch.load(os.path.join(model_path, "pytorch_model.bin"))
        model.load_state_dict(checkpoint)
        
        mlb = MultiLabelBinarizer(classes=classes)
        
        train_labels = mlb.fit_transform(train_df["annotations"].apply(ast.literal_eval))
        val_labels = mlb.transform(val_df["annotations"])
        test_labels = mlb.transform(test_df["annotations"])
        
        train_dataset = TweetDataset(train_df['text'].to_list(), torch.tensor(train_labels), mlb, tokenizer)
        val_dataset = TweetDataset(val_df['text'].to_list(), torch.tensor(val_labels), mlb, tokenizer)
        test_dataset = TweetDataset(test_df['text'].to_list(), torch.tensor(test_labels), mlb, tokenizer)
        
        val_loader = torch.utils.data.DataLoader(
            val_dataset, batch_size=256, shuffle=False, collate_fn=MultiLabelDataCollator(tokenizer)
        )
        test_loader = torch.utils.data.DataLoader(
            test_dataset, batch_size=256, shuffle=False, collate_fn=MultiLabelDataCollator(tokenizer)
        )
        
        model.eval()
        val_report_dict, val_report = get_classification_report(val_loader, model, classes, range(len(classes)))
        test_report_dict, test_report = get_classification_report(test_loader, model, classes, range(len(classes)))
        val_classification_reports.append(val_report_dict)
        test_classification_reports.append(test_report_dict)

    val_average_report = calculate_average_report(val_classification_reports)
    test_average_report = calculate_average_report(test_classification_reports)
    val_average_report_df = average_report_to_dataframe(val_average_report)
    test_average_report_df = average_report_to_dataframe(test_average_report)
    print("\nAverage Validation Classification Report In DataFrame Format:")
    print(val_average_report_df) 
    print("\nAverage Test Classification Report In DataFrame Format:")
    print(test_average_report_df) 
    return val_average_report_df, test_average_report_df

# Distinct

In [8]:
generic_val_average_report_df, generic_test_average_report_df = calculate_metrics("vicuna_with_rules_classification_only_distinct_generic")

../models/weak_labeled/vicuna_with_rules_classification_only_distinct_generic_epochs_200_train_size_full_fold_0/
0                                  ['Science/Technology']
1                                              ['Others']
2       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
3                                              ['Others']
4                                   ['Government/Public']
                              ...                        
3436                                 ['Labor/Employment']
3437                                        ['Education']
3438                                ['Conspiracy Theory']
3439                                ['Conspiracy Theory']
3440                                        ['Education']
Name: annotations, Length: 3441, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation'

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/vicuna_with_rules_classification_only_distinct_generic_epochs_200_train_size_full_fold_1/
0                                  ['Science/Technology']
1                                              ['Others']
2       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
3                                              ['Others']
4                                          ['War/Terror']
                              ...                        
3436                                 ['Labor/Employment']
3437                                        ['Education']
3438                                ['Conspiracy Theory']
3439                                ['Conspiracy Theory']
3440                                        ['Education']
Name: annotations, Length: 3441, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation'

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/vicuna_with_rules_classification_only_distinct_generic_epochs_200_train_size_full_fold_2/
0       ['Science/Technology']
1                   ['Others']
2                   ['Others']
3        ['Government/Public']
4               ['War/Terror']
                 ...          
3436      ['Labor/Employment']
3437             ['Education']
3438     ['Conspiracy Theory']
3439     ['Conspiracy Theory']
3440             ['Education']
Name: annotations, Length: 3441, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Journalism', 'Others', 'Religion', 'Science/Technology', 'War/Terror']


  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/vicuna_with_rules_classification_only_distinct_generic_epochs_200_train_size_full_fold_3/
0                                  ['Science/Technology']
1       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
2                                              ['Others']
3                                   ['Government/Public']
4                                          ['War/Terror']
                              ...                        
3436                                 ['Labor/Employment']
3437                                        ['Education']
3438                                ['Conspiracy Theory']
3439                                ['Conspiracy Theory']
3440                                        ['Education']
Name: annotations, Length: 3441, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation'

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/vicuna_with_rules_classification_only_distinct_generic_epochs_200_train_size_full_fold_4/
0                                              ['Others']
1       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
2                                   ['Government/Public']
3                                          ['War/Terror']
4                                          ['War/Terror']
                              ...                        
3436                                 ['Labor/Employment']
3437                                        ['Education']
3438                                ['Conspiracy Theory']
3439                                ['Conspiracy Theory']
3440                                        ['Education']
Name: annotations, Length: 3441, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation'

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


[{'Conspiracy Theory': {'precision': 0.7105263157894737, 'recall': 0.574468085106383, 'f1-score': 0.6352941176470589, 'support': 47}, 'Education': {'precision': 0.5555555555555556, 'recall': 0.8333333333333334, 'f1-score': 0.6666666666666667, 'support': 12}, 'Election Campaign': {'precision': 0.7619047619047619, 'recall': 0.6153846153846154, 'f1-score': 0.6808510638297872, 'support': 26}, 'Environment': {'precision': 0.9166666666666666, 'recall': 0.7857142857142857, 'f1-score': 0.8461538461538461, 'support': 14}, 'Government/Public': {'precision': 0.776173285198556, 'recall': 0.8531746031746031, 'f1-score': 0.8128544423440454, 'support': 252}, 'Health': {'precision': 0.7843137254901961, 'recall': 0.8888888888888888, 'f1-score': 0.8333333333333334, 'support': 45}, 'Immigration/Integration': {'precision': 0.7317073170731707, 'recall': 0.75, 'f1-score': 0.7407407407407408, 'support': 40}, 'Justice/Crime': {'precision': 0.8068181818181818, 'recall': 0.6635514018691588, 'f1-score': 0.728205

In [9]:
generic_val_average_report_df, generic_test_average_report_df = calculate_metrics("oa_with_rules_classification_only_distinct_generic")

../models/weak_labeled/oa_with_rules_classification_only_distinct_generic_epochs_200_train_size_full_fold_0/
0                                  ['Science/Technology']
1                                              ['Others']
2       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
3                                              ['Others']
4                                   ['Government/Public']
                              ...                        
3480                                 ['Labor/Employment']
3481                                         ['Religion']
3482                                        ['Education']
3483                                ['Conspiracy Theory']
3484                                ['Conspiracy Theory']
Name: annotations, Length: 3485, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'M

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/oa_with_rules_classification_only_distinct_generic_epochs_200_train_size_full_fold_1/
0                                  ['Science/Technology']
1                                              ['Others']
2       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
3                                              ['Others']
4                                          ['War/Terror']
                              ...                        
3480                                 ['Labor/Employment']
3481                                         ['Religion']
3482                                        ['Education']
3483                                ['Conspiracy Theory']
3484                                ['Conspiracy Theory']
Name: annotations, Length: 3485, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'M

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/oa_with_rules_classification_only_distinct_generic_epochs_200_train_size_full_fold_2/
0       ['Science/Technology']
1                   ['Others']
2                   ['Others']
3        ['Government/Public']
4               ['War/Terror']
                 ...          
3480      ['Labor/Employment']
3481              ['Religion']
3482             ['Education']
3483     ['Conspiracy Theory']
3484     ['Conspiracy Theory']
Name: annotations, Length: 3485, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Journalism', 'Others', 'Religion', 'Science/Technology', 'War/Terror']


  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/oa_with_rules_classification_only_distinct_generic_epochs_200_train_size_full_fold_3/
0                                  ['Science/Technology']
1       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
2                                              ['Others']
3                                   ['Government/Public']
4                                          ['War/Terror']
                              ...                        
3480                                 ['Labor/Employment']
3481                                         ['Religion']
3482                                        ['Education']
3483                                ['Conspiracy Theory']
3484                                ['Conspiracy Theory']
Name: annotations, Length: 3485, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'M

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/oa_with_rules_classification_only_distinct_generic_epochs_200_train_size_full_fold_4/
0                                              ['Others']
1       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
2                                   ['Government/Public']
3                                          ['War/Terror']
4                                          ['War/Terror']
                              ...                        
3480                                 ['Labor/Employment']
3481                                         ['Religion']
3482                                        ['Education']
3483                                ['Conspiracy Theory']
3484                                ['Conspiracy Theory']
Name: annotations, Length: 3485, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'M

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


[{'Conspiracy Theory': {'precision': 0.575, 'recall': 0.48936170212765956, 'f1-score': 0.5287356321839081, 'support': 47}, 'Education': {'precision': 0.75, 'recall': 0.5, 'f1-score': 0.6, 'support': 12}, 'Election Campaign': {'precision': 0.7692307692307693, 'recall': 0.7692307692307693, 'f1-score': 0.7692307692307693, 'support': 26}, 'Environment': {'precision': 0.8, 'recall': 0.8571428571428571, 'f1-score': 0.8275862068965518, 'support': 14}, 'Government/Public': {'precision': 0.7992277992277992, 'recall': 0.8214285714285714, 'f1-score': 0.8101761252446184, 'support': 252}, 'Health': {'precision': 0.9459459459459459, 'recall': 0.7777777777777778, 'f1-score': 0.8536585365853658, 'support': 45}, 'Immigration/Integration': {'precision': 0.775, 'recall': 0.775, 'f1-score': 0.775, 'support': 40}, 'Justice/Crime': {'precision': 0.8152173913043478, 'recall': 0.7009345794392523, 'f1-score': 0.7537688442211056, 'support': 107}, 'Labor/Employment': {'precision': 0.6818181818181818, 'recall': 0

In [10]:
generic_val_average_report_df, generic_test_average_report_df = calculate_metrics("oa_with_3_random_examples_classification_only_distinct_generic")

../models/weak_labeled/oa_with_3_random_examples_classification_only_distinct_generic_epochs_200_train_size_full_fold_0/
0                                  ['Science/Technology']
1                                              ['Others']
2       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
3                                              ['Others']
4                                   ['Government/Public']
                              ...                        
3759                                 ['Labor/Employment']
3760                                 ['Labor/Employment']
3761                                        ['Education']
3762                                         ['Religion']
3763                                         ['Religion']
Name: annotations, Length: 3764, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Reg

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/oa_with_3_random_examples_classification_only_distinct_generic_epochs_200_train_size_full_fold_1/
0                                  ['Science/Technology']
1                                              ['Others']
2       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
3                                              ['Others']
4                                          ['War/Terror']
                              ...                        
3759                                 ['Labor/Employment']
3760                                 ['Labor/Employment']
3761                                        ['Education']
3762                                         ['Religion']
3763                                         ['Religion']
Name: annotations, Length: 3764, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Reg

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/oa_with_3_random_examples_classification_only_distinct_generic_epochs_200_train_size_full_fold_2/
0       ['Science/Technology']
1                   ['Others']
2                   ['Others']
3        ['Government/Public']
4               ['War/Terror']
                 ...          
3759      ['Labor/Employment']
3760      ['Labor/Employment']
3761             ['Education']
3762              ['Religion']
3763              ['Religion']
Name: annotations, Length: 3764, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Journalism', 'Others', 'Religion', 'Science/Technology', 'War/Terror']


  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/oa_with_3_random_examples_classification_only_distinct_generic_epochs_200_train_size_full_fold_3/
0                                  ['Science/Technology']
1       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
2                                              ['Others']
3                                   ['Government/Public']
4                                          ['War/Terror']
                              ...                        
3759                                 ['Labor/Employment']
3760                                 ['Labor/Employment']
3761                                        ['Education']
3762                                         ['Religion']
3763                                         ['Religion']
Name: annotations, Length: 3764, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Reg

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/oa_with_3_random_examples_classification_only_distinct_generic_epochs_200_train_size_full_fold_4/
0                                              ['Others']
1       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
2                                   ['Government/Public']
3                                          ['War/Terror']
4                                          ['War/Terror']
                              ...                        
3759                                 ['Labor/Employment']
3760                                 ['Labor/Employment']
3761                                        ['Education']
3762                                         ['Religion']
3763                                         ['Religion']
Name: annotations, Length: 3764, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Reg

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


[{'Conspiracy Theory': {'precision': 0.6956521739130435, 'recall': 0.3404255319148936, 'f1-score': 0.4571428571428571, 'support': 47}, 'Education': {'precision': 0.7, 'recall': 0.5833333333333334, 'f1-score': 0.6363636363636365, 'support': 12}, 'Election Campaign': {'precision': 0.8421052631578947, 'recall': 0.6153846153846154, 'f1-score': 0.7111111111111111, 'support': 26}, 'Environment': {'precision': 0.8, 'recall': 0.5714285714285714, 'f1-score': 0.6666666666666666, 'support': 14}, 'Government/Public': {'precision': 0.8165938864628821, 'recall': 0.7420634920634921, 'f1-score': 0.7775467775467776, 'support': 252}, 'Health': {'precision': 0.9230769230769231, 'recall': 0.8, 'f1-score': 0.8571428571428571, 'support': 45}, 'Immigration/Integration': {'precision': 0.9285714285714286, 'recall': 0.65, 'f1-score': 0.7647058823529412, 'support': 40}, 'Justice/Crime': {'precision': 0.7906976744186046, 'recall': 0.6355140186915887, 'f1-score': 0.7046632124352331, 'support': 107}, 'Labor/Employm

In [12]:
generic_val_average_report_df, generic_test_average_report_df = calculate_metrics("oa_classification_only_v03_distinct_generic")

../models/weak_labeled/oa_classification_only_v03_distinct_generic_epochs_200_train_size_full_fold_0/
0                                  ['Science/Technology']
1                                              ['Others']
2       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
3                                              ['Others']
4                                   ['Government/Public']
                              ...                        
3822                                 ['Labor/Employment']
3823                                         ['Religion']
3824                                        ['Education']
3825                               ['Science/Technology']
3826                               ['Science/Technology']
Name: annotations, Length: 3827, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Jo

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/oa_classification_only_v03_distinct_generic_epochs_200_train_size_full_fold_1/
0                                  ['Science/Technology']
1                                              ['Others']
2       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
3                                              ['Others']
4                                          ['War/Terror']
                              ...                        
3822                                 ['Labor/Employment']
3823                                         ['Religion']
3824                                        ['Education']
3825                               ['Science/Technology']
3826                               ['Science/Technology']
Name: annotations, Length: 3827, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Jo

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/oa_classification_only_v03_distinct_generic_epochs_200_train_size_full_fold_2/
0       ['Science/Technology']
1                   ['Others']
2                   ['Others']
3        ['Government/Public']
4               ['War/Terror']
                 ...          
3822      ['Labor/Employment']
3823              ['Religion']
3824             ['Education']
3825    ['Science/Technology']
3826    ['Science/Technology']
Name: annotations, Length: 3827, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Journalism', 'Others', 'Religion', 'Science/Technology', 'War/Terror']


  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/oa_classification_only_v03_distinct_generic_epochs_200_train_size_full_fold_3/
0                                  ['Science/Technology']
1       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
2                                              ['Others']
3                                   ['Government/Public']
4                                          ['War/Terror']
                              ...                        
3822                                 ['Labor/Employment']
3823                                         ['Religion']
3824                                        ['Education']
3825                               ['Science/Technology']
3826                               ['Science/Technology']
Name: annotations, Length: 3827, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Jo

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/oa_classification_only_v03_distinct_generic_epochs_200_train_size_full_fold_4/
0                                              ['Others']
1       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
2                                   ['Government/Public']
3                                          ['War/Terror']
4                                          ['War/Terror']
                              ...                        
3822                                 ['Labor/Employment']
3823                                         ['Religion']
3824                                        ['Education']
3825                               ['Science/Technology']
3826                               ['Science/Technology']
Name: annotations, Length: 3827, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Jo

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


[{'Conspiracy Theory': {'precision': 0.8571428571428571, 'recall': 0.1276595744680851, 'f1-score': 0.2222222222222222, 'support': 47}, 'Education': {'precision': 0.8571428571428571, 'recall': 0.5, 'f1-score': 0.631578947368421, 'support': 12}, 'Election Campaign': {'precision': 1.0, 'recall': 0.3076923076923077, 'f1-score': 0.47058823529411764, 'support': 26}, 'Environment': {'precision': 0.8, 'recall': 0.5714285714285714, 'f1-score': 0.6666666666666666, 'support': 14}, 'Government/Public': {'precision': 0.8212765957446808, 'recall': 0.7658730158730159, 'f1-score': 0.7926078028747434, 'support': 252}, 'Health': {'precision': 0.926829268292683, 'recall': 0.8444444444444444, 'f1-score': 0.8837209302325582, 'support': 45}, 'Immigration/Integration': {'precision': 0.88, 'recall': 0.55, 'f1-score': 0.676923076923077, 'support': 40}, 'Justice/Crime': {'precision': 0.8378378378378378, 'recall': 0.5794392523364486, 'f1-score': 0.6850828729281768, 'support': 107}, 'Labor/Employment': {'precisio

In [11]:
generic_val_average_report_df, generic_test_average_report_df = calculate_metrics("davinci_elaboration_first_v04_distinct_generic")

../models/weak_labeled/davinci_elaboration_first_v04_distinct_generic_epochs_200_train_size_full_fold_0/
0                                  ['Science/Technology']
1                                              ['Others']
2       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
3                                              ['Others']
4                                   ['Government/Public']
                              ...                        
3630                                         ['Religion']
3631                                        ['Education']
3632                                ['Conspiracy Theory']
3633                                ['Conspiracy Theory']
3634                               ['Science/Technology']
Name: annotations, Length: 3635, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/davinci_elaboration_first_v04_distinct_generic_epochs_200_train_size_full_fold_1/
0                                  ['Science/Technology']
1                                              ['Others']
2       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
3                                              ['Others']
4                                          ['War/Terror']
                              ...                        
3630                                         ['Religion']
3631                                        ['Education']
3632                                ['Conspiracy Theory']
3633                                ['Conspiracy Theory']
3634                               ['Science/Technology']
Name: annotations, Length: 3635, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/davinci_elaboration_first_v04_distinct_generic_epochs_200_train_size_full_fold_2/
0       ['Science/Technology']
1                   ['Others']
2                   ['Others']
3        ['Government/Public']
4               ['War/Terror']
                 ...          
3630              ['Religion']
3631             ['Education']
3632     ['Conspiracy Theory']
3633     ['Conspiracy Theory']
3634    ['Science/Technology']
Name: annotations, Length: 3635, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Journalism', 'Others', 'Religion', 'Science/Technology', 'War/Terror']


  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/davinci_elaboration_first_v04_distinct_generic_epochs_200_train_size_full_fold_3/
0                                  ['Science/Technology']
1       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
2                                              ['Others']
3                                   ['Government/Public']
4                                          ['War/Terror']
                              ...                        
3630                                         ['Religion']
3631                                        ['Education']
3632                                ['Conspiracy Theory']
3633                                ['Conspiracy Theory']
3634                               ['Science/Technology']
Name: annotations, Length: 3635, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/davinci_elaboration_first_v04_distinct_generic_epochs_200_train_size_full_fold_4/
0                                              ['Others']
1       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
2                                   ['Government/Public']
3                                          ['War/Terror']
4                                          ['War/Terror']
                              ...                        
3630                                         ['Religion']
3631                                        ['Education']
3632                                ['Conspiracy Theory']
3633                                ['Conspiracy Theory']
3634                               ['Science/Technology']
Name: annotations, Length: 3635, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


[{'Conspiracy Theory': {'precision': 0.6451612903225806, 'recall': 0.425531914893617, 'f1-score': 0.5128205128205128, 'support': 47}, 'Education': {'precision': 1.0, 'recall': 0.25, 'f1-score': 0.4, 'support': 12}, 'Election Campaign': {'precision': 0.8571428571428571, 'recall': 0.6923076923076923, 'f1-score': 0.7659574468085107, 'support': 26}, 'Environment': {'precision': 0.7, 'recall': 0.5, 'f1-score': 0.5833333333333334, 'support': 14}, 'Government/Public': {'precision': 0.7473684210526316, 'recall': 0.8452380952380952, 'f1-score': 0.7932960893854749, 'support': 252}, 'Health': {'precision': 0.9722222222222222, 'recall': 0.7777777777777778, 'f1-score': 0.8641975308641976, 'support': 45}, 'Immigration/Integration': {'precision': 0.896551724137931, 'recall': 0.65, 'f1-score': 0.7536231884057972, 'support': 40}, 'Justice/Crime': {'precision': 0.8194444444444444, 'recall': 0.5514018691588785, 'f1-score': 0.659217877094972, 'support': 107}, 'Labor/Employment': {'precision': 0.6875, 'rec

# Co-Occurrence

In [4]:
generic_val_average_report_df, generic_test_average_report_df = calculate_metrics("vicuna_with_rules_classification_only_co_occurrence_generic")

../models/weak_labeled/vicuna_with_rules_classification_only_co_occurrence_generic_epochs_200_train_size_full_fold_0/
0                                  ['Science/Technology']
1                                              ['Others']
2       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
3                                              ['Others']
4                                   ['Government/Public']
                              ...                        
3436                                 ['Labor/Employment']
3437                                        ['Education']
3438                                ['Conspiracy Theory']
3439                                ['Conspiracy Theory']
3440                                        ['Education']
Name: annotations, Length: 3441, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regula

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/vicuna_with_rules_classification_only_co_occurrence_generic_epochs_200_train_size_full_fold_1/
0                                  ['Science/Technology']
1                                              ['Others']
2       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
3                                              ['Others']
4                                          ['War/Terror']
                              ...                        
3436                                 ['Labor/Employment']
3437                                        ['Education']
3438                                ['Conspiracy Theory']
3439                                ['Conspiracy Theory']
3440                                        ['Education']
Name: annotations, Length: 3441, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regula

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/vicuna_with_rules_classification_only_co_occurrence_generic_epochs_200_train_size_full_fold_2/
0       ['Science/Technology']
1                   ['Others']
2                   ['Others']
3        ['Government/Public']
4               ['War/Terror']
                 ...          
3436      ['Labor/Employment']
3437             ['Education']
3438     ['Conspiracy Theory']
3439     ['Conspiracy Theory']
3440             ['Education']
Name: annotations, Length: 3441, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Journalism', 'Others', 'Religion', 'Science/Technology', 'War/Terror']


  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/vicuna_with_rules_classification_only_co_occurrence_generic_epochs_200_train_size_full_fold_3/
0                                  ['Science/Technology']
1       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
2                                              ['Others']
3                                   ['Government/Public']
4                                          ['War/Terror']
                              ...                        
3436                                 ['Labor/Employment']
3437                                        ['Education']
3438                                ['Conspiracy Theory']
3439                                ['Conspiracy Theory']
3440                                        ['Education']
Name: annotations, Length: 3441, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regula

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/vicuna_with_rules_classification_only_co_occurrence_generic_epochs_200_train_size_full_fold_4/
0                                              ['Others']
1       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
2                                   ['Government/Public']
3                                          ['War/Terror']
4                                          ['War/Terror']
                              ...                        
3436                                 ['Labor/Employment']
3437                                        ['Education']
3438                                ['Conspiracy Theory']
3439                                ['Conspiracy Theory']
3440                                        ['Education']
Name: annotations, Length: 3441, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regula

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


[{'Conspiracy Theory': {'precision': 0.5384615384615384, 'recall': 0.5957446808510638, 'f1-score': 0.5656565656565657, 'support': 47}, 'Education': {'precision': 0.75, 'recall': 0.75, 'f1-score': 0.75, 'support': 12}, 'Election Campaign': {'precision': 0.7307692307692307, 'recall': 0.7307692307692307, 'f1-score': 0.7307692307692306, 'support': 26}, 'Environment': {'precision': 0.6666666666666666, 'recall': 0.8571428571428571, 'f1-score': 0.75, 'support': 14}, 'Government/Public': {'precision': 0.8, 'recall': 0.8095238095238095, 'f1-score': 0.8047337278106509, 'support': 252}, 'Health': {'precision': 0.7586206896551724, 'recall': 0.9777777777777777, 'f1-score': 0.854368932038835, 'support': 45}, 'Immigration/Integration': {'precision': 0.7619047619047619, 'recall': 0.8, 'f1-score': 0.7804878048780488, 'support': 40}, 'Justice/Crime': {'precision': 0.8041237113402062, 'recall': 0.7289719626168224, 'f1-score': 0.7647058823529411, 'support': 107}, 'Labor/Employment': {'precision': 0.714285

In [5]:
generic_val_average_report_df, generic_test_average_report_df = calculate_metrics("oa_with_rules_classification_only_co_occurrence_generic")

../models/weak_labeled/oa_with_rules_classification_only_co_occurrence_generic_epochs_200_train_size_full_fold_0/
0                                  ['Science/Technology']
1                                              ['Others']
2       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
3                                              ['Others']
4                                   ['Government/Public']
                              ...                        
3480                                 ['Labor/Employment']
3481                                         ['Religion']
3482                                        ['Education']
3483                                ['Conspiracy Theory']
3484                                ['Conspiracy Theory']
Name: annotations, Length: 3485, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/oa_with_rules_classification_only_co_occurrence_generic_epochs_200_train_size_full_fold_1/
0                                  ['Science/Technology']
1                                              ['Others']
2       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
3                                              ['Others']
4                                          ['War/Terror']
                              ...                        
3480                                 ['Labor/Employment']
3481                                         ['Religion']
3482                                        ['Education']
3483                                ['Conspiracy Theory']
3484                                ['Conspiracy Theory']
Name: annotations, Length: 3485, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/oa_with_rules_classification_only_co_occurrence_generic_epochs_200_train_size_full_fold_2/
0       ['Science/Technology']
1                   ['Others']
2                   ['Others']
3        ['Government/Public']
4               ['War/Terror']
                 ...          
3480      ['Labor/Employment']
3481              ['Religion']
3482             ['Education']
3483     ['Conspiracy Theory']
3484     ['Conspiracy Theory']
Name: annotations, Length: 3485, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Journalism', 'Others', 'Religion', 'Science/Technology', 'War/Terror']


  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/oa_with_rules_classification_only_co_occurrence_generic_epochs_200_train_size_full_fold_3/
0                                  ['Science/Technology']
1       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
2                                              ['Others']
3                                   ['Government/Public']
4                                          ['War/Terror']
                              ...                        
3480                                 ['Labor/Employment']
3481                                         ['Religion']
3482                                        ['Education']
3483                                ['Conspiracy Theory']
3484                                ['Conspiracy Theory']
Name: annotations, Length: 3485, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/oa_with_rules_classification_only_co_occurrence_generic_epochs_200_train_size_full_fold_4/
0                                              ['Others']
1       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
2                                   ['Government/Public']
3                                          ['War/Terror']
4                                          ['War/Terror']
                              ...                        
3480                                 ['Labor/Employment']
3481                                         ['Religion']
3482                                        ['Education']
3483                                ['Conspiracy Theory']
3484                                ['Conspiracy Theory']
Name: annotations, Length: 3485, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


[{'Conspiracy Theory': {'precision': 0.6111111111111112, 'recall': 0.46808510638297873, 'f1-score': 0.5301204819277109, 'support': 47}, 'Education': {'precision': 1.0, 'recall': 0.6666666666666666, 'f1-score': 0.8, 'support': 12}, 'Election Campaign': {'precision': 0.8235294117647058, 'recall': 0.5384615384615384, 'f1-score': 0.6511627906976744, 'support': 26}, 'Environment': {'precision': 0.8461538461538461, 'recall': 0.7857142857142857, 'f1-score': 0.8148148148148148, 'support': 14}, 'Government/Public': {'precision': 0.7870722433460076, 'recall': 0.8214285714285714, 'f1-score': 0.803883495145631, 'support': 252}, 'Health': {'precision': 0.9512195121951219, 'recall': 0.8666666666666667, 'f1-score': 0.9069767441860465, 'support': 45}, 'Immigration/Integration': {'precision': 0.6888888888888889, 'recall': 0.775, 'f1-score': 0.7294117647058822, 'support': 40}, 'Justice/Crime': {'precision': 0.7872340425531915, 'recall': 0.6915887850467289, 'f1-score': 0.7363184079601991, 'support': 107}

In [6]:
generic_val_average_report_df, generic_test_average_report_df = calculate_metrics("oa_with_3_random_examples_classification_only_co_occurrence_generic")

../models/weak_labeled/oa_with_3_random_examples_classification_only_co_occurrence_generic_epochs_200_train_size_full_fold_0/
0                                  ['Science/Technology']
1                                              ['Others']
2       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
3                                              ['Others']
4                                   ['Government/Public']
                              ...                        
3759                                 ['Labor/Employment']
3760                                 ['Labor/Employment']
3761                                        ['Education']
3762                                         ['Religion']
3763                                         ['Religion']
Name: annotations, Length: 3764, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economi

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/oa_with_3_random_examples_classification_only_co_occurrence_generic_epochs_200_train_size_full_fold_1/
0                                  ['Science/Technology']
1                                              ['Others']
2       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
3                                              ['Others']
4                                          ['War/Terror']
                              ...                        
3759                                 ['Labor/Employment']
3760                                 ['Labor/Employment']
3761                                        ['Education']
3762                                         ['Religion']
3763                                         ['Religion']
Name: annotations, Length: 3764, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economi

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/oa_with_3_random_examples_classification_only_co_occurrence_generic_epochs_200_train_size_full_fold_2/
0       ['Science/Technology']
1                   ['Others']
2                   ['Others']
3        ['Government/Public']
4               ['War/Terror']
                 ...          
3759      ['Labor/Employment']
3760      ['Labor/Employment']
3761             ['Education']
3762              ['Religion']
3763              ['Religion']
Name: annotations, Length: 3764, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Journalism', 'Others', 'Religion', 'Science/Technology', 'War/Terror']


  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/oa_with_3_random_examples_classification_only_co_occurrence_generic_epochs_200_train_size_full_fold_3/
0                                  ['Science/Technology']
1       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
2                                              ['Others']
3                                   ['Government/Public']
4                                          ['War/Terror']
                              ...                        
3759                                 ['Labor/Employment']
3760                                 ['Labor/Employment']
3761                                        ['Education']
3762                                         ['Religion']
3763                                         ['Religion']
Name: annotations, Length: 3764, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economi

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/oa_with_3_random_examples_classification_only_co_occurrence_generic_epochs_200_train_size_full_fold_4/
0                                              ['Others']
1       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
2                                   ['Government/Public']
3                                          ['War/Terror']
4                                          ['War/Terror']
                              ...                        
3759                                 ['Labor/Employment']
3760                                 ['Labor/Employment']
3761                                        ['Education']
3762                                         ['Religion']
3763                                         ['Religion']
Name: annotations, Length: 3764, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economi

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


[{'Conspiracy Theory': {'precision': 0.5757575757575758, 'recall': 0.40425531914893614, 'f1-score': 0.47500000000000003, 'support': 47}, 'Education': {'precision': 0.8888888888888888, 'recall': 0.6666666666666666, 'f1-score': 0.761904761904762, 'support': 12}, 'Election Campaign': {'precision': 0.8888888888888888, 'recall': 0.6153846153846154, 'f1-score': 0.7272727272727274, 'support': 26}, 'Environment': {'precision': 0.7692307692307693, 'recall': 0.7142857142857143, 'f1-score': 0.7407407407407408, 'support': 14}, 'Government/Public': {'precision': 0.8181818181818182, 'recall': 0.75, 'f1-score': 0.7826086956521738, 'support': 252}, 'Health': {'precision': 0.9487179487179487, 'recall': 0.8222222222222222, 'f1-score': 0.8809523809523809, 'support': 45}, 'Immigration/Integration': {'precision': 0.8181818181818182, 'recall': 0.675, 'f1-score': 0.7397260273972603, 'support': 40}, 'Justice/Crime': {'precision': 0.8048780487804879, 'recall': 0.616822429906542, 'f1-score': 0.6984126984126984,

In [5]:
generic_val_average_report_df, generic_test_average_report_df = calculate_metrics("oa_classification_only_v03_co_occurrence_generic")

../models/weak_labeled/oa_classification_only_v03_co_occurrence_generic_epochs_200_train_size_full_fold_0/
0                                  ['Science/Technology']
1                                              ['Others']
2       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
3                                              ['Others']
4                                   ['Government/Public']
                              ...                        
3822                                 ['Labor/Employment']
3823                                         ['Religion']
3824                                        ['Education']
3825                               ['Science/Technology']
3826                               ['Science/Technology']
Name: annotations, Length: 3827, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Med

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/oa_classification_only_v03_co_occurrence_generic_epochs_200_train_size_full_fold_1/
0                                  ['Science/Technology']
1                                              ['Others']
2       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
3                                              ['Others']
4                                          ['War/Terror']
                              ...                        
3822                                 ['Labor/Employment']
3823                                         ['Religion']
3824                                        ['Education']
3825                               ['Science/Technology']
3826                               ['Science/Technology']
Name: annotations, Length: 3827, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Med

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/oa_classification_only_v03_co_occurrence_generic_epochs_200_train_size_full_fold_2/
0       ['Science/Technology']
1                   ['Others']
2                   ['Others']
3        ['Government/Public']
4               ['War/Terror']
                 ...          
3822      ['Labor/Employment']
3823              ['Religion']
3824             ['Education']
3825    ['Science/Technology']
3826    ['Science/Technology']
Name: annotations, Length: 3827, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Journalism', 'Others', 'Religion', 'Science/Technology', 'War/Terror']


  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/oa_classification_only_v03_co_occurrence_generic_epochs_200_train_size_full_fold_3/
0                                  ['Science/Technology']
1       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
2                                              ['Others']
3                                   ['Government/Public']
4                                          ['War/Terror']
                              ...                        
3822                                 ['Labor/Employment']
3823                                         ['Religion']
3824                                        ['Education']
3825                               ['Science/Technology']
3826                               ['Science/Technology']
Name: annotations, Length: 3827, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Med

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/oa_classification_only_v03_co_occurrence_generic_epochs_200_train_size_full_fold_4/
0                                              ['Others']
1       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
2                                   ['Government/Public']
3                                          ['War/Terror']
4                                          ['War/Terror']
                              ...                        
3822                                 ['Labor/Employment']
3823                                         ['Religion']
3824                                        ['Education']
3825                               ['Science/Technology']
3826                               ['Science/Technology']
Name: annotations, Length: 3827, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Med

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


[{'Conspiracy Theory': {'precision': 0.7777777777777778, 'recall': 0.2978723404255319, 'f1-score': 0.43076923076923074, 'support': 47}, 'Education': {'precision': 0.7777777777777778, 'recall': 0.5833333333333334, 'f1-score': 0.6666666666666666, 'support': 12}, 'Election Campaign': {'precision': 0.8823529411764706, 'recall': 0.5769230769230769, 'f1-score': 0.6976744186046512, 'support': 26}, 'Environment': {'precision': 0.8571428571428571, 'recall': 0.42857142857142855, 'f1-score': 0.5714285714285714, 'support': 14}, 'Government/Public': {'precision': 0.8073770491803278, 'recall': 0.7817460317460317, 'f1-score': 0.7943548387096774, 'support': 252}, 'Health': {'precision': 0.9459459459459459, 'recall': 0.7777777777777778, 'f1-score': 0.8536585365853658, 'support': 45}, 'Immigration/Integration': {'precision': 0.88, 'recall': 0.55, 'f1-score': 0.676923076923077, 'support': 40}, 'Justice/Crime': {'precision': 0.8414634146341463, 'recall': 0.6448598130841121, 'f1-score': 0.73015873015873, '

In [7]:
generic_val_average_report_df, generic_test_average_report_df = calculate_metrics("davinci_elaboration_first_v04_co_occurrence_generic")

../models/weak_labeled/davinci_elaboration_first_v04_co_occurrence_generic_epochs_200_train_size_full_fold_0/
0                                  ['Science/Technology']
1                                              ['Others']
2       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
3                                              ['Others']
4                                   ['Government/Public']
                              ...                        
3630                                         ['Religion']
3631                                        ['Education']
3632                                ['Conspiracy Theory']
3633                                ['Conspiracy Theory']
3634                               ['Science/Technology']
Name: annotations, Length: 3635, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', '

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/davinci_elaboration_first_v04_co_occurrence_generic_epochs_200_train_size_full_fold_1/
0                                  ['Science/Technology']
1                                              ['Others']
2       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
3                                              ['Others']
4                                          ['War/Terror']
                              ...                        
3630                                         ['Religion']
3631                                        ['Education']
3632                                ['Conspiracy Theory']
3633                                ['Conspiracy Theory']
3634                               ['Science/Technology']
Name: annotations, Length: 3635, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', '

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/davinci_elaboration_first_v04_co_occurrence_generic_epochs_200_train_size_full_fold_2/
0       ['Science/Technology']
1                   ['Others']
2                   ['Others']
3        ['Government/Public']
4               ['War/Terror']
                 ...          
3630              ['Religion']
3631             ['Education']
3632     ['Conspiracy Theory']
3633     ['Conspiracy Theory']
3634    ['Science/Technology']
Name: annotations, Length: 3635, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Journalism', 'Others', 'Religion', 'Science/Technology', 'War/Terror']


  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/davinci_elaboration_first_v04_co_occurrence_generic_epochs_200_train_size_full_fold_3/
0                                  ['Science/Technology']
1       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
2                                              ['Others']
3                                   ['Government/Public']
4                                          ['War/Terror']
                              ...                        
3630                                         ['Religion']
3631                                        ['Education']
3632                                ['Conspiracy Theory']
3633                                ['Conspiracy Theory']
3634                               ['Science/Technology']
Name: annotations, Length: 3635, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', '

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/davinci_elaboration_first_v04_co_occurrence_generic_epochs_200_train_size_full_fold_4/
0                                              ['Others']
1       ['Health', 'Justice/Crime', 'Macroeconomics/Ec...
2                                   ['Government/Public']
3                                          ['War/Terror']
4                                          ['War/Terror']
                              ...                        
3630                                         ['Religion']
3631                                        ['Education']
3632                                ['Conspiracy Theory']
3633                                ['Conspiracy Theory']
3634                               ['Science/Technology']
Name: annotations, Length: 3635, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', '

  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


[{'Conspiracy Theory': {'precision': 0.6052631578947368, 'recall': 0.48936170212765956, 'f1-score': 0.5411764705882353, 'support': 47}, 'Education': {'precision': 0.8571428571428571, 'recall': 0.5, 'f1-score': 0.631578947368421, 'support': 12}, 'Election Campaign': {'precision': 0.8181818181818182, 'recall': 0.6923076923076923, 'f1-score': 0.7500000000000001, 'support': 26}, 'Environment': {'precision': 0.7272727272727273, 'recall': 0.5714285714285714, 'f1-score': 0.64, 'support': 14}, 'Government/Public': {'precision': 0.7250859106529209, 'recall': 0.8373015873015873, 'f1-score': 0.7771639042357275, 'support': 252}, 'Health': {'precision': 0.8974358974358975, 'recall': 0.7777777777777778, 'f1-score': 0.8333333333333333, 'support': 45}, 'Immigration/Integration': {'precision': 0.9285714285714286, 'recall': 0.65, 'f1-score': 0.7647058823529412, 'support': 40}, 'Justice/Crime': {'precision': 0.8235294117647058, 'recall': 0.6542056074766355, 'f1-score': 0.7291666666666665, 'support': 107}

# EDA

In [6]:
generic_val_average_report_df, generic_test_average_report_df = calculate_metrics("eda_generic")

../models/weak_labeled/eda_generic_epochs_200_train_size_full_fold_0/
0                     ['Science/Technology']
1                     ['Science/Technology']
2                     ['Science/Technology']
3                     ['Science/Technology']
4                     ['Science/Technology']
                        ...                 
15995    ['Government/Public', 'War/Terror']
15996    ['Government/Public', 'War/Terror']
15997    ['Government/Public', 'War/Terror']
15998    ['Government/Public', 'War/Terror']
15999    ['Government/Public', 'War/Terror']
Name: annotations, Length: 16000, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Journalism', 'Others', 'Religion', 'Science/Technology', 'War/Terror']


  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/eda_generic_epochs_200_train_size_full_fold_1/
0                     ['Science/Technology']
1                     ['Science/Technology']
2                     ['Science/Technology']
3                     ['Science/Technology']
4                     ['Science/Technology']
                        ...                 
15995    ['Government/Public', 'War/Terror']
15996    ['Government/Public', 'War/Terror']
15997    ['Government/Public', 'War/Terror']
15998    ['Government/Public', 'War/Terror']
15999    ['Government/Public', 'War/Terror']
Name: annotations, Length: 16000, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Journalism', 'Others', 'Religion', 'Science/Technology', 'War/Terror']


  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/eda_generic_epochs_200_train_size_full_fold_2/
0                        ['Science/Technology']
1                        ['Science/Technology']
2                        ['Science/Technology']
3                        ['Science/Technology']
4                        ['Science/Technology']
                          ...                  
15995    ['Government/Public', 'Justice/Crime']
15996    ['Government/Public', 'Justice/Crime']
15997    ['Government/Public', 'Justice/Crime']
15998    ['Government/Public', 'Justice/Crime']
15999    ['Government/Public', 'Justice/Crime']
Name: annotations, Length: 16000, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Journalism', 'Others', 'Religion', 'Science/Technology', 'War/Terror']


  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/eda_generic_epochs_200_train_size_full_fold_3/
0                     ['Science/Technology']
1                     ['Science/Technology']
2                     ['Science/Technology']
3                     ['Science/Technology']
4                     ['Science/Technology']
                        ...                 
15995    ['Government/Public', 'War/Terror']
15996    ['Government/Public', 'War/Terror']
15997    ['Government/Public', 'War/Terror']
15998    ['Government/Public', 'War/Terror']
15999    ['Government/Public', 'War/Terror']
Name: annotations, Length: 16000, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Journalism', 'Others', 'Religion', 'Science/Technology', 'War/Terror']


  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/eda_generic_epochs_200_train_size_full_fold_4/
0                                 ['Others']
1                                 ['Others']
2                                 ['Others']
3                                 ['Others']
4                                 ['Others']
                        ...                 
15995    ['Government/Public', 'War/Terror']
15996    ['Government/Public', 'War/Terror']
15997    ['Government/Public', 'War/Terror']
15998    ['Government/Public', 'War/Terror']
15999    ['Government/Public', 'War/Terror']
Name: annotations, Length: 16000, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Journalism', 'Others', 'Religion', 'Science/Technology', 'War/Terror']


  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


[{'Conspiracy Theory': {'precision': 0.8095238095238095, 'recall': 0.3617021276595745, 'f1-score': 0.5, 'support': 47}, 'Education': {'precision': 0.8888888888888888, 'recall': 0.6666666666666666, 'f1-score': 0.761904761904762, 'support': 12}, 'Election Campaign': {'precision': 0.7391304347826086, 'recall': 0.6538461538461539, 'f1-score': 0.693877551020408, 'support': 26}, 'Environment': {'precision': 0.8888888888888888, 'recall': 0.5714285714285714, 'f1-score': 0.6956521739130435, 'support': 14}, 'Government/Public': {'precision': 0.7644927536231884, 'recall': 0.8373015873015873, 'f1-score': 0.7992424242424242, 'support': 252}, 'Health': {'precision': 0.8163265306122449, 'recall': 0.8888888888888888, 'f1-score': 0.851063829787234, 'support': 45}, 'Immigration/Integration': {'precision': 0.7272727272727273, 'recall': 0.8, 'f1-score': 0.761904761904762, 'support': 40}, 'Justice/Crime': {'precision': 0.7521367521367521, 'recall': 0.822429906542056, 'f1-score': 0.7857142857142856, 'suppor

In [7]:
generic_val_average_report_df, generic_test_average_report_df = calculate_metrics("eda_alpha_0_05_n_aug_8_generic")

../models/weak_labeled/eda_alpha_0_05_n_aug_8_generic_epochs_200_train_size_full_fold_0/
0                     ['Science/Technology']
1                     ['Science/Technology']
2                     ['Science/Technology']
3                     ['Science/Technology']
4                     ['Science/Technology']
                        ...                 
28795    ['Government/Public', 'War/Terror']
28796    ['Government/Public', 'War/Terror']
28797    ['Government/Public', 'War/Terror']
28798    ['Government/Public', 'War/Terror']
28799    ['Government/Public', 'War/Terror']
Name: annotations, Length: 28800, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Journalism', 'Others', 'Religion', 'Science/Technology', 'War/Terror']


  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/eda_alpha_0_05_n_aug_8_generic_epochs_200_train_size_full_fold_1/
0                     ['Science/Technology']
1                     ['Science/Technology']
2                     ['Science/Technology']
3                     ['Science/Technology']
4                     ['Science/Technology']
                        ...                 
28795    ['Government/Public', 'War/Terror']
28796    ['Government/Public', 'War/Terror']
28797    ['Government/Public', 'War/Terror']
28798    ['Government/Public', 'War/Terror']
28799    ['Government/Public', 'War/Terror']
Name: annotations, Length: 28800, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Journalism', 'Others', 'Religion', 'Science/Technology', 'War/Terror']


  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/eda_alpha_0_05_n_aug_8_generic_epochs_200_train_size_full_fold_2/
0                        ['Science/Technology']
1                        ['Science/Technology']
2                        ['Science/Technology']
3                        ['Science/Technology']
4                        ['Science/Technology']
                          ...                  
28795    ['Government/Public', 'Justice/Crime']
28796    ['Government/Public', 'Justice/Crime']
28797    ['Government/Public', 'Justice/Crime']
28798    ['Government/Public', 'Justice/Crime']
28799    ['Government/Public', 'Justice/Crime']
Name: annotations, Length: 28800, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Journalism', 'Others', 'Religion', 'Science/Technology', 'War/Terror']


  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/eda_alpha_0_05_n_aug_8_generic_epochs_200_train_size_full_fold_3/
0                     ['Science/Technology']
1                     ['Science/Technology']
2                     ['Science/Technology']
3                     ['Science/Technology']
4                     ['Science/Technology']
                        ...                 
28795    ['Government/Public', 'War/Terror']
28796    ['Government/Public', 'War/Terror']
28797    ['Government/Public', 'War/Terror']
28798    ['Government/Public', 'War/Terror']
28799    ['Government/Public', 'War/Terror']
Name: annotations, Length: 28800, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Journalism', 'Others', 'Religion', 'Science/Technology', 'War/Terror']


  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


../models/weak_labeled/eda_alpha_0_05_n_aug_8_generic_epochs_200_train_size_full_fold_4/
0                                 ['Others']
1                                 ['Others']
2                                 ['Others']
3                                 ['Others']
4                                 ['Others']
                        ...                 
28795    ['Government/Public', 'War/Terror']
28796    ['Government/Public', 'War/Terror']
28797    ['Government/Public', 'War/Terror']
28798    ['Government/Public', 'War/Terror']
28799    ['Government/Public', 'War/Terror']
Name: annotations, Length: 28800, dtype: object
['Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Journalism', 'Others', 'Religion', 'Science/Technology', 'War/Terror']


  'label': torch.tensor(label, dtype=torch.float32)}
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  'label': torch.tensor(label, dtype=torch.float32)}


[{'Conspiracy Theory': {'precision': 0.6538461538461539, 'recall': 0.3617021276595745, 'f1-score': 0.4657534246575342, 'support': 47}, 'Education': {'precision': 0.875, 'recall': 0.5833333333333334, 'f1-score': 0.7000000000000001, 'support': 12}, 'Election Campaign': {'precision': 0.7916666666666666, 'recall': 0.7307692307692307, 'f1-score': 0.76, 'support': 26}, 'Environment': {'precision': 0.75, 'recall': 0.6428571428571429, 'f1-score': 0.6923076923076924, 'support': 14}, 'Government/Public': {'precision': 0.7712177121771218, 'recall': 0.8293650793650794, 'f1-score': 0.7992351816443594, 'support': 252}, 'Health': {'precision': 0.8478260869565217, 'recall': 0.8666666666666667, 'f1-score': 0.8571428571428571, 'support': 45}, 'Immigration/Integration': {'precision': 0.7045454545454546, 'recall': 0.775, 'f1-score': 0.7380952380952381, 'support': 40}, 'Justice/Crime': {'precision': 0.7938144329896907, 'recall': 0.719626168224299, 'f1-score': 0.7549019607843137, 'support': 107}, 'Labor/Emp

In [30]:
generic_valid_results = [{'Conspiracy Theory': {'precision': 0.6521739130434783, 'recall': 0.3191489361702128, 'f1-score': 0.4285714285714286, 'support': 47}, 'Education': {'precision': 0.875, 'recall': 0.5833333333333334, 'f1-score': 0.7000000000000001, 'support': 12}, 'Election Campaign': {'precision': 0.8076923076923077, 'recall': 0.8076923076923077, 'f1-score': 0.8076923076923077, 'support': 26}, 'Environment': {'precision': 1.0, 'recall': 0.5714285714285714, 'f1-score': 0.7272727272727273, 'support': 14}, 'Government/Public': {'precision': 0.7686832740213523, 'recall': 0.8571428571428571, 'f1-score': 0.8105065666041276, 'support': 252}, 'Health': {'precision': 0.8571428571428571, 'recall': 0.8, 'f1-score': 0.8275862068965518, 'support': 45}, 'Immigration/Integration': {'precision': 0.8275862068965517, 'recall': 0.6, 'f1-score': 0.6956521739130435, 'support': 40}, 'Justice/Crime': {'precision': 0.7669902912621359, 'recall': 0.7383177570093458, 'f1-score': 0.7523809523809523, 'support': 107}, 'Labor/Employment': {'precision': 0.7058823529411765, 'recall': 0.631578947368421, 'f1-score': 0.6666666666666667, 'support': 19}, 'Macroeconomics/Economic Regulation': {'precision': 0.8780487804878049, 'recall': 0.6206896551724138, 'f1-score': 0.7272727272727273, 'support': 58}, 'Media/Journalism': {'precision': 0.8235294117647058, 'recall': 0.7368421052631579, 'f1-score': 0.7777777777777778, 'support': 38}, 'Others': {'precision': 0.8357487922705314, 'recall': 0.8439024390243902, 'f1-score': 0.8398058252427185, 'support': 205}, 'Religion': {'precision': 0.6666666666666666, 'recall': 0.36363636363636365, 'f1-score': 0.4705882352941177, 'support': 22}, 'Science/Technology': {'precision': 0.6666666666666666, 'recall': 0.4, 'f1-score': 0.5, 'support': 10}, 'War/Terror': {'precision': 0.9710982658959537, 'recall': 0.9081081081081082, 'f1-score': 0.9385474860335196, 'support': 185}, 'micro avg': {'precision': 0.8267326732673267, 'recall': 0.7731481481481481, 'f1-score': 0.7990430622009568, 'support': 1080}, 'macro avg': {'precision': 0.8068606524501458, 'recall': 0.6521214254232989, 'f1-score': 0.7113547387745777, 'support': 1080}, 'weighted avg': {'precision': 0.8255101143884639, 'recall': 0.7731481481481481, 'f1-score': 0.7917527953259383, 'support': 1080}, 'samples avg': {'precision': 0.8335416666666666, 'recall': 0.8048958333333335, 'f1-score': 0.8039999999999999, 'support': 1080}}, {'Conspiracy Theory': {'precision': 0.6944444444444444, 'recall': 0.4166666666666667, 'f1-score': 0.5208333333333334, 'support': 60}, 'Education': {'precision': 0.8333333333333334, 'recall': 0.625, 'f1-score': 0.7142857142857143, 'support': 16}, 'Election Campaign': {'precision': 0.8611111111111112, 'recall': 0.8857142857142857, 'f1-score': 0.8732394366197184, 'support': 35}, 'Environment': {'precision': 0.875, 'recall': 0.5, 'f1-score': 0.6363636363636364, 'support': 14}, 'Government/Public': {'precision': 0.7565543071161048, 'recall': 0.8278688524590164, 'f1-score': 0.7906066536203522, 'support': 244}, 'Health': {'precision': 0.8048780487804879, 'recall': 0.8048780487804879, 'f1-score': 0.8048780487804877, 'support': 41}, 'Immigration/Integration': {'precision': 0.6938775510204082, 'recall': 0.7555555555555555, 'f1-score': 0.723404255319149, 'support': 45}, 'Justice/Crime': {'precision': 0.7563025210084033, 'recall': 0.8181818181818182, 'f1-score': 0.7860262008733625, 'support': 110}, 'Labor/Employment': {'precision': 0.8125, 'recall': 0.8125, 'f1-score': 0.8125, 'support': 16}, 'Macroeconomics/Economic Regulation': {'precision': 0.6666666666666666, 'recall': 0.72, 'f1-score': 0.6923076923076923, 'support': 50}, 'Media/Journalism': {'precision': 0.6410256410256411, 'recall': 0.78125, 'f1-score': 0.7042253521126761, 'support': 32}, 'Others': {'precision': 0.8599033816425121, 'recall': 0.7705627705627706, 'f1-score': 0.8127853881278538, 'support': 231}, 'Religion': {'precision': 0.7692307692307693, 'recall': 0.7142857142857143, 'f1-score': 0.7407407407407408, 'support': 14}, 'Science/Technology': {'precision': 1.0, 'recall': 0.14285714285714285, 'f1-score': 0.25, 'support': 14}, 'War/Terror': {'precision': 0.9060773480662984, 'recall': 0.9213483146067416, 'f1-score': 0.913649025069638, 'support': 178}, 'micro avg': {'precision': 0.7962962962962963, 'recall': 0.7818181818181819, 'f1-score': 0.7889908256880734, 'support': 1100}, 'macro avg': {'precision': 0.7953936748964121, 'recall': 0.6997779446446799, 'f1-score': 0.7183896985036237, 'support': 1100}, 'weighted avg': {'precision': 0.8008552187842429, 'recall': 0.7818181818181819, 'f1-score': 0.7831627791467639, 'support': 1100}, 'samples avg': {'precision': 0.8075, 'recall': 0.8034375, 'f1-score': 0.7893571428571429, 'support': 1100}}, {'Conspiracy Theory': {'precision': 0.5757575757575758, 'recall': 0.4318181818181818, 'f1-score': 0.49350649350649356, 'support': 44}, 'Education': {'precision': 0.7777777777777778, 'recall': 0.6363636363636364, 'f1-score': 0.7000000000000001, 'support': 11}, 'Election Campaign': {'precision': 0.6666666666666666, 'recall': 0.72, 'f1-score': 0.6923076923076923, 'support': 25}, 'Environment': {'precision': 1.0, 'recall': 0.7272727272727273, 'f1-score': 0.8421052631578948, 'support': 11}, 'Government/Public': {'precision': 0.8617511520737328, 'recall': 0.7540322580645161, 'f1-score': 0.8043010752688172, 'support': 248}, 'Health': {'precision': 0.8125, 'recall': 0.65, 'f1-score': 0.7222222222222223, 'support': 40}, 'Immigration/Integration': {'precision': 0.8, 'recall': 0.8275862068965517, 'f1-score': 0.8135593220338982, 'support': 29}, 'Justice/Crime': {'precision': 0.8303571428571429, 'recall': 0.808695652173913, 'f1-score': 0.8193832599118942, 'support': 115}, 'Labor/Employment': {'precision': 0.7692307692307693, 'recall': 0.5, 'f1-score': 0.6060606060606061, 'support': 20}, 'Macroeconomics/Economic Regulation': {'precision': 0.7073170731707317, 'recall': 0.6041666666666666, 'f1-score': 0.6516853932584269, 'support': 48}, 'Media/Journalism': {'precision': 0.72, 'recall': 0.8, 'f1-score': 0.7578947368421052, 'support': 45}, 'Others': {'precision': 0.8442211055276382, 'recall': 0.7962085308056872, 'f1-score': 0.8195121951219513, 'support': 211}, 'Religion': {'precision': 0.5, 'recall': 0.1, 'f1-score': 0.16666666666666669, 'support': 10}, 'Science/Technology': {'precision': 1.0, 'recall': 0.18181818181818182, 'f1-score': 0.3076923076923077, 'support': 11}, 'War/Terror': {'precision': 0.9004739336492891, 'recall': 0.95, 'f1-score': 0.924574209245742, 'support': 200}, 'micro avg': {'precision': 0.8296146044624746, 'recall': 0.7659176029962547, 'f1-score': 0.7964946445959105, 'support': 1068}, 'macro avg': {'precision': 0.7844035464474215, 'recall': 0.6325308027920041, 'f1-score': 0.674764762886448, 'support': 1068}, 'weighted avg': {'precision': 0.8262380033627459, 'recall': 0.7659176029962547, 'f1-score': 0.7889078257750887, 'support': 1068}, 'samples avg': {'precision': 0.8297916666666667, 'recall': 0.7947916666666666, 'f1-score': 0.7966428571428571, 'support': 1068}}, {'Conspiracy Theory': {'precision': 0.7435897435897436, 'recall': 0.6041666666666666, 'f1-score': 0.6666666666666667, 'support': 48}, 'Education': {'precision': 0.5, 'recall': 0.2, 'f1-score': 0.28571428571428575, 'support': 10}, 'Election Campaign': {'precision': 0.8333333333333334, 'recall': 0.8333333333333334, 'f1-score': 0.8333333333333334, 'support': 24}, 'Environment': {'precision': 1.0, 'recall': 0.375, 'f1-score': 0.5454545454545454, 'support': 8}, 'Government/Public': {'precision': 0.7906976744186046, 'recall': 0.816, 'f1-score': 0.8031496062992125, 'support': 250}, 'Health': {'precision': 0.803921568627451, 'recall': 0.8367346938775511, 'f1-score': 0.8200000000000001, 'support': 49}, 'Immigration/Integration': {'precision': 0.8648648648648649, 'recall': 0.7111111111111111, 'f1-score': 0.7804878048780488, 'support': 45}, 'Justice/Crime': {'precision': 0.7421875, 'recall': 0.8189655172413793, 'f1-score': 0.7786885245901638, 'support': 116}, 'Labor/Employment': {'precision': 0.9166666666666666, 'recall': 0.4583333333333333, 'f1-score': 0.611111111111111, 'support': 24}, 'Macroeconomics/Economic Regulation': {'precision': 0.8979591836734694, 'recall': 0.7857142857142857, 'f1-score': 0.838095238095238, 'support': 56}, 'Media/Journalism': {'precision': 0.875, 'recall': 0.5384615384615384, 'f1-score': 0.6666666666666667, 'support': 39}, 'Others': {'precision': 0.8102564102564103, 'recall': 0.8404255319148937, 'f1-score': 0.825065274151436, 'support': 188}, 'Religion': {'precision': 0.875, 'recall': 0.875, 'f1-score': 0.875, 'support': 16}, 'Science/Technology': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 7}, 'War/Terror': {'precision': 0.945054945054945, 'recall': 0.8686868686868687, 'f1-score': 0.9052631578947369, 'support': 198}, 'micro avg': {'precision': 0.8277886497064579, 'recall': 0.7847866419294991, 'f1-score': 0.8057142857142858, 'support': 1078}, 'macro avg': {'precision': 0.7732354593656993, 'recall': 0.6374621920227308, 'f1-score': 0.682313080990363, 'support': 1078}, 'weighted avg': {'precision': 0.8261886369326729, 'recall': 0.7847866419294991, 'f1-score': 0.7992508141662166, 'support': 1078}, 'samples avg': {'precision': 0.844375, 'recall': 0.8184583333333333, 'f1-score': 0.8152053571428571, 'support': 1078}}, {'Conspiracy Theory': {'precision': 0.7272727272727273, 'recall': 0.43636363636363634, 'f1-score': 0.5454545454545455, 'support': 55}, 'Education': {'precision': 0.6666666666666666, 'recall': 0.7142857142857143, 'f1-score': 0.689655172413793, 'support': 14}, 'Election Campaign': {'precision': 0.782608695652174, 'recall': 0.782608695652174, 'f1-score': 0.782608695652174, 'support': 23}, 'Environment': {'precision': 0.8571428571428571, 'recall': 0.5454545454545454, 'f1-score': 0.6666666666666665, 'support': 11}, 'Government/Public': {'precision': 0.7838827838827839, 'recall': 0.84251968503937, 'f1-score': 0.812144212523719, 'support': 254}, 'Health': {'precision': 0.8461538461538461, 'recall': 0.8461538461538461, 'f1-score': 0.8461538461538461, 'support': 39}, 'Immigration/Integration': {'precision': 1.0, 'recall': 0.5714285714285714, 'f1-score': 0.7272727272727273, 'support': 42}, 'Justice/Crime': {'precision': 0.8108108108108109, 'recall': 0.7258064516129032, 'f1-score': 0.7659574468085107, 'support': 124}, 'Labor/Employment': {'precision': 0.9, 'recall': 0.5, 'f1-score': 0.6428571428571429, 'support': 18}, 'Macroeconomics/Economic Regulation': {'precision': 0.6666666666666666, 'recall': 0.7368421052631579, 'f1-score': 0.7, 'support': 38}, 'Media/Journalism': {'precision': 0.7142857142857143, 'recall': 0.5, 'f1-score': 0.588235294117647, 'support': 30}, 'Others': {'precision': 0.8701923076923077, 'recall': 0.8153153153153153, 'f1-score': 0.841860465116279, 'support': 222}, 'Religion': {'precision': 0.35, 'recall': 0.7777777777777778, 'f1-score': 0.48275862068965514, 'support': 9}, 'Science/Technology': {'precision': 0.3333333333333333, 'recall': 0.16666666666666666, 'f1-score': 0.2222222222222222, 'support': 12}, 'War/Terror': {'precision': 0.9021739130434783, 'recall': 0.9325842696629213, 'f1-score': 0.9171270718232043, 'support': 178}, 'micro avg': {'precision': 0.8139763779527559, 'recall': 0.7736202057998129, 'f1-score': 0.7932853717026378, 'support': 1069}, 'macro avg': {'precision': 0.7474126881735578, 'recall': 0.6595871520451067, 'f1-score': 0.6820649419848087, 'support': 1069}, 'weighted avg': {'precision': 0.8187934761459387, 'recall': 0.7736202057998129, 'f1-score': 0.7883713165257594, 'support': 1069}, 'samples avg': {'precision': 0.8177083333333333, 'recall': 0.7994791666666667, 'f1-score': 0.794375, 'support': 1069}}]

generic_test_results = [{'Conspiracy Theory': {'precision': 0.7, 'recall': 0.3111111111111111, 'f1-score': 0.43076923076923074, 'support': 45}, 'Education': {'precision': 0.4666666666666667, 'recall': 0.5384615384615384, 'f1-score': 0.5, 'support': 13}, 'Election Campaign': {'precision': 0.8235294117647058, 'recall': 0.8484848484848485, 'f1-score': 0.8358208955223881, 'support': 33}, 'Environment': {'precision': 0.875, 'recall': 0.5, 'f1-score': 0.6363636363636364, 'support': 14}, 'Government/Public': {'precision': 0.7396825396825397, 'recall': 0.8006872852233677, 'f1-score': 0.768976897689769, 'support': 291}, 'Health': {'precision': 0.7352941176470589, 'recall': 0.5434782608695652, 'f1-score': 0.625, 'support': 46}, 'Immigration/Integration': {'precision': 0.7727272727272727, 'recall': 0.4722222222222222, 'f1-score': 0.5862068965517242, 'support': 36}, 'Justice/Crime': {'precision': 0.7947019867549668, 'recall': 0.8759124087591241, 'f1-score': 0.8333333333333334, 'support': 137}, 'Labor/Employment': {'precision': 0.7, 'recall': 0.5, 'f1-score': 0.5833333333333334, 'support': 28}, 'Macroeconomics/Economic Regulation': {'precision': 0.8333333333333334, 'recall': 0.6451612903225806, 'f1-score': 0.7272727272727272, 'support': 62}, 'Media/Journalism': {'precision': 0.8181818181818182, 'recall': 0.5625, 'f1-score': 0.6666666666666666, 'support': 48}, 'Others': {'precision': 0.8284671532846716, 'recall': 0.8376383763837638, 'f1-score': 0.83302752293578, 'support': 271}, 'Religion': {'precision': 0.6666666666666666, 'recall': 0.5454545454545454, 'f1-score': 0.6, 'support': 11}, 'Science/Technology': {'precision': 0.5, 'recall': 0.36363636363636365, 'f1-score': 0.4210526315789474, 'support': 11}, 'War/Terror': {'precision': 0.9517543859649122, 'recall': 0.8509803921568627, 'f1-score': 0.898550724637681, 'support': 255}, 'micro avg': {'precision': 0.8088597210828548, 'recall': 0.7578785549577248, 'f1-score': 0.7825396825396824, 'support': 1301}, 'macro avg': {'precision': 0.7470670235116408, 'recall': 0.6130485762057263, 'f1-score': 0.6630916331103479, 'support': 1301}, 'weighted avg': {'precision': 0.8096397647592806, 'recall': 0.7578785549577248, 'f1-score': 0.7760990798508935, 'support': 1301}, 'samples avg': {'precision': 0.8225, 'recall': 0.7977333333333333, 'f1-score': 0.7935380952380953, 'support': 1301}}, {'Conspiracy Theory': {'precision': 0.5945945945945946, 'recall': 0.4888888888888889, 'f1-score': 0.5365853658536586, 'support': 45}, 'Education': {'precision': 0.4375, 'recall': 0.5384615384615384, 'f1-score': 0.4827586206896552, 'support': 13}, 'Election Campaign': {'precision': 0.875, 'recall': 0.8484848484848485, 'f1-score': 0.8615384615384615, 'support': 33}, 'Environment': {'precision': 0.8181818181818182, 'recall': 0.6428571428571429, 'f1-score': 0.7200000000000001, 'support': 14}, 'Government/Public': {'precision': 0.7583892617449665, 'recall': 0.7766323024054983, 'f1-score': 0.767402376910017, 'support': 291}, 'Health': {'precision': 0.7073170731707317, 'recall': 0.6304347826086957, 'f1-score': 0.6666666666666667, 'support': 46}, 'Immigration/Integration': {'precision': 0.6363636363636364, 'recall': 0.7777777777777778, 'f1-score': 0.7000000000000001, 'support': 36}, 'Justice/Crime': {'precision': 0.7793103448275862, 'recall': 0.8248175182481752, 'f1-score': 0.8014184397163121, 'support': 137}, 'Labor/Employment': {'precision': 0.5625, 'recall': 0.6428571428571429, 'f1-score': 0.6000000000000001, 'support': 28}, 'Macroeconomics/Economic Regulation': {'precision': 0.7868852459016393, 'recall': 0.7741935483870968, 'f1-score': 0.7804878048780488, 'support': 62}, 'Media/Journalism': {'precision': 0.6862745098039216, 'recall': 0.7291666666666666, 'f1-score': 0.7070707070707071, 'support': 48}, 'Others': {'precision': 0.81640625, 'recall': 0.7712177121771218, 'f1-score': 0.7931688804554079, 'support': 271}, 'Religion': {'precision': 0.5, 'recall': 0.6363636363636364, 'f1-score': 0.56, 'support': 11}, 'Science/Technology': {'precision': 0.5, 'recall': 0.45454545454545453, 'f1-score': 0.47619047619047616, 'support': 11}, 'War/Terror': {'precision': 0.9473684210526315, 'recall': 0.9176470588235294, 'f1-score': 0.9322709163346613, 'support': 255}, 'micro avg': {'precision': 0.7861003861003861, 'recall': 0.7824750192159877, 'f1-score': 0.7842835130970724, 'support': 1301}, 'macro avg': {'precision': 0.6937394103761018, 'recall': 0.6969564013035476, 'f1-score': 0.6923705810869382, 'support': 1301}, 'weighted avg': {'precision': 0.7893769206228457, 'recall': 0.7824750192159877, 'f1-score': 0.784868286472289, 'support': 1301}, 'samples avg': {'precision': 0.8025833333333332, 'recall': 0.8094333333333332, 'f1-score': 0.7909380952380952, 'support': 1301}}, {'Conspiracy Theory': {'precision': 0.5, 'recall': 0.4444444444444444, 'f1-score': 0.47058823529411764, 'support': 45}, 'Education': {'precision': 0.5, 'recall': 0.38461538461538464, 'f1-score': 0.4347826086956522, 'support': 13}, 'Election Campaign': {'precision': 0.7894736842105263, 'recall': 0.9090909090909091, 'f1-score': 0.8450704225352113, 'support': 33}, 'Environment': {'precision': 0.7, 'recall': 0.5, 'f1-score': 0.5833333333333334, 'support': 14}, 'Government/Public': {'precision': 0.8146551724137931, 'recall': 0.6494845360824743, 'f1-score': 0.722753346080306, 'support': 291}, 'Health': {'precision': 0.8055555555555556, 'recall': 0.6304347826086957, 'f1-score': 0.7073170731707318, 'support': 46}, 'Immigration/Integration': {'precision': 0.8285714285714286, 'recall': 0.8055555555555556, 'f1-score': 0.8169014084507044, 'support': 36}, 'Justice/Crime': {'precision': 0.8409090909090909, 'recall': 0.8102189781021898, 'f1-score': 0.825278810408922, 'support': 137}, 'Labor/Employment': {'precision': 0.6842105263157895, 'recall': 0.4642857142857143, 'f1-score': 0.5531914893617021, 'support': 28}, 'Macroeconomics/Economic Regulation': {'precision': 0.8125, 'recall': 0.6290322580645161, 'f1-score': 0.7090909090909092, 'support': 62}, 'Media/Journalism': {'precision': 0.68, 'recall': 0.7083333333333334, 'f1-score': 0.6938775510204083, 'support': 48}, 'Others': {'precision': 0.8237410071942446, 'recall': 0.8450184501845018, 'f1-score': 0.8342440801457194, 'support': 271}, 'Religion': {'precision': 0.5, 'recall': 0.36363636363636365, 'f1-score': 0.4210526315789474, 'support': 11}, 'Science/Technology': {'precision': 0.6666666666666666, 'recall': 0.18181818181818182, 'f1-score': 0.28571428571428575, 'support': 11}, 'War/Terror': {'precision': 0.9428571428571428, 'recall': 0.9058823529411765, 'f1-score': 0.9239999999999999, 'support': 255}, 'micro avg': {'precision': 0.8209459459459459, 'recall': 0.7471176018447349, 'f1-score': 0.7822937625754527, 'support': 1301}, 'macro avg': {'precision': 0.7259426849796159, 'recall': 0.615456749650896, 'f1-score': 0.6551464123253967, 'support': 1301}, 'weighted avg': {'precision': 0.8168136599834133, 'recall': 0.7471176018447349, 'f1-score': 0.7766685582179377, 'support': 1301}, 'samples avg': {'precision': 0.8228333333333333, 'recall': 0.7862333333333333, 'f1-score': 0.7882714285714286, 'support': 1301}}, {'Conspiracy Theory': {'precision': 0.5365853658536586, 'recall': 0.4888888888888889, 'f1-score': 0.5116279069767442, 'support': 45}, 'Education': {'precision': 0.42857142857142855, 'recall': 0.46153846153846156, 'f1-score': 0.4444444444444445, 'support': 13}, 'Election Campaign': {'precision': 0.8, 'recall': 0.8484848484848485, 'f1-score': 0.823529411764706, 'support': 33}, 'Environment': {'precision': 0.8333333333333334, 'recall': 0.35714285714285715, 'f1-score': 0.5, 'support': 14}, 'Government/Public': {'precision': 0.7793103448275862, 'recall': 0.7766323024054983, 'f1-score': 0.7779690189328744, 'support': 291}, 'Health': {'precision': 0.7297297297297297, 'recall': 0.5869565217391305, 'f1-score': 0.6506024096385542, 'support': 46}, 'Immigration/Integration': {'precision': 0.8, 'recall': 0.6666666666666666, 'f1-score': 0.7272727272727272, 'support': 36}, 'Justice/Crime': {'precision': 0.782312925170068, 'recall': 0.8394160583941606, 'f1-score': 0.8098591549295774, 'support': 137}, 'Labor/Employment': {'precision': 0.5806451612903226, 'recall': 0.6428571428571429, 'f1-score': 0.6101694915254238, 'support': 28}, 'Macroeconomics/Economic Regulation': {'precision': 0.7213114754098361, 'recall': 0.7096774193548387, 'f1-score': 0.7154471544715446, 'support': 62}, 'Media/Journalism': {'precision': 0.84375, 'recall': 0.5625, 'f1-score': 0.675, 'support': 48}, 'Others': {'precision': 0.7881944444444444, 'recall': 0.8376383763837638, 'f1-score': 0.8121645796064401, 'support': 271}, 'Religion': {'precision': 0.6153846153846154, 'recall': 0.7272727272727273, 'f1-score': 0.6666666666666667, 'support': 11}, 'Science/Technology': {'precision': 1.0, 'recall': 0.36363636363636365, 'f1-score': 0.5333333333333333, 'support': 11}, 'War/Terror': {'precision': 0.9563318777292577, 'recall': 0.8588235294117647, 'f1-score': 0.9049586776859504, 'support': 255}, 'micro avg': {'precision': 0.794912559618442, 'recall': 0.7686395080707148, 'f1-score': 0.7815552950371238, 'support': 1301}, 'macro avg': {'precision': 0.746364046782952, 'recall': 0.6485421442784742, 'f1-score': 0.677536331816599, 'support': 1301}, 'weighted avg': {'precision': 0.8000169987612445, 'recall': 0.7686395080707148, 'f1-score': 0.7796543083498748, 'support': 1301}, 'samples avg': {'precision': 0.8118333333333333, 'recall': 0.7970499999999999, 'f1-score': 0.7901960317460317, 'support': 1301}}, {'Conspiracy Theory': {'precision': 0.6, 'recall': 0.3333333333333333, 'f1-score': 0.42857142857142855, 'support': 45}, 'Education': {'precision': 0.5, 'recall': 0.5384615384615384, 'f1-score': 0.5185185185185186, 'support': 13}, 'Election Campaign': {'precision': 0.8387096774193549, 'recall': 0.7878787878787878, 'f1-score': 0.8125, 'support': 33}, 'Environment': {'precision': 0.8888888888888888, 'recall': 0.5714285714285714, 'f1-score': 0.6956521739130435, 'support': 14}, 'Government/Public': {'precision': 0.7652733118971061, 'recall': 0.8178694158075601, 'f1-score': 0.7906976744186045, 'support': 291}, 'Health': {'precision': 0.7297297297297297, 'recall': 0.5869565217391305, 'f1-score': 0.6506024096385542, 'support': 46}, 'Immigration/Integration': {'precision': 0.8846153846153846, 'recall': 0.6388888888888888, 'f1-score': 0.7419354838709676, 'support': 36}, 'Justice/Crime': {'precision': 0.8384615384615385, 'recall': 0.7956204379562044, 'f1-score': 0.8164794007490638, 'support': 137}, 'Labor/Employment': {'precision': 0.64, 'recall': 0.5714285714285714, 'f1-score': 0.6037735849056605, 'support': 28}, 'Macroeconomics/Economic Regulation': {'precision': 0.8269230769230769, 'recall': 0.6935483870967742, 'f1-score': 0.7543859649122807, 'support': 62}, 'Media/Journalism': {'precision': 0.8611111111111112, 'recall': 0.6458333333333334, 'f1-score': 0.738095238095238, 'support': 48}, 'Others': {'precision': 0.8713692946058091, 'recall': 0.7749077490774908, 'f1-score': 0.8203125, 'support': 271}, 'Religion': {'precision': 0.6428571428571429, 'recall': 0.8181818181818182, 'f1-score': 0.7200000000000001, 'support': 11}, 'Science/Technology': {'precision': 0.4444444444444444, 'recall': 0.36363636363636365, 'f1-score': 0.39999999999999997, 'support': 11}, 'War/Terror': {'precision': 0.9105058365758755, 'recall': 0.9176470588235294, 'f1-score': 0.9140624999999999, 'support': 255}, 'micro avg': {'precision': 0.8216926869350862, 'recall': 0.7686395080707148, 'f1-score': 0.7942811755361397, 'support': 1301}, 'macro avg': {'precision': 0.7495259625019642, 'recall': 0.6570413851381264, 'f1-score': 0.6937057918395574, 'support': 1301}, 'weighted avg': {'precision': 0.8204475776112201, 'recall': 0.7686395080707148, 'f1-score': 0.7901483178423465, 'support': 1301}, 'samples avg': {'precision': 0.8249833333333334, 'recall': 0.8010166666666666, 'f1-score': 0.7992119047619047, 'support': 1301}}]
venezuela_valid_results = [{'Conspiracy Theory': {'precision': 0.6216216216216216, 'recall': 0.5111111111111111, 'f1-score': 0.5609756097560976, 'support': 45}, 'Education': {'precision': 0.8571428571428571, 'recall': 0.5, 'f1-score': 0.631578947368421, 'support': 12}, 'Election Campaign': {'precision': 0.7241379310344828, 'recall': 0.84, 'f1-score': 0.7777777777777777, 'support': 25}, 'Environment': {'precision': 0.6666666666666666, 'recall': 0.5714285714285714, 'f1-score': 0.6153846153846153, 'support': 14}, 'Government/Public': {'precision': 0.7215909090909091, 'recall': 0.671957671957672, 'f1-score': 0.695890410958904, 'support': 189}, 'Health': {'precision': 0.851063829787234, 'recall': 0.7017543859649122, 'f1-score': 0.7692307692307693, 'support': 57}, 'Immigration/Integration': {'precision': 0.8095238095238095, 'recall': 0.6296296296296297, 'f1-score': 0.7083333333333334, 'support': 27}, 'Justice/Crime': {'precision': 0.803030303030303, 'recall': 0.8983050847457628, 'f1-score': 0.8480000000000001, 'support': 118}, 'Labor/Employment': {'precision': 0.7272727272727273, 'recall': 0.6956521739130435, 'f1-score': 0.711111111111111, 'support': 23}, 'Macroeconomics/Economic Regulation': {'precision': 0.7843137254901961, 'recall': 0.7142857142857143, 'f1-score': 0.7476635514018691, 'support': 56}, 'Media/Journalism': {'precision': 0.7916666666666666, 'recall': 0.6333333333333333, 'f1-score': 0.7037037037037038, 'support': 30}, 'Others': {'precision': 0.8805970149253731, 'recall': 0.8009049773755657, 'f1-score': 0.8388625592417063, 'support': 221}, 'Religion': {'precision': 0.3333333333333333, 'recall': 0.5, 'f1-score': 0.4, 'support': 8}, 'Science/Technology': {'precision': 0.3333333333333333, 'recall': 0.16666666666666666, 'f1-score': 0.2222222222222222, 'support': 6}, 'War/Terror': {'precision': 0.9585253456221198, 'recall': 0.9411764705882353, 'f1-score': 0.9497716894977167, 'support': 221}, 'micro avg': {'precision': 0.8203834510595358, 'recall': 0.7728136882129277, 'f1-score': 0.7958883994126285, 'support': 1052}, 'macro avg': {'precision': 0.7242546716361089, 'recall': 0.651747052733348, 'f1-score': 0.6787004200658832, 'support': 1052}, 'weighted avg': {'precision': 0.8200694461036517, 'recall': 0.7728136882129277, 'f1-score': 0.7933445325423749, 'support': 1052}, 'samples avg': {'precision': 0.8270833333333333, 'recall': 0.8021458333333333, 'f1-score': 0.8006140873015872, 'support': 1052}}, {'Conspiracy Theory': {'precision': 0.8, 'recall': 0.2926829268292683, 'f1-score': 0.4285714285714285, 'support': 41}, 'Education': {'precision': 0.5714285714285714, 'recall': 0.4444444444444444, 'f1-score': 0.5, 'support': 9}, 'Election Campaign': {'precision': 0.7083333333333334, 'recall': 0.7727272727272727, 'f1-score': 0.7391304347826088, 'support': 22}, 'Environment': {'precision': 1.0, 'recall': 0.4, 'f1-score': 0.5714285714285715, 'support': 10}, 'Government/Public': {'precision': 0.6564102564102564, 'recall': 0.735632183908046, 'f1-score': 0.6937669376693767, 'support': 174}, 'Health': {'precision': 0.8529411764705882, 'recall': 0.6744186046511628, 'f1-score': 0.7532467532467532, 'support': 43}, 'Immigration/Integration': {'precision': 0.9523809523809523, 'recall': 0.7407407407407407, 'f1-score': 0.8333333333333334, 'support': 27}, 'Justice/Crime': {'precision': 0.8681318681318682, 'recall': 0.7821782178217822, 'f1-score': 0.8229166666666666, 'support': 101}, 'Labor/Employment': {'precision': 0.75, 'recall': 0.4838709677419355, 'f1-score': 0.5882352941176471, 'support': 31}, 'Macroeconomics/Economic Regulation': {'precision': 0.7755102040816326, 'recall': 0.6229508196721312, 'f1-score': 0.6909090909090909, 'support': 61}, 'Media/Journalism': {'precision': 0.7, 'recall': 0.4827586206896552, 'f1-score': 0.5714285714285714, 'support': 29}, 'Others': {'precision': 0.8512396694214877, 'recall': 0.8442622950819673, 'f1-score': 0.8477366255144033, 'support': 244}, 'Religion': {'precision': 0.6666666666666666, 'recall': 0.3333333333333333, 'f1-score': 0.4444444444444444, 'support': 6}, 'Science/Technology': {'precision': 1.0, 'recall': 0.25, 'f1-score': 0.4, 'support': 8}, 'War/Terror': {'precision': 0.9319148936170213, 'recall': 0.9279661016949152, 'f1-score': 0.9299363057324841, 'support': 236}, 'micro avg': {'precision': 0.8201663201663202, 'recall': 0.7571976967370442, 'f1-score': 0.787425149700599, 'support': 1042}, 'macro avg': {'precision': 0.8056638394628252, 'recall': 0.5858644352891103, 'f1-score': 0.6543389638563588, 'support': 1042}, 'weighted avg': {'precision': 0.823708413394294, 'recall': 0.7571976967370442, 'f1-score': 0.7791725272628834, 'support': 1042}, 'samples avg': {'precision': 0.8220833333333333, 'recall': 0.795625, 'f1-score': 0.7931011904761904, 'support': 1042}}, {'Conspiracy Theory': {'precision': 0.7941176470588235, 'recall': 0.5192307692307693, 'f1-score': 0.627906976744186, 'support': 52}, 'Education': {'precision': 0.6666666666666666, 'recall': 0.6666666666666666, 'f1-score': 0.6666666666666666, 'support': 12}, 'Election Campaign': {'precision': 1.0, 'recall': 0.6842105263157895, 'f1-score': 0.8125000000000001, 'support': 19}, 'Environment': {'precision': 1.0, 'recall': 0.4, 'f1-score': 0.5714285714285715, 'support': 10}, 'Government/Public': {'precision': 0.6771300448430493, 'recall': 0.7947368421052632, 'f1-score': 0.7312348668280871, 'support': 190}, 'Health': {'precision': 0.9032258064516129, 'recall': 0.9032258064516129, 'f1-score': 0.9032258064516129, 'support': 31}, 'Immigration/Integration': {'precision': 0.7368421052631579, 'recall': 0.6363636363636364, 'f1-score': 0.6829268292682926, 'support': 22}, 'Justice/Crime': {'precision': 0.8315789473684211, 'recall': 0.797979797979798, 'f1-score': 0.8144329896907218, 'support': 99}, 'Labor/Employment': {'precision': 0.7647058823529411, 'recall': 0.52, 'f1-score': 0.6190476190476191, 'support': 25}, 'Macroeconomics/Economic Regulation': {'precision': 0.7678571428571429, 'recall': 0.7543859649122807, 'f1-score': 0.7610619469026548, 'support': 57}, 'Media/Journalism': {'precision': 0.9090909090909091, 'recall': 0.5128205128205128, 'f1-score': 0.6557377049180326, 'support': 39}, 'Others': {'precision': 0.8689320388349514, 'recall': 0.8136363636363636, 'f1-score': 0.8403755868544601, 'support': 220}, 'Religion': {'precision': 0.75, 'recall': 0.2727272727272727, 'f1-score': 0.39999999999999997, 'support': 11}, 'Science/Technology': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 13}, 'War/Terror': {'precision': 0.9606986899563319, 'recall': 0.9016393442622951, 'f1-score': 0.930232558139535, 'support': 244}, 'micro avg': {'precision': 0.8310880829015544, 'recall': 0.7681992337164751, 'f1-score': 0.798407167745147, 'support': 1044}, 'macro avg': {'precision': 0.7753897253829339, 'recall': 0.6118415668981508, 'f1-score': 0.6677852081960294, 'support': 1044}, 'weighted avg': {'precision': 0.8291678609403849, 'recall': 0.7681992337164751, 'f1-score': 0.7903071787941347, 'support': 1044}, 'samples avg': {'precision': 0.8336458333333334, 'recall': 0.8032291666666665, 'f1-score': 0.8043095238095238, 'support': 1044}}, {'Conspiracy Theory': {'precision': 0.5, 'recall': 0.5588235294117647, 'f1-score': 0.5277777777777778, 'support': 34}, 'Education': {'precision': 0.8, 'recall': 0.5714285714285714, 'f1-score': 0.6666666666666666, 'support': 14}, 'Election Campaign': {'precision': 0.8421052631578947, 'recall': 0.7619047619047619, 'f1-score': 0.8, 'support': 21}, 'Environment': {'precision': 0.75, 'recall': 0.6428571428571429, 'f1-score': 0.6923076923076924, 'support': 14}, 'Government/Public': {'precision': 0.7567567567567568, 'recall': 0.7446808510638298, 'f1-score': 0.7506702412868633, 'support': 188}, 'Health': {'precision': 0.8260869565217391, 'recall': 0.7307692307692307, 'f1-score': 0.7755102040816326, 'support': 52}, 'Immigration/Integration': {'precision': 0.84, 'recall': 0.5675675675675675, 'f1-score': 0.6774193548387097, 'support': 37}, 'Justice/Crime': {'precision': 0.8947368421052632, 'recall': 0.8252427184466019, 'f1-score': 0.8585858585858587, 'support': 103}, 'Labor/Employment': {'precision': 0.6428571428571429, 'recall': 0.5294117647058824, 'f1-score': 0.5806451612903226, 'support': 17}, 'Macroeconomics/Economic Regulation': {'precision': 0.8918918918918919, 'recall': 0.6226415094339622, 'f1-score': 0.7333333333333333, 'support': 53}, 'Media/Journalism': {'precision': 0.875, 'recall': 0.65625, 'f1-score': 0.75, 'support': 32}, 'Others': {'precision': 0.8851674641148325, 'recall': 0.7676348547717843, 'f1-score': 0.8222222222222222, 'support': 241}, 'Religion': {'precision': 0.4, 'recall': 0.2857142857142857, 'f1-score': 0.3333333333333333, 'support': 14}, 'Science/Technology': {'precision': 0.5, 'recall': 0.125, 'f1-score': 0.2, 'support': 8}, 'War/Terror': {'precision': 0.8725868725868726, 'recall': 0.9576271186440678, 'f1-score': 0.9131313131313131, 'support': 236}, 'micro avg': {'precision': 0.8274111675126904, 'recall': 0.7659774436090225, 'f1-score': 0.7955100048804296, 'support': 1064}, 'macro avg': {'precision': 0.7518126126661595, 'recall': 0.6231702604479635, 'f1-score': 0.6721068772570484, 'support': 1064}, 'weighted avg': {'precision': 0.8269770674375722, 'recall': 0.7659774436090225, 'f1-score': 0.790770984527679, 'support': 1064}, 'samples avg': {'precision': 0.826875, 'recall': 0.7987083333333334, 'f1-score': 0.7997063492063492, 'support': 1064}}, {'Conspiracy Theory': {'precision': 0.8260869565217391, 'recall': 0.4634146341463415, 'f1-score': 0.59375, 'support': 41}, 'Education': {'precision': 1.0, 'recall': 0.21428571428571427, 'f1-score': 0.35294117647058826, 'support': 14}, 'Election Campaign': {'precision': 0.8947368421052632, 'recall': 0.5483870967741935, 'f1-score': 0.6799999999999999, 'support': 31}, 'Environment': {'precision': 1.0, 'recall': 0.16666666666666666, 'f1-score': 0.2857142857142857, 'support': 12}, 'Government/Public': {'precision': 0.7333333333333333, 'recall': 0.7021276595744681, 'f1-score': 0.7173913043478262, 'support': 188}, 'Health': {'precision': 0.7954545454545454, 'recall': 0.7954545454545454, 'f1-score': 0.7954545454545455, 'support': 44}, 'Immigration/Integration': {'precision': 0.9333333333333333, 'recall': 0.5384615384615384, 'f1-score': 0.6829268292682926, 'support': 26}, 'Justice/Crime': {'precision': 0.91, 'recall': 0.7583333333333333, 'f1-score': 0.8272727272727273, 'support': 120}, 'Labor/Employment': {'precision': 0.875, 'recall': 0.4375, 'f1-score': 0.5833333333333334, 'support': 16}, 'Macroeconomics/Economic Regulation': {'precision': 0.7021276595744681, 'recall': 0.6875, 'f1-score': 0.6947368421052632, 'support': 48}, 'Media/Journalism': {'precision': 0.7368421052631579, 'recall': 0.5185185185185185, 'f1-score': 0.6086956521739131, 'support': 27}, 'Others': {'precision': 0.8975609756097561, 'recall': 0.7666666666666667, 'f1-score': 0.8269662921348315, 'support': 240}, 'Religion': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 10}, 'Science/Technology': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 13}, 'War/Terror': {'precision': 0.8826086956521739, 'recall': 0.9441860465116279, 'f1-score': 0.9123595505617977, 'support': 215}, 'micro avg': {'precision': 0.8424581005586592, 'recall': 0.7215311004784689, 'f1-score': 0.777319587628866, 'support': 1045}, 'macro avg': {'precision': 0.7458056297898514, 'recall': 0.502766828026241, 'f1-score': 0.5707695025891603, 'support': 1045}, 'weighted avg': {'precision': 0.8293895778652883, 'recall': 0.7215311004784689, 'f1-score': 0.7602263978913698, 'support': 1045}, 'samples avg': {'precision': 0.8133333333333332, 'recall': 0.7571875, 'f1-score': 0.7708214285714285, 'support': 1045}}]

In [32]:
pd.DataFrame(generic_valid_results[1]).transpose()

Unnamed: 0,precision,recall,f1-score,support
Conspiracy Theory,0.694444,0.416667,0.520833,60.0
Education,0.833333,0.625,0.714286,16.0
Election Campaign,0.861111,0.885714,0.873239,35.0
Environment,0.875,0.5,0.636364,14.0
Government/Public,0.756554,0.827869,0.790607,244.0
Health,0.804878,0.804878,0.804878,41.0
Immigration/Integration,0.693878,0.755556,0.723404,45.0
Justice/Crime,0.756303,0.818182,0.786026,110.0
Labor/Employment,0.8125,0.8125,0.8125,16.0
Macroeconomics/Economic Regulation,0.666667,0.72,0.692308,50.0


In [34]:
import pandas as pd

# Initialize dictionaries to store metrics
precision_dict = {label: [] for label in pd.DataFrame(generic_valid_results[0]).transpose().index[:-4]}
recall_dict = {label: [] for label in pd.DataFrame(generic_valid_results[0]).transpose().index[:-4]}
f1_dict = {label: [] for label in pd.DataFrame(generic_valid_results[0]).transpose().index[:-4]}
support_dict = {label: [] for label in pd.DataFrame(generic_valid_results[0]).transpose().index[:-4]}

# Loop through the results and add metrics to lists
for i in range(0, 5):
    report_df = pd.DataFrame(generic_valid_results[i]).transpose()
    for label in report_df.index[:-4]:  # Exclude averages
        precision_dict[label].append(report_df.loc[label, 'precision'])
        recall_dict[label].append(report_df.loc[label, 'recall'])
        f1_dict[label].append(report_df.loc[label, 'f1-score'])
        support_dict[label].append(report_df.loc[label, 'support'])
        print(label, report_df.loc[label, 'support'])

# Create DataFrames from the collected metrics
precision_df = pd.DataFrame(precision_dict)
recall_df = pd.DataFrame(recall_dict)
f1_df = pd.DataFrame(f1_dict)
support_df = pd.DataFrame(support_dict)

# Create correlation matrices
precision_corr = precision_df.corrwith(support_df)
recall_corr = recall_df.corrwith(support_df)
f1_corr = f1_df.corrwith(support_df)

# Print correlation matrices
print('Precision correlation with support:')
print(precision_corr)
print('\nRecall correlation with support:')
print(recall_corr)
print('\nF1-score correlation with support:')
print(f1_corr)

Conspiracy Theory 47.0
Education 12.0
Election Campaign 26.0
Environment 14.0
Government/Public 252.0
Health 45.0
Immigration/Integration 40.0
Justice/Crime 107.0
Labor/Employment 19.0
Macroeconomics/Economic Regulation 58.0
Media/Journalism 38.0
Others 205.0
Religion 22.0
Science/Technology 10.0
War/Terror 185.0
Conspiracy Theory 60.0
Education 16.0
Election Campaign 35.0
Environment 14.0
Government/Public 244.0
Health 41.0
Immigration/Integration 45.0
Justice/Crime 110.0
Labor/Employment 16.0
Macroeconomics/Economic Regulation 50.0
Media/Journalism 32.0
Others 231.0
Religion 14.0
Science/Technology 14.0
War/Terror 178.0
Conspiracy Theory 44.0
Education 11.0
Election Campaign 25.0
Environment 11.0
Government/Public 248.0
Health 40.0
Immigration/Integration 29.0
Justice/Crime 115.0
Labor/Employment 20.0
Macroeconomics/Economic Regulation 48.0
Media/Journalism 45.0
Others 211.0
Religion 10.0
Science/Technology 11.0
War/Terror 200.0
Conspiracy Theory 48.0
Education 10.0
Election Campaign

In [30]:
import pandas as pd

dataframes = {
    "generic_val_average_report": generic_val_average_report_df,
    "generic_test_average_report": generic_test_average_report_df,
    "GRU_202012_val_average_report": GRU_202012_val_average_report_df,
    "GRU_202012_test_average_report": GRU_202012_test_average_report_df,
    "IRA_202012_val_average_report": IRA_202012_val_average_report_df,
    "IRA_202012_test_average_report": IRA_202012_test_average_report_df,
    "REA_0621_val_average_report": REA_0621_val_average_report_df,
    "REA_0621_test_average_report": REA_0621_test_average_report_df,
    "UGANDA_0621_val_average_report": UGANDA_0621_val_average_report_df,
    "UGANDA_0621_test_average_report": UGANDA_0621_test_average_report_df,
    "VENEZUELA_201901_2_val_average_report": VENEZUELA_201901_2_val_average_report_df,
    "VENEZUELA_201901_2_test_average_report": VENEZUELA_201901_2_test_average_report_df,
}

for name, df in dataframes.items():
    csv_filename = f"../reports/{name}_weight_decay.csv"
    df.to_csv(csv_filename, index=False)

# Macro Averages:

In [37]:
def extract_macro_avg_value(df):
    return df[df.index == "macro avg"]["f1-score"].values[0]

summary_data = []

for name, df in dataframes.items():
    micro_avg_value = round(extract_macro_avg_value(df), 2)
    train_data = name.split("_")[0]

    if "val" in name:
        validation_value = micro_avg_value
        test_value = None
    elif "test" in name:
        validation_value = None
        test_value = micro_avg_value

    test_data = train_data
    if train_data != "generic":
        train_data = "All but " + train_data

    summary_data.append({
        "Train Data": train_data,
        "Test Data": test_data,
        "Validation": validation_value,
        "Test": test_value,
    })

# Combine rows with the same "Train Data" and "Test Data" into one
macro_summary_df = pd.DataFrame(summary_data)
macro_summary_df = macro_summary_df.groupby(["Train Data", "Test Data"], as_index=False).first()

# Reorder columns
macro_summary_df = macro_summary_df[["Train Data", "Test Data", "Validation", "Test"]]
macro_summary_df = macro_summary_df.reindex([macro_summary_df.index[-1]] + list(macro_summary_df.index[:-1]))
macro_summary_df = macro_summary_df.reset_index(drop=True)

print(macro_summary_df)

          Train Data  Test Data  Validation  Test
0            generic    generic        0.69  0.68
1        All but GRU        GRU        0.68  0.42
2        All but IRA        IRA        0.70  0.56
3        All but REA        REA        0.66  0.54
4     All but UGANDA     UGANDA        0.69  0.48
5  All but VENEZUELA  VENEZUELA        0.65  0.55


# Micro Averages:

In [32]:
def extract_micro_avg_value(df):
    return df[df.index == "micro avg"]["f1-score"].values[0]

summary_data = []

for name, df in dataframes.items():
    micro_avg_value = round(extract_micro_avg_value(df), 2)
    train_data = name.split("_")[0]

    if "val" in name:
        validation_value = micro_avg_value
        test_value = None
    elif "test" in name:
        validation_value = None
        test_value = micro_avg_value

    test_data = train_data
    if train_data != "generic":
        train_data = "All but " + train_data

    summary_data.append({
        "Train Data": train_data,
        "Test Data": test_data,
        "Validation": validation_value,
        "Test": test_value,
    })

# Combine rows with the same "Train Data" and "Test Data" into one
micro_summary_df = pd.DataFrame(summary_data)
micro_summary_df = micro_summary_df.groupby(["Train Data", "Test Data"], as_index=False).first()

# Reorder columns
micro_summary_df = micro_summary_df[["Train Data", "Test Data", "Validation", "Test"]]
micro_summary_df = micro_summary_df.reindex([micro_summary_df.index[-1]] + list(micro_summary_df.index[:-1]))
micro_summary_df = micro_summary_df.reset_index(drop=True)

print(micro_summary_df)

          Train Data  Test Data  Validation  Test
0            generic    generic        0.80  0.78
1        All but GRU        GRU        0.78  0.77
2        All but IRA        IRA        0.81  0.63
3        All but REA        REA        0.79  0.72
4     All but UGANDA     UGANDA        0.80  0.76
5  All but VENEZUELA  VENEZUELA        0.79  0.70


In [7]:
def create_latex_table(val_average_report_df, test_average_report_df):
    train_data = "Generic"
    test_data = "Generic"
    
    val_micro_avg = round(val_average_report_df.loc["micro avg", "f1-score"], 2)
    test_micro_avg = round(test_average_report_df.loc["micro avg", "f1-score"], 2)
    
    data = [[train_data, test_data, val_micro_avg, test_micro_avg]]
    
    columns = pd.MultiIndex.from_tuples([
        ("Dataset", "Train Data"),
        ("Dataset", "Test Data"),
        ("BERTweet Large", "Validation"),
        ("BERTweet Large", "Test")
    ])
    
    f1_scores_df = pd.DataFrame(data, columns=columns)
    
    print("Human-readable table:")
    print(f1_scores_df.to_string(index=False))
    print("\n")
    
    latex_table = f1_scores_df.to_latex(index=False, bold_rows=True, multicolumn=True, multicolumn_format='c', decimal=',', column_format='|l|l|l|l|', header=True, escape=False)

    latex_table = latex_table.replace('\\toprule', '')
    latex_table = latex_table.replace('\\midrule', '')
    latex_table = latex_table.replace('\\bottomrule', '')

    # Resize the header and center it
    latex_table = latex_table.replace('{tabular}', '{tabular}{@{}c@{}}')
    latex_table = latex_table.replace('Dataset & BERTweet Large', '\\large{Dataset} & \\large{BERTweet Large}')
    
    latex_table = latex_table.replace("\\begin{tabular}", "\\begin{tabular}{|l|l|l|l|}\n\\hline \\hline")
    latex_table = latex_table.replace("\\end{tabular}", "\\\\ \\hline \\hline\n\\end{tabular}")

    # Add borders between the rows
    latex_table = latex_table.replace('\\\\\n', '\\\\ \\hline\n')

    return latex_table

latex_table = create_latex_table(val_average_report_df, test_average_report_df)
print("LaTeX table:")
print(latex_table)

Human-readable table:
   Dataset           BERTweet Large     
Train Data Test Data     Validation Test
   Generic   Generic            0.8 0.79


LaTeX table:
\begin{tabular}{|l|l|l|l|}
\hline \hline{@{}c@{}}{|l|l|l|l|}

   Dataset & \multicolumn{2}{c}{BERTweet Large} \\ \hline
Train Data & Test Data &     Validation & Test \\ \hline

   Generic &   Generic &            0,8 & 0,79 \\ \hline

\\ \hline \hline
\end{tabular}{@{}c@{}}



  latex_table = f1_scores_df.to_latex(index=False, bold_rows=True, multicolumn=True, multicolumn_format='c', decimal=',', column_format='|l|l|l|l|', header=True, escape=False)
