# Fine-tuning BERT (and friends) for multi-label text classification
The original code is from https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb

## Set-up environment

In [40]:
# !pip install -U accelerate
# !pip install -U transformers
!pip install -q transformers datasets


[notice] A new release of pip is available: 23.1.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [35]:
import pandas as pd
import numpy as np
df = pd.read_pickle('FACT-GPT dataset.pickle')

In [36]:
syn_df = df[['claim', 'generated_entail_tweet_gpt-4', 'generated_contradict_tweet_gpt-4', 'generated_neutral_tweet_gpt-4']]

# Reshape the DataFrame
syn_df = syn_df.melt(id_vars='claim', var_name='label', value_name='tweet')

# Replace the label names
syn_df['label'] = syn_df['label'].replace({'generated_entail_tweet_gpt-4': 'ENTAILMENT',
                                           'generated_contradict_tweet_gpt-4': 'CONTRADICTION',
                                           'generated_neutral_tweet_gpt-4': 'NEUTRAL'})

syn_df.reset_index(inplace=True)

column_to_evaluate = 'label'

# Add new columns
syn_df['ENTAILMENT'] = syn_df[column_to_evaluate] == "ENTAILMENT"
syn_df['CONTRADICTION'] = syn_df[column_to_evaluate] == "CONTRADICTION"
syn_df['NEUTRAL'] = syn_df[column_to_evaluate] == "NEUTRAL"

# Drop the original column
syn_df.drop(columns=[column_to_evaluate], inplace=True)

# Reorder columns to the desired order
syn_df = syn_df[['index', 'tweet', 'claim', 'ENTAILMENT', 'NEUTRAL', 'CONTRADICTION']]
syn_df

Unnamed: 0,index,tweet,claim,ENTAILMENT,NEUTRAL,CONTRADICTION
0,0,Reports coming in that 108 FIFA-registered pla...,Suggests 108 FIFA registered players/coaches h...,True,False,False
1,1,"Event 201, a pandemic simulation, occurred in ...","The COVID-19 pandemic was planned, and a 2019 ...",True,False,False
2,2,"According to the WHO, the infection fatality r...",Even the WHO has conceded that the (SARS- CoV-...,True,False,False
3,3,Heads up! Walgreens appears to be checking our...,Walgreens refrigerators are scanning shoppers’...,True,False,False
4,4,Biden's CDC Director confesses surprising news...,Biden's CDC Director ADMITS Her Own Employees ...,True,False,False
...,...,...,...,...,...,...
3670,3670,Dr. Fauci's expertise in infectious diseases i...,Dr. Fauci disagreed with President Trump's dec...,False,True,False
3671,3671,"Diving into the depths of timeless literature,...","Poem about self isolation written in 1869, rep...",False,True,False
3672,3672,Pondering on the various perspectives about th...,“It has been transpiring that the current pand...,False,True,False
3673,3673,Exploring the fascinating world of technology ...,Eugenicist Bill Gates co-hosted a “high-level ...,False,True,False


In [38]:
# In order to evaluate the performance with the original data, you need to convert tweet_id into actual tweets
# as mentioned in https://github.com/echen102/COVID-19-TweetIDs
# FACT-GPT eval tiebreak.pkl provides test set labels, repeated 1,000 times with random tiebreaks 

column_to_evaluate = 'Mturk_1'

# Assuming df is your existing DataFrame
new_df = df[['tweet', 'claim', column_to_evaluate]].copy()
new_df.reset_index(inplace=True)

# Add new columns based on the value of 'entailment_few_shot_gpt-3_cleaned'
new_df['ENTAILMENT'] = new_df[column_to_evaluate] == "ENTAILMENT"
new_df['CONTRADICTION'] = new_df[column_to_evaluate] == "CONTRADICTION"
new_df['NEUTRAL'] = new_df[column_to_evaluate] == "NEUTRAL"

# Drop the original 'entailment_few_shot_gpt-3_cleaned' column if you wish
new_df.drop(columns=[column_to_evaluate], inplace=True)

# Reorder columns to the desired order
new_df = new_df[['index', 'tweet', 'claim', 'ENTAILMENT', 'NEUTRAL', 'CONTRADICTION']]
new_df

Unnamed: 0,index,tweet,claim,ENTAILMENT,NEUTRAL,CONTRADICTION
0,0,108 players registered with FIFA have died on ...,Suggests 108 FIFA registered players/coaches h...,False,True,False
1,1,Look up Event 201. A pandemic simulation that...,"The COVID-19 pandemic was planned, and a 2019 ...",False,True,False
2,2,"SARS-CoV-2 is a dangerous virus, but it has no...",Even the WHO has conceded that the (SARS- CoV-...,False,True,False
3,3,"In Walgreens line Cashier wearing No, we dont...",Walgreens refrigerators are scanning shoppers’...,True,False,False
4,4,"Rochelle Walensky, head of CDC was asked by re...",Biden's CDC Director ADMITS Her Own Employees ...,True,False,False
...,...,...,...,...,...,...
1220,1220,While decrying the fact that China muzzled hea...,Dr. Fauci disagreed with President Trump's dec...,False,True,False
1221,1221,This morning we would like to inspire you with...,"Poem about self isolation written in 1869, rep...",True,False,False
1222,1222,"@maestropetals On this Covid 19, there is an o...",“It has been transpiring that the current pand...,True,False,False
1223,1223,Coronavirus was funded by Bill Gates.,Eugenicist Bill Gates co-hosted a “high-level ...,True,False,False


## Load dataset



In [41]:
from datasets import Dataset
import datasets

In [42]:
from sklearn.model_selection import train_test_split

# Split the synthetic data into 80% training, 20% validation
train_df, valid_df = train_test_split(syn_df, test_size=0.2, random_state=42)
test_df = new_df

In [43]:
train_df.reset_index(drop=True, inplace=True)
valid_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [44]:
dataset = datasets.DatasetDict({'train': Dataset.from_pandas(train_df),
                                'valid': Dataset.from_pandas(valid_df),
                                 'test': Dataset.from_pandas(test_df)})

In [45]:
labels = [label for label in dataset['train'].features.keys() if label not in ['index', 'tweet', 'claim']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['ENTAILMENT', 'NEUTRAL', 'CONTRADICTION']

## Preprocess data

In [46]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(examples):
    # Take a batch of texts and claims
    text = examples["tweet"]
    claims = examples["claim"]
    # Concatenate them with the [SEP] token in between
    combined_texts = [t + " [SEP] " + c for t, c in zip(text, claims)]
    # Encode them
    encoding = tokenizer(combined_texts, padding="max_length", truncation=True, max_length=512)
    # Add labels (your existing code here, assuming `labels` is defined elsewhere in your code)
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
    labels_matrix = np.zeros((len(text), len(labels)))
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]
    encoding["labels"] = labels_matrix.tolist()
    return encoding


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [47]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/2940 [00:00<?, ? examples/s]

Map:   0%|          | 0/735 [00:00<?, ? examples/s]

Map:   0%|          | 0/1225 [00:00<?, ? examples/s]

In [48]:
example = encoded_dataset['train'][1]
print(example.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [49]:
tokenizer.decode(example['input_ids'])

'[CLS] unfolding events in austria paint a tense picture ; conversations surrounding public health, individual freedom, and the nuremberg code continue to amplify across the eu. # austrialockdown # eudiscussions [SEP] shocking : in the wake of austria ’ s drastic lockdown of unvaccinated people, eu chief calls for throwing out nuremberg code. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [

In [50]:
example['labels']

[0.0, 1.0, 0.0]

In [51]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

['NEUTRAL']

In [52]:
encoded_dataset.set_format("torch")

## Define model

In [53]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Train the model

In [54]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

Let's verify a batch as well as a forward pass:

In [55]:
encoded_dataset['train'][0]['labels'].type()

'torch.FloatTensor'

In [56]:
encoded_dataset['train']['input_ids'][0]

tensor([  101,  2522, 17258,  1011,  2539,  3793,  4471, 11727,  1999,  7387,
         2031,  2584,  1037, 26233,  1015,  4551, 22525,  2050,   999,  1001,
         2522, 17258, 16147, 25518, 11610,   102,  7387,  2985,  1015,  4551,
        22525,  2050,  2006,  2522, 17258,  1011,  2539,  3793,  7696,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [57]:
#forward pass
input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0)
attention_mask = torch.ones(input_ids.shape, dtype=torch.long)

outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=encoded_dataset['train'][0]['labels'].unsqueeze(0))
outputs

SequenceClassifierOutput(loss=tensor(0.6363, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[ 0.0709,  0.0464, -0.3488]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

Let's start training!

In [58]:
from transformers import TrainingArguments, Trainer

batch_size = 8
metric_name = "f1"

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    overwrite_output_dir=True,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    logging_steps=100
)

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [59]:
trainer.train()

Epoch,Training Loss,Validation Loss



KeyboardInterrupt



## Evaluate

In [46]:
test_results = trainer.evaluate(encoded_dataset["test"])
predictions, label_ids, metrics = trainer.predict(encoded_dataset["test"])

In [47]:
import numpy as np

# Apply softmax to get probabilities
probabilities = np.exp(predictions) / np.sum(np.exp(predictions), axis=1, keepdims=True)

# Get the class that has the maximum probability
predicted_classes = np.argmax(probabilities, axis=1)

In [48]:
predicted_labels = [id2label[idx] for idx in predicted_classes]
y_pred = predicted_labels

In [None]:
with open("y_pred.pkl", 'wb') as f:
    pickle.dump(y_pred, f)

In [60]:
import pickle

# Load the pickled random_aggregated_mturks list
with open('y_pred.pkl', 'rb') as f:
    y_pred = pickle.load(f)
    
with open('FACT-GPT eval tiebreak.pkl', 'rb') as f:
    random_aggregated_mturks = pickle.load(f)

In [61]:
from sklearn.metrics import confusion_matrix

ev = []
for i in range(1000):
    ev += random_aggregated_mturks[i]

In [62]:
from sklearn.metrics import classification_report

# Assuming y_true is your ground truth labels and y_pred is the predicted labels from your model
report = classification_report(ev, y_pred * 1000, target_names=['CONTRADICTION', 'ENTAILMENT', 'NEUTRAL'])

print(report)

               precision    recall  f1-score   support

CONTRADICTION       0.13      0.47      0.21    100502
   ENTAILMENT       0.67      0.66      0.66    668891
      NEUTRAL       0.51      0.24      0.33    455607

     accuracy                           0.49   1225000
    macro avg       0.44      0.46      0.40   1225000
 weighted avg       0.57      0.49      0.50   1225000

