# Fine-tuning BERT (and friends) for multi-label text classification
The original code is from https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb

## Set-up environment

In [1]:
!pip install -U accelerate
!pip install -U transformers

In [6]:
!pip install -q transformers datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/FACT-GPT dataset.csv').rename(columns={'Unnamed: 0': 'index'})

In [3]:
syn_df = df[['claim', 'generated_entail_tweet_gpt-4', 'generated_contradict_tweet_gpt-4', 'generated_neutral_tweet_gpt-4']]

# Reshape the DataFrame
syn_df = syn_df.melt(id_vars='claim', var_name='label', value_name='tweet')

# Replace the label names
syn_df['label'] = syn_df['label'].replace({'generated_entail_tweet_gpt-4': 'ENTAILMENT',
                                           'generated_contradict_tweet_gpt-4': 'CONTRADICTION',
                                           'generated_neutral_tweet_gpt-4': 'NEUTRAL'})

syn_df.reset_index(inplace=True)

column_to_evaluate = 'label'

# Add new columns
syn_df['ENTAILMENT'] = syn_df[column_to_evaluate] == "ENTAILMENT"
syn_df['CONTRADICTION'] = syn_df[column_to_evaluate] == "CONTRADICTION"
syn_df['NEUTRAL'] = syn_df[column_to_evaluate] == "NEUTRAL"

# Drop the original column
syn_df.drop(columns=[column_to_evaluate], inplace=True)

# Reorder columns to the desired order
syn_df = syn_df[['index', 'tweet', 'claim', 'ENTAILMENT', 'NEUTRAL', 'CONTRADICTION']]
syn_df

In [4]:
column_to_evaluate = 'Mturk_1'

# Assuming df is your existing DataFrame
new_df = df[['index', 'tweet', 'claim', column_to_evaluate]].copy()

# Add new columns based on the value of 'entailment_few_shot_gpt-3_cleaned'
new_df['ENTAILMENT'] = new_df[column_to_evaluate] == "ENTAILMENT"
new_df['CONTRADICTION'] = new_df[column_to_evaluate] == "CONTRADICTION"
new_df['NEUTRAL'] = new_df[column_to_evaluate] == "NEUTRAL"

# Drop the original 'entailment_few_shot_gpt-3_cleaned' column if you wish
new_df.drop(columns=[column_to_evaluate], inplace=True)

# Reorder columns to the desired order
new_df = new_df[['index', 'tweet', 'claim', 'ENTAILMENT', 'NEUTRAL', 'CONTRADICTION']]
new_df

## Load dataset



In [7]:
from datasets import Dataset
import datasets

In [34]:
from sklearn.model_selection import train_test_split

# Split the synthetic data into 80% training, 20% validation
train_df, valid_df = train_test_split(syn_df, test_size=0.2, random_state=42)
test_df = new_df

In [35]:
train_df.reset_index(drop=True, inplace=True)
valid_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [36]:
dataset = datasets.DatasetDict({'train': Dataset.from_pandas(train_df),
                                'valid': Dataset.from_pandas(valid_df),
                                 'test': Dataset.from_pandas(test_df)})

In [5]:
labels = [label for label in dataset['train'].features.keys() if label not in ['index', 'tweet', 'claim']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

## Preprocess data

In [39]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(examples):
    # Take a batch of texts and claims
    text = examples["tweet"]
    claims = examples["claim"]
    # Concatenate them with the [SEP] token in between
    combined_texts = [t + " [SEP] " + c for t, c in zip(text, claims)]
    # Encode them
    encoding = tokenizer(combined_texts, padding="max_length", truncation=True, max_length=512)
    # Add labels (your existing code here, assuming `labels` is defined elsewhere in your code)
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
    labels_matrix = np.zeros((len(text), len(labels)))
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]
    encoding["labels"] = labels_matrix.tolist()
    return encoding


In [6]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)

In [7]:
example = encoded_dataset['train'][1]
print(example.keys())

In [8]:
tokenizer.decode(example['input_ids'])

In [9]:
example['labels']

In [10]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

In [45]:
encoded_dataset.set_format("torch")

## Define model

In [11]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

## Train the model

In [20]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

Let's verify a batch as well as a forward pass:

In [21]:
encoded_dataset['train'][0]['labels'].type()

'torch.FloatTensor'

In [12]:
encoded_dataset['train']['input_ids'][0]

In [13]:
#forward pass
input_ids=encoded_dataset['train']['input_ids'][0].unsqueeze(0)
attention_mask = torch.ones(input_ids.shape, dtype=torch.long)

outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=encoded_dataset['train'][0]['labels'].unsqueeze(0))
outputs

Let's start training!

In [24]:
from transformers import TrainingArguments, Trainer

batch_size = 8
metric_name = "f1"

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    overwrite_output_dir=True,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    logging_steps=100
)

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [14]:
trainer.train()

## Evaluate

In [46]:
test_results = trainer.evaluate(encoded_dataset["test"])
predictions, label_ids, metrics = trainer.predict(encoded_dataset["test"])

In [47]:
import numpy as np

# Apply softmax to get probabilities
probabilities = np.exp(predictions) / np.sum(np.exp(predictions), axis=1, keepdims=True)

# Get the class that has the maximum probability
predicted_classes = np.argmax(probabilities, axis=1)

In [48]:
predicted_labels = [id2label[idx] for idx in predicted_classes]
y_pred = predicted_labels

In [49]:
import pickle

# Load the pickled random_aggregated_mturks list
pickle_file_path = '/content/drive/MyDrive/FACT-GPT eval tiebreak.pkl'
with open(pickle_file_path, 'rb') as f:
    random_aggregated_mturks = pickle.load(f)

In [50]:
from sklearn.metrics import confusion_matrix

ev = []
for i in range(1000):
    ev += random_aggregated_mturks[i]

In [51]:
from sklearn.metrics import classification_report

# Assuming y_true is your ground truth labels and y_pred is the predicted labels from your model
report = classification_report(ev, y_pred * 1000, target_names=['CONTRADICTION', 'ENTAILMENT', 'NEUTRAL'])

print(report)

               precision    recall  f1-score   support

CONTRADICTION       0.13      0.52      0.21    100502
   ENTAILMENT       0.65      0.65      0.65    668891
      NEUTRAL       0.59      0.20      0.30    455607

     accuracy                           0.47   1225000
    macro avg       0.46      0.46      0.39   1225000
 weighted avg       0.59      0.47      0.48   1225000

