<a href="https://colab.research.google.com/github/Doris-QZ/spooky_author_identification/blob/main/3_BERT_Spooky_Author_Identification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Introduction

This is the third deep learning model for the 'Spooky Author Identification' project. In this notebook, I will directly load the data from my Google Drive to fine-tune the **BERT model**. For the EDA section, please check the notebook: [1_LSTM_Spooky_Author_Identification.ipynb](https://github.com/Doris-QZ/spooky_author_identification/blob/main/1_LSTM_Spooky_Author_Identification.ipynb).

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
# Load Important packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import re

# Modeling
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, classification_report, accuracy_score, f1_score
from torch.optim import AdamW

In [21]:
# Load the data
train = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/Spooky_Author_Identification/train.csv')
test = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/Spooky_Author_Identification/test.csv')

### BERT

In [22]:
# Split the training set to training and validation set
training_set, validation_set = train_test_split(train, test_size = 0.2, stratify = train['author_encoded'], random_state = 1)

**BERT model with last encoder layer and pooler layer unfreezed**

I will first fine tune a BertForSequenceClassification model with the last encoder layer and pooler layer of BERT unfreezed.

In [23]:
# Load bert_tokenizer and bert_model
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
# Take a look at the architecture of bert_model
bert_model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [25]:
# Use GPU if available
bert_model = bert_model.to('cuda')

In [26]:
# Freeze base model parameters
for name, param in bert_model.base_model.named_parameters():
  param.requires_grad = False

# Unfreeze the last encoder layer and pooiling layers
for name, param in bert_model.base_model.encoder.layer[-1].named_parameters():
  param.requires_grad = True
for name, param in bert_model.base_model.pooler.named_parameters():
  param.requires_grad = True

In [27]:
total_params = sum(p.numel() for p in bert_model.parameters())
trainable_params = sum(p.numel() for p in bert_model.parameters() if p.requires_grad)

print(f'Total parameters: {total_params:,}')
print(f'Trainable parameters: {trainable_params:,}')

Total parameters: 109,484,547
Trainable parameters: 7,680,771


In [28]:
# Check the length of text data
text_length = training_set['text'].str.split().str.len()
print(text_length.describe())

count    15663.000000
mean        26.697951
std         18.102614
min          2.000000
25%         15.000000
50%         23.000000
75%         34.000000
max        594.000000
Name: text, dtype: float64


In [29]:
(text_length > 64).sum() / training_set.shape[0]

np.float64(0.033263104130754007)

There are 3% text data has more than 64 words. I will set the max_length of the bert_tokenizer to be 64.

In [30]:
# Tokenize text data
train_tokenized = bert_tokenizer(training_set['text'].tolist(),
                                 padding = True,
                                 truncation = True,
                                 add_special_tokens = True,
                                 max_length = 64,
                                 return_tensors = 'pt')

val_tokenized = bert_tokenizer(validation_set['text'].tolist(),
                                 padding = True,
                                 truncation = True,
                                 add_special_tokens = True,
                                 max_length = 64,
                                 return_tensors = 'pt')

In [37]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
  def __init__(self, tokenized, labels = None):
    self.tokenized = tokenized
    self.labels = labels

  def __getitem__(self, idx):
    item = {key:value[idx] for key, value in self.tokenized.items()}
    if self.labels:
      item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.tokenized['input_ids'])


In [38]:
train_dataset = Dataset(train_tokenized, training_set['author_encoded'].tolist())
val_dataset = Dataset(val_tokenized, validation_set['author_encoded'].tolist())

In [39]:
train_dataset[0]

{'input_ids': tensor([  101, 10930, 11563,  2024, 14195,  2368,  1010,  2980, 18884,  1010,
         26679,  6392, 24546,  1010, 10514,  2906,  8159,  1010,  1998, 10131,
         18884,  1012,   102,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [40]:
# Define metrics
def compute_metrics(eval_pred):
  y_pred, y_true = eval_pred
  y_pred = np.argmax(y_pred, axis = 1)
  accuracy = accuracy_score(y_true, y_pred)
  f1 = f1_score(y_true, y_pred, average = 'macro')
  return {'accuracy': accuracy, 'f1_score': f1}

In [41]:
# Define trainer
args = TrainingArguments(
    output_dir = '/content/drive/MyDrive/ColabNotebooks/Spooky_Author_Identification/bert_model',
    num_train_epochs = 20,
    learning_rate = 3e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    eval_strategy = 'epoch',
    logging_strategy = 'epoch',
    save_strategy = 'epoch',
    save_total_limit = 1,
    load_best_model_at_end = True,
    metric_for_best_model = 'eval_loss',
    report_to = "none"
)

trainer = Trainer(
    model = bert_model,
    args = args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    compute_metrics = compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [42]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Score
1,0.4797,0.483513,0.800306,0.800427
2,0.4415,0.461761,0.814607,0.814128
3,0.3859,0.474409,0.821246,0.821865
4,0.3387,0.454556,0.829673,0.830116
5,0.2902,0.478576,0.829162,0.82946
6,0.2525,0.499073,0.829418,0.829714
7,0.2248,0.504932,0.830184,0.829807


TrainOutput(global_step=6853, training_loss=0.34477200265280716, metrics={'train_runtime': 555.8315, 'train_samples_per_second': 563.588, 'train_steps_per_second': 35.227, 'total_flos': 3606002279139456.0, 'train_loss': 0.34477200265280716, 'epoch': 7.0})

In [43]:
trainer.evaluate()

{'eval_loss': 0.4545561373233795,
 'eval_accuracy': 0.8296731358529111,
 'eval_f1_score': 0.8301157447653735,
 'eval_runtime': 13.4984,
 'eval_samples_per_second': 290.109,
 'eval_steps_per_second': 18.15,
 'epoch': 7.0}

The best validation loss is 0.45, with an validation accuracy of 0.83 and f1 score of 0.83, achieved at epoch 7. As we can see from the training log, the training loss continually decreases over the 7 epochs, while the validation loss decreases for the first few epochs but starts increasing after epoch 4 and continues rising through epoch 7, indicating that the model is overfitting.  

I also tried fine-tuning the model with only the pooler layer unfrozen, but it didn’t reduce overfitting. Instead, it lowered the performance, with a validation accuracy of around 0.77. Therefore, I decided to stick with the first model for making predictions on the test set.

In [44]:
# Prepare test dataset
test_tokenized = bert_tokenizer(test['text'].tolist(),
                                 padding = True,
                                 truncation = True,
                                 add_special_tokens = True,
                                 max_length = 64,
                                 return_tensors = 'pt')

test_dataset = Dataset(test_tokenized)

In [46]:
# load the first model
# output_dir = '/content/drive/MyDrive/ColabNotebooks/Spooky_Author_Identification/bert_model/checkpoint-9790'
# bert_model = BertForSequenceClassification.from_pretrained(output_dir)
# bert_model = bert_model.to('cuda')

In [47]:
# Define dummy training arguments
args = TrainingArguments(
    output_dir = '/content/drive/MyDrive/ColabNotebooks/Spooky_Author_Identification/bert_model/results',
    per_device_eval_batch_size = 16,
    report_to = "none"
)

# Create the trainer
trainer = Trainer(
    model = bert_model,
    args = args
)


In [48]:
# Print the classification report on the validation set
predictions = trainer.predict(val_dataset)
y_pred = np.argmax(predictions.predictions, axis = 1)
y_true = np.array(validation_set['author_encoded'])
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.79      0.82      1580
           1       0.79      0.88      0.83      1209
           2       0.83      0.83      0.83      1127

    accuracy                           0.83      3916
   macro avg       0.83      0.83      0.83      3916
weighted avg       0.83      0.83      0.83      3916



In [49]:
# Make prediction on the test set
predictions = trainer.predict(test_dataset)

# Extract the logits from the prediction object
logits = predictions.predictions

# Convert logits to probability
probabilities = torch.softmax(torch.tensor(logits), dim = -1).numpy()

In [50]:
bert_prediction = pd.DataFrame(probabilities, columns = ['EAP', 'MWS', 'HPL'])
bert_prediction = pd.concat([test['id'], bert_prediction], axis = 1)
bert_prediction = bert_prediction[['id', 'EAP', 'HPL', 'MWS']]
bert_prediction.head()

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.003493,0.000267,0.99624
1,id24541,0.998931,0.000734,0.000335
2,id00134,0.000129,0.999812,5.9e-05
3,id27757,0.825461,0.171772,0.002767
4,id04081,0.705251,0.218576,0.076173


In [51]:
bert_prediction.to_csv('/content/drive/MyDrive/ColabNotebooks/Spooky_Author_Identification/bert_model/bert_prediction.csv', index = False)

After submitting to Kaggle, I got a public score of log_loss at 0.61, and private score of 0.57.