# Danial Ebrazeh - SCU Student

In [14]:
import pandas as pd
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from torch.optim import AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from tqdm.auto import tqdm

try:
    BASE_DIR = Path(__file__).parent
except NameError:
    BASE_DIR = Path().resolve()

print("Base directory:", BASE_DIR)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Base directory: C:\Users\13811\Desktop\Uni\Data mining and Machine learning\Data_Maining_Spooky_Author_40173126_Final
Using device: cuda


In [15]:
train_df = pd.read_csv(BASE_DIR / "train.csv")
val_df   = pd.read_csv(BASE_DIR / "val.csv")
test_df  = pd.read_csv(BASE_DIR / "test.csv")

le = LabelEncoder()
train_labels = le.fit_transform(train_df["author"])
val_labels = le.transform(val_df["author"])
test_labels = le.transform(test_df["author"])

print("Author classes:", le.classes_)


Author classes: ['EAP' 'HPL' 'MWS']


In [16]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def encode_texts(texts, max_len=256):

    return tokenizer(
        texts.tolist(),
        padding="max_length",
        truncation=True,
        max_length=max_len,
        return_tensors="pt"
    )


In [17]:
class AuthorDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        encodings = encode_texts(texts)
        self.input_ids = encodings["input_ids"]
        self.attention_mask = encodings["attention_mask"]
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx]
        }

train_ds = AuthorDataset(train_df["text"], train_labels, tokenizer)
val_ds = AuthorDataset(val_df["text"], val_labels, tokenizer)
test_ds = AuthorDataset(test_df["text"], test_labels, tokenizer)

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=16)
test_loader = DataLoader(test_ds, batch_size=16)


In [18]:
import torch.nn as nn

class BertClassifier(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.5)
        self.hidden1 = nn.Linear(self.bert.config.hidden_size, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.hidden2 = nn.Linear(512, 256)
        self.bn2 = nn.BatchNorm1d(256)
        self.hidden3 = nn.Linear(256, 128)
        self.output = nn.Linear(128, num_classes)
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        x = outputs.pooler_output
        x = self.dropout(self.relu(self.bn1(self.hidden1(x))))
        x = self.dropout(self.relu(self.bn2(self.hidden2(x))))
        x = self.output(self.hidden3(x))
        return x

model = BertClassifier(num_classes=len(le.classes_)).to(device)
model.bert.requires_grad_(True)


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [19]:
def train_epoch(model, loader, optimizer, criterion, device):

    model.train()
    losses = []
    for batch in tqdm(loader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    return sum(losses) / len(losses)

def eval_model(model, loader, device, le):

    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            logits = model(input_ids, attention_mask)
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            trues.extend(labels.cpu().numpy())
    acc = accuracy_score(trues, preds)
    report = classification_report(trues, preds, target_names=le.classes_)
    return acc, report


In [20]:
optimizer = AdamW([
    {"params": model.bert.parameters(), "lr": 2e-5},
    {"params": model.hidden1.parameters(), "lr": 1e-3},
    {"params": model.hidden2.parameters(), "lr": 1e-3},
    {"params": model.hidden3.parameters(), "lr": 1e-3},
    {"params": model.output.parameters(), "lr": 1e-3},
])


In [21]:
from torch.nn import CrossEntropyLoss
criterion = CrossEntropyLoss()

In [9]:
best_val_acc = 0
for epoch in range(1, 20): 
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    print(f"Epoch {epoch}")
    train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    print(f"Train loss: {train_loss:.4f}")
    val_acc, val_report = eval_model(model, val_loader, device, le)
    print(f"Validation Accuracy: {val_acc:.4f}")
    print(val_report)
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "best_model.pt")
        print("Model saved!")


Epoch 1


Training: 100%|██████████| 2399/2399 [11:26<00:00,  3.50it/s]


Train loss: 0.5698


Evaluating: 100%|██████████| 300/300 [00:28<00:00, 10.68it/s]


Validation Accuracy: 0.8424
              precision    recall  f1-score   support

         EAP       0.87      0.80      0.83      1992
         HPL       0.86      0.89      0.87      1307
         MWS       0.80      0.86      0.83      1498

    accuracy                           0.84      4797
   macro avg       0.84      0.85      0.84      4797
weighted avg       0.84      0.84      0.84      4797

Model saved!
Epoch 2


Training: 100%|██████████| 2399/2399 [11:41<00:00,  3.42it/s]


Train loss: 0.2978


Evaluating: 100%|██████████| 300/300 [00:28<00:00, 10.61it/s]


Validation Accuracy: 0.8730
              precision    recall  f1-score   support

         EAP       0.86      0.88      0.87      1992
         HPL       0.91      0.86      0.89      1307
         MWS       0.85      0.88      0.86      1498

    accuracy                           0.87      4797
   macro avg       0.88      0.87      0.87      4797
weighted avg       0.87      0.87      0.87      4797

Model saved!
Epoch 3


Training: 100%|██████████| 2399/2399 [11:44<00:00,  3.40it/s]


Train loss: 0.1598


Evaluating: 100%|██████████| 300/300 [00:28<00:00, 10.57it/s]


Validation Accuracy: 0.8851
              precision    recall  f1-score   support

         EAP       0.88      0.89      0.88      1992
         HPL       0.91      0.90      0.90      1307
         MWS       0.88      0.87      0.87      1498

    accuracy                           0.89      4797
   macro avg       0.89      0.89      0.89      4797
weighted avg       0.89      0.89      0.89      4797

Model saved!
Epoch 4


Training: 100%|██████████| 2399/2399 [11:45<00:00,  3.40it/s]


Train loss: 0.0938


Evaluating: 100%|██████████| 300/300 [00:28<00:00, 10.52it/s]


Validation Accuracy: 0.8516
              precision    recall  f1-score   support

         EAP       0.93      0.75      0.83      1992
         HPL       0.84      0.91      0.87      1307
         MWS       0.79      0.93      0.85      1498

    accuracy                           0.85      4797
   macro avg       0.85      0.86      0.85      4797
weighted avg       0.86      0.85      0.85      4797

Epoch 5


Training: 100%|██████████| 2399/2399 [11:44<00:00,  3.41it/s]


Train loss: 0.0689


Evaluating: 100%|██████████| 300/300 [00:28<00:00, 10.64it/s]


Validation Accuracy: 0.8837
              precision    recall  f1-score   support

         EAP       0.90      0.86      0.88      1992
         HPL       0.90      0.90      0.90      1307
         MWS       0.86      0.90      0.88      1498

    accuracy                           0.88      4797
   macro avg       0.88      0.89      0.88      4797
weighted avg       0.88      0.88      0.88      4797

Epoch 6


Training: 100%|██████████| 2399/2399 [11:43<00:00,  3.41it/s]


Train loss: 0.0522


Evaluating: 100%|██████████| 300/300 [00:28<00:00, 10.51it/s]


Validation Accuracy: 0.8853
              precision    recall  f1-score   support

         EAP       0.86      0.92      0.89      1992
         HPL       0.94      0.86      0.90      1307
         MWS       0.87      0.87      0.87      1498

    accuracy                           0.89      4797
   macro avg       0.89      0.88      0.89      4797
weighted avg       0.89      0.89      0.89      4797

Model saved!
Epoch 7


Training: 100%|██████████| 2399/2399 [11:44<00:00,  3.41it/s]


Train loss: 0.0471


Evaluating: 100%|██████████| 300/300 [00:28<00:00, 10.58it/s]


Validation Accuracy: 0.8801
              precision    recall  f1-score   support

         EAP       0.86      0.90      0.88      1992
         HPL       0.88      0.91      0.89      1307
         MWS       0.90      0.83      0.86      1498

    accuracy                           0.88      4797
   macro avg       0.88      0.88      0.88      4797
weighted avg       0.88      0.88      0.88      4797

Epoch 8


Training: 100%|██████████| 2399/2399 [11:44<00:00,  3.41it/s]


Train loss: 0.0404


Evaluating: 100%|██████████| 300/300 [00:28<00:00, 10.57it/s]


Validation Accuracy: 0.8774
              precision    recall  f1-score   support

         EAP       0.85      0.90      0.87      1992
         HPL       0.92      0.88      0.90      1307
         MWS       0.88      0.84      0.86      1498

    accuracy                           0.88      4797
   macro avg       0.88      0.87      0.88      4797
weighted avg       0.88      0.88      0.88      4797

Epoch 9


Training: 100%|██████████| 2399/2399 [11:44<00:00,  3.40it/s]


Train loss: 0.0357


Evaluating: 100%|██████████| 300/300 [00:28<00:00, 10.58it/s]


Validation Accuracy: 0.8662
              precision    recall  f1-score   support

         EAP       0.92      0.80      0.86      1992
         HPL       0.90      0.90      0.90      1307
         MWS       0.79      0.93      0.85      1498

    accuracy                           0.87      4797
   macro avg       0.87      0.87      0.87      4797
weighted avg       0.87      0.87      0.87      4797

Epoch 10


Training: 100%|██████████| 2399/2399 [11:43<00:00,  3.41it/s]


Train loss: 0.0370


Evaluating: 100%|██████████| 300/300 [00:28<00:00, 10.64it/s]


Validation Accuracy: 0.8509
              precision    recall  f1-score   support

         EAP       0.90      0.78      0.84      1992
         HPL       0.95      0.85      0.89      1307
         MWS       0.75      0.94      0.84      1498

    accuracy                           0.85      4797
   macro avg       0.86      0.86      0.86      4797
weighted avg       0.86      0.85      0.85      4797

Epoch 11


Training: 100%|██████████| 2399/2399 [11:42<00:00,  3.41it/s]


Train loss: 0.0289


Evaluating: 100%|██████████| 300/300 [00:28<00:00, 10.60it/s]


Validation Accuracy: 0.8851
              precision    recall  f1-score   support

         EAP       0.86      0.91      0.88      1992
         HPL       0.93      0.87      0.90      1307
         MWS       0.88      0.87      0.87      1498

    accuracy                           0.89      4797
   macro avg       0.89      0.88      0.89      4797
weighted avg       0.89      0.89      0.89      4797

Epoch 12


Training: 100%|██████████| 2399/2399 [11:42<00:00,  3.41it/s]


Train loss: 0.0301


Evaluating: 100%|██████████| 300/300 [00:28<00:00, 10.64it/s]


Validation Accuracy: 0.8783
              precision    recall  f1-score   support

         EAP       0.90      0.86      0.88      1992
         HPL       0.92      0.88      0.90      1307
         MWS       0.83      0.90      0.86      1498

    accuracy                           0.88      4797
   macro avg       0.88      0.88      0.88      4797
weighted avg       0.88      0.88      0.88      4797

Epoch 13


Training: 100%|██████████| 2399/2399 [11:43<00:00,  3.41it/s]


Train loss: 0.0245


Evaluating: 100%|██████████| 300/300 [00:28<00:00, 10.59it/s]


Validation Accuracy: 0.8755
              precision    recall  f1-score   support

         EAP       0.83      0.91      0.87      1992
         HPL       0.93      0.86      0.89      1307
         MWS       0.90      0.83      0.86      1498

    accuracy                           0.88      4797
   macro avg       0.89      0.87      0.88      4797
weighted avg       0.88      0.88      0.88      4797

Epoch 14


Training: 100%|██████████| 2399/2399 [11:44<00:00,  3.41it/s]


Train loss: 0.0262


Evaluating: 100%|██████████| 300/300 [00:28<00:00, 10.55it/s]


Validation Accuracy: 0.8824
              precision    recall  f1-score   support

         EAP       0.90      0.86      0.88      1992
         HPL       0.90      0.90      0.90      1307
         MWS       0.84      0.90      0.87      1498

    accuracy                           0.88      4797
   macro avg       0.88      0.89      0.88      4797
weighted avg       0.88      0.88      0.88      4797

Epoch 15


Training: 100%|██████████| 2399/2399 [11:43<00:00,  3.41it/s]


Train loss: 0.0245


Evaluating: 100%|██████████| 300/300 [00:28<00:00, 10.55it/s]


Validation Accuracy: 0.8691
              precision    recall  f1-score   support

         EAP       0.92      0.82      0.87      1992
         HPL       0.92      0.88      0.90      1307
         MWS       0.79      0.93      0.85      1498

    accuracy                           0.87      4797
   macro avg       0.87      0.87      0.87      4797
weighted avg       0.88      0.87      0.87      4797

Epoch 16


Training: 100%|██████████| 2399/2399 [11:43<00:00,  3.41it/s]


Train loss: 0.0211


Evaluating: 100%|██████████| 300/300 [00:28<00:00, 10.56it/s]


Validation Accuracy: 0.8856
              precision    recall  f1-score   support

         EAP       0.90      0.87      0.89      1992
         HPL       0.92      0.89      0.90      1307
         MWS       0.84      0.90      0.87      1498

    accuracy                           0.89      4797
   macro avg       0.89      0.89      0.89      4797
weighted avg       0.89      0.89      0.89      4797

Model saved!
Epoch 17


Training: 100%|██████████| 2399/2399 [11:43<00:00,  3.41it/s]


Train loss: 0.0222


Evaluating: 100%|██████████| 300/300 [00:28<00:00, 10.55it/s]


Validation Accuracy: 0.8726
              precision    recall  f1-score   support

         EAP       0.87      0.90      0.88      1992
         HPL       0.96      0.80      0.87      1307
         MWS       0.82      0.90      0.86      1498

    accuracy                           0.87      4797
   macro avg       0.88      0.87      0.87      4797
weighted avg       0.88      0.87      0.87      4797

Epoch 18


Training: 100%|██████████| 2399/2399 [11:43<00:00,  3.41it/s]


Train loss: 0.0225


Evaluating: 100%|██████████| 300/300 [00:28<00:00, 10.64it/s]


Validation Accuracy: 0.8826
              precision    recall  f1-score   support

         EAP       0.89      0.87      0.88      1992
         HPL       0.91      0.89      0.90      1307
         MWS       0.85      0.89      0.87      1498

    accuracy                           0.88      4797
   macro avg       0.88      0.88      0.88      4797
weighted avg       0.88      0.88      0.88      4797

Epoch 19


Training: 100%|██████████| 2399/2399 [11:41<00:00,  3.42it/s]


Train loss: 0.0211


Evaluating: 100%|██████████| 300/300 [00:28<00:00, 10.66it/s]


Validation Accuracy: 0.8858
              precision    recall  f1-score   support

         EAP       0.90      0.87      0.88      1992
         HPL       0.90      0.90      0.90      1307
         MWS       0.86      0.89      0.87      1498

    accuracy                           0.89      4797
   macro avg       0.89      0.89      0.89      4797
weighted avg       0.89      0.89      0.89      4797

Model saved!


In [None]:
model.load_state_dict(torch.load("best_model.pt"))
test_acc, test_report = eval_model(model, test_loader, device, le)
print(f"Test Accuracy: {test_acc:.4f}")
print(test_report)

Evaluating: 100%|██████████| 300/300 [00:29<00:00, 10.23it/s]

Test Accuracy: 0.8816
              precision    recall  f1-score   support

         EAP       0.88      0.87      0.88      1992
         HPL       0.90      0.90      0.90      1308
         MWS       0.86      0.88      0.87      1497

    accuracy                           0.88      4797
   macro avg       0.88      0.88      0.88      4797
weighted avg       0.88      0.88      0.88      4797




