In [1]:
import torch
from torch.utils.data import Dataset
import torch.nn as nn
from transformers import DistilBertModel
from torch.utils.data import DataLoader
import pandas as pd
from transformers import DistilBertTokenizer
from sklearn.preprocessing import LabelEncoder
import joblib
import os

In [2]:
class F1Dataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }


In [3]:

class F1BertModel(nn.Module):
    def __init__(self, num_classes):
        super(F1BertModel, self).__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        cls_output = bert_output.last_hidden_state[:, 0, :]  # CLS token

        x = self.dropout(cls_output)
        x = self.fc(x)
        return x


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_df = pd.read_csv("data/processed/train.csv")
test_df = pd.read_csv("data/processed/test.csv")

le = joblib.load("models/label_encoder.pkl")

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

train_dataset = F1Dataset(train_df["text"], train_df["label"], tokenizer)
test_dataset = F1Dataset(test_df["text"], test_df["label"], tokenizer)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4)

num_classes = len(le.classes_)
model = F1BertModel(num_classes=num_classes).to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

epochs = 10
model.train()

for epoch in range(epochs):
    total_loss = 0

    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss:.4f}")

os.makedirs("models", exist_ok=True)
torch.save(model.state_dict(), "models/bert_model.pt")

print("Training complete. Model saved in models/bert_model.pt")


Epoch 1/10 - Loss: 20.3566
Epoch 2/10 - Loss: 19.0166
Epoch 3/10 - Loss: 16.9195
Epoch 4/10 - Loss: 13.1263
Epoch 5/10 - Loss: 10.0243
Epoch 6/10 - Loss: 8.6454
Epoch 7/10 - Loss: 6.6165
Epoch 8/10 - Loss: 5.0795
Epoch 9/10 - Loss: 4.5026
Epoch 10/10 - Loss: 3.2061
Training complete. Model saved in models/bert_model.pt
