In [15]:
import os, re, math, nltk, torch
import numpy as np
import pandas as pd
from collections import Counter
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tqdm import tqdm

nltk.download("stopwords", quiet=True)
stop_words = set(stopwords.words("english"))

special_tokens = ["<PAD>", "<UNK>", "<SOS>", "<EOS>", "<NUM>"]
batch_size=32
path = "/kaggle/input/dbpedia-classes"

In [16]:
train_path = os.path.join(path, "DBPEDIA_train.csv")
test_path = os.path.join(path, "DBPEDIA_test.csv")
val_path = os.path.join(path, "DBPEDIA_val.csv")

In [17]:
def preprocess(path, config):
    df = pd.read_csv(path)
    df['label'] = df['l3']
    df.drop(columns=["l1", "l2", "l3"], inplace=True)
    df["len"] = df["text"].apply(len)
    df = df[df["len"] < config.max_words]
    
    def tokenize(text, stop_words):
        tokens = re.split(f"\W+", text.lower())
        words_tokens = ["<SOS>"]
        words_tokens += [token if token.isalpha() else "<NUM>" for token in tokens if token and token not in stop_words]
        words_tokens += ["<EOS>"]
        return words_tokens


    def build_vocab(tokens, min_freq=3):
        vocab_counter = Counter(word for seq in tokens for word in seq)
        vocab = {'<PAD>': 0, '<UNK>': 1, "<SOS>": 2, "<EOS>": 3, '<NUM>': 4}
        for token, freq in vocab_counter.items():
            if token not in special_tokens and freq >= min_freq:
                vocab[token] = len(vocab)
        return vocab

    def encode_tokens(tokens, vocab, seq_len):
        encoded = [vocab.get(token, vocab["<UNK>"]) for token in tokens]
        if len(encoded) < seq_len:
            encoded += [vocab["<PAD>"]] * (seq_len - len(encoded))
        else:
            encoded = encoded[:seq_len]
        return encoded

    df["tokens"] = df["text"].apply(lambda x: tokenize(x, config.stop_words))
    if config.vocab is None:
        vocab = build_vocab(df["tokens"])
        config.vocab = vocab
        
    print(f"vocab size: {len(config.vocab)}")

    seq_len = df["tokens"].apply(len)
    max_len = seq_len.max()
    
    print(f"seq len max: {max_len}, mean: {seq_len.mean()}")
    
    df["encoded_text"] = df["tokens"].apply(lambda x: encode_tokens(x, config.vocab, max_len))
    
    if config.label_vocab is None:
        config.label_vocab = {label: idx for idx, label in enumerate(df["label"].unique())}

    print(f"num_labels: {len(config.label_vocab)}")
    
    df["encoded_label"] = df["label"].apply(lambda x: config.label_vocab[x])
    df.reset_index(drop=True, inplace=True)
    
    return df[["encoded_text", "encoded_label"]]

In [18]:
class Config:
    def __init__(self, stop_words=stop_words):
        self.vocab = None
        self.label_vocab = None
        self.max_words = 360
        self.stop_words = stop_words

In [19]:
class TextDataset(Dataset):
    def __init__(self, text, label):
        self.text = text
        self.label = label

    def __len__(self):
        return len(self.label)


    def __getitem__(self, idx):
        return torch.tensor(self.text[idx], dtype=torch.long), torch.tensor(self.label[idx], dtype=torch.long)



In [20]:
config = Config()
df_train = preprocess(train_path, config)
df_test = preprocess(test_path, config)
df_val = preprocess(val_path, config)
len(config.vocab), len(config.label_vocab)

vocab size: 43928
seq len max: 68, mean: 25.67092819075198
num_labels: 219
vocab size: 43928
seq len max: 56, mean: 25.629540347293155
num_labels: 219
vocab size: 43928
seq len max: 61, mean: 25.67331687919923
num_labels: 219


(43928, 219)

In [21]:
df_train.head()

Unnamed: 0,encoded_text,encoded_label
0,"[2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, ...",0
1,"[2, 1, 1, 32, 33, 34, 35, 36, 37, 38, 39, 40, ...",1
2,"[2, 4, 48, 4, 49, 50, 4, 4, 49, 48, 4, 49, 50,...",2
3,"[2, 52, 53, 54, 55, 56, 57, 53, 54, 58, 4, 4, ...",3
4,"[2, 1, 1, 65, 4, 4, 66, 67, 68, 4, 4, 66, 67, ...",4


In [22]:
x_train = np.array(df_train["encoded_text"].tolist())
x_test = np.array(df_test["encoded_text"].tolist())
x_val = np.array(df_val["encoded_text"].tolist())

y_train = np.array(df_train["encoded_label"].tolist())
y_test = np.array(df_test["encoded_label"].tolist())
y_val = np.array(df_val["encoded_label"].tolist())

In [23]:
train_dataset = TextDataset(x_train, y_train)
test_dataset = TextDataset(x_test, y_test)
val_dataset = TextDataset(x_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=4)

In [24]:
x_train.shape, x_test.shape, x_val.shape, y_train.shape, y_test.shape, y_val.shape

((97383, 68), (24475, 56), (14586, 61), (97383,), (24475,), (14586,))

In [27]:
class PosEmb(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.dim = dim

    def forward(self, x):
        device = x.device
        half_dim = self.dim // 2
        emb = math.log(10000) / (half_dim - 1)
        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
        emb = x[:, None] * emb[None, :]
        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
        return emb


class Enc_Transformer(nn.Module):
    def __init__(self, vocab_size, num_classes, emb_dim=256, num_heads=8):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.emb.weight.data *= 1e-3
        self.pos_emb = PosEmb(emb_dim)

        self.multihead_attn = nn.MultiheadAttention(emb_dim, num_heads=num_heads, batch_first=True)
        self.mlp = nn.Sequential(
            nn.Linear(emb_dim, 4*emb_dim),
            nn.ELU(inplace=True),
            nn.Linear(4*emb_dim, emb_dim),
            nn.LayerNorm(emb_dim),
        )
        self.fc_out = nn.Linear(emb_dim, num_classes)


    def forward(self, input_seq):
        bs, l = input_seq.shape
        emb_seq = self.emb(input_seq)
        seq_idx = torch.arange(l, device=input_seq.device)
        pos_emb = self.pos_emb(seq_idx).reshape(1, l, -1).expand(bs, l, -1)
        emb_seq += pos_emb

        output, attn_map = self.multihead_attn(emb_seq, emb_seq, emb_seq)
        output = self.mlp(output).mean(dim=1)
        return self.fc_out(output), attn_map
        

In [30]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
vocab_size = len(config.vocab)
num_classes = len(config.label_vocab)
epochs = 2
learning_rate=1e-4
model = Enc_Transformer(vocab_size, num_classes).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs, eta_min=0)


In [31]:

train_acc_hist = []
train_loss_hist = []

val_acc_hist = []
val_loss_hist = []

train_acc, test_acc = 0, 0

for epoch in range(epochs):
    # Training
    model.train()
    correct = total = train_loss = 0
    
    for text, label in tqdm(train_loader):
        text, label = text.to(device), label.to(device)
        pred, _ = model(text)
        
        loss = criterion(pred, label)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        correct += (pred.argmax(1) == label).sum().item()
        total += label.size(0)
        train_loss += loss.item()
    
    # Calculate epoch metrics
    train_acc = correct / total
    train_loss = train_loss / len(train_loader)
    train_acc_hist.append(train_acc)
    train_loss_hist.append(train_loss)
    
    lr_scheduler.step()
    
    # Validation
    model.eval()
    val_correct = val_total = val_loss = 0
    
    with torch.no_grad():
        for text, label in test_loader:
            text, label = text.to(device), label.to(device)
            pred, _ = model(text)
            loss = criterion(pred, label)
            
            val_correct += (pred.argmax(1) == label).sum().item()
            val_total += label.size(0)
            val_loss += loss.item()

    val_acc = val_correct / val_total
    val_loss = val_loss / len(test_loader)
    val_acc_hist.append(val_acc)
    val_loss_hist.append(val_loss)
    
    print(f"epoch {epoch+1}/{epochs} acc: {train_acc:.4f}, val_acc: {val_acc:.4f}, loss: {train_loss:.4f}, val_loss: {val_loss:.4f}")

100%|██████████| 3043/3043 [00:25<00:00, 118.94it/s]


epoch 1/2 acc: 0.4752, val_acc: 0.8935, loss: 2.7544, val_loss: 0.5236


100%|██████████| 3043/3043 [00:25<00:00, 117.61it/s]


epoch 2/2 acc: 0.9246, val_acc: 0.9236, loss: 0.3442, val_loss: 0.3449
