In [2]:
import numpy as np 
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from nltk.stem import PorterStemmer

from nltk import pos_tag
import matplotlib.pyplot as plt
from transformers import AutoTokenizer
from torch.utils.flop_counter import FlopCounterMode
from tqdm import tqdm

In [3]:
import re
stem = PorterStemmer()

def clean(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z0-9]',' ',text)
    text = text.split()
    text = [ i for i in text if i not in stopwords.words('english')]
    text = [stem.stem(i) for i in text]
    return ' '.join(text)

In [4]:
# df_train = pd.read_csv(r'data\TextClassification\AGNews\train.csv', header=None)
# df_train.columns=['Class Index', 'Title', 'Description']
# df_test = pd.read_csv(r'data\TextClassification\AGNews\test.csv', header=None)
# df_test.columns=['Class Index', 'Title', 'Description']
# df_train['News'] = df_train['Title']+" "+df_train['Description']
# df_test['News'] = df_test['Title']+" "+df_test['Description']

# df_train['clean'] = df_train['News'].apply(clean)
# df_train.to_csv(r"data\TextClassification\AGNews\train-clean.csv")

# df_test['clean'] = df_test['News'].apply(clean)
# df_test.to_csv(r"data\TextClassification\AGNews\test-clean.csv")

df_train = pd.read_csv(r"data\TextClassification\AGNews\train-clean.csv")
df_test = pd.read_csv(r"data\TextClassification\AGNews\test-clean.csv")

In [5]:
df_train.shape

(120000, 6)

In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Unnamed: 0   120000 non-null  int64 
 1   Class Index  120000 non-null  int64 
 2   Title        120000 non-null  object
 3   Description  120000 non-null  object
 4   News         120000 non-null  object
 5   clean        120000 non-null  object
dtypes: int64(2), object(4)
memory usage: 5.5+ MB


In [7]:
df_train['Class Index'].value_counts()

Class Index
3    30000
4    30000
2    30000
1    30000
Name: count, dtype: int64

In [8]:
sent_len1 = df_train['clean'].map(lambda x: len(x.split())).max()
sent_len2 = df_test['clean'].map(lambda x: len(x.split())).max()
sent_len = max(sent_len1, sent_len2)
sent_len

142

In [9]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")



In [10]:
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader

In [11]:
train_data = torch.tensor([tokenizer(doc, padding='max_length', truncation=True, max_length=sent_len).input_ids for doc in df_train['clean']])
train_label = torch.tensor([label - 1 for label in df_train['Class Index']])
train_dataset = TensorDataset(train_data, train_label)

In [12]:
test_data = torch.tensor([tokenizer(doc, padding='max_length', truncation=True, max_length=sent_len).input_ids for doc in df_test['clean']])
test_label = torch.tensor([label - 1 for label in df_test['Class Index']])
test_dataset = TensorDataset(test_data, test_label)

In [13]:
batch_size = 256
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, drop_last=True, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, drop_last=True)

In [None]:
# Model
class GRUTextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(GRUTextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.bigru1 = nn.GRU(embed_dim, 128, batch_first=True, bidirectional=True)
        self.bigru2 = nn.GRU(128*2, 64, batch_first=True, bidirectional=True)
        self.global_max_pool = nn.AdaptiveMaxPool1d(1)
        self.fc1 = nn.Linear(64*2, 256)
        self.dropout1 = nn.Dropout(0.25)
        self.fc2 = nn.Linear(256, 128)
        self.dropout2 = nn.Dropout(0.25)
        self.fc3 = nn.Linear(128, 64)
        self.dropout3 = nn.Dropout(0.25)
        self.out = nn.Linear(64, num_classes)

    def forward(self, x):
        x = self.embedding(x)                         # [B, L] -> [B, L, E]
        x, _ = self.bigru1(x)                          # [B, L, 2*128]
        x, _ = self.bigru2(x)                          # [B, L, 2*64]
        x = x.permute(0, 2, 1)                         # [B, 2*64, L]
        x = self.global_max_pool(x).squeeze(-1)        # [B, 2*64]
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = F.relu(self.fc3(x))
        x = self.dropout3(x)
        return self.out(x)


In [28]:
# Example values (adjust as needed)
vocabulary_size = tokenizer.vocab_size
embed_size = 256
max_len = sent_len
num_classes = 4
num_epochs = 20
lr = 0.0012

In [38]:
# Instantiate model
model = GRUTextClassifier(vocabulary_size, embed_size, num_classes)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = model.to(device)

num_total_params = sum(p.numel() for p in model.parameters())
print(f'number of parameters: {num_total_params}')

number of parameters: 33262532


In [39]:
from ptflops import get_model_complexity_info

In [44]:
def input_constructor(input_res):
    # input_res is (batch_size, sent_len)
    return {'x': torch.ones(input_res, dtype=torch.long)}

macs, params = get_model_complexity_info(
    model, 
    (1, sent_len), 
    as_strings=False, 
    backend='pytorch', 
    print_per_layer_stat=True, 
    verbose=True,
    input_constructor=input_constructor
)
print(f"MACs: {macs}, Params: {params}")


GRUTextClassifier(
  494.53 k, 1.487% Params, 60.13 MMac, 99.969% MACs, 
  (embedding): Embedding(0, 0.000% Params, 0.0 Mac, 0.000% MACs, 128000, 256)
  (bigru1): GRU(296.45 k, 0.891% Params, 42.35 MMac, 70.411% MACs, 256, 128, batch_first=True, bidirectional=True)
  (bigru2): GRU(123.65 k, 0.372% Params, 17.69 MMac, 29.404% MACs, 256, 64, batch_first=True, bidirectional=True)
  (global_max_pool): AdaptiveMaxPool1d(0, 0.000% Params, 18.18 KMac, 0.030% MACs, output_size=1)
  (fc1): Linear(33.02 k, 0.099% Params, 33.02 KMac, 0.055% MACs, in_features=128, out_features=256, bias=True)
  (dropout1): Dropout(0, 0.000% Params, 0.0 Mac, 0.000% MACs, p=0.25, inplace=False)
  (fc2): Linear(32.9 k, 0.099% Params, 32.9 KMac, 0.055% MACs, in_features=256, out_features=128, bias=True)
  (dropout2): Dropout(0, 0.000% Params, 0.0 Mac, 0.000% MACs, p=0.25, inplace=False)
  (fc3): Linear(8.26 k, 0.025% Params, 8.26 KMac, 0.014% MACs, in_features=128, out_features=64, bias=True)
  (dropout3): Dropout(0, 

In [63]:
# Loss and optimizer

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=0.00001)
# Training Loop
model = model.to(device)
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for inputs, labels in tqdm(train_dataloader):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)                     # [B, num_classes]
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for val_inputs, val_labels in tqdm(test_dataloader):
            val_inputs, val_labels = val_inputs.to(device), val_labels.to(device)
            val_outputs = model(val_inputs)
            v_loss = loss_fn(val_outputs, val_labels)
            val_loss += v_loss.item()
            preds = val_outputs.argmax(dim=1)
            correct += (preds == val_labels).sum().item()
            total += val_labels.size(0)
    val_acc = correct / total if total > 0 else 0

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

100%|██████████| 468/468 [00:14<00:00, 32.37it/s]
100%|██████████| 29/29 [00:00<00:00, 65.23it/s]


Epoch 1/20, Loss: 231.4989, Val Loss: 8.8333, Val Acc: 0.8982


100%|██████████| 468/468 [00:13<00:00, 35.85it/s]
100%|██████████| 29/29 [00:00<00:00, 72.53it/s]


Epoch 2/20, Loss: 125.6758, Val Loss: 8.3877, Val Acc: 0.8984


100%|██████████| 468/468 [00:12<00:00, 36.17it/s]
100%|██████████| 29/29 [00:00<00:00, 74.33it/s]


Epoch 3/20, Loss: 101.5442, Val Loss: 7.7483, Val Acc: 0.9080


100%|██████████| 468/468 [00:12<00:00, 36.15it/s]
100%|██████████| 29/29 [00:00<00:00, 71.09it/s]


Epoch 4/20, Loss: 83.6554, Val Loss: 7.9473, Val Acc: 0.9118


100%|██████████| 468/468 [00:12<00:00, 36.52it/s]
100%|██████████| 29/29 [00:00<00:00, 70.22it/s]


Epoch 5/20, Loss: 69.4139, Val Loss: 8.2513, Val Acc: 0.9102


100%|██████████| 468/468 [00:13<00:00, 34.21it/s]
100%|██████████| 29/29 [00:00<00:00, 48.22it/s]


Epoch 6/20, Loss: 56.0850, Val Loss: 9.1144, Val Acc: 0.9099


100%|██████████| 468/468 [00:14<00:00, 33.17it/s]
100%|██████████| 29/29 [00:00<00:00, 67.89it/s]


Epoch 7/20, Loss: 44.5056, Val Loss: 9.6988, Val Acc: 0.9133


100%|██████████| 468/468 [00:13<00:00, 34.21it/s]
100%|██████████| 29/29 [00:00<00:00, 65.18it/s]


Epoch 8/20, Loss: 36.0154, Val Loss: 10.5812, Val Acc: 0.9103


100%|██████████| 468/468 [00:13<00:00, 34.63it/s]
100%|██████████| 29/29 [00:00<00:00, 65.91it/s]


Epoch 9/20, Loss: 29.7368, Val Loss: 10.9907, Val Acc: 0.9098


100%|██████████| 468/468 [00:14<00:00, 31.86it/s]
100%|██████████| 29/29 [00:00<00:00, 55.00it/s]


Epoch 10/20, Loss: 25.5902, Val Loss: 12.5474, Val Acc: 0.9084


100%|██████████| 468/468 [00:14<00:00, 32.07it/s]
100%|██████████| 29/29 [00:00<00:00, 66.06it/s]


Epoch 11/20, Loss: 21.6882, Val Loss: 12.7420, Val Acc: 0.9080


100%|██████████| 468/468 [00:14<00:00, 32.17it/s]
100%|██████████| 29/29 [00:00<00:00, 38.54it/s]


Epoch 12/20, Loss: 19.3233, Val Loss: 12.7352, Val Acc: 0.9077


100%|██████████| 468/468 [00:14<00:00, 32.73it/s]
100%|██████████| 29/29 [00:00<00:00, 63.45it/s]


Epoch 13/20, Loss: 18.2203, Val Loss: 13.0892, Val Acc: 0.9098


100%|██████████| 468/468 [00:13<00:00, 35.77it/s]
100%|██████████| 29/29 [00:00<00:00, 74.74it/s]


Epoch 14/20, Loss: 17.4841, Val Loss: 13.3803, Val Acc: 0.9124


100%|██████████| 468/468 [00:14<00:00, 33.15it/s]
100%|██████████| 29/29 [00:00<00:00, 49.09it/s]


Epoch 15/20, Loss: 16.1766, Val Loss: 13.0035, Val Acc: 0.9129


100%|██████████| 468/468 [00:23<00:00, 19.66it/s]
100%|██████████| 29/29 [00:01<00:00, 24.05it/s]


Epoch 16/20, Loss: 14.4031, Val Loss: 14.6887, Val Acc: 0.9054


100%|██████████| 468/468 [00:24<00:00, 18.76it/s]
100%|██████████| 29/29 [00:00<00:00, 45.87it/s]


Epoch 17/20, Loss: 15.1299, Val Loss: 14.8473, Val Acc: 0.9102


100%|██████████| 468/468 [00:25<00:00, 18.60it/s]
100%|██████████| 29/29 [00:00<00:00, 48.27it/s]


Epoch 18/20, Loss: 14.8119, Val Loss: 13.1010, Val Acc: 0.9134


100%|██████████| 468/468 [00:23<00:00, 19.57it/s]
100%|██████████| 29/29 [00:00<00:00, 41.30it/s]


Epoch 19/20, Loss: 13.5512, Val Loss: 14.9606, Val Acc: 0.9119


100%|██████████| 468/468 [00:23<00:00, 19.67it/s]
100%|██████████| 29/29 [00:00<00:00, 46.57it/s]

Epoch 20/20, Loss: 13.7057, Val Loss: 13.4410, Val Acc: 0.9153





In [68]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from torchmetrics import ConfusionMatrix

def calculate_metrics(cl_model, dataloader, num_classes):
    cm = ConfusionMatrix(task="multiclass", num_classes=num_classes)

    y_pred = []
    y_true = []

    cl_model = cl_model.eval()
    cl_model.to(device)
    for X, y in tqdm(dataloader):
        X = X.to(device)
        with torch.no_grad():
            y_p = cl_model(X)
            y_p = y_p.cpu()
        y_pred.append(y_p)
        y_true.append(y)
    y_pred = torch.cat(y_pred, dim=0)
    y_true = torch.cat(y_true, dim=0)
    y_pred2 = torch.argmax(y_pred, dim=1)
    y_true2 = torch.argmax(y_true, dim=1) if len(y_true.shape)>1 else y_true
    print(f'classification report: \n {classification_report(y_true2, y_pred2, digits=4)}')
    print(f'confusion matrix:\n {cm(y_pred2, y_true2)}')
    print('================================')


In [69]:
calculate_metrics(model, test_dataloader, num_classes=4)

100%|██████████| 29/29 [00:00<00:00, 42.58it/s]


classification report: 
               precision    recall  f1-score   support

           0     0.9223    0.9133    0.9178      1858
           1     0.9645    0.9661    0.9653      1858
           2     0.8908    0.8778    0.8843      1850
           3     0.8837    0.9037    0.8936      1858

    accuracy                         0.9153      7424
   macro avg     0.9153    0.9152    0.9152      7424
weighted avg     0.9154    0.9153    0.9153      7424

confusion matrix:
 tensor([[1697,   36,   74,   51],
        [  30, 1795,   14,   19],
        [  60,   15, 1624,  151],
        [  53,   15,  111, 1679]])
