In [1]:
import numpy as np 
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from nltk.stem import PorterStemmer

from nltk import pos_tag
import matplotlib.pyplot as plt
from transformers import AutoTokenizer
from torch.utils.flop_counter import FlopCounterMode
from tqdm import tqdm

In [2]:
import re
stem = PorterStemmer()

def clean(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z0-9]',' ',text)
    text = text.split()
    text = [ i for i in text if i not in stopwords.words('english')]
    text = [stem.stem(i) for i in text]
    return ' '.join(text)

In [3]:
# keep_ratio = 1.0
# df_test = pd.read_csv(r'data\TextClassification\IMDB\test.csv')
# df_test['Topic'] = df_test['label']
# df_test['Content'] = df_test['text']
# df_test.drop(['label', 'text'], axis=1, inplace=True)
# df_test.dropna(inplace=True)
# df_test = df_test.sample(frac=1).reset_index(drop=True)
# df_test = df_test.iloc[:int(keep_ratio*df_test.shape[0])]
# df_train = pd.read_csv(r'data\TextClassification\IMDB\train.csv')
# df_train['Topic'] = df_train['label']
# df_train['Content'] = df_train['text']
# df_train.drop(['label', 'text'], axis=1, inplace=True)
# df_train.dropna(inplace=True)
# df_train = df_train.sample(frac=1).reset_index(drop=True)
# df_train = df_train.iloc[:int(keep_ratio*df_train.shape[0])]

# df_train['clean'] = df_train['Content'].apply(clean)
# df_train.to_csv(r"data\TextClassification\IMDB\train-clean.csv")

# df_test['clean'] = df_test['Content'].apply(clean)
# df_test.to_csv(r"data\TextClassification\IMDB\test-clean.csv")

df_train = pd.read_csv(r"data\TextClassification\IMDB\train-clean.csv")
df_test = pd.read_csv(r"data\TextClassification\IMDB\test-clean.csv")

In [4]:
df_classes = ["Negative", "Positive"]
df = pd.concat([df_train, df_test])
df.shape

(50000, 4)

In [5]:
df_train.shape

(25000, 4)

In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  25000 non-null  int64 
 1   Topic       25000 non-null  int64 
 2   Content     25000 non-null  object
 3   clean       25000 non-null  object
dtypes: int64(2), object(2)
memory usage: 781.4+ KB


In [7]:
df_train['Topic'].value_counts()

Topic
1    12500
0    12500
Name: count, dtype: int64

In [8]:
sent_len1 = df_train['clean'].map(lambda x: len(x.split())).max()
sent_len2 = df_test['clean'].map(lambda x: len(x.split())).max()
sent_len = max(sent_len1, sent_len2)
sent_len

1455

In [9]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large", cache_dir=r'cache_dir/tokenizers/')



In [10]:
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader

In [11]:
train_data = torch.tensor([tokenizer(doc, padding='max_length', truncation=True, max_length=min(sent_len, 1024)).input_ids for doc in df_train['clean']])
train_label = torch.tensor([label for label in df_train['Topic']])
train_dataset = TensorDataset(train_data, train_label)

In [21]:
test_data = torch.tensor([tokenizer(doc, padding='max_length', truncation=True, max_length=min(sent_len, 1024)).input_ids for doc in df_test['clean']])
test_label = torch.tensor([label for label in df_test['Topic']])
test_dataset = TensorDataset(test_data, test_label)

In [22]:
batch_size = 128
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, drop_last=True, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, drop_last=True)

In [23]:
# Model
class GRUTextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(GRUTextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.bigru1 = nn.GRU(embed_dim, 128, batch_first=True, bidirectional=True)
        self.bigru2 = nn.GRU(128*2, 64, batch_first=True, bidirectional=True)
        self.global_max_pool = nn.AdaptiveMaxPool1d(1)
        self.fc1 = nn.Linear(64*2, 256)
        self.dropout1 = nn.Dropout(0.25)
        self.fc2 = nn.Linear(256, 128)
        self.dropout2 = nn.Dropout(0.25)
        self.fc3 = nn.Linear(128, 64)
        self.dropout3 = nn.Dropout(0.25)
        self.out = nn.Linear(64, num_classes)

    def forward(self, x):
        x = self.embedding(x)                         # [B, L] -> [B, L, E]
        x, _ = self.bigru1(x)                          # [B, L, 2*128]
        x, _ = self.bigru2(x)                          # [B, L, 2*64]
        x = x.permute(0, 2, 1)                         # [B, 2*64, L]
        x = self.global_max_pool(x).squeeze(-1)        # [B, 2*64]
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = F.relu(self.fc3(x))
        x = self.dropout3(x)
        return self.out(x)


In [32]:
# Example values (adjust as needed)
vocabulary_size = tokenizer.vocab_size
embed_size = 256
max_len = sent_len
num_classes = 4
num_epochs = 3
lr = 0.0012

In [33]:
# Instantiate model
model = GRUTextClassifier(vocabulary_size, embed_size, num_classes)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = model.to(device)

num_total_params = sum(p.numel() for p in model.parameters())
print(f'number of parameters: {num_total_params}')

number of parameters: 33262532


In [34]:
from ptflops import get_model_complexity_info

In [35]:
def input_constructor(input_res):
    # input_res is (batch_size, sent_len)
    return {'x': torch.ones(input_res, dtype=torch.long)}

macs, params = get_model_complexity_info(
    model, 
    (1, sent_len), 
    as_strings=False, 
    backend='pytorch', 
    print_per_layer_stat=True, 
    verbose=True,
    input_constructor=input_constructor
)
print(f"MACs: {macs}, Params: {params}")


GRUTextClassifier(
  494.53 k, 1.487% Params, 615.41 MMac, 99.970% MACs, 
  (embedding): Embedding(0, 0.000% Params, 0.0 Mac, 0.000% MACs, 128000, 256)
  (bigru1): GRU(296.45 k, 0.891% Params, 433.94 MMac, 70.491% MACs, 256, 128, batch_first=True, bidirectional=True)
  (bigru2): GRU(123.65 k, 0.372% Params, 181.21 MMac, 29.437% MACs, 256, 64, batch_first=True, bidirectional=True)
  (global_max_pool): AdaptiveMaxPool1d(0, 0.000% Params, 186.24 KMac, 0.030% MACs, output_size=1)
  (fc1): Linear(33.02 k, 0.099% Params, 33.02 KMac, 0.005% MACs, in_features=128, out_features=256, bias=True)
  (dropout1): Dropout(0, 0.000% Params, 0.0 Mac, 0.000% MACs, p=0.25, inplace=False)
  (fc2): Linear(32.9 k, 0.099% Params, 32.9 KMac, 0.005% MACs, in_features=256, out_features=128, bias=True)
  (dropout2): Dropout(0, 0.000% Params, 0.0 Mac, 0.000% MACs, p=0.25, inplace=False)
  (fc3): Linear(8.26 k, 0.025% Params, 8.26 KMac, 0.001% MACs, in_features=128, out_features=64, bias=True)
  (dropout3): Dropout

In [36]:
# Loss and optimizer

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=0.00001)
# Training Loop
model = model.to(device)
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for inputs, labels in tqdm(train_dataloader):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)                     # [B, num_classes]
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for val_inputs, val_labels in tqdm(test_dataloader):
            val_inputs, val_labels = val_inputs.to(device), val_labels.to(device)
            val_outputs = model(val_inputs)
            v_loss = loss_fn(val_outputs, val_labels)
            val_loss += v_loss.item()
            preds = val_outputs.argmax(dim=1)
            correct += (preds == val_labels).sum().item()
            total += val_labels.size(0)
    val_acc = correct / total if total > 0 else 0

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

  0%|          | 0/195 [00:00<?, ?it/s]

100%|██████████| 195/195 [00:11<00:00, 16.34it/s]
100%|██████████| 195/195 [00:05<00:00, 37.60it/s]


Epoch 1/3, Loss: 125.1410, Val Loss: 76.6097, Val Acc: 0.8258


100%|██████████| 195/195 [00:11<00:00, 17.05it/s]
100%|██████████| 195/195 [00:04<00:00, 45.34it/s]


Epoch 2/3, Loss: 65.4229, Val Loss: 64.0929, Val Acc: 0.8589


100%|██████████| 195/195 [00:11<00:00, 17.69it/s]
100%|██████████| 195/195 [00:04<00:00, 42.95it/s]

Epoch 3/3, Loss: 46.2216, Val Loss: 61.9259, Val Acc: 0.8750





In [37]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from torchmetrics import ConfusionMatrix

def calculate_metrics(cl_model, dataloader, num_classes):
    cm = ConfusionMatrix(task="multiclass", num_classes=num_classes)

    y_pred = []
    y_true = []

    cl_model = cl_model.eval()
    cl_model.to(device)
    for X, y in tqdm(dataloader):
        X = X.to(device)
        with torch.no_grad():
            y_p = cl_model(X)
            y_p = y_p.cpu()
        y_pred.append(y_p)
        y_true.append(y)
    y_pred = torch.cat(y_pred, dim=0)
    y_true = torch.cat(y_true, dim=0)
    y_pred2 = torch.argmax(y_pred, dim=1)
    y_true2 = torch.argmax(y_true, dim=1) if len(y_true.shape)>1 else y_true
    print(f'classification report: \n {classification_report(y_true2, y_pred2, digits=4)}')
    print(f'confusion matrix:\n {cm(y_pred2, y_true2)}')
    print('================================')


In [38]:
calculate_metrics(model, test_dataloader, num_classes=4)

100%|██████████| 195/195 [00:04<00:00, 46.02it/s]

classification report: 
               precision    recall  f1-score   support

           0     0.8634    0.8910    0.8770     12481
           1     0.8873    0.8590    0.8730     12479

    accuracy                         0.8750     24960
   macro avg     0.8754    0.8750    0.8750     24960
weighted avg     0.8754    0.8750    0.8750     24960

confusion matrix:
 tensor([[11120,  1361,     0,     0],
        [ 1759, 10720,     0,     0],
        [    0,     0,     0,     0],
        [    0,     0,     0,     0]])



