# **1. Set up**

In [1]:
!pip install underthesea

Collecting underthesea
  Downloading underthesea-8.3.0-py3-none-any.whl.metadata (14 kB)
Collecting python-crfsuite>=0.9.6 (from underthesea)
  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting scikit-learn>=1.6.1 (from underthesea)
  Downloading scikit_learn-1.7.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting underthesea_core==1.0.5 (from underthesea)
  Downloading underthesea_core-1.0.5-cp311-cp311-manylinux2010_x86_64.whl.metadata (1.4 kB)
Downloading underthesea-8.3.0-py3-none-any.whl (8.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.3/8.3 MB[0m [31m67.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hDownloading underthesea_core-1.0.5-cp311-cp311-manylinux2010_x86_64.whl (978 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m978.6/978.6 kB[0m [31m37.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_crfsuite-0.9.11

In [3]:
import os
import re
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from underthesea import word_tokenize

In [4]:
!git clone https://github.com/duyvuleo/VNTC.git

Cloning into 'VNTC'...
remote: Enumerating objects: 39, done.[K
remote: Total 39 (delta 0), reused 0 (delta 0), pack-reused 39 (from 1)[K
Receiving objects: 100% (39/39), 160.90 MiB | 38.92 MiB/s, done.
Resolving deltas: 100% (4/4), done.
Updating files: 100% (15/15), done.
Filtering content: 100% (2/2), 168.95 MiB | 42.70 MiB/s, done.


In [5]:
!mkdir -p /kaggle/working/VNTC/Data/10Topics/Ver1.1/train
!mkdir -p /kaggle/working/VNTC/Data/10Topics/Ver1.1/test

!unrar x /kaggle/working/VNTC/Data/10Topics/Ver1.1/Train_Full.rar /kaggle/working/VNTC/Data/10Topics/Ver1.1/train/ > /dev/null 2>&1
!unrar x /kaggle/working/VNTC/Data/10Topics/Ver1.1/Test_Full.rar /kaggle/working/VNTC/Data/10Topics/Ver1.1/test/ > /dev/null 2>&1
print("Extraction completed!")

Extraction completed!


In [6]:
def load_vntc_data(base_dir):
    texts, labels = [], []
    for root, dirs, files in os.walk(base_dir):
        for fname in files:
            if fname.endswith('.txt'):
                fpath = os.path.join(root, fname)
                label = os.path.basename(os.path.dirname(fpath))
    
                try:
                    with open(fpath, encoding='utf-8') as f:
                        text = f.read()
                except UnicodeDecodeError:
                    try:
                        with open(fpath, encoding='utf-16') as f:
                            text = f.read()
                    except UnicodeDecodeError:
                        with open(fpath, encoding='latin-1') as f:
                            text = f.read()
                
                texts.append(text)
                labels.append(label)
    return texts, labels

In [7]:
train_texts, train_labels = load_vntc_data("/kaggle/working/VNTC/Data/10Topics/Ver1.1/train/Train_Full")
test_texts, test_labels = load_vntc_data("/kaggle/working/VNTC/Data/10Topics/Ver1.1/test/Test_Full")
print(f"Loaded {len(train_texts)} samples, {len(set(train_labels))} topics.")
print(f"Loaded {len(test_texts)} samples, {len(set(test_labels))} topics.")

Loaded 33759 samples, 10 topics.
Loaded 50373 samples, 10 topics.


In [8]:
#Stratify train set and test set for faster preprocessing and training 
train_texts_small, _, train_labels_small, _ = train_test_split(
    train_texts,
    train_labels,
    train_size=2000,
    stratify=train_labels,   
    random_state=42
)

test_texts_small, _, test_labels_small, _ = train_test_split(
    test_texts,
    test_labels,
    train_size=2000,
    stratify=test_labels,
    random_state=42
)

In [10]:
#Inspect stratified data
print("Train class distribution:")
print(pd.Series(train_labels_small).value_counts(normalize=True))
print("\nTest class distribution:")
print(pd.Series(test_labels_small).value_counts(normalize=True))

Train class distribution:
The thao            0.1570
Chinh tri Xa hoi    0.1545
Phap luat           0.1145
Suc khoe            0.1005
Doi song            0.0935
Van hoa             0.0910
The gioi            0.0860
Kinh doanh          0.0755
Vi tinh             0.0735
Khoa hoc            0.0540
Name: proportion, dtype: float64

Test class distribution:
Chinh tri Xa hoi    0.1500
The gioi            0.1335
The thao            0.1325
Van hoa             0.1240
Suc khoe            0.1075
Kinh doanh          0.1050
Vi tinh             0.0905
Phap luat           0.0750
Khoa hoc            0.0415
Doi song            0.0405
Name: proportion, dtype: float64


In [11]:
le = LabelEncoder()
y_train = le.fit_transform(train_labels_small)
y_test = le.transform(test_labels_small)
num_classes = len(le.classes_)
print("Classes:", le.classes_)

Classes: ['Chinh tri Xa hoi' 'Doi song' 'Khoa hoc' 'Kinh doanh' 'Phap luat'
 'Suc khoe' 'The gioi' 'The thao' 'Van hoa' 'Vi tinh']


# **2. Modeling**

**a. Text Preprocessing**

In [12]:
def preprocess (text):
    #Lowercase 
    text = text.lower()
    #Remove url 
    text = re.sub(r"https?://\S+", '',text)
    #Remove punctuation
    text = re.sub(r"[^a-zA-ZÀ-ỹ\s]", " ", text)
    #Strip extra space 
    text = re.sub(r"\s+", " ", text).strip()
    #Tokenize 
    text = word_tokenize(text, format="text")
    return text

In [13]:
train_texts_clean = [preprocess(t) for t in train_texts_small]
test_texts_clean = [preprocess(t) for t in test_texts_small]

In [14]:
X_train, X_val, y_train, y_val = train_test_split(
    train_texts_clean, y_train, test_size=0.2, random_state=42, stratify=y_train
)
print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(test_texts_clean)}")

Train: 1600, Val: 400, Test: 2000


In [15]:
#TF-IDF 
vectorizer = TfidfVectorizer(max_features=3000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(test_texts_clean)

**2. Building model**

In [16]:
#Baseline model check (Logistic Regression)
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)
print("Validation Accuracy:", clf.score(X_val_tfidf, y_val))

Validation Accuracy: 0.845


In [17]:
class TfidfDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X.toarray(), dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = TfidfDataset(X_train_tfidf, y_train)
val_dataset = TfidfDataset(X_val_tfidf, y_val)
test_dataset = TfidfDataset(X_test_tfidf, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


In [18]:
#Model architecture
class SimpleNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.batchnorm = nn.BatchNorm1d(hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.4)
        self.fc2 = nn.Linear(hidden_dim, num_classes)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.batchnorm(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

#Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = SimpleNN(input_dim=3000, hidden_dim=512, num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

Using device: cpu


In [19]:
def evaluate(model, loader):
    model.eval()
    correct, total = 0, 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y_batch.cpu().numpy())
    acc = correct / total
    return acc, all_preds, all_labels

# Training configuration
num_epochs = 15  
best_val_acc = 0

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    val_acc, _, _ = evaluate(model, val_loader)
    print(f"Epoch {epoch+1}/{num_epochs} | Loss: {total_loss:.3f} | Val Acc: {val_acc:.4f}")
    
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "best_model.pt")

Epoch 1/15 | Loss: 23.850 | Val Acc: 0.8450
Epoch 2/15 | Loss: 3.202 | Val Acc: 0.8500
Epoch 3/15 | Loss: 0.988 | Val Acc: 0.8425
Epoch 4/15 | Loss: 0.447 | Val Acc: 0.8350
Epoch 5/15 | Loss: 0.281 | Val Acc: 0.8425
Epoch 6/15 | Loss: 0.188 | Val Acc: 0.8350
Epoch 7/15 | Loss: 0.149 | Val Acc: 0.8500
Epoch 8/15 | Loss: 0.117 | Val Acc: 0.8375
Epoch 9/15 | Loss: 0.096 | Val Acc: 0.8375
Epoch 10/15 | Loss: 0.081 | Val Acc: 0.8475
Epoch 11/15 | Loss: 0.070 | Val Acc: 0.8400
Epoch 12/15 | Loss: 0.064 | Val Acc: 0.8425
Epoch 13/15 | Loss: 0.052 | Val Acc: 0.8425
Epoch 14/15 | Loss: 0.046 | Val Acc: 0.8450
Epoch 15/15 | Loss: 0.040 | Val Acc: 0.8475


In [20]:
model.load_state_dict(torch.load("best_model.pt"))
test_acc, preds, labels = evaluate(model, test_loader)
print(f"Test Accuracy: {test_acc:.4f}")
print("\nClassification Report:\n", classification_report(labels, preds))

Test Accuracy: 0.8830

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.88      0.84       300
           1       0.73      0.54      0.62        81
           2       0.82      0.70      0.75        83
           3       0.90      0.80      0.84       210
           4       0.83      0.93      0.88       150
           5       0.89      0.96      0.92       215
           6       0.91      0.91      0.91       267
           7       0.98      0.97      0.97       265
           8       0.90      0.92      0.91       248
           9       0.91      0.90      0.90       181

    accuracy                           0.88      2000
   macro avg       0.87      0.85      0.86      2000
weighted avg       0.88      0.88      0.88      2000



The model achieved a test accuracy of 88.3%, indicating fairly strong overall performance across the 10 topics. Most classes show balanced precision, recall, and F1-scores around 0.85–0.95, suggesting consistent prediction quality. However, classes 1 and 2 have noticeably lower recall (0.54 and 0.70), meaning the model sometimes fails to correctly identify samples from these categories. Despite that, the macro average F1-score of 0.86 confirms good generalization, and the weighted average F1-score of 0.88 shows that performance is reliable even considering class imbalance. Overall, the classifier performs well, with room for improvement in the minority or harder-to-distinguish classes.