In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
import pandas as pd

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

In [None]:
train_data = pd.read_csv('/kaggle/input/wiki-data2/train_wiki_data.csv',encoding='utf-8', delimiter=',')
test_data = pd.read_csv('/kaggle/input/wiki-data2/test_wiki_data.csv',encoding='utf-8', delimiter=',')

In [None]:
train_data['label'].value_counts()

In [None]:
import re
import nltk
from stop_words import get_stop_words

nltk.download('stopwords')
stop_words = get_stop_words('vi')

def preprocess_text(text):
    text = re.sub(r'\[\d+\]', '', text)
    text = re.sub(r'\([^()]*\)', '', text)
    text = re.sub(r'\xa0', ' ',text)
#     words = text.split()
#     filtered_words = [word for word in words if word.lower() not in stop_words]
#     filtered_text = ' '.join(filtered_words)
    return text

In [None]:
train_data['content'] = train_data['content'].apply(preprocess_text)
test_data['content'] = test_data['content'].apply(preprocess_text)

In [None]:
num_samples_to_remove = 600
indices_to_remove = train_data[train_data['label'] == 0].sample(n=num_samples_to_remove, random_state=42).index
train_data = train_data.drop(indices_to_remove)

In [None]:
train_data['label'].value_counts()

In [None]:
train_data['label'].isnull().value_counts()

In [None]:
train_data = train_data.reset_index(drop=True)

In [None]:
train_data

In [None]:
test_data

In [None]:
test_data['content']

In [None]:
print(len(train_data["content"]))
print(len(train_data["label"]))

In [None]:
print(train_data["content"].isnull().sum())
print(train_data["label"].isnull().sum())

In [None]:
# Define training parameters
MAX_LEN = 256
BATCH_SIZE = 8
EPOCHS = 20
LEARNING_RATE = 0.001

In [None]:
# Define dataset class
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        content = str(self.data.loc[index, 'content'])
        label = self.data.loc[index, 'label']
        
        encoding = self.tokenizer.encode_plus(
            content,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'content': content,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [None]:
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("ThuyNT03/xlm-roberta-base-VietNam-train")
model = AutoModel.from_pretrained("ThuyNT03/xlm-roberta-base-VietNam-train")
model.config.hidden_size = 80
# Move model to GPU
model.to(device)
print(model.config.hidden_size)

In [None]:
# Define additional layers for classification
reduce_dim_layer = nn.Linear(768, 80)
class Classifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Classifier, self).__init__()
        self.reduce_dim_layer = nn.Linear(768, 80)
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        x = self.reduce_dim_layer(x)
        x = self.dropout(torch.relu(self.fc1(x)))
        x = self.fc2(x)
        return x


In [None]:
# Prepare DataLoader
train_dataset = CustomDataset(train_data, tokenizer, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = CustomDataset(test_data, tokenizer, MAX_LEN)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


In [None]:
# Initialize classifier
num_labels = 4
classifier = Classifier(input_dim=model.config.hidden_size, hidden_dim=64, output_dim=num_labels)

# Move classifier to GPU
classifier.to(device)

In [None]:
# Define loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(classifier.parameters(), lr=LEARNING_RATE, weight_decay=0.01)

In [None]:
import torch
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

# Training loop
for epoch in range(EPOCHS):
    # Train
    classifier.train()
    train_loss = 0.0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            last_hidden_states = outputs.last_hidden_state[:, 0, :]  # CLS token

        logits = classifier(last_hidden_states)
        loss = loss_fn(logits, labels)
        
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * input_ids.size(0)

    # Evaluate
    classifier.eval()
    val_loss = 0.0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask)
            last_hidden_states = outputs.last_hidden_state[:, 0, :]
            
            logits = classifier(last_hidden_states)
            loss = loss_fn(logits, labels)
            val_loss += loss.item() * input_ids.size(0)
            
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    train_loss /= len(train_data)
    val_loss /= len(test_data)
    report = classification_report(all_labels, all_preds)

    # Calculate Accuracy
    accuracy = accuracy_score(all_labels, all_preds)

    # Calculate Macro F1
    macro_f1 = f1_score(all_labels, all_preds, average='macro')

    # Calculate precision and recall for each label
    precision_per_label = precision_score(all_labels, all_preds, average=None)
    recall_per_label = recall_score(all_labels, all_preds, average=None)

    print(f'Epoch {epoch + 1}/{EPOCHS}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
    print(f"Accuracy for BERT: {accuracy:.4f}")
    print("Precision, Recall, F1-score for each label for BERT:")
    for label, precision, recall in zip(range(num_labels), precision_per_label, recall_per_label):
        print(f"Label {label}:")
        print(f"  Precision: {precision:.4f}")
        print(f"  Recall: {recall:.4f}")
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
        print(f"  F1-score: {f1:.4f}")
    print(f"Macro F1-score for BERT: {macro_f1:.4f}")
    print('------------------------------------------------------------------------')