In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%cd /kaggle/input


In [None]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer

In [None]:
phobert = AutoModel.from_pretrained("vinai/phobert-base")

# For transformers v4.x+: 
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)

In [None]:
df_train = pd.read_csv('/kaggle/input/vietnamese-text-classification/vn_text_classification_data/csv/train_clean.csv')
df_dev = pd.read_csv('/kaggle/input/vietnamese-text-classification/vn_text_classification_data/csv/dev_clean.csv')
df_test = pd.read_csv('/kaggle/input/vietnamese-text-classification/vn_text_classification_data/csv/test_clean.csv')

In [None]:
text_train = df_train['clean_text'].values
text_dev = df_dev['clean_text'].values
text_test = df_test['clean_text'].values

In [None]:
features_train = np.zeros((len(text_train), 768))
features_dev = np.zeros((len(text_dev), 768))
features_test = np.zeros((len(text_test), 768))

In [None]:
def encode_text(text_l):
    features = np.zeros((len(text_l), 768))
    for i, text in enumerate(text_l):
        input_ids = torch.tensor([tokenizer.encode(text)])
        input_ids = input_ids[:, :256]
        print(f'Step {i + 1}/{len(text_l)}')
        with torch.no_grad():
            vec = phobert(input_ids)[1]              # Models outputs are now tuples
        features[i] = vec
    return features

In [None]:
features_train = encode_text(text_train)
features_dev = encode_text(text_dev)
features_test = encode_text(text_test)

In [None]:
## encode y
y_train = df_train["label"].values
y_dev = df_dev["label"].values
y_test = df_test["label"].values

dic_y_mapping = {n:label for n,label in 
                 enumerate(np.unique(y_train))}
inverse_dic = {v:k for k,v in dic_y_mapping.items()}

y_train = np.array([inverse_dic[y] for y in y_train])
y_dev = np.array([inverse_dic[y] for y in y_dev])
y_test = np.array([inverse_dic[y] for y in y_test])

In [None]:
dic_y_mapping

In [None]:
# SVM Classifier
from sklearn import svm
from sklearn.metrics import classification_report
linear_svc = svm.LinearSVC(penalty='l2', C=4)
svc = svm.SVC(C=4)

In [None]:
linear_svc.fit(features_train, y_train)
y_true_linear, y_pred_linear = y_dev, linear_svc.predict(features_dev)

In [None]:
print(classification_report(y_true_linear, y_pred_linear))

In [None]:
svc.fit(features_train, y_train)
y_true_svc, y_pred_svc = y_dev, svc.predict(features_dev)

In [None]:
print(classification_report(y_true_svc, y_pred_svc))

In [None]:
svc_poly = svm.SVC(C=4, kernel='poly')
svc_poly.fit(features_train, y_train)
y_true_svc_poly, y_pred_svc_poly = y_dev, svc_poly.predict(features_dev)

In [None]:
print(classification_report(y_true_svc_poly, y_pred_svc_poly))

In [None]:
svc_sigmoid = svm.SVC(C=4, kernel='sigmoid')
svc_sigmoid.fit(features_train, y_train)
y_true_svc_sigmoid, y_pred_svc_sigmoid = y_dev, svc_sigmoid.predict(features_dev)

In [None]:
print(classification_report(y_true_svc_sigmoid, y_pred_svc_sigmoid))

In [None]:
from sklearn.metrics import f1_score
print('Micro F1')
print(f"Linear Kernel: {f1_score(y_true_linear, y_pred_linear, average='micro')}")
print(f"RBF Kernel: {f1_score(y_true_svc, y_pred_svc, average='micro')}")
print(f"Poly Kernel: {f1_score(y_true_svc_poly, y_pred_svc_poly, average='micro')}")
print(f"Sigmoid Kernel: {f1_score(y_true_svc_sigmoid, y_pred_svc_sigmoid, average='micro')}")
print('\n')

print('Macro F1')
print(f"Linear Kernel: {f1_score(y_true_linear, y_pred_linear, average='macro')}")
print(f"RBF Kernel: {f1_score(y_true_svc, y_pred_svc, average='macro')}")
print(f"Poly Kernel: {f1_score(y_true_svc_poly, y_pred_svc_poly, average='macro')}")
print(f"Sigmoid Kernel: {f1_score(y_true_svc_sigmoid, y_pred_svc_sigmoid, average='macro')}")
print('\n')

print('Weighted F1')
print(f"Linear Kernel: {f1_score(y_true_linear, y_pred_linear, average='weighted')}")
print(f"RBF Kernel: {f1_score(y_true_svc, y_pred_svc, average='weighted')}")
print(f"Poly Kernel: {f1_score(y_true_svc_poly, y_pred_svc_poly, average='weighted')}")
print(f"Sigmoid Kernel: {f1_score(y_true_svc_sigmoid, y_pred_svc_sigmoid, average='weighted')}")
print('\n')

In [None]:
class TextDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.float), torch.tensor(self.y[idx], dtype=torch.long)

In [None]:
train_ds = TextDataset(features_train, y_train)
dev_ds = TextDataset(features_dev, y_dev)
test_ds = TextDataset(features_test, y_test)
train_loader = DataLoader(train_ds, batch_size=128, shuffle=True)
dev_loader = DataLoader(dev_ds, batch_size=128, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=128, shuffle=False)

In [None]:
class LinearClassifier(nn.Module):

    def __init__(self, in_features, out_features):
        super().__init__()
        self.fc1 = nn.Linear(in_features, 128)
        self.fc2 = nn.Linear(128, out_features)
        self.softmax = nn.Softmax()

    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        output = self.softmax(x)
        return output

In [None]:
def save_checkpoint(save_path, model, optimizer, valid_loss):

    if save_path == None:
        return
    
    state_dict = {'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'valid_loss': valid_loss}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')

In [None]:
# Training Function
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f'Device {device}')

def train(model,
          optimizer,
          criterion = nn.CrossEntropyLoss(),
          train_loader = train_loader,
          dev_loader = dev_loader,
          num_epochs = 100,
          eval_every = len(train_ds) // 2,
          file_path = "",
          best_dev_loss = float("Inf")):
    
    # initialize running values
    running_loss = 0.0
    dev_running_loss = 0.0
    global_step = 0
    train_loss_list = []
    dev_loss_list = []
    global_steps_list = []

    # training loop
    model.train()
    for epoch in range(num_epochs):
        print(f'Training epochs {epoch + 1}/{num_epochs}')
        for i, (features, labels) in enumerate(train_loader):
            features = features.to(device)           
            labels = labels.to(device)
            output = model(features)

            loss = criterion(output, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # update running values
            running_loss += loss.item()
            global_step += 1

            # evaluation step

        model.eval()
        correct, total = 0, 0
        with torch.no_grad():                    
        # devation loop
            for features, labels in dev_loader:
                features = features.to(device) 
                labels = labels.to(device)
                output = model(features)

                loss = criterion(output, labels)
                dev_running_loss += loss.item()

                _, predicted = torch.max(output.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        print('Accuracy: %d %%' % (100 * correct / total))

        # evaluation
        average_train_loss = running_loss
        average_dev_loss = dev_running_loss
        train_loss_list.append(average_train_loss)
        dev_loss_list.append(average_dev_loss)
        global_steps_list.append(global_step)

        # resetting running values
        running_loss = 0.0                
        dev_running_loss = 0.0
        model.train()

        # print progress
        print(f'Epoch [{epoch + 1}/{num_epochs}] Train Loss: {average_train_loss}, dev Loss: {average_dev_loss}')
                  
        # checkpoint
        if best_dev_loss > average_dev_loss:
            best_dev_loss = average_dev_loss
            save_checkpoint(file_path + f'/model_bert.pt', model, optimizer, best_dev_loss)
    
    print('Finished Training!')

In [None]:
model = LinearClassifier(768, 27).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train(model=model, optimizer=optimizer, num_epochs=1000)