In [None]:
!unzip student_feedback*.zip

Archive:  student_feedback.zip
  inflating: synthetic_train.csv     
  inflating: synthetic_val.csv       


In [None]:
import pandas as pd
import torch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader
from nltk.tokenize import sent_tokenize, word_tokenize
import csv
import numpy as np
from gensim.utils import simple_preprocess


In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [None]:
from transformers import get_linear_schedule_with_warmup, AutoTokenizer, AutoModel, logging
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.13M [00:00<?, ?B/s]

In [None]:
class CustomDataset(Dataset):

    def __init__(self, data_dir, max_length = 256, transform=None, target_transform=None):
        super(CustomDataset, self).__init__()

        self.data_dir = data_dir
        self.transform = transform
        self.target_transform = target_transform
        self.max_length = max_length
        texts, labels = [], []
        with open(data_dir) as csv_file:
            reader = csv.reader(csv_file, quotechar='"')
            for idx, line in enumerate(reader):
                if idx == 0:
                    continue
                text = line[0]
                text = ' '.join(simple_preprocess(text))
                label = line[1]
                if (label == 'negative'):
                    labels.append(0)
                if (label == 'neutral'):
                    labels.append(1)
                if (label == 'positive'):
                    labels.append(2)
                texts.append(text)

        self.label_list = labels
        self.text_list = texts

    def __len__(self):
        return len(self.label_list)

    def __getitem__(self, index):
        label = self.label_list[index]
        if self.target_transform:
            label = self.target_transform(label)
        text = self.text_list[index]
        encoding = tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_masks': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(label, dtype=torch.long)
        }

In [17]:
from torchtext.functional import to_tensor
from torchvision.transforms import ToTensor, Lambda

n_classes = 3
training_data = CustomDataset(data_dir="/content/synthetic_train.csv")
test_data = CustomDataset(data_dir="/content/synthetic_val.csv")
print(training_data.__getitem__(4))

train_dataloader = DataLoader(training_data, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=16, shuffle=True)

print (len(train_dataloader.dataset))

{'text': 'tôi nghĩ rằng chương trình đào tạo có thể có thêm các môn học về lịch sử và văn hóa để sinh viên hiểu rõ hơn về đất nước và con người việt nam', 'input_ids': tensor([    0,    70,   487,    87,  5887,  1893,  1717,   199,    10,  4623,
           10,   143,     9,  1002,   222,    28,  2546,  5717,     6,  2958,
         1340, 11095,  1517,    24,   418,  1430,   563,   297,    48,    28,
          179,    58,     6,    73,    18, 14053,   542,     2,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,   

In [None]:
class SentimentClassifier(nn.Module):
    def __init__(self):
        super(SentimentClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained("vinai/phobert-base")
        self.fc = nn.Linear(self.bert.config.hidden_size, n_classes)
        nn.init.normal_(self.fc.weight, std=0.02)
        nn.init.normal_(self.fc.bias, 0)

    def forward(self, input_ids, attention_mask):
        last_hidden_state, output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=False
        )

        x = self.fc(output)
        return x

In [18]:
import time
from tqdm import tqdm

def train(model, loss_fn, optimizer, train_loader, epoch):
    model.zero_grad()
    model.train()
    losses = []
    correct = 0

    for data in tqdm(train_loader, f'Epoch: {epoch}'):
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_masks'].to(device)
        targets = data['targets'].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        loss = loss_fn(outputs, targets)
        _, pred = torch.max(outputs, dim=1)

        correct += torch.sum(pred == targets)
        losses.append(loss.item())
        loss.backward()

        optimizer.step()

    print(f'Train Accuracy: {correct.double()/len(train_loader.dataset)} Loss: {np.mean(losses)}')

def test(model, loss_fn, test_loader):
    model.eval()
    losses = []
    correct = 0

    with torch.no_grad():
        for data in test_loader:
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_masks'].to(device)
            targets = data['targets'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            _, pred = torch.max(outputs, dim=1)

            loss = loss_fn(outputs, targets)
            correct += torch.sum(pred == targets)
            losses.append(loss.item())
            time.sleep(0.001)

    print(f'Test Accuracy: {correct.double()/len(test_loader.dataset)} Loss: {np.mean(losses)}')
    return correct.double()/len(test_loader.dataset)

In [19]:
model = SentimentClassifier()
lr = 2e-5
optimizer = torch.optim.Adam(model.parameters(),lr=lr)
loss_fn = torch.nn.CrossEntropyLoss()
model.to(device)

n_epoches = 4

for epoch in range(n_epoches):
    torch.compile(train(model, loss_fn, optimizer, train_dataloader, epoch+1))
    test(model, loss_fn, test_dataloader)
    print()

torch.save(model.state_dict(), 'model.pth')

Epoch: 1: 100%|██████████| 509/509 [05:35<00:00,  1.52it/s]


Train Accuracy: 0.7678045186640471 Loss: 0.5124970462268369
Test Accuracy: 0.8384086444007858 Loss: 0.3751712731900625



Epoch: 2: 100%|██████████| 509/509 [05:34<00:00,  1.52it/s]


Train Accuracy: 0.8325147347740668 Loss: 0.37365397292762
Test Accuracy: 0.8403732809430254 Loss: 0.3687159458058886



Epoch: 3: 100%|██████████| 509/509 [05:34<00:00,  1.52it/s]


Train Accuracy: 0.8665275049115913 Loss: 0.2945516376677454
Test Accuracy: 0.8639489194499017 Loss: 0.3152506168698892



Epoch: 4: 100%|██████████| 509/509 [05:34<00:00,  1.52it/s]


Train Accuracy: 0.8967337917485264 Loss: 0.24339074854444426
Test Accuracy: 0.8811394891944989 Loss: 0.29176831418590154



In [23]:
model = SentimentClassifier()
model.load_state_dict(torch.load('model.pth'))
model.eval()


SentimentClassifier(
  (bert): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La