In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
base_path = "/content/drive/MyDrive/Colab Notebooks/Deep Learning/Sentiment_classification/"
%cd /content/drive/MyDrive/Colab Notebooks/Deep Learning/Sentiment_classification/

/content/drive/MyDrive/Colab Notebooks/Deep Learning/Sentiment_classification


# Install dependencies

In [6]:
!pip install transformers
!pip install vncorenlp
!pip install fairseq
!pip install fastBPE
!pip install sentencepiece




In [None]:
!wget https://public.vinai.io/PhoBERT_base_transformers.tar.gz
!tar -xzvf PhoBERT_base_transformers.tar.gz


--2022-03-18 02:09:00--  https://public.vinai.io/PhoBERT_base_transformers.tar.gz
Resolving public.vinai.io (public.vinai.io)... 108.159.227.65, 108.159.227.34, 108.159.227.61, ...
Connecting to public.vinai.io (public.vinai.io)|108.159.227.65|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 322405979 (307M) [application/x-tar]
Saving to: ‘PhoBERT_base_transformers.tar.gz’


2022-03-18 02:09:03 (179 MB/s) - ‘PhoBERT_base_transformers.tar.gz’ saved [322405979/322405979]

PhoBERT_base_transformers/
PhoBERT_base_transformers/config.json
PhoBERT_base_transformers/bpe.codes
PhoBERT_base_transformers/model.bin
PhoBERT_base_transformers/dict.txt


In [None]:
!mkdir -p vncorenlp/models/wordsegmenter
!wget -q --show-progress https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
!wget -q --show-progress https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
!wget -q --show-progress https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr
!mv VnCoreNLP-1.1.1.jar vncorenlp/ 
!mv vi-vocab vncorenlp/models/wordsegmenter/
!mv wordsegmenter.rdr vncorenlp/models/wordsegmenter/




# Import library

In [59]:
import os  
import argparse
import warnings

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F 
from torch.utils.data import TensorDataset, DataLoader
from fairseq.data.encoders.fastbpe import fastBPE
from fairseq.data import Dictionary
from vncorenlp import VnCoreNLP
from transformers import RobertaConfig, BertPreTrainedModel, RobertaModel, AdamW
from transformers import get_linear_schedule_with_warmup, get_constant_schedule
from transformers.modeling_utils import *
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from tqdm import tqdm

warnings.filterwarnings("ignore")


# Utils

In [8]:
def convert_lines(df, vocab, bpe, max_sequence_length):
    # Initial output matrix
    outputs = np.zeros((len(df), max_sequence_length))
    
    cls_id = 0
    pad_id = 1
    eos_id = 2

    for (idx, text, label, _) in tqdm(df.itertuples(name=None), total=len(df)):
        subwords = bpe.encode('<s> '+ text +' </s>')
        input_ids = vocab.encode_line(subwords, append_eos=False, add_if_not_exist=False).long().tolist()
        if len(input_ids) > max_sequence_length: 
            input_ids = input_ids[:max_sequence_length] 
            input_ids[-1] = eos_id
        else:
            input_ids = input_ids + [pad_id, ] * (max_sequence_length - len(input_ids))

        outputs[idx, :] = np.array(input_ids)
    return outputs

def seed_everything(SEED):
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True

def sigmoid(x):
    return 1 / (1 + np.exp(-x))
    

In [9]:
import pickle

def _save_pkl(path, obj):
  with open(path, 'wb') as f:
    pickle.dump(obj, f)

def _load_pkl(path):
  with open(path, 'rb') as f:
    obj = pickle.load(f)
  return obj


# Preprocessing

In [31]:
train_path = os.path.join(base_path, "data/train.csv")
dict_path = os.path.join(base_path, "PhoBERT_base_transformers/dict.txt")
config_path = os.path.join(base_path, "PhoBERT_base_transformers/config.json")
pretrained_path = os.path.join(base_path, "PhoBERT_base_transformers/model.bin")
bpe_codes = os.path.join(base_path, "PhoBERT_base_transformers/bpe.codes")
checkpoint_path = os.path.join(base_path, "models/")
rdrsegmenter_path = os.path.join(base_path, "vncorenlp/VnCoreNLP-1.1.1.jar")


In [32]:
# Khởi tạo Byte Pair Encoding cho PhoBERT
class BPE():
  bpe_codes = bpe_codes

bpe = fastBPE(BPE())
rdrsegmenter = VnCoreNLP(rdrsegmenter_path, annotators="wseg", max_heap_size='-Xmx500m')


In [33]:
# Load the Dictionary
vocab = Dictionary()
vocab.add_from_file(dict_path)


In [34]:
# Input 
text = "Ông Nguyễn Khắc Chúc đang làm việc tại Đại học Quốc gia Hà Nội. Bà Lan, vợ ông Chúc, cũng làm việc tại đây."

# To perform word (and sentence) segmentation
' '.join([' '.join(sent) for sent in rdrsegmenter.tokenize(text)])
 

'Ông Nguyễn_Khắc_Chúc đang làm_việc tại Đại_học Quốc_gia Hà_Nội . Bà Lan , vợ ông Chúc , cũng làm_việc tại đây .'

In [35]:
tqdm.pandas()
# Load data

train_df = pd.read_csv(train_path)
print("Num samples: {}".format(len(train_df)))



Num samples: 377861


In [36]:
# Count the number of words
train_df['number_of_words'] = train_df['text'].progress_apply(lambda x: len(str(x).strip().split()))
train_df.head()


100%|██████████| 377861/377861 [00:06<00:00, 61818.63it/s]


Unnamed: 0,text,label,number_of_words
0,Xuất hiện trên thị trường khi vừa bước sang gi...,Positive,51
1,Khép lại năm 2014. Nhìn lại chặn đường đã đi q...,Positive,64
2,Bên cạnh Phú Quốc (Kiên Giang) và Bắc Vân Phon...,Positive,43
3,"Vi vu Nha Trang, nghỉ dưỡng tại Biệt thự Vinpe...",Positive,54
4,"Sở hữu nhiều giá trị vượt trội về diện tích, k...",Positive,49


In [37]:
# number of rows with text length < 2
no_text = train_df[train_df['number_of_words'] < 2]
print(len(no_text))
train_df.drop(no_text.index, inplace=True)

# number of rows that text is NaN
null_text = train_df[train_df['text'].isnull()]
print(len(null_text))

# drop these rows
train_df.drop(null_text.index, inplace=True)


46
0
0


In [38]:
train_df

Unnamed: 0,text,label,number_of_words
0,Xuất hiện trên thị trường khi vừa bước sang gi...,Positive,51
1,Khép lại năm 2014. Nhìn lại chặn đường đã đi q...,Positive,64
2,Bên cạnh Phú Quốc (Kiên Giang) và Bắc Vân Phon...,Positive,43
3,"Vi vu Nha Trang, nghỉ dưỡng tại Biệt thự Vinpe...",Positive,54
4,"Sở hữu nhiều giá trị vượt trội về diện tích, k...",Positive,49
...,...,...,...
377856,"Mới đây, Novaland đã ký kết hợp tác với tập đo...",Negative,341
377857,Hà Nội sẽ có thêm 10 cầu vượt sông HồngTrả lời...,Negative,919
377858,Toàn cảnh giá bán 21 dự án chung cư nằm dọc đạ...,Negative,2565
377859,Bốn dự án giao thông ở TP HCM hoàn thành năm 2...,Negative,712


In [39]:
train_df.reset_index(drop=True, inplace=True)

In [40]:
train_df

Unnamed: 0,text,label,number_of_words
0,Xuất hiện trên thị trường khi vừa bước sang gi...,Positive,51
1,Khép lại năm 2014. Nhìn lại chặn đường đã đi q...,Positive,64
2,Bên cạnh Phú Quốc (Kiên Giang) và Bắc Vân Phon...,Positive,43
3,"Vi vu Nha Trang, nghỉ dưỡng tại Biệt thự Vinpe...",Positive,54
4,"Sở hữu nhiều giá trị vượt trội về diện tích, k...",Positive,49
...,...,...,...
377810,"Mới đây, Novaland đã ký kết hợp tác với tập đo...",Negative,341
377811,Hà Nội sẽ có thêm 10 cầu vượt sông HồngTrả lời...,Negative,919
377812,Toàn cảnh giá bán 21 dự án chung cư nằm dọc đạ...,Negative,2565
377813,Bốn dự án giao thông ở TP HCM hoàn thành năm 2...,Negative,712


In [41]:
# Tokenize
train_df["text"] = train_df["text"].progress_apply(lambda x: ' '.join([' '.join(sent) for sent in rdrsegmenter.tokenize(x)]))


100%|██████████| 377815/377815 [45:48<00:00, 137.45it/s]


In [42]:

train_df.to_csv(os.path.join(base_path, "data/train_segment.csv"), index=False, encoding="utf-8")

In [43]:
train_df = pd.read_csv(os.path.join(base_path, "data/train_segment.csv"))

In [44]:
max_sequence_length = 256

X_train = convert_lines(train_df, vocab, bpe, max_sequence_length)
y = train_df["label"].values
lb = LabelEncoder()
y = lb.fit_transform(y)


100%|██████████| 377815/377815 [07:53<00:00, 798.05it/s]


In [45]:
_save_pkl(os.path.join(base_path, "data/X_train_raw.pkl"), X_train)
_save_pkl(os.path.join(base_path, "data/y.pkl"), y)


# Training

In [61]:
X_train = _load_pkl(os.path.join(base_path, "data/X_train_raw.pkl"))
y = _load_pkl(os.path.join(base_path, "data/y.pkl"))


In [62]:
seed = 69
seed_everything(seed)

if torch.cuda.is_available():
    print('Using GPU')
    device = torch.device('cuda')
else:
    print('Using CPU')
    device = torch.device('cpu')


Using GPU


In [63]:
# Define the model
class RobertaVN(BertPreTrainedModel):
   config_class = RobertaConfig
   base_model_prefix = "roberta"
   def __init__(self, config):
       super(RobertaVN, self).__init__(config)
       self.num_labels = config.num_labels
       self.roberta = RobertaModel(config)
       self.qa_outputs = nn.Linear(4*config.hidden_size, self.num_labels)

       self.init_weights()

   def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
                start_positions=None, end_positions=None):

       outputs = self.roberta(input_ids,
                            attention_mask=attention_mask,
#                            token_type_ids=token_type_ids,
                            position_ids=position_ids,
                            head_mask=head_mask)
       cls_output = torch.cat((outputs[2][-1][:,0, ...],outputs[2][-2][:,0, ...], outputs[2][-3][:,0, ...], outputs[2][-4][:,0, ...]),-1)
       logits = self.qa_outputs(cls_output)
       return logits


In [64]:
# Load model
config = RobertaConfig.from_pretrained(
    config_path,
    output_hidden_states=True,
    num_labels=1
)

model = RobertaVN.from_pretrained(pretrained_path, config=config)
# model.eval()
model.to(device)


You are using a model of type bert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at /content/drive/MyDrive/Colab Notebooks/Deep Learning/Sentiment_classification/PhoBERT_base_transformers/model.bin were not used when initializing RobertaVN: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaVN from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaVN from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaVN wer

RobertaVN(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=0)
      (position_embeddings): Embedding(258, 768, padding_idx=0)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,),

In [65]:
if torch.cuda.device_count():
    print(f"Training using {torch.cuda.device_count()} gpus")
    model = nn.DataParallel(model)
    tsfm = model.module.roberta
else:
    tsfm = model.roberta


Training using 1 gpus


In [66]:
batch_size = 24
epochs = 10
lr = 3e-5
accumulation_steps = 5

# Optimizer and lr schedulers
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
num_train_optimization_steps = int(epochs * len(train_df) / batch_size / accumulation_steps)
optimizer = AdamW(optimizer_grouped_parameters, lr=lr, correct_bias=False)
# Warmup 100 step before start learning
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=num_train_optimization_steps)
scheduler0 = get_constant_schedule(optimizer)


In [67]:
# Checkpoint path 
if not os.path.exists(checkpoint_path):
    os.mkdir(checkpoint_path)
    

In [None]:
fold = 0

splits = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=32).split(X_train, y))
for fold, (train_idx, val_idx) in enumerate(splits):
    print("Training for fold {}".format(fold))
    best_score = 0
    if fold != fold:
        continue
    
    train_dataset = TensorDataset(torch.tensor(X_train[train_idx], dtype=torch.long), torch.tensor(y[train_idx], dtype=torch.long))
    valid_dataset = TensorDataset(torch.tensor(X_train[val_idx], dtype=torch.long), torch.tensor(y[val_idx], dtype=torch.long))
    for child in tsfm.children():
        for param in child.parameters():
            if not param.requires_grad:
                pass
            param.requires_grad = False
    
    frozen = True
    for epoch in tqdm(range(epochs + 1)):
        if epoch > 0 and frozen:
            for child in tsfm.children():
                for param in child.parameters():
                    param.requires_grad = True
            frozen = False
            del scheduler0
            torch.cuda.empty_cache()
        
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
        avg_loss = 0.
        train_preds = None
        optimizer.zero_grad()
        with tqdm(enumerate(train_loader), total=len(train_loader), unit="batch") as process_bar:
            for step, (x_batch, y_batch) in process_bar:
                process_bar.set_description(f"Epoch {epoch + 1}")
                model.train()
                
                y_pred = model(x_batch.to(device), attention_mask=(x_batch > 0).to(device))
                predictions = y_pred.squeeze().detach().cpu().numpy()
                predictions = np.atleast_1d(predictions)
                train_preds = predictions if train_preds is None else np.concatenate([train_preds, predictions])
                predictions = sigmoid(predictions)
                accuracy = accuracy_score(y_batch, predictions > 0.5)
                loss = F.binary_cross_entropy_with_logits(y_pred.view(-1).to(device), y_batch.float().to(device))
                loss = loss.mean()
                loss.backward()

                if step % accumulation_steps == 0 or step == len(train_loader) - 1:
                    optimizer.step()
                    optimizer.zero_grad()
                    if not frozen:
                        scheduler.step()
                    else:
                        scheduler0.step()
                process_bar.set_postfix(loss=loss.item(), accuracy=round(accuracy * 100, 2))
                avg_loss += loss.item() / len(train_loader)
        train_score = f1_score(y[train_idx], train_preds > 0.5)
        train_accuracy = accuracy_score(y[train_idx], train_preds > 0.5)
        print(f"\nAverage loss {avg_loss},  F1 score @0.5 = {train_score:.4f}, Accuracy = {round(train_accuracy * 100, 2)}")

        model.eval()
        val_preds = None
        process_bar = tqdm(enumerate(valid_loader), total=len(valid_loader))
        for step, (x_batch, y_batch) in process_bar:
            with torch.no_grad():
                y_pred = model(x_batch.to(device), attention_mask=(x_batch > 0).to(device))
                y_pred = y_pred.squeeze().detach().cpu().numpy()
                val_preds = np.atleast_1d(y_pred) if val_preds is None else np.concatenate([val_preds, np.atleast_1d(y_pred)])
        val_preds = sigmoid(val_preds)
        best_th = 0

        score = f1_score(y[val_idx], val_preds > 0.5)
        val_accuracy = accuracy_score(y[val_idx], val_preds > 0.5)
        print(f"\nAUC = {roc_auc_score(y[val_idx], val_preds):.4f}, F1 score @0.5 = {score:.4f}, Accuracy = {round(val_accuracy * 100, 2)}")
        if score >= best_score:
            torch.save(model.state_dict(),os.path.join(checkpoint_path, f"model_{fold}.bin"))
            best_score = score


Training for fold 0


  0%|          | 0/11 [00:00<?, ?it/s]
  0%|          | 0/12594 [00:00<?, ?batch/s][A
Epoch 0:   0%|          | 0/12594 [00:00<?, ?batch/s][A
Epoch 0:   0%|          | 0/12594 [00:00<?, ?batch/s, accuracy=45.8, loss=0.772][A
Epoch 0:   0%|          | 1/12594 [00:00<3:05:10,  1.13batch/s, accuracy=45.8, loss=0.772][A
Epoch 0:   0%|          | 1/12594 [00:00<3:05:10,  1.13batch/s, accuracy=45.8, loss=0.772][A
Epoch 0:   0%|          | 1/12594 [00:01<3:05:10,  1.13batch/s, accuracy=33.3, loss=0.817][A
Epoch 0:   0%|          | 2/12594 [00:01<2:55:49,  1.19batch/s, accuracy=33.3, loss=0.817][A
Epoch 0:   0%|          | 2/12594 [00:01<2:55:49,  1.19batch/s, accuracy=33.3, loss=0.817][A
Epoch 0:   0%|          | 2/12594 [00:02<2:55:49,  1.19batch/s, accuracy=37.5, loss=0.756][A
Epoch 0:   0%|          | 3/12594 [00:02<2:53:12,  1.21batch/s, accuracy=37.5, loss=0.756][A
Epoch 0:   0%|          | 3/12594 [00:02<2:53:12,  1.21batch/s, accuracy=37.5, loss=0.756][A
Epoch 0:   0%|      