In [1]:
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

from gensim.utils import simple_preprocess
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

from transformers import get_linear_schedule_with_warmup, AutoTokenizer, AutoModel, logging

import warnings
warnings.filterwarnings("ignore")

logging.set_verbosity_error()
# read data ignore last col
import py_vncorenlp
import os
def seed_everything(seed_value):
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True



if 'pwd' not in locals():
    from transformers import PhobertTokenizer, AutoModel
    from transformers import DataCollatorForTokenClassification
    pwd = Path(os.getcwd())
    save_dir = pwd / 'models'
    rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir=str(save_dir))
    os.chdir(pwd)
    seed_everything(25)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
EPOCHS = 6
N_SPLITS = 5

  from .autonotebook import tqdm as notebook_tqdm


2023-08-15 14:44:41 INFO  WordSegmenter:24 - Loading Word Segmentation model


In [2]:
data = pd.read_csv('data - data.csv', usecols=range(0, 3))
data['wseg'] = data['comment'].apply(lambda x : " ".join(simple_preprocess( " ".join(rdrsegmenter.word_segment(x)))))
display(data.sample(10))


Unnamed: 0,comment,label,rate,wseg
6452,Shop phục vụ tốt.,POS,4,shop phục_vụ tốt
26367,"Bé nhỏ cũng có thể dắt xe, nhấc xe lên để chơi.",NEU,3,bé_nhỏ cũng có_thể dắt xe nhấc xe lên để chơi
29602,Áo không giống hình sản phẩm lỗi lừa đảo.,NEG,1,áo không giống hình sản_phẩm lỗi lừa_đảo
19747,"ba lô phù hợp vs giá tiên, như hình.",NEU,3,ba_lô phù_hợp vs giá tiên như hình
1432,Rất ok mua hoài vẫn ok.,POS,5,rất ok mua hoài vẫn ok
445,5 sao nha!,POS,5,sao nha
19210,Đóng gói sản phẩm rất đẹp và chắc chắn.,POS,5,đóng_gói sản_phẩm rất đẹp và chắc_chắn
12179,Ưng quá.,POS,5,ưng quá
11598,Thời gian giao hàng rất nhanh.Hàng ok.,POS,4,thời_gian giao hàng rất nhanh hàng ok
8113,K biết cái này tác dụng thơm ở đâu.,NEG,1,biết cái này tác_dụng thơm đâu


In [31]:
from transformers import AutoModel
from CustomDataset import SentimentDataset
bert = AutoModel.from_pretrained("vinai/phobert-base-v2")
tokenizer = PhobertTokenizer.from_pretrained("vinai/phobert-base-v2")
batch_size = 128
dataset = SentimentDataset(data, tokenizer, max_len=256)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=2)
bert.to(device)
all_feats = []
with torch.no_grad():
    for batch in tqdm(dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_masks'].to(device)
        output = bert(input_ids, attention_mask)
        all_feats.append(output[-1])
        


initialize


100%|██████████| 246/246 [14:20<00:00,  3.50s/it]


In [3]:
all_feats = torch.load('all_feats.pt')
X = np.concatenate([_.cpu().numpy() for _ in all_feats], axis=0)
X.shape

(31460, 768)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing


Y = data['label'].tolist()
le = preprocessing.LabelEncoder()
le.fit(Y)
Y = le.transform(Y)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=25)
X_train.shape

(25168, 768)

In [5]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

  
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100,], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['sigmoid', 'rbf', 'linear', 'poly']} 
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3, cv = N_SPLITS, scoring = 'f1_macro')
  
# fitting the model for grid search
grid.fit(X_train, y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits


[CV 1/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.260 total time= 3.4min
[CV 2/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.260 total time= 3.3min
[CV 3/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.260 total time= 3.3min
[CV 4/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.260 total time= 3.3min
[CV 5/5] END ....C=0.1, gamma=1, kernel=sigmoid;, score=0.260 total time= 3.3min
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.272 total time=10.9min
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.276 total time=11.0min
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.277 total time=11.1min
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.272 total time=11.1min
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.271 total time=11.1min
[CV 1/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.583 total time= 2.4min
[CV 2/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.597 total time= 2.4min
