In [34]:
# import torch
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path

from gensim.utils import simple_preprocess
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

from transformers import get_linear_schedule_with_warmup, AutoTokenizer, AutoModel, logging

import warnings
warnings.filterwarnings("ignore")

logging.set_verbosity_error()
# read data ignore last col
import py_vncorenlp
import os
def seed_everything(seed_value):
    np.random.seed(seed_value)
    # torch.manual_seed(seed_value)
    
    # if torch.cuda.is_available(): 
    #     torch.cuda.manual_seed(seed_value)
    #     torch.cuda.manual_seed_all(seed_value)
    #     torch.backends.cudnn.deterministic = True
    #     torch.backends.cudnn.benchmark = True



if 'pwd' not in locals():
    from transformers import PhobertTokenizer, AutoModel
    from transformers import DataCollatorForTokenClassification
    pwd = Path(os.getcwd())
    save_dir = pwd / 'models'
    rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir=str(save_dir))
    os.chdir(pwd)
    seed_everything(25)

# device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
EPOCHS = 6
N_SPLITS = 5

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

data = pd.read_csv('data - data.csv', usecols=range(0, 3))
data['wseg'] = data['comment'].apply(lambda x : " ".join(simple_preprocess( " ".join(rdrsegmenter.word_segment(x)))))
display(data.sample(10))


Unnamed: 0,comment,label,rate,wseg
11319,"Thanks shop, nếu cần nữa sẽ ủng hộ shop tiếp.",POS,4,thanks shop nếu cần nữa sẽ ủng_hộ shop tiếp
3317,Không đẹp giống như hình ảnh.,NEG,1,không đẹp giống như hình_ảnh
11342,Chúc shop buôn may bán đắt <3.,POS,5,chúc shop buôn may bán đắt
849,Như củ.,NEG,2,như củ
18922,Giày xinh cực luôn 😘😘😘.,POS,4,giày xinh cực luôn
17666,Áo đẹp xuất sắc luôn.,POS,5,áo đẹp xuất_sắc luôn
12515,Túi quần bị rách.,NEG,2,túi quần bị rách
3021,hàng đẹp giống y hình.,POS,4,hàng đẹp giống hình
2549,Giá hơp lý!,POS,4,giá hơp lý
20868,Đóng gói không đẹp hộp bị rách may không thủng...,NEG,2,đóng_gói không đẹp hộp bị rách may không thủng...


In [7]:
vectorizer = TfidfVectorizer(tokenizer = lambda x : x.split(' '))
corpus  = data['wseg'].tolist()
X = vectorizer.fit_transform(corpus)

display(X.shape)

(31460, 6921)

In [10]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing



Y = data['label'].tolist()
le = preprocessing.LabelEncoder()
le.fit(Y)
Y = le.transform(Y)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=25)

In [12]:
# using SVM for classification
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)
svm.score(X_test, y_test)

0.7817863954227591

In [32]:
y_pred = svm.predict(X_test)
print(classification_report(y_test, y_pred,  target_names=['POS','NEU','NEG']))

              precision    recall  f1-score   support

         POS       0.68      0.77      0.72      1328
         NEU       0.48      0.16      0.24       979
         NEG       0.84      0.94      0.89      3985

    accuracy                           0.78      6292
   macro avg       0.67      0.62      0.61      6292
weighted avg       0.75      0.78      0.75      6292



In [33]:

from sklearn.model_selection import GridSearchCV
  
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']} 
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
  
# fitting the model for grid search
grid.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.713 total time=  37.7s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.730 total time=  37.7s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.724 total time=  37.6s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.721 total time=  37.8s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.719 total time=  38.0s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.670 total time=  38.1s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.676 total time=  37.8s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.674 total time=  38.1s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.672 total time=  38.3s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.668 total time=  38.3s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.640 total time=  40.7s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

In [36]:
y_pred = grid.predict(X_test)
print(classification_report(y_test, y_pred,  target_names=['POS','NEU','NEG']))
print(grid.best_params_)

              precision    recall  f1-score   support

         POS       0.68      0.77      0.72      1328
         NEU       0.48      0.16      0.24       979
         NEG       0.84      0.94      0.89      3985

    accuracy                           0.78      6292
   macro avg       0.67      0.62      0.61      6292
weighted avg       0.75      0.78      0.75      6292



{'C': 1, 'gamma': 1, 'kernel': 'rbf'}