In [None]:
import sklearn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [41]:
col_list = ["content", "label"]
df = pd.read_csv("data/data.csv", usecols=col_list)

df.head()

Unnamed: 0,content,label
0,Áo bao đẹp ạ!,POS
1,Tuyệt vời,POS
2,2day ao khong giong trong,NEG
3,"Mùi thơm,bôi lên da mềm da",POS
4,"Vải đẹp, dày dặn",POS


In [42]:
df['label'] = df['label'].replace(['POS'], '2')
df['label'] = df['label'].replace(['NEU'], '1')
df['label'] = df['label'].replace(['NEG'], '0')

df.head()

Unnamed: 0,content,label
0,Áo bao đẹp ạ!,2
1,Tuyệt vời,2
2,2day ao khong giong trong,0
3,"Mùi thơm,bôi lên da mềm da",2
4,"Vải đẹp, dày dặn",2


In [43]:
df = df.dropna()
df = df.drop_duplicates()

In [44]:
from data.preprocess import pipeline_preprocess

df["content"] = df['content'].fillna('').apply(pipeline_preprocess)

df.head()


Unnamed: 0,content,label
0,áo_bao đẹp,2
1,tuyệt_vời,2
2,2 day ao khong giong,0
3,mùi_thơm bôi lên_da mềm da,2
4,vải đẹp dày_dặn,2


In [45]:
len(df)

26810

In [46]:
train, test = train_test_split(df, test_size=0.2, shuffle=True)

In [47]:
train.to_csv("data/preprocess/sentiment_train.csv", index=False)
test.to_csv("data/preprocess/sentiment_test.csv", index=False)

In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("data/preprocess/sentiment_train.csv")
df = df.dropna()

df.head()

Unnamed: 0,content,label
0,mong ad điều_chỉnh,0
1,chất_lượng sản_phẩm tuyệt_vời,2
2,nhưq mìh áo đen logo trắq giao áo trắq logo đen,1
3,chất_lượng giấy mềm_mại,2
4,sp giá tiền,2


In [3]:
target_count = df.label.value_counts()
target_count

2    12778
0     5071
1     3466
Name: label, dtype: int64

In [4]:
X_train = df['content'].values
y_train = df['label'].values

X_train = X_train.astype("str")
y_train = y_train.astype('int')

In [5]:
count_vector = CountVectorizer()
model_rf_preprocess = Pipeline([('vect', count_vector),
                    ('tfidf', TfidfTransformer()),
                    ])
X_train_CV = model_rf_preprocess.fit_transform(X_train)

In [6]:
X_train_CV.shape

(21315, 6331)

In [7]:
df_test = pd.read_csv("data/preprocess/sentiment_test.csv")
df_test = df_test.dropna()
X_test = df_test['content'].values
y_test = df_test['label'].values

X_test = X_test.astype("str")
y_test = y_test.astype('int')

X_test_CV = model_rf_preprocess.transform(X_test)
X_test_CV.shape

(5325, 6331)

In [9]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from collections import Counter

# smote = SMOTE()
smote = SMOTETomek()
X_train_CV, y_train = smote.fit_resample(X_train_CV, y_train)

print('Resampled dataset shape %s' % Counter(y_train))

Resampled dataset shape Counter({2: 12703, 1: 12686, 0: 12671})


In [8]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

rus = RandomUnderSampler()
X_test_CV, y_test = rus.fit_resample(X_test_CV, y_test)

print('Resampled dataset shape %s' % Counter(y_test))

Resampled dataset shape Counter({0: 875, 1: 875, 2: 875})


## Decision Tree

In [None]:
from ensemble.decisiontree_cart import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=5)
dt.fit(X_train_CV.toarray(), y_train)

print("Training complete")

predict = dt.predict(X_test_CV.toarray())

print(accuracy_score(y_test, predict))
print(confusion_matrix(y_test, predict))
print(classification_report(y_test, predict))


In [20]:
from sklearn.tree import DecisionTreeClassifier
# param_grid = {"max_depth": [7, None]}
# dt = GridSearchCV(DecisionTreeClassifier(), param_grid, scoring="accuracy", refit = True, verbose = 10, n_jobs=2, cv=4)

dt = DecisionTreeClassifier()
dt.fit(X_train_CV.toarray(), y_train)

print("Training complete")

predict = dt.predict(X_test_CV.toarray())

print(accuracy_score(y_test, predict))
print(confusion_matrix(y_test, predict))
print(classification_report(y_test, predict))

Training complete
0.5436190476190477
[[511 167 197]
 [267 242 366]
 [ 78 123 674]]
              precision    recall  f1-score   support

           0       0.60      0.58      0.59       875
           1       0.45      0.28      0.34       875
           2       0.54      0.77      0.64       875

    accuracy                           0.54      2625
   macro avg       0.53      0.54      0.52      2625
weighted avg       0.53      0.54      0.52      2625



In [None]:
from ensemble.randomforest import RandomForest


rf = RandomForest(n_trees=100, max_depth=6, min_samples_split=2, n_feats=150)
rf.fit(X_train_CV.toarray(), y_train)

predict = rf.predict(X_test_CV.toarray())

print(accuracy_score(y_test, predict))
print(confusion_matrix(y_test, predict))

## Random Forest

In [17]:
from sklearn.ensemble import RandomForestClassifier

# param_grid = {"max_depth": [None]}
# rf = GridSearchCV(RandomForestClassifier(), param_grid, scoring="accuracy", refit = True, verbose = 10, n_jobs=2, cv=4)
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_CV.toarray(), y_train)

predict = rf.predict(X_test_CV.toarray())

print(accuracy_score(y_test, predict))
print(confusion_matrix(y_test, predict))

0.5942857142857143
[[571 160 144]
 [280 269 326]
 [ 59  96 720]]


## SVM

In [9]:
from ml_lib import MulticlassSVM

svm = MulticlassSVM(reg=0.1, lr=0.1, beta1=0.9, beta2=0.999, eps=1e-6, max_epochs=300)
svm.fit(X_train_CV.toarray(), y_train, print_every=100)

predict = svm.predict(X_test_CV.toarray())

print(accuracy_score(y_test, predict))
print(confusion_matrix(y_test, predict))
print(classification_report(y_test, predict))

it 1/300, loss = 814.3140998872196
it 101/300, loss = 1.6878607905526035
it 201/300, loss = 1.6758842429964043


## KNN

In [16]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=7, weights='uniform')
knn.fit(X_train_CV.toarray(), y_train)

predict = knn.predict(X_test_CV.toarray())

print(classification_report(y_test, predict))

0.5272380952380953
[[500 308  67]
 [327 411 137]
 [109 293 473]]
              precision    recall  f1-score   support

           0       0.53      0.57      0.55       875
           1       0.41      0.47      0.44       875
           2       0.70      0.54      0.61       875

    accuracy                           0.53      2625
   macro avg       0.55      0.53      0.53      2625
weighted avg       0.55      0.53      0.53      2625



## Softmax Regression

In [9]:
from ml_lib import SoftmaxRegression
# param_grid = {"lr": [0.2, 0.3, 0.1], "max_epochs": [10000]}
# sr = GridSearchCV(SoftmaxRegression(), param_grid, scoring="accuracy", refit = True, verbose = 10, n_jobs=2, cv=5)

sr = SoftmaxRegression(lr=0.3, max_epochs=1000)
sr.fit(X_train_CV.toarray(), y_train)

predict = sr.predict(X_test_CV.toarray())

print(accuracy_score(y_test, predict))
print(confusion_matrix(y_test, predict))
print(classification_report(y_test, predict))

(3, 21315)
0.5653333333333334
[[539 122 214]
 [262 189 424]
 [ 68  51 756]]
              precision    recall  f1-score   support

           0       0.62      0.62      0.62       875
           1       0.52      0.22      0.31       875
           2       0.54      0.86      0.67       875

    accuracy                           0.57      2625
   macro avg       0.56      0.57      0.53      2625
weighted avg       0.56      0.57      0.53      2625



## Naive Bayes

In [37]:
from sklearn.naive_bayes import MultinomialNB

param_grid = {"alpha": [0.8, 0.1, 0.2]}
NB = GridSearchCV(MultinomialNB(), param_grid, scoring="accuracy", refit = True, verbose = 10, n_jobs=2, cv=5)
# NB = MultinomialNB(alpha=0.2)
NB.fit(X_train_CV.toarray(), y_train)

y_pred = NB.predict(X_test_CV.toarray())
accuracy_score(y_test, y_pred)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


0.5447619047619048

In [38]:
NB.best_params_

{'alpha': 0.2}

In [16]:
confusion_matrix(y_test, y_pred)

array([[ 745,   59,  452],
       [ 234,   71,  570],
       [ 168,   44, 2982]], dtype=int64)

In [27]:
from ml_lib import MultinomialNaiveBayes

# param_grid = {"alpha": [0.6, 0.8, 1.2, 1.5]}

NB = MultinomialNaiveBayes(alpha=2.0)
# NB = GridSearchCV(MultinomialNaiveBayes(), param_grid, scoring="accuracy", refit = True, verbose = 10, n_jobs=2, cv=5)
NB.fit(X_train_CV.toarray(), y_train)

predict = NB.predict(X_test_CV.toarray())
print(accuracy_score(y_test, predict))
print(confusion_matrix(y_test, predict))
print(classification_report(y_test, predict))

0.6266666666666667
[[569 217  89]
 [258 388 229]
 [ 68 119 688]]
              precision    recall  f1-score   support

           0       0.64      0.65      0.64       875
           1       0.54      0.44      0.49       875
           2       0.68      0.79      0.73       875

    accuracy                           0.63      2625
   macro avg       0.62      0.63      0.62      2625
weighted avg       0.62      0.63      0.62      2625

