In [13]:
import sklearn
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [14]:
col_list = ["comments", "spam"]
df = pd.read_excel("Binh_comments.xlsx", usecols=col_list)

df.head()

Unnamed: 0,comments,spam
0,,1.0
1,,1.0
2,-,1.0
3,-------,1.0
4,----------,1.0


In [15]:
df.shape

(7836, 2)

In [16]:
df2 = pd.read_excel("Khoa_comments.xlsx", usecols=col_list)[:7000]

df2.head()

Unnamed: 0,comments,spam
0,-,1.0
1,-------,1.0
2,----------,1.0
3,-----------------,1.0
4,-------------------,1.0


In [17]:
df2.shape

(7000, 2)

In [18]:
df = pd.concat([df, df2])

In [19]:
df.shape

(14836, 2)

In [21]:
df.tail()

Unnamed: 0,comments,spam
6995,Hình ảnh mang tính chất nhận xu From Mọt Shop...,1.0
6996,"Hinh anh mang tinh chat nhan xu , hang ok chat...",0.0
6997,"hình ảnh mang tính chất nhận xu , sản phẩm có ...",0.0
6998,,
6999,Hình ảnh mang tính chất nhận xu . Bọc rất cẩn ...,0.0


In [25]:
df = df.dropna()
df = df.drop_duplicates()

In [26]:
df.shape

(14821, 2)

In [27]:
from data.preprocess import pipeline_preprocess

df["comments"] = df['comments'].apply(pipeline_preprocess)

df.head()

Unnamed: 0,comments,spam
0,,1.0
1,,1.0
2,,1.0
3,,1.0
4,,1.0


In [28]:
df = df.dropna()
df = df.drop_duplicates()
df.shape

(13851, 2)

In [29]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, stratify=df["spam"], test_size=0.25)

In [30]:
train.to_csv("data/preprocess/spam_train.csv", index=False)
test.to_csv("data/preprocess/spam_test.csv", index=False)

In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("data/preprocess/spam_train.csv")
df = df.dropna()

df.head()

Unnamed: 0,comments,spam
0,dây mua 3 rẻ sạc nóng,0.0
1,dép đẹp gói hàng,0.0
2,đóng hàng giao hàng shiper nhiệt_tình,0.0
3,hàng đóng_gói cẩn_thận hợp_tác vận_chuyển nhan...,0.0
4,giao hàng nhma_shop gửi tiền,0.0


In [3]:
target_count = df.spam.value_counts()
target_count

0.0    8255
1.0    2132
Name: spam, dtype: int64

In [4]:
X_train = df['comments'].values
y_train = df['spam'].values

X_train = X_train.astype("str")
y_train = y_train.astype('int')

In [5]:
count_vector = CountVectorizer()
model_rf_preprocess = Pipeline([('vect', count_vector),
                    ('tfidf', TfidfTransformer()),
                    ])
X_train_CV = model_rf_preprocess.fit_transform(X_train)

In [6]:
X_train_CV.shape

(10387, 10132)

In [8]:
df_test = pd.read_csv("data/preprocess/spam_test.csv")
df_test = df_test.dropna()
X_test = df_test['comments'].values
y_test = df_test['spam'].values

X_test = X_test.astype("str")
y_test = y_test.astype('int')

X_test_CV = model_rf_preprocess.transform(X_test)
X_test_CV.shape

(3462, 10132)

In [8]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from collections import Counter

# smote = SMOTE()
smote = SMOTETomek()
X_train_CV, y_train = smote.fit_resample(X_train_CV, y_train)

print('Resampled dataset shape %s' % Counter(y_train))

Resampled dataset shape Counter({0: 8248, 1: 8248})


In [9]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

rus = RandomUnderSampler()
X_test_CV, y_test = rus.fit_resample(X_test_CV, y_test)

print('Resampled dataset shape %s' % Counter(y_test))

Resampled dataset shape Counter({0: 710, 1: 710})


## Decision Tree

In [9]:
from sklearn.tree import DecisionTreeClassifier
param_grid = {"max_depth": [7, None]}
dt = GridSearchCV(DecisionTreeClassifier(), param_grid, scoring="accuracy", refit = True, verbose = 10, n_jobs=2, cv=5)

# dt = DecisionTreeClassifier()
dt.fit(X_train_CV.toarray(), y_train)

print("Training complete")

predict = dt.predict(X_test_CV.toarray())

print(accuracy_score(y_test, predict))
print(confusion_matrix(y_test, predict))
print(classification_report(y_test, predict))

Fitting 5 folds for each of 2 candidates, totalling 10 fits
Training complete
0.8154929577464789
[[669  41]
 [221 489]]
              precision    recall  f1-score   support

           0       0.75      0.94      0.84       710
           1       0.92      0.69      0.79       710

    accuracy                           0.82      1420
   macro avg       0.84      0.82      0.81      1420
weighted avg       0.84      0.82      0.81      1420



## Random Forest

In [11]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {"n_estimators": [50]}
rf = GridSearchCV(RandomForestClassifier(), param_grid, scoring="accuracy", refit = True, verbose = 10, n_jobs=2, cv=5)
# rf = RandomForestClassifier(n_estimators=50, random_state=42)
rf.fit(X_train_CV.toarray(), y_train)

predict = rf.predict(X_test_CV.toarray())

print(accuracy_score(y_test, predict))
print(confusion_matrix(y_test, predict))
print(classification_report(y_test, predict))

Fitting 5 folds for each of 1 candidates, totalling 5 fits
0.8387323943661972
[[687  23]
 [206 504]]
              precision    recall  f1-score   support

           0       0.77      0.97      0.86       710
           1       0.96      0.71      0.81       710

    accuracy                           0.84      1420
   macro avg       0.86      0.84      0.84      1420
weighted avg       0.86      0.84      0.84      1420

{'n_estimators': 50}


## SVM

In [10]:
from ml_lib import MulticlassSVM

param_grid = {"lr": [0.3, 0.5], "max_epochs": [300]}
svm = GridSearchCV(MulticlassSVM(), param_grid, scoring="accuracy", refit = True, verbose = 10, n_jobs=2, cv=5)
# svm = MulticlassSVM(reg=0.1, lr=0.3, beta1=0.9, beta2=0.999, eps=1e-6, max_epochs=300)
svm.fit(X_train_CV.toarray(), y_train, print_every=100)

predict = svm.predict(X_test_CV.toarray())

print(accuracy_score(y_test, predict))
print(confusion_matrix(y_test, predict))
print(classification_report(y_test, predict))

Fitting 5 folds for each of 2 candidates, totalling 10 fits
it 1/300, loss = 621.2592930958281
it 101/300, loss = 0.8122699680383345
it 201/300, loss = 0.8026093087632495
0.6049295774647887
[[709   1]
 [560 150]]
              precision    recall  f1-score   support

           0       0.56      1.00      0.72       710
           1       0.99      0.21      0.35       710

    accuracy                           0.60      1420
   macro avg       0.78      0.60      0.53      1420
weighted avg       0.78      0.60      0.53      1420



## KNN

In [18]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=1, weights='uniform')
knn.fit(X_train_CV.toarray(), y_train)

predict = knn.predict(X_test_CV.toarray())

print(accuracy_score(y_test, predict))
print(confusion_matrix(y_test, predict))
print(classification_report(y_test, predict))

0.7112676056338029
[[403 307]
 [103 607]]
              precision    recall  f1-score   support

           0       0.80      0.57      0.66       710
           1       0.66      0.85      0.75       710

    accuracy                           0.71      1420
   macro avg       0.73      0.71      0.71      1420
weighted avg       0.73      0.71      0.71      1420



## Softmax Regression

In [13]:
from ml_lib import SoftmaxRegression
param_grid = {"lr": [0.2], "max_epochs": [300, 400, 500]}
sr = GridSearchCV(SoftmaxRegression(), param_grid, scoring="accuracy", refit = True, verbose = 10, n_jobs=2, cv=5)

# sr = SoftmaxRegression(lr=0.2, max_epochs=300)
sr.fit(X_train_CV.toarray(), y_train)

predict = sr.predict(X_test_CV.toarray())

print(accuracy_score(y_test, predict))
print(confusion_matrix(y_test, predict))
print(classification_report(y_test, predict))

Fitting 5 folds for each of 3 candidates, totalling 15 fits
0.6809859154929577
[[680  30]
 [423 287]]
              precision    recall  f1-score   support

           0       0.62      0.96      0.75       710
           1       0.91      0.40      0.56       710

    accuracy                           0.68      1420
   macro avg       0.76      0.68      0.65      1420
weighted avg       0.76      0.68      0.65      1420



## Naive Bayes

In [20]:
from sklearn.naive_bayes import MultinomialNB

# param_grid = {"alpha": [0.05, 0.01, 0.02, 0.04, 0.1]}
# NB = GridSearchCV(MultinomialNB(), param_grid, scoring="accuracy", refit = True, verbose = 10, n_jobs=2, cv=5)
NB = MultinomialNB(alpha=0.4)
NB.fit(X_train_CV.toarray(), y_train)

y_pred = NB.predict(X_test_CV.toarray())
accuracy_score(y_test, y_pred)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


0.721830985915493

In [55]:
from ml_lib import MultinomialNaiveBayes

# param_grid = {"alpha": [0.6, 0.8, 1.2, 1.5]}

# NB = GridSearchCV(MultinomialNaiveBayes(), param_grid, scoring="accuracy", refit = True, verbose = 10, n_jobs=2, cv=5)
NB = MultinomialNaiveBayes(alpha=0.02)
NB.fit(X_train_CV.toarray(), y_train)

predict = NB.predict(X_test_CV.toarray())
print(accuracy_score(y_test, predict))
print(confusion_matrix(y_test, predict))
print(classification_report(y_test, predict))

0.7232394366197183
[[692  18]
 [375 335]]
              precision    recall  f1-score   support

           0       0.65      0.97      0.78       710
           1       0.95      0.47      0.63       710

    accuracy                           0.72      1420
   macro avg       0.80      0.72      0.70      1420
weighted avg       0.80      0.72      0.70      1420

