In [1]:
import numpy as np
import pickle

with open("txt_data", "rb") as fp:   #Pickling
    data = pickle.load(fp)
with open("txt_labels", "rb") as fp:   #Pickling
    labels = pickle.load(fp)

print(len(data), len(labels))

31702 31702


In [2]:
from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(data,labels, test_size = 0.30, shuffle=True, random_state = 42)
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.10, shuffle=True, random_state = 33)

print('split done.', len(x_valid))

split done. 2220


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=500, use_idf=True, stop_words='english')
vectorizer.fit(data)
x_train = vectorizer.transform(x_train).toarray()
x_test = vectorizer.transform(x_test).toarray()
x_valid = vectorizer.transform(x_valid).toarray()
print('> tfidVectorization done.')

> tfidVectorization done.


In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
import pandas as pd
from IPython.display import display
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB, CategoricalNB
from sklearn.neighbors import KNeighborsClassifier, KernelDensity

### k Nearest Neighbors

In [5]:
params = {
    'n_neighbors':[1, 2, 3, 4, 5, 8]
}

clf = GridSearchCV(KNeighborsClassifier(), param_grid=params, n_jobs=-1, cv=2, verbose=1)
clf.fit(x_train, y_train)

clf = clf.best_estimator_

print()
print('Best params: ',clf)

y_pred = clf.predict(x_test)

print()
dt = pd.DataFrame(confusion_matrix(y_test, y_pred))
display(dt)
print(classification_report(y_test, y_pred, zero_division=0))

Fitting 2 folds for each of 6 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  9.5min finished



Best params:  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')



Unnamed: 0,0,1,2,3,4,5,6,7,8
0,827,35,26,37,36,14,10,51,24
1,37,922,50,45,62,16,11,46,46
2,47,34,244,18,32,7,10,24,14
3,45,22,18,229,37,16,6,26,9
4,65,80,48,43,787,27,11,52,40
5,19,15,6,8,31,335,6,15,18
6,1,5,1,4,9,3,1118,117,143
7,10,8,3,16,15,7,118,1070,480
8,12,5,1,11,8,13,157,378,1059


              precision    recall  f1-score   support

           0       0.78      0.78      0.78      1060
           1       0.82      0.75      0.78      1235
           2       0.61      0.57      0.59       430
           3       0.56      0.56      0.56       408
           4       0.77      0.68      0.73      1153
           5       0.76      0.74      0.75       453
           6       0.77      0.80      0.79      1401
           7       0.60      0.62      0.61      1727
           8       0.58      0.64      0.61      1644

    accuracy                           0.69      9511
   macro avg       0.70      0.68      0.69      9511
weighted avg       0.70      0.69      0.69      9511



In [6]:
import pickle

with open('CLF_kNN','wb') as f:
    pickle.dump(clf, f)

### Naive Bayes

- BernoulliNB

In [6]:
clf = BernoulliNB()
clf.fit(x_train, y_train)

#clf = clf.best_estimator_

print()
print('Best params: ',clf)

y_pred = clf.predict(x_test)

print()
dt = pd.DataFrame(confusion_matrix(y_test, y_pred))
display(dt)
print(classification_report(y_test, y_pred, zero_division=0))


Best params:  BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)



Unnamed: 0,0,1,2,3,4,5,6,7,8
0,843,15,44,28,47,12,13,56,2
1,111,867,39,37,63,19,16,80,3
2,111,24,203,14,29,4,3,39,3
3,116,2,20,162,67,5,4,31,1
4,340,22,78,63,528,24,4,90,4
5,65,14,12,23,41,281,3,13,1
6,19,1,0,1,46,21,594,633,86
7,31,5,0,3,54,11,161,1326,136
8,32,3,0,3,59,4,141,1126,276


              precision    recall  f1-score   support

           0       0.51      0.80      0.62      1060
           1       0.91      0.70      0.79      1235
           2       0.51      0.47      0.49       430
           3       0.49      0.40      0.44       408
           4       0.57      0.46      0.51      1153
           5       0.74      0.62      0.67       453
           6       0.63      0.42      0.51      1401
           7       0.39      0.77      0.52      1727
           8       0.54      0.17      0.26      1644

    accuracy                           0.53      9511
   macro avg       0.59      0.53      0.53      9511
weighted avg       0.58      0.53      0.52      9511



- Multinomial

In [5]:
clf = MultinomialNB()
clf.fit(x_train, y_train)

#clf = clf.best_estimator_

print()
print('Best params: ',clf)

y_pred = clf.predict(x_test)

print()
dt = pd.DataFrame(confusion_matrix(y_test, y_pred))
display(dt)
print(classification_report(y_test, y_pred, zero_division=0))


Best params:  MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)



Unnamed: 0,0,1,2,3,4,5,6,7,8
0,804,54,13,12,90,4,23,44,16
1,16,1041,5,15,86,2,32,23,15
2,33,124,140,2,102,2,10,11,6
3,39,48,4,144,119,1,11,24,18
4,36,86,14,23,931,1,15,26,21
5,20,42,3,7,88,266,3,16,8
6,0,3,0,0,4,0,834,308,252
7,9,21,0,2,19,4,226,1054,392
8,2,25,0,1,12,2,202,662,738


              precision    recall  f1-score   support

           0       0.84      0.76      0.80      1060
           1       0.72      0.84      0.78      1235
           2       0.78      0.33      0.46       430
           3       0.70      0.35      0.47       408
           4       0.64      0.81      0.72      1153
           5       0.94      0.59      0.72       453
           6       0.62      0.60      0.61      1401
           7       0.49      0.61      0.54      1727
           8       0.50      0.45      0.47      1644

    accuracy                           0.63      9511
   macro avg       0.69      0.59      0.62      9511
weighted avg       0.64      0.63      0.62      9511



In [6]:
import pickle

with open('CLF_best_naiveBayes','wb') as f:
    pickle.dump(clf, f)

- Gaussian

In [8]:
clf = GaussianNB()
clf.fit(x_train, y_train)

#clf = clf.best_estimator_

print()
print('Best params: ',clf)

y_pred = clf.predict(x_test)

print()
dt = pd.DataFrame(confusion_matrix(y_test, y_pred))
display(dt)
print(classification_report(y_test, y_pred, zero_division=0))


Best params:  GaussianNB(priors=None, var_smoothing=1e-09)



Unnamed: 0,0,1,2,3,4,5,6,7,8
0,812,24,71,42,26,57,12,10,6
1,14,904,148,55,54,34,12,11,3
2,27,35,294,17,30,16,5,5,1
3,42,16,50,224,33,29,4,7,3
4,58,47,271,86,577,86,17,7,4
5,12,17,23,26,21,342,5,5,2
6,5,9,2,15,7,44,1064,136,119
7,17,31,17,32,23,102,658,584,263
8,8,20,28,16,7,72,578,397,518


              precision    recall  f1-score   support

           0       0.82      0.77      0.79      1060
           1       0.82      0.73      0.77      1235
           2       0.33      0.68      0.44       430
           3       0.44      0.55      0.49       408
           4       0.74      0.50      0.60      1153
           5       0.44      0.75      0.55       453
           6       0.45      0.76      0.57      1401
           7       0.50      0.34      0.40      1727
           8       0.56      0.32      0.40      1644

    accuracy                           0.56      9511
   macro avg       0.57      0.60      0.56      9511
weighted avg       0.60      0.56      0.55      9511



- Complement

In [9]:
clf = ComplementNB()
clf.fit(x_train, y_train)

#clf = clf.best_estimator_

print()
print('Best params: ',clf)

y_pred = clf.predict(x_test)

print()
dt = pd.DataFrame(confusion_matrix(y_test, y_pred))
display(dt)
print(classification_report(y_test, y_pred, zero_division=0))


Best params:  ComplementNB(alpha=1.0, class_prior=None, fit_prior=True, norm=False)



Unnamed: 0,0,1,2,3,4,5,6,7,8
0,832,44,5,5,89,14,31,30,10
1,16,1067,4,8,78,10,31,12,9
2,49,142,112,1,91,13,11,8,3
3,88,36,3,90,137,4,27,13,10
4,69,82,11,5,923,20,20,13,10
5,29,43,0,6,73,284,3,8,7
6,8,8,0,4,9,25,918,183,246
7,26,42,2,8,37,29,355,818,410
8,14,35,6,6,24,26,301,479,753


              precision    recall  f1-score   support

           0       0.74      0.78      0.76      1060
           1       0.71      0.86      0.78      1235
           2       0.78      0.26      0.39       430
           3       0.68      0.22      0.33       408
           4       0.63      0.80      0.71      1153
           5       0.67      0.63      0.65       453
           6       0.54      0.66      0.59      1401
           7       0.52      0.47      0.50      1727
           8       0.52      0.46      0.49      1644

    accuracy                           0.61      9511
   macro avg       0.64      0.57      0.58      9511
weighted avg       0.61      0.61      0.60      9511



- Categorical

In [10]:
clf = CategoricalNB()
clf.fit(x_train, y_train)

#clf = clf.best_estimator_

print()
print('Best params: ',clf)

y_pred = clf.predict(x_test)

print()
dt = pd.DataFrame(confusion_matrix(y_test, y_pred))
display(dt)
print(classification_report(y_test, y_pred, zero_division=0))


Best params:  CategoricalNB(alpha=1.0, class_prior=None, fit_prior=True)



Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0,0,0,0,0,0,0,6,1054
1,0,0,0,0,0,0,0,4,1231
2,0,0,0,0,0,0,0,4,426
3,0,0,0,0,0,0,0,1,407
4,0,0,0,0,0,0,0,5,1148
5,0,0,0,0,0,0,0,0,453
6,0,0,0,0,0,0,0,6,1395
7,0,0,0,0,0,0,0,40,1687
8,0,0,0,0,0,0,0,19,1625


              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1060
           1       0.00      0.00      0.00      1235
           2       0.00      0.00      0.00       430
           3       0.00      0.00      0.00       408
           4       0.00      0.00      0.00      1153
           5       0.00      0.00      0.00       453
           6       0.00      0.00      0.00      1401
           7       0.47      0.02      0.04      1727
           8       0.17      0.99      0.29      1644

    accuracy                           0.18      9511
   macro avg       0.07      0.11      0.04      9511
weighted avg       0.12      0.18      0.06      9511

