In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from konlpy.tag import Twitter

import pandas as pd
import dill as pickle

In [3]:
data = pd.read_pickle('dataset/training_2018-04-19 19-13-30.pkl')

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data.content, data.area, test_size=0.3, random_state=1)

In [5]:
clf = Pipeline([
    ('vect', TfidfVectorizer(tokenizer=lambda x: ['/'.join(t) for t in Twitter().pos(x)])),
    ('clf',MultinomialNB(alpha=0.01)),
])

In [6]:
y_train.head()

2131    0
9354    3
9740    3
8076    2
179     0
Name: area, dtype: int64

In [7]:
%%time
model = clf.fit(X_train, y_train)

In [8]:
from sklearn.metrics import confusion_matrix, classification_report

In [9]:
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[816,   3,  33,  25,   4,   4],
       [ 15, 733,  19,   9,  11,  30],
       [ 19,   6, 749,   7,  15,   4],
       [ 18,  12,   0, 815,  10,   8],
       [  1,   9,   6,   1, 847,   8],
       [  5,   6,   2,   6,  14, 778]], dtype=int64)

In [10]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.93      0.92      0.93       885
          1       0.95      0.90      0.92       817
          2       0.93      0.94      0.93       800
          3       0.94      0.94      0.94       863
          4       0.94      0.97      0.96       872
          5       0.94      0.96      0.95       811

avg / total       0.94      0.94      0.94      5048



-----------

In [27]:
from datetime import datetime

now = datetime.now()
cur_time = now.strftime('%Y-%m-%d %H-%M-%S')

pickle.dump(model, open("dataset/1_twitter-tfidf-mulnb.pkl", "wb"))

In [24]:
model = pickle.load(open('dataset/1_twitter-tfidf-mulnb.pkl','rb'))

In [22]:
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[257,   1,   8,   5,   1,   1],
       [  5, 251,   4,   1,   8,   8],
       [ 10,   5, 255,   1,  11,   0],
       [  5,   3,   0, 276,   6,   2],
       [  3,   4,   3,   0, 276,   4],
       [  3,   7,   0,   1,   9, 250]], dtype=int64)

In [23]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.91      0.94      0.92       273
          1       0.93      0.91      0.92       277
          2       0.94      0.90      0.92       282
          3       0.97      0.95      0.96       292
          4       0.89      0.95      0.92       290
          5       0.94      0.93      0.93       270

avg / total       0.93      0.93      0.93      1684

