In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from konlpy.tag import Twitter

import pandas as pd
import pickle

In [2]:
data = pd.read_pickle('dataset/training_set.pkl')

In [3]:
data['area'] = data.area.astype('int64')
X_train, X_test, y_train, y_test = train_test_split(data.content, data.area, test_size=0.25, random_state=1)

In [4]:
pos_tagger = Twitter()
def tokenize_pos(doc):
    return ["/".join(t) for t in pos_tagger.pos(doc)]

In [5]:
clf = Pipeline([
    ('vect', TfidfVectorizer(tokenizer=tokenize_pos)),
    ('clf',MultinomialNB(alpha=0.01)),
])

In [6]:
y_train.head()

1833    1
3601    3
5594    5
1010    1
1488    1
Name: area, dtype: int64

In [7]:
model = clf.fit(X_train, y_train)

In [8]:
from sklearn.metrics import confusion_matrix, classification_report

In [9]:
y_pred = model.predict(X_test)

In [10]:
confusion_matrix(y_test, y_pred)

array([[217,   1,   8,   5,   1,   1],
       [  3, 212,   3,   1,   8,   3],
       [  7,   5, 212,   1,  10,   0],
       [  5,   6,   0, 223,   6,   1],
       [  0,   4,   2,   0, 239,   0],
       [  3,   7,   0,   1,   7, 201]], dtype=int64)

In [11]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.92      0.93      0.93       233
          1       0.90      0.92      0.91       230
          2       0.94      0.90      0.92       235
          3       0.97      0.93      0.94       241
          4       0.88      0.98      0.93       245
          5       0.98      0.92      0.95       219

avg / total       0.93      0.93      0.93      1403

