In [13]:
import MySQLdb as db
import pandas as pd
import numpy as np
import dill as pickle
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from konlpy.tag import Twitter
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import cross_val_score
from datetime import datetime



In [3]:
# import json

# with open('/home/ej/github/news_config.json') as f:
#     config = json.load(f)

# db = db.connect(
#     config['ADDR'],
#     config['ID'],
#     config['PW'],
#     config['DB'],
#     charset='utf8'
# )

db = db.connect(
    '127.0.0.1',
    'root',
    '5555',
    'news_rec',
    charset='utf8'
)

In [4]:
data = pd.read_sql('SELECT * FROM ARTICLE;', db)

In [5]:
X = data.content
y = data.area

In [16]:
from sklearn.model_selection import KFold

cv1 = KFold(n_splits=4, shuffle=True, random_state=0)

clf = Pipeline([
    ('vect', TfidfVectorizer(tokenizer=lambda x: ['/'.join(t) for t in Twitter().pos(x)]) ),
    ('clf',MultinomialNB(alpha=0.01)),
])

# for Test set
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

scores = cross_val_score(clf, X_train, y_train, cv=4)

In [19]:
model = clf.fit(X_train, y_train)

In [20]:
y_pred = model.predict(X_test)

In [24]:
tmp = classification_report(y_test, y_pred)

In [25]:
tmp

'             precision    recall  f1-score   support\n\n          0       0.91      0.92      0.91       646\n          1       0.87      0.86      0.87       566\n          2       0.91      0.89      0.90       608\n          3       0.93      0.94      0.93       640\n          4       0.92      0.93      0.92       675\n          5       0.94      0.94      0.94       679\n\navg / total       0.91      0.91      0.91      3814\n'

In [17]:
scores

array([0.91352201, 0.92003146, 0.9166011 , 0.91653543])

In [158]:
X_train, X_test, y_train, y_test = train_test_split(data.content, data.area, test_size=0.3, random_state=2)

In [159]:
y_train.value_counts()

4    2292
0    2279
2    2245
3    2234
5    2175
1    2121
Name: area, dtype: int64

In [160]:
y_test.value_counts()

0    1017
4     988
3     985
5     959
1     900
2     872
Name: area, dtype: int64

In [161]:
X_train = X_train[:100]
X_test = X_test[:100]
y_train = y_train[:100]
y_test = y_test[:100]

In [184]:
clf = Pipeline([
    ('vect', TfidfVectorizer(tokenizer=lambda x: ['/'.join(t) for t in Twitter().pos(x)]) ),
    ('clf',MultinomialNB(alpha=0.01)),
])

In [190]:
how_many_folds = 3

cnf_mat = []
clf_rep = []


total_here = datetime.now()

print('Cross validaiton started !')
for idx in range(0,how_many_folds):
    here = datetime.now()
    X_train, X_test, y_train, y_test = train_test_split(data.content, data.area, test_size=0.3, random_state=idx)
    clf = Pipeline([
    ('vect', TfidfVectorizer(tokenizer=lambda x: ['/'.join(t) for t in Twitter().pos(x)])),
    ('clf',MultinomialNB(alpha=0.01)),
    ])
    
    # for test
#     X_train, X_test, y_train, y_test = X_train[:100], X_test[:100], y_train[:100], y_test[:100]
    
    model = clf.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    cnf_mat.append(str(confusion_matrix(y_test, y_pred)))
    clf_rep.append(classification_report(y_test, y_pred))
    
    td = datetime.now() - here
    minutes = td.seconds // 60 % 60
    print('iteration : ',idx, ', elapsed time :','{} min'.format(minutes),'{} sec'.format(td.seconds - minutes * 60))
    
    
    
total_td = datetime.now() - total_here
minutes = total_td.seconds // 60 % 60
print('Completed !', '{} min'.format(minutes),'{} sec'.format(total_td.seconds - minutes * 60))

Cross validaiton started !
iteration :  0 , elapsed time :  0 min 3 sec
iteration :  1 , elapsed time :  0 min 3 sec
iteration :  2 , elapsed time :  0 min 3 sec
Completed ! 0 min 11 sec


In [193]:
y_pred = model.predict(X_test)

In [195]:
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.52      0.94      0.67        16
          1       0.80      0.63      0.71        19
          2       0.40      0.38      0.39        16
          3       0.83      0.33      0.48        15
          4       0.69      0.65      0.67        17
          5       0.74      0.82      0.78        17

avg / total       0.67      0.63      0.62       100



In [200]:
print('\n\n'.join(cnf_mat))

[[18  1  0  0  0  1]
 [ 0 12  3  1  2  1]
 [ 2  1  5  1  0  0]
 [ 6  2  3  7  1  1]
 [ 1  2  8  1  7  1]
 [ 1  2  0  1  1  7]]

[[11  0  3  0  0  0]
 [ 6  7  0  0  1  5]
 [ 5  1 13  0  3  0]
 [ 2  1  1  8  0  0]
 [ 0  0  3  2 11  2]
 [ 0  0  0  0  0 15]]

[[15  0  1  0  0  0]
 [ 2 12  3  0  0  2]
 [ 7  1  6  0  2  0]
 [ 3  1  4  5  1  1]
 [ 1  1  1  1 11  2]
 [ 1  0  0  0  2 14]]


In [201]:
print('\n\n'.join(clf_rep))

             precision    recall  f1-score   support

          0       0.64      0.90      0.75        20
          1       0.60      0.63      0.62        19
          2       0.26      0.56      0.36         9
          3       0.64      0.35      0.45        20
          4       0.64      0.35      0.45        20
          5       0.64      0.58      0.61        12

avg / total       0.60      0.56      0.55       100


             precision    recall  f1-score   support

          0       0.46      0.79      0.58        14
          1       0.78      0.37      0.50        19
          2       0.65      0.59      0.62        22
          3       0.80      0.67      0.73        12
          4       0.73      0.61      0.67        18
          5       0.68      1.00      0.81        15

avg / total       0.69      0.65      0.64       100


             precision    recall  f1-score   support

          0       0.52      0.94      0.67        16
          1       0.80      0.63     

In [181]:
for each in cnf_mat:
    print(each)

[[18  1  0  0  0  1]
 [ 0 12  3  1  2  1]
 [ 2  1  5  1  0  0]
 [ 6  2  3  7  1  1]
 [ 1  2  8  1  7  1]
 [ 1  2  0  1  1  7]]
[[11  0  3  0  0  0]
 [ 6  7  0  0  1  5]
 [ 5  1 13  0  3  0]
 [ 2  1  1  8  0  0]
 [ 0  0  3  2 11  2]
 [ 0  0  0  0  0 15]]
[[15  0  1  0  0  0]
 [ 2 12  3  0  0  2]
 [ 7  1  6  0  2  0]
 [ 3  1  4  5  1  1]
 [ 1  1  1  1 11  2]
 [ 1  0  0  0  2 14]]


In [180]:
for each in clf_rep:
    print(each)

             precision    recall  f1-score   support

          0       0.64      0.90      0.75        20
          1       0.60      0.63      0.62        19
          2       0.26      0.56      0.36         9
          3       0.64      0.35      0.45        20
          4       0.64      0.35      0.45        20
          5       0.64      0.58      0.61        12

avg / total       0.60      0.56      0.55       100

             precision    recall  f1-score   support

          0       0.46      0.79      0.58        14
          1       0.78      0.37      0.50        19
          2       0.65      0.59      0.62        22
          3       0.80      0.67      0.73        12
          4       0.73      0.61      0.67        18
          5       0.68      1.00      0.81        15

avg / total       0.69      0.65      0.64       100

             precision    recall  f1-score   support

          0       0.52      0.94      0.67        16
          1       0.80      0.63      0

In [163]:
%%time
model = clf.fit(X_train, y_train)

Wall time: 1.69 s


In [164]:
%%time
y_pred = model.predict(X_test)

Wall time: 1.94 s


In [165]:
cnf_mat = confusion_matrix(y_test, y_pred)

In [174]:
print(str(confusion_matrix(y_test, y_pred)))

[[15  0  1  0  0  0]
 [ 2 12  3  0  0  2]
 [ 7  1  6  0  2  0]
 [ 3  1  4  5  1  1]
 [ 1  1  1  1 11  2]
 [ 1  0  0  0  2 14]]


In [167]:
clf_rep = classification_report(y_test, y_pred)

In [169]:
print(clf_rep)

             precision    recall  f1-score   support

          0       0.52      0.94      0.67        16
          1       0.80      0.63      0.71        19
          2       0.40      0.38      0.39        16
          3       0.83      0.33      0.48        15
          4       0.69      0.65      0.67        17
          5       0.74      0.82      0.78        17

avg / total       0.67      0.63      0.62       100



------------

In [80]:
from datetime import datetime

cur_time = datetime.now().strftime('%Y-%m-%d %H-%M-%S')

pickle.dump(model, open("model/twitter_tfidf_mulnb_{}.pkl".format(cur_time), "wb"))
with open('model/twitter_tfidf_mulnb_cnf_mat_{}.txt'.format(cur_time),'w',encoding='utf-8') as f:
    f.write(str(con_mat))

with open('model/twitter_tfidf_mulnb_clf_rep_{}.txt'.format(cur_time),'w',encoding='utf-8') as f:
    f.write(clf_report)