<a href="https://colab.research.google.com/github/2020-nlp-c/nlp-statisticsmodel/blob/master/HyeonminNam/200727_NBC_sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Sklearn을 활용한 Naive Bayesian Classifier 

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [None]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

In [None]:
text_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', MultinomialNB()), ])

text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

In [None]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.7738980350504514

## Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
parameters_clf = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
                 'tfidf__use_idf': (True, False),}
gs_clf = GridSearchCV(text_clf, parameters_clf, n_jobs = -1, verbose=2)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)
print('Best score: {}'.format(gs_clf.best_score_))
print('Best parameters set:')
best_parameters = gs_clf.best_estimator_.get_params()
for param_name in sorted(list(best_parameters.keys())):
    print('\t{0}: {1}'.format(param_name, best_parameters[param_name]))

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  1.5min finished


Best score: 0.8581401572914213
Best parameters set:
	clf: MultinomialNB()
	clf__alpha: 1.0
	clf__class_prior: None
	clf__fit_prior: True
	memory: None
	steps: [('vect', CountVectorizer(ngram_range=(1, 3))), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())]
	tfidf: TfidfTransformer()
	tfidf__norm: l2
	tfidf__smooth_idf: True
	tfidf__sublinear_tf: False
	tfidf__use_idf: True
	vect: CountVectorizer(ngram_range=(1, 3))
	vect__analyzer: word
	vect__binary: False
	vect__decode_error: strict
	vect__dtype: <class 'numpy.int64'>
	vect__encoding: utf-8
	vect__input: content
	vect__lowercase: True
	vect__max_df: 1.0
	vect__max_features: None
	vect__min_df: 1
	vect__ngram_range: (1, 3)
	vect__preprocessor: None
	vect__stop_words: None
	vect__strip_accents: None
	vect__token_pattern: (?u)\b\w\w+\b
	vect__tokenizer: None
	vect__vocabulary: None
	verbose: False


In [None]:
import numpy as np
predicted = gs_clf.best_estimator_.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.766064790228359

## Scikit-learn 활용한 NBC 클래스화

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [None]:
class NBC_sklearn():
    def __init__(self):
        self.text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()), ])
        self.paramters_clf = {}
        self.gs_clf = 0
    
    def train_model(self, train_data, train_target):
        self.text_clf = self.text_clf.fit(train_data, train_target)

    def Grid_search(self, train_data, train_target, parameters_clf):
        self.parameters_clf = parameters_clf
        gs_clf = GridSearchCV(self.text_clf, self.parameters_clf, n_jobs = -1, verbose=2)
        gs_clf = gs_clf.fit(train_data, train_target)
        print('Best score: {}'.format(gs_clf.best_score_))
        print('Best parameters set:')
        best_parameters = gs_clf.best_estimator_.get_params()
        for param_name in sorted(list(best_parameters.keys())):
            print('\t{0}: {1}'.format(param_name, best_parameters[param_name]))
        self.gs_clf = gs_clf
# parameters_clf = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
#          'tfidf__use_idf': (True, False),}

    def model_predict(self, test_data, test_target, grid_search=False):
        if grid_search==False:
            predicted = self.text_clf.predict(test_data)
            print('정확도: {}'.format(np.mean(predicted == test_target)))
        else:
            predicted = self.gs_clf.best_estimator_.predict(twenty_test.data)
            print('정확도: {}'.format(np.mean(predicted == test_target)))

## 클래스 테스트

In [None]:
nbc_sk = NBC_sklearn()

In [None]:
nbc_sk.train_model(twenty_train.data, twenty_train.target)

In [None]:
parameters_clf = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3)],
                 'tfidf__use_idf': (True, False),}
nbc_sk.Grid_search(twenty_train.data, twenty_train.target, parameters_clf)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  1.6min finished


Best score: 0.8581401572914213
Best parameters set:
	clf: MultinomialNB()
	clf__alpha: 1.0
	clf__class_prior: None
	clf__fit_prior: True
	memory: None
	steps: [('vect', CountVectorizer(ngram_range=(1, 3))), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())]
	tfidf: TfidfTransformer()
	tfidf__norm: l2
	tfidf__smooth_idf: True
	tfidf__sublinear_tf: False
	tfidf__use_idf: True
	vect: CountVectorizer(ngram_range=(1, 3))
	vect__analyzer: word
	vect__binary: False
	vect__decode_error: strict
	vect__dtype: <class 'numpy.int64'>
	vect__encoding: utf-8
	vect__input: content
	vect__lowercase: True
	vect__max_df: 1.0
	vect__max_features: None
	vect__min_df: 1
	vect__ngram_range: (1, 3)
	vect__preprocessor: None
	vect__stop_words: None
	vect__strip_accents: None
	vect__token_pattern: (?u)\b\w\w+\b
	vect__tokenizer: None
	vect__vocabulary: None
	verbose: False


In [None]:
nbc_sk.model_predict(twenty_test.data, twenty_test.target)

정확도: 0.7738980350504514


In [None]:
nbc_sk.model_predict(twenty_test.data, twenty_test.target, grid_search=True)

정확도: 0.766064790228359
