# Scikit-learn을 이용한 분류

In [1]:
train_path = "/content/drive/MyDrive/05_강의자료/210719_SDS/nsmc_train_preprocessed.csv"
test_path = "/content/drive/MyDrive/05_강의자료/210719_SDS/nsmc_test_preprocessed.csv"

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

In [4]:
train.head()

Unnamed: 0.1,Unnamed: 0,id,document,label
0,0,9976970,더빙 진짜 짜증 나 목소리,0
1,1,3819312,흠 포스터 보 초딩 영화 줄 오버 연기 가볍 않,1
2,2,10265843,너무 그래서 보 것 추천 하,0
3,3,9045019,교도소 이야기 솔직히 재미 없 평점 조정,0
4,4,6483659,사이몬 익살 스럽 연기 돋보이 영화 스파이더맨 늙 보이 하 커스틴 던스트 너무나,1


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  1000 non-null   int64 
 1   id          1000 non-null   int64 
 2   document    988 non-null    object
 3   label       1000 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 31.4+ KB


In [7]:
train = train.dropna()

In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  500 non-null    int64 
 1   id          500 non-null    int64 
 2   document    490 non-null    object
 3   label       500 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 15.8+ KB


In [9]:
test = test.dropna()

In [10]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 988 entries, 0 to 999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  988 non-null    int64 
 1   id          988 non-null    int64 
 2   document    988 non-null    object
 3   label       988 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 38.6+ KB


In [12]:
# Embedding
tfidfv = TfidfVectorizer()

In [13]:
# tf-idf 계산하려면 전체 문서가 있어야 하는데, 
# train, test의 document 컬럼을 합쳐주겠습니다.

corpus = train['document'].tolist()
corpus.extend(test['document'].tolist())

In [14]:
len(corpus)

1478

In [15]:
tfidfv.fit(corpus)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [16]:
train_x = tfidfv.transform(train['document']).toarray()

In [17]:
train_x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
test_x = tfidfv.transform(test['document']).toarray()

In [20]:
# 머신러닝 모델 학습
from sklearn.naive_bayes import BernoulliNB

In [21]:
# 나이브베이즈 분류기 객체 선언
nb = BernoulliNB()

In [22]:
nb.fit(train_x, train['label'])

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [23]:
pred_y = nb.predict(test_x)

In [24]:
from sklearn.metrics import classification_report

In [26]:
print(classification_report(test['label'], pred_y))

              precision    recall  f1-score   support

           0       0.68      0.78      0.73       234
           1       0.77      0.66      0.71       256

    accuracy                           0.72       490
   macro avg       0.72      0.72      0.72       490
weighted avg       0.73      0.72      0.72       490

