### BOW와 TF IDF 모형을 사용한 자연어 Classification 예측

In [1]:
import numpy as np
import pickle
import re
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
warnings.filterwarnings('ignore')               # Turn the warnings off.
# nltk.download('stopwords')

#### 1. 데이터를 읽어온다.
영화 리뷰 데이터. <br>
- 데이터의 출처는 [여기](https://perun.pmf.uns.ac.rs/radovanovic/dmsem/cd/datasets/text/MovieReviews/Movie%20Review%20Data.htm).<br>
- 2000개의 파일로 이루어진 데이터를 Pickle로 한 개의 파일로 저장해 놓고 사용한다.

In [2]:
# Pickle 되어 있는 데이터를 읽어온다.
with open('../data/data_reviews.pkl','rb') as f:
    reviews = pickle.load(f)
my_docs, y = reviews.data, reviews.target

In [3]:
np.unique(y, return_counts=True)

(array([0, 1]), array([1000, 1000], dtype=int64))

#### 2. 데이터 전처리.

In [5]:
corpus = []
for i in range(0, len(my_docs)):
    review = re.sub(r'\W', ' ', str(my_docs[i]))
    review = review.lower()
    review = re.sub(r'^br$', ' ', review)
    review = re.sub(r'\s+br\s+',' ',review)      
    review = re.sub(r'\s+[a-z]\s+', ' ',review)  
    review = re.sub(r'^b\s+', '', review)             
    review = re.sub(r'\s+', ' ', review)               # 잉여 space 제거.
    review = re.sub(",|\n|@|:", "", review) # 쉼표, \n, @ 제거
    review = re.sub(r'\([^)]*\)', '', review) # 소괄호 제거
    corpus.append(review)  

#### 3. BOW Classification 예측.

#### 3.1. BOW 행렬을 만든다.

In [7]:
BOW = CountVectorizer(max_features = 1000, stop_words = stopwords.words('english'))
BOW.fit(corpus)                                          # BOW 모델 학습. 
X = BOW.transform(corpus).toarray()                      # "transform" 하므로 행렬이 생성됨.
#X = BOW.fit_transform(corpus).toarray()                 # 한번의 스텝으로 처리.
X.shape

(2000, 1000)

#### 3.2. 로지스틱 회귀 예측.

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1234)

In [9]:
# Training.
LR = LogisticRegression()
LR.fit(X_train, y_train)

In [10]:
# Testing.
y_pred = LR.predict(X_test)
conf_mat = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)

In [12]:
# 정확도.
print(np.round(acc,3))

0.785


#### 4. TF IDF Classification 예측.

#### 4.1. TF IDF 행렬을 만든다.

In [13]:
TFIDF = TfidfVectorizer(max_features = 1000, min_df = 2, max_df = 0.6, stop_words = stopwords.words('english'))
TFIDF.fit(corpus)                                          # TF-IDF 모델 학습. 
X = TFIDF.transform(corpus).toarray()                      # "transform" 하므로 행렬이 생성됨.
#X = TFIDF.fit_transform(corpus).toarray()                 # 한번의 스텝으로 처리.
X.shape

(2000, 1000)

#### 4.2. 로지스틱 회귀 예측.

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1234)

In [15]:
# Training.
LR = LogisticRegression()
LR.fit(X_train, y_train)

In [16]:
# Testing.
y_pred = LR.predict(X_test)
conf_mat = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)

In [18]:
# 정확도.
print(np.round(acc,3))

0.808


**NOTE:** TF IDF가 BOW 보다 다소 향상된 성능을 보인다.

#### 5. Doc2Vec 예측.

In [19]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import gensim

In [20]:
my_corpus = pd.DataFrame({'rt':corpus})
my_corpus['num'] = pd.DataFrame(range(len(my_corpus)))
my_corpus

Unnamed: 0,rt,num
0,arnold schwarzenegger has been an icon for act...,0
1,good films are hard to find these days ngreat ...,1
2,quaid stars as man who has taken up the proffe...,2
3,we could paraphrase michelle pfieffer characte...,3
4,kolya is one of the richest films ve seen in s...,4
...,...,...
1995,under any other circumstances would not be dis...,1995
1996,bruce barth mellow piano plays in the backgrou...,1996
1997,man is not man without eight taels of gold nst...,1997
1998,this is film that was inclined to like at the ...,1998


In [21]:
#문장 분절
def nltk_tokenizer(_wd):
    return RegexpTokenizer(r'\w+').tokenize(_wd.lower())

my_corpus['wd'] = my_corpus['rt'].apply(nltk_tokenizer)
my_corpus

Unnamed: 0,rt,num,wd
0,arnold schwarzenegger has been an icon for act...,0,"[arnold, schwarzenegger, has, been, an, icon, ..."
1,good films are hard to find these days ngreat ...,1,"[good, films, are, hard, to, find, these, days..."
2,quaid stars as man who has taken up the proffe...,2,"[quaid, stars, as, man, who, has, taken, up, t..."
3,we could paraphrase michelle pfieffer characte...,3,"[we, could, paraphrase, michelle, pfieffer, ch..."
4,kolya is one of the richest films ve seen in s...,4,"[kolya, is, one, of, the, richest, films, ve, ..."
...,...,...,...
1995,under any other circumstances would not be dis...,1995,"[under, any, other, circumstances, would, not,..."
1996,bruce barth mellow piano plays in the backgrou...,1996,"[bruce, barth, mellow, piano, plays, in, the, ..."
1997,man is not man without eight taels of gold nst...,1997,"[man, is, not, man, without, eight, taels, of,..."
1998,this is film that was inclined to like at the ...,1998,"[this, is, film, that, was, inclined, to, like..."


In [22]:
tokens = [ t for d in my_corpus['wd'] for t in d]
text = nltk.Text(tokens, name='AI_assay')
print(len(text.tokens))
print(len(set(text.tokens)))

1267473
44556


In [23]:
# 단어 토큰화
lower_cnt = int(len(set(text.tokens)) * 0.01) * -1
print(text.vocab().most_common()[:lower_cnt:-1])

[('sutra', 1), ('kama', 1), ('choudhury', 1), ('sarita', 1), ('gekko', 1), ('mover', 1), ('ironing', 1), ('minimalistic', 1), ('motorists', 1), ('nmorse', 1), ('inveigles', 1), ('impatience', 1), ('resignedly', 1), ('poled', 1), ('rafts', 1), ('soong', 1), ('sundry', 1), ('lobsters', 1), ('lighters', 1), ('beekeeping', 1), ('beekeeper', 1), ('npressure', 1), ('nnu', 1), ('tempest', 1), ('barth', 1), ('dateline', 1), ('munchausen', 1), ('negates', 1), ('bamboozled', 1), ('bludgeoned', 1), ('intruded', 1), ('sharpening', 1), ('indigestion', 1), ('repetitiously', 1), ('bloodstains', 1), ('sprinkler', 1), ('exclusivity', 1), ('harridans', 1), ('karnstein', 1), ('conducive', 1), ('nhes', 1), ('somersets', 1), ('nsomersets', 1), ('finchers', 1), ('relentlessness', 1), ('classifying', 1), ('losin', 1), ('toiled', 1), ('fringes', 1), ('hanksian', 1), ('ndowny', 1), ('downy', 1), ('chivalrous', 1), ('gaskell', 1), ('chivalrously', 1), ('toneless', 1), ('writerly', 1), ('bathrobes', 1), ('craggy

In [24]:
doc_test = my_corpus[['num','wd']].values.tolist()

In [25]:
tagged_data = [TaggedDocument(words=_d, tags=[uid]) for uid, _d in doc_test]

In [26]:
# doc2vec 하이퍼 파라미터 튜닝
max_epochs = 11

model = Doc2Vec(
    window=5,                # 문맥을 고려하는 단어의 최대 거리
    vector_size=130,         # 벡터 크기
    alpha=0.025,             # 모델의 학습속도
    min_alpha=0.025,
    workers = 8,             # 학습을 병렬화하기 위한 스레드 수
    min_count=3,             # 최소 단어 빈도
    dm =0,                   # dm = '0': DBOW // dm = '1': DM
    negative = 6,            # 음수 에제의 수를 결정
    seed = 9999)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)
    model.alpha -= 0.002     # 학습 속도 점차 감소시킴
    model.min_alpha = model.alpha

model.save("d2v.model")
print("Model Saved")

iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
Model Saved


In [27]:
# 문장에 대한 임베딩 추정
vectors = [model.infer_vector(my_cor.split()) for my_cor in my_corpus['rt']]

In [28]:
X = pd.DataFrame(vectors)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1234)

In [30]:
# Training.
LR = LogisticRegression()
LR.fit(X_train, y_train)

In [31]:
# Testing.
y_pred = LR.predict(X_test)
conf_mat = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)

In [32]:
# 정확도.
print(np.round(acc,3))

0.887


**NOTE:** TF IDF와 BOW 보다 doc2vec이 10% 가까이 향상된 성능을 보인다.