In [3]:
""" 영화 리뷰 불러오기 """
import pandas as pd

df = pd.read_csv('movie_data.csv', encoding = 'utf-8')
df.head(5)

Unnamed: 0,review,sentiment
0,'Tycus' is almost as bad as a science fiction ...,0
1,The Beguiled is a pretty satisfying film for t...,1
2,"The 3-D featured in ""The Man Who Wasn't There""...",0
3,"I haven't seen a lot of episodes of ""Family Gu...",0
4,Although the beginning suggests All Quiet on t...,0


In [4]:
""" 함수 정의 및 데이터 분할 """
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfTransformer
import nltk
from nltk.corpus import stopwords

porter = PorterStemmer()
tfidf = TfidfTransformer(use_idf = True, norm = 'l2', smooth_idf = True)
nltk.download('stopwords')
stop = stopwords.words('english')

def tokenizer(text) :
    return text.split()

def tokenizer_porter(text) : 
    return [porter.stem(word) for word in text.split()]

X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Artyrie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
""" 5겹 교차 검증으로 로지스틱 회귀 매개변수 조합 찾기 """
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(solver='liblinear', random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5, verbose=1, n_jobs=-1)
gs_lr_tfidf.fit(X_train, y_train)

print('최적의 매개변수 조합 : %s' % gs_lr_tfidf.best_params_)

print('CV 정확도 : %.3f' % gs_lr_tfidf.best_score_)

clf = gs_lr_tfidf.best_estimator_
print('테스트 정확도 : %.3f ' % clf.score(X_test, y_test))

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 188 tasks      | elapsed: 35.3min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 46.1min finished


최적의 매개변수 조합 : {'clf__C': 100.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x000002486CF23730>}
CV 정확도 : 0.888
테스트 정확도 : 0.890 


In [14]:
print('result : \nclf__C : 100.0\nclf__penalty : l2\nvect__ngram_range : (1, 1)')
print('vect__stop_words : None\nvect__tokenizer : <function tokenizer at 0x000002486CF23730')
print('CV 정확도 : 0.888')
print('테스트 정확도 : 0.890')
print('소모시간 : 6.8m, 35.3m, 46.1m')

result : 
clf__C : 100.0
clf__penalty : l2
vect__ngram_range : (1, 1)
vect__stop_words : None
vect__tokenizer : <function tokenizer at 0x000002486CF23730
CV 정확도 : 0.888
테스트 정확도 : 0.890
소모시간 : 6.8m, 35.3m, 46.1m
