In [1]:
import spacy 
import nltk
print("spacy 버전", spacy.__version__)
print("nltk 버전", nltk.__version__)

spacy 버전 3.3.1
nltk 버전 3.7


In [1]:
import spacy 
import nltk

en_nlp = spacy.load('en_core_web_sm')
stemmer = nltk.stem.PorterStemmer()

In [2]:
def compare_normalization(doc):
    doc_spacy = en_nlp(doc)
    print("표제어:")
    print([token.lemma_ for token in doc_spacy])
    print("어간:")
    print([stemmer.stem(token.norm_.lower()) for token in doc_spacy])

In [4]:
compare_normalization(u"Our meeting today was worse than yesterday,"
                        "I'm scared of meeting the clients tomorrow.")

표제어:
['our', 'meeting', 'today', 'be', 'bad', 'than', 'yesterday', ',', "i'm", 'scared', 'of', 'meet', 'the', 'client', 'tomorrow', '.']
어간:
['our', 'meet', 'today', 'wa', 'wors', 'than', 'yesterday', ',', "i'm", 'scare', 'of', 'meet', 'the', 'client', 'tomorrow', '.']


In [6]:
en_nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
from sklearn.feature_extraction.text import CountVectorizer
def custom_tokenizer(document):
    doc_spacy = en_nlp(document)
    return [token.lemma_ for token in doc_spacy]

lemma_vect = CountVectorizer(tokenizer=custom_tokenizer, min_df=5)

In [8]:
from sklearn.datasets import load_files

reviews_train = load_files("data/aclImdb/train/")
text_train, y_train = reviews_train.data, reviews_train.target
print("text_train의 타입:", type(text_train))
print("text_train의 길이:", len(text_train))
print("text_train[6]:\n", text_train[6])
]
text_train = [doc.replace(b"<br />", b" ") for doc in text_train]

X_train_lemma = lemma_vect.fit_transform(text_train)
print("X_train_lemma.shape:", X_train_lemma.shape)

vect = CountVectorizer(min_df=5).fit(text_train)
X_train = vect.transform(text_train)
print("X_train.shape:", X_train.shape)


text_train의 타입: <class 'list'>
text_train의 길이: 25000
text_train[6]:
 b"This movie has a special way of telling the story, at first i found it rather odd as it jumped through time and I had no idea whats happening.<br /><br />Anyway the story line was although simple, but still very real and touching. You met someone the first time, you fell in love completely, but broke up at last and promoted a deadly agony. Who hasn't go through this? but we will never forget this kind of pain in our life. <br /><br />I would say i am rather touched as two actor has shown great performance in showing the love between the characters. I just wish that the story could be a happy ending."
X_train_lemma.shape: (25000, 21887)
X_train.shape: (25000, 27271)


In [9]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

param_grid = {'C':[0.001, 0.01, 0.1, 1, 10]}
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.99, train_size=0.01, random_state=0)
grid = GridSearchCV(LogisticRegression(max_iter=5000), param_grid, cv=cv)

grid.fit(X_train, y_train)
print("최상의 교차 검증 점수"
    "(기본 CountVectorizer): {:.3f}".format(grid.best_score_))

grid.fit(X_train_lemma, y_train)
print("최상의 교차 검증 점수"
        "(표제어): {:.3f}".format(grid.best_score_))

최상의 교차 검증 점수(기본 CountVectorizer): 0.719
최상의 교차 검증 점수(표제어): 0.717


# KONLPY 를 사용한 영화리뷰 분석

In [11]:
import pandas as pd
df_train = pd.read_csv('data/ratings_train.txt', delimiter='\t', keep_default_na=False)
df_train.head(n=3)

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0


In [13]:
text_train, y_train = df_train['document'].values, df_train['label'].values
df_test = pd.read_csv('data/ratings_test.txt', delimiter='\t', keep_default_na=False)
text_test = df_test['document'].values
y_test = df_test['label'].values

In [15]:
import numpy as np
len(text_train), np.bincount(y_train)

(150000, array([75173, 74827], dtype=int64))

In [16]:
len(text_test), np.bincount(y_test)

(50000, array([24827, 25173], dtype=int64))

In [18]:
from konlpy.tag import Okt
okt_tag = Okt()

In [19]:
def okt_tokenizer(text):
    return okt_tag.morphs(text)

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

okt_param_grid = {'tfidfvetorizer__min_df':[3,5,7], 'tfidfvectorizer__ngram_range':[(1,1),(1,2),(1,3)], 'logisticregression__C':[0.1,1,10]}
okt_pipe = make_pipeline(TfidfVectorizer(tokenizer=okt_tokenizer), LogisticRegression(solver='liblinear'))

okt_grid = GridSearchCV(okt_pipe, okt_param_grid, cv=3)

okt_grid.fit(text_train[0:1000], y_train[0:1000])
print("최상의 교차 검증 점수:{:.3f}".format(okt_grid.best_score_))
print("최적의 교차 검증 매개변수:", okt_grid.best_params_)

ValueError: Invalid parameter 'tfidfvetorizer' for estimator Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(tokenizer=<function okt_tokenizer at 0x000001A18144E940>)),
                ('logisticregression', LogisticRegression(solver='liblinear'))]). Valid parameters are: ['memory', 'steps', 'verbose'].

In [26]:
X_test_okt = okt_grid.best_estimator_.named_steps['tfidfvectorizer'].transform(text_test)
score = okt_grid.best_estimator_.named_steps['LogisticRegression'].score(X_test_okt, y_test)
print('테스트 세트 점수:{:.3f}'.format(score))

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'