In [1]:
""" 데이터 정제 및 토큰화 """
import numpy as np
import re
from nltk.corpus import stopwords

stop = stopwords.words('english')

def tokenizer(text) : 
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [3]:
""" 문서를 하나씩 읽어서 반환 """

def stream_docs(path) :
    with open(path, 'r', encoding = 'utf-8') as csv : 
        next(csv)
        # 헤더 넘기기
        for line in csv : 
            text, label = line[:-3], int(line[-2])
            yield text, label

next(stream_docs(path = 'movie_data.csv'))

('"\'Tycus\' is almost as bad as a science fiction film can go.<br /><br />I can hardly find something good to say about this film. The premises are completely wrong. A comet is supposed to hit the Moon and cause catastrophic damage to Earth, but nobody believes the scientist who predicts this.A whole underground city plus a launching pad for nuclear armed rockets is build in the California mountains without anybody noticing. When the comet nears Earth the news make it to the TV and newspapers hardly a day before the event. And so on, and so on ...<br /><br />Neither does any kind of emotion make it to the screen. Is the genius who discovers the comet and builds the underground city a savior of humanity or a beast? The director or Dennis Hooper who is playing the role did not seem to decide until the film was done, and actually it does not make any difference because acting and directing is so confusing that you end by wondering what does this film try to say. The special effects are s

In [4]:
""" 지정한 만큼 문서를 반환 """

def get_minibatch(doc_stream, size) : 
    docs, y = [], []
    try : 
        for _ in range(size) : 
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration : 
        pass
    return docs, y

In [5]:
""" 데이터 종류에 상관없는 HashingVectorizer"""
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error = 'ignore',
                        n_features = 2**21,
                         # 해시 충돌 가능성을 줄임 => 로지스틱 회귀 모델의 가중치 개수도 늘어남
                        preprocessor = None,
                        tokenizer = tokenizer)
clf = SGDClassifier(loss = 'log', random_state = 1, max_iter = 1)
doc_stream = stream_docs(path = 'movie_data.csv')

In [6]:
""" 외부 메모리 학습 """
import pyprind

pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])

for _ in range(45) : 
    X_train, y_train = get_minibatch(doc_stream, size = 1000)
    if not X_train : 
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes = classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:22


In [16]:
""" 모델 테스트 하기 """

X_test, y_test = get_minibatch(doc_stream, size = 5000)
X_test = vect.transform(X_test)
print('정확도: %.3f' % clf.score(X_test, y_test))

clf.particial_fit(X_test, y_test)

ValueError: Cannot vectorize empty sequence.