# 영화 리뷰 쿼리 예제

In [1]:
# import files
import os
import numpy as np
# get titles 영화 정보에 대한 사전 만듬
from bs4 import BeautifulSoup
moviehtmldir='./machine_learning_for_the_web-master/chapter_4/movie/' # 제목을 파싱
moviedict = {} # 사전을 만듬
for filename in [f for f in os.listdir(moviehtmldir) if f[0]!='.']:
    id = filename.split('.')[0]
    
    #f = open(moviehtmldir+'/'+filename, encoding='ISO-8859-1')
    f = open(moviehtmldir+'/'+filename, encoding='ISO-8859-1')
    #parsed_html = BeautifulSoup(f.read(), "lxml")
    parsed_html = BeautifulSoup(f.read(), 'html.parser')
    try:
        title = parsed_html.body.h1.text
    except:
        title = 'none'
    moviedict[id] = title

In [2]:
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
tknzr = WordPunctTokenizer()
nltk.download('stopwards')

[nltk_data] Error loading stopwards: Package 'stopwards' not found in
[nltk_data]     index


False

In [3]:
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
tknzr = WordPunctTokenizer()
nltk.download('stopwords')
stoplist = stopwords.words('english')
print(stoplist[:20],'...')

from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer() # 어간 추출 
def ListDocs(dirname):
    docs = []
    titles = []
    for filename in [f for f in os.listdir(dirname) if str(f)[0]!='.']:
        f = open(dirname+'/'+filename, 'r')
        id = filename.split('.')[0].split('_')[1]
        titles.append(moviedict[id])
        docs.append(f.read())
    return docs, titles

dir='./machine_learning_for_the_web-master/chapter_4/review_polarity/txt_sentoken/'
pos_textreviews, pos_tiltes = ListDocs(dir+'pos/') # 긍정리뷰
neg_textreviews, neg_tiltes = ListDocs(dir+'neg/') # 부정리뷰
tot_textreviews = pos_textreviews+neg_textreviews # 2000개의 리뷰 저장 리스트
tot_titles = pos_tiltes+neg_tiltes # 제목 저장

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jhee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers'] ...


## TF-IDF

In [4]:
#test tf-idf 모델
from sklearn.feature_extraction.text import TfidfVectorizer

# 문서 사전처리, 불용어제거, 토큰화, 스테밍 실행 
def PreprocessTfidf(texts, stoplist=[],stem=False):
    newtexts = []
    for text in texts:
        if stem:
            tmp = [w for w in tknzr.tokenize(text) if w not in stoplist]
        else:
            tmp = [stemmer.tem(w) for w in [w for w in tknzr.tokenize(text) if w not in soplist]]
        newtexts.append(' '.join(tmp))
    return newtexts

vectorizer = TfidfVectorizer(min_df=1)
processed_reviews = PreprocessTfidf(tot_textreviews, stoplist, True)
mod_tfidf = vectorizer.fit(processed_reviews)
vec_tfidf = mod_tfidf.transform(processed_reviews)
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_)) # 사전을 만듬
print(vectorizer.idf_)
print(vectorizer.get_feature_names()[:10], "...")

[ 6.40417776  4.06880284  7.90825515 ...,  7.50279005  7.90825515
  7.90825515]
['00', '000', '0009f', '007', '00s', '03', '04', '05', '05425', '10'] ...


In [5]:
# dump tf-idf into file
import _pickle as pickle

print(len(processed_reviews),'--',len(mod_tfidf.get_feature_names()))
v = mod_tfidf.transform(processed_reviews)

# vectorizer.pk 파일로 덤프 뜨기
with open('vectorizer.pk', 'wb') as fin:
    pickle.dump(mod_tfidf, fin)
    
# 확인 
file = open("vectorizer.pk",'rb')
load_tfidf =  pickle.load(file)
        
print(load_tfidf.transform(PreprocessTfidf([' '.join(['drama'])],stoplist,True)))

2000 -- 39516
  (0, 10607)	1.0


## LSA

In [6]:
import gensim
from gensim import models

# 문서를 사전 처리
class GenSimCorpus(object):
    def __init__(self, texts, stoplist=[],stem=False):
        self.texts = texts
        self.stoplist = stoplist
        self.stem = stem
        self.dictionary = gensim.corpora.Dictionary(self.iter_docs(texts, stoplist))
               
    def __len__(self):
        return len(self.texts)
    def __iter__(self):
        for tokens in self.iter_docs(self.texts, self.stoplist):
            yield self.dictionary.doc2bow(tokens)
    def iter_docs(self,texts, stoplist):
        for text in texts:
            if self.stem:
                yield (stemmer.stem(w) for w in [x for x in tknzr.tokenize(text) if x not in stoplist])
            else:
                yield (x for x in tknzr.tokenize(text) if x not in stoplist)
print(len(tot_textreviews),len(stoplist))
corpus = GenSimCorpus(tot_textreviews, stoplist) # stem 사용 안함 원랜 True 
dict_corpus = corpus.dictionary
ntopics = 10 # 잠재 차원 10
# 모델이 읽을수 있는 형식으로 변환
lsi = models.LsiModel(corpus, num_topics=ntopics, id2word=dict_corpus)



2000 153


In [7]:
# yield 키워드....generator를 만들 수 있다.

In [8]:
# lsi 객체로 쿼리 잠재 공간으로 변환에 쓰일 U,V,S 행렬 만듬
U = lsi.projection.u
# np.eye 대각행렬
Sigma = np.eye(ntopics)*lsi.projection.s

# calculate V
# dict_corpus 단어 색인 dict_words
V = gensim.matutils.corpus2dense(lsi[corpus], len(lsi.projection.s)).T / lsi.projection.s
dict_words = {}
for i in range(len(dict_corpus)):
    dict_words[dict_corpus[i]] = i

## Doc2Vec

In [9]:
# genism 의 Doc2Vec 보델이 다룰수 있도록 데이터 형식을 맞춰줌
from collections import namedtuple

# 사전처리
def PreprocessDoc2Vec(text,stop=[],stem=False):
    words = tknzr.tokenize(text)
    if stem:
       words_clean = [stemmer.stem(w) for w in [i.lower() for i in words if i not in stop]]
    else:
       words_clean = [i.lower() for i in words if i not in stop]
    return words_clean

# namedtuple 에 각 리뷰를 담음 
Review = namedtuple('Review','words tags')
dir = './machine_learning_for_the_web-master/chapter_4/review_polarity/txt_sentoken/'
do2vecstem = False

reviews_pos = []
cnt = 0
# 사전 처리된 단어와 파일명으로 된 태그로 Review 객체를 구성
for filename in [f for f in os.listdir(dir+'pos/') if str(f)[0]!='.']:
    f = open(dir+'pos/'+filename,'r')
    reviews_pos.append(Review(PreprocessDoc2Vec(f.read(),stoplist,do2vecstem),['pos_'+str(cnt)]))
    cnt+=1
    
reviews_neg = []
cnt= 0
for filename in [f for f in os.listdir(dir+'neg/') if str(f)[0]!='.']:
    f = open(dir+'neg/'+filename,'r')
    reviews_neg.append(Review(PreprocessDoc2Vec(f.read(),stoplist,do2vecstem),['neg_'+str(cnt)]))
    cnt+=1

tot_reviews = reviews_pos + reviews_neg

# 일반적으로 stemmer 를 적용하지 않은 모델이 더좋게 나온다.

In [12]:
# Doc2Vec 모델 생성
from gensim.models import Doc2Vec
import multiprocessing

# DM 아키텍처 구성(dm=1), 은닉계층(size=vec_size), 윈도우크기(window=10 단어), 
# 최소 한번이상 나타난 단어는 모델에 고려 min_count=1
# negative 부정 샘플링, 
# hs 계층적 소프트맥스
cores = multiprocessing.cpu_count()
vec_size = 500
model_d2v = Doc2Vec(dm=1, dm_concat=0, size=vec_size, window=10, 
                    negative=0, hs=0, min_count=1, workers=cores)

#build vocab
model_d2v.build_vocab(tot_reviews)

#train
# 훈련은 20 epoch 동안 진행
# 학습률 0.00
numepochs = 20
for epoch in range(numepochs):
    try:
        print('epoch %d' % (epoch))
        model_d2v.train(tot_reviews,total_examples=vec_size, epochs=numepochs )
        model_d2v.alpha *= 0.99
        model_d2v.min_alpha = model_d2v.alpha
    except (KeyboardInterrupt, SystemExit):
        break

epoch 0
epoch 1
epoch 2
epoch 3
epoch 4
epoch 5
epoch 6
epoch 7
epoch 8
epoch 9
epoch 10
epoch 11
epoch 12
epoch 13
epoch 14
epoch 15
epoch 16
epoch 17
epoch 18
epoch 19


## 검증

In [13]:
# 쿼리 작성 단어목록
query = ['science','future','action']

### TF-IDF

In [14]:
# 가장 비슷한 웹 페이지 다섯 개를 반환하는 스크립트 
#sparse matrix so the metrics transform into regular vectors before computing cosine
from sklearn.metrics.pairwise import cosine_similarity

# sparse matrix 포맷을 사용해 데이터 저장
query_vec = mod_tfidf.transform(PreprocessTfidf([' '.join(query)], stoplist, True))

# cosine_similarity 벡터를 정규 벡터로 변환 -> 코사인 유사도계산
sims= cosine_similarity(query_vec,vec_tfidf)[0]
indxs_sims = sims.argsort()[::-1]
for d in list(indxs_sims)[:5]:
    print('sim: ', sims[d], ' title: ', tot_titles[d])

sim:  0.177948650457  title:  No Telling (1991)
sim:  0.177821146567  title:  Total Recall (1990)
sim:  0.173783798661  title:  Time Machine, The (1960)
sim:  0.163031796224  title:  Bicentennial Man (1999)
sim:  0.160582512878  title:  Andromeda Strain, The (1971)


### LSA

In [15]:
# 쿼리를 LSA 의 qk로 변환하고 다섯개의 유사한 웹 페이지 출력
#LSA query
def TransformWordsListtoQueryVec(wordslist,dict_words,stem=False):
    q = np.zeros(len(dict_words.keys()))
    for w in wordslist:
        if stem:
            q[dict_words[stemmer.stem(w)]]=1.
        else:
            q[dict_words[w]] = 1.
    return q

q = TransformWordsListtoQueryVec(query,dict_words) # stemm true

qk =   np.dot(np.dot(q,U),Sigma)

sims = np.zeros(len(tot_textreviews))
for d in range(len(V)):
    sims[d]=np.dot(qk,V[d])
indxs_sims = np.argsort(sims)[::-1]  
for d in list(indxs_sims)[:5]:
    print('sim:',sims[d],' doc:',tot_titles[d])

sim: 3.32004078148  doc: Star Wars: Episode I - The Phantom Menace (1999)
sim: 3.20984088836  doc: Rocky Horror Picture Show, The (1975)
sim: 3.09025340245  doc: Alien³ (1992)
sim: 2.78900587052  doc: Starship Troopers (1997)
sim: 2.66131573666  doc: Wild Things (1998)


### doc2vec

In [37]:
# doc2vec 모델에서 infer_vector 함수로 쿼리 목록을 벡터로 변환
# most_similar 함수로 가장 유사한 리뷰를 찾음
# random 파라미터는 최적화 방식에서 정해진 결과를 얻기위해 고정 값 사용

#force inference to get the same result
model_d2v.random = np.random.RandomState(1)
query_docvec = model_d2v.infer_vector(PreprocessDoc2Vec(' '.join(query),stoplist,do2vecstem))

reviews_related = model_d2v.docvecs.most_similar([query_docvec], topn=5) #model_d2v.docvecs.most_similar([query_docvec], topn=3)
for review in reviews_related:
    print('relevance:',review[1],'  title:',tot_titles[review[0]])


TypeError: list indices must be integers or slices, not str

In [39]:
# TF-IDF 가 고급 알고리즘 LSA와 Doc2Vec 보다 좋은 결과가 나옴
# http://www.cs.cornell.edu/people/pabo/movie-review-data 

# 사후 처리 정보 예제

In [40]:
# 사전처리된 tot_textreviews 리스트 이용
# LDA 가 다른 주제에 대한 리뷰를 수집할 수 있는지 테스트 
# 사전 처리 작업 1, 2, 되야 함

## LDA (Latent Dirichlet Allocation)

In [41]:
import gensim.models
from gensim import models

from nltk.tokenize import RegexpTokenizer
tknzr = RegexpTokenizer(r'((?<=[^\w\s])\w(?=[^\w\s])|(\W))+', gaps=True)

from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

# 문서를 토큰으로 변환 하고 불용어 제거(다른 토크나이저 사용해봄)
class GenSimCorpus(object):
    def __init__(self, texts, stoplist=[], bestwords=[], stem=False):
        self.texts = texts
        self.stoplist = stoplist
        self.stem = stem
        self.bestwords = bestwords
        self.dictionary = gensim.corpora.Dictionary(self.iter_docs(texts, stoplist))
    
    def __len__(self):
        return len(self.texts)
    def __iter__(self):
        for tokens in self.iter_docs(self.texts, self.stoplist):
            yield self.dictionary.doc2bow(tokens)
    def iter_docs(self, texts, stoplist):
        for text in texts:
            if self.stem:
                yield (stemmer.stem(w) for w in [x for x in tknzr.tokenize(text) if x not in stoplist])
            else:
                if len(self.bestwords) > 0:
                       yield (x for x in tknzr.tokenize(text) if x in self.bestwords)
                else:
                       yield (x for x in tknzr.tokenize(text) if x not in stoplist)            
        
num_topics = 10
corpus = GenSimCorpus(tot_textreviews, stoplist,[],False)
dict_lda = corpus.dictionary
lda = models.LdaModel(corpus, num_topics=num_topics, id2word=dict_lda,passes=10, iterations=50)
print(lda.show_topics(num_topics=num_topics))                       
                       

[(0, '0.000*" " + 0.000*"\n" + 0.000*"\'" + 0.000*"-" + 0.000*"movie" + 0.000*"film" + 0.000*"one" + 0.000*"much" + 0.000*"like" + 0.000*"even"'), (1, '0.000*" " + 0.000*"\n" + 0.000*"\'" + 0.000*"-" + 0.000*"movie" + 0.000*"film" + 0.000*"one" + 0.000*"much" + 0.000*"like" + 0.000*"even"'), (2, '0.334*" " + 0.024*"\n" + 0.016*"&" + 0.009*"=" + 0.006*"nbsp" + 0.005*"files" + 0.005*"-" + 0.005*"x" + 0.005*"\'" + 0.004*"series"'), (3, '0.000*" " + 0.000*"\n" + 0.000*"\'" + 0.000*"-" + 0.000*"film" + 0.000*"movie" + 0.000*"one" + 0.000*"much" + 0.000*"like" + 0.000*"get"'), (4, '0.000*" " + 0.000*"\n" + 0.000*"\'" + 0.000*"-" + 0.000*"film" + 0.000*"movie" + 0.000*"one" + 0.000*"much" + 0.000*"even" + 0.000*"like"'), (5, '0.000*" " + 0.000*"\n" + 0.000*"\'" + 0.000*"-" + 0.000*"film" + 0.000*"movie" + 0.000*"one" + 0.000*"like" + 0.000*"much" + 0.000*"get"'), (6, '0.726*" " + 0.037*"\n" + 0.017*"\'" + 0.009*"-" + 0.003*"film" + 0.002*"one" + 0.002*"movie" + 0.001*"like" + 0.001*"even" + 0

In [42]:
# 출현 빈도가 높은 단어를 걸러냄 1000번보다 크고 3번보다 작게 나타나는 단어 제외
import copy
from six import iteritems # 추가함 iteritems 는 dictionary 에 없음 
out_ids = [tokenid for tokenid, docfreq in iteritems(dict_lda.dfs) if docfreq > 1000 or docfreq < 3 ]
dict_lfq = copy.deepcopy(dict_lda)
dict_lfq.filter_tokens(out_ids)
dict_lfq.compactify()
print(len(dict_lfq))
print(out_ids[:20],'...')
print(dict_lfq)

18480
[1, 8, 24, 26, 41, 45, 48, 51, 52, 70, 73, 75, 96, 98, 114, 115, 120, 123, 129, 131] ...
Dictionary(18480 unique tokens: ['films', 'adapted', 'comic', 'books', 'plenty']...)


In [43]:
# LDA 모델을 10개의 주제에 대해 훈련시킴
# passes : 코퍼스에 대한 훈련 횟수
# 주제별로 나타날 확률이 가장 높은 단어 10개씩 반환
lda_lfq = models.LdaModel(corpus, num_topics=num_topics, id2word=dict_lfq, 
                          passes=10, iterations=50, alpha=0.01, eta=0.01)

# topic2 의 disney, mulan, love, life 는 애니
# topic6 의 action, alien, bad, planet 은 판타니 sci-fi 
for t in range(num_topics):
    print('topic ',t,' words: ', la_lfq.print_topic(t,topn=10))
    print()

IndexError: index 18480 is out of bounds for axis 1 with size 18480

In [44]:
#topics for each doc
# 실제 가장 확률이 높은 주제가 6인 영화를 쿼리 하는 코드
def GenerateDistrArrays(corpus):
    for i, idst in enumerate(corpus[:10]):
        dist_array = np.zeros(num_topics)
        for d in dist:
            dist_array[d[0]] = d[1]
        if dist_array.argmax() == 6:
            print(tot_titles[i])

corpus_lda = lda_lfq[corpus]
GenerateDistrArrays(corpus_lda)

NameError: name 'lda_lfq' is not defined

In [45]:
# 제목이 대부분 sci-fi 와 판다지 영화 
# 주제 공간에서 문서 표현 lda_lfq[corpus] 를 군집화 알고리즘에 적용 (숙제)

## Opinion Mining 

In [46]:
# 데이터 사전처리 
import nltk
import os
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
tknzr = WordPunctTokenizer()

from nltk.tokenize import RegexpTokenizer
tknzr = RegexpTokenizer(r'((?<=[^\w\s])\w(?=[^\w\s])|(\W))+', gaps=True)

nltk.download('stopwords')
stoplist = stopwords.words('english')
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

from collections import namedtuple

def PreprocessReviews(text, stop=[], stem=False):
    #print profile
    words =tknzr.tokenize(text)
    if stem:
        words_clean = [stemmer.stem(w) for w in [i.lower() for i in words if i not in stopwords]]
    else:
        words_clean = [i.lower() for i in words if i not in stop]
    return words_clean

Review = namedtuple('Review', 'woruuds title tags')
dir = './machine_learning_for_the_web-master/chapter_4/review_polarity/txt_sentoken/'
do2vecstem = True
reviews_pos = []
cnt = 0
for filename in [f for f in os.listdir(dir+'pos/') if str(f)[0]!='.']:
    f = open(dir+'pos/'+filename, 'r')
    id = filename.split('.')[0].split('_')[1]
    reviews_pos.append(Review(PreprocessReviews(f.read(), stoplist, do2vecstem),moviedict[id],['pos_'+str(cnt)]))
    cnt+=1
tot_reviews = reviews_pos + reviews_neg

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jhee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


TypeError: argument of type 'WordListCorpusReader' is not iterable

In [48]:
#split in test training sets
# 데이터를 nltk 라이브러리가 처리할 수 있는 방식으로 데이터 분리


def word_features(words):
    return dict([(word, True) for word in words])
negfeatures = [(word_features(r.words), 'neg') for r in reviews_neg]
posfeatures = [(word_features(r.words), 'pos') for r in reviews_pos]
portionpos = int(len(posfeatures)*0.8)
portionneg = int(len(negfeatures)*0.8)
print(portionpos,'-',portionneg)
# training 80 test 20
# 훈련, 테스트 집합별로 튜플 목록을 구성 하거나 또는 
# 문서의 단어가 들어있는 사전과 레이블과 함께 튜플 목록을 구성
trainfeatures = negfeatures[:portionneg] + posfeatures[:portionpos]
print(len(trainfeatures))
testfeatures = negfeatures[portionneg:] + posfeatures[portionpos:]
print(len(testfeatures))
#shuffle(testfeatures)

0 - 800
800
200


In [49]:
# nltk 라이브러리를 NaiveBayesClassifier 이용 다항분포 훈련
from nltk.classify import NaiveBayesClassifier
#training naive bayes 
classifier = NaiveBayesClassifier.train(trainfeatures)

# 오류 체크 
err = 0
print('test on: ',len(testfeatures))
for r in testfeatures:
    sent = classifier.classify(r[0])
    if sent != r[1]:
       err +=1.
print('error rate: ',err/float(len(testfeatures)))

test on:  200
error rate:  0.0


In [50]:
# 바이그램 (연속된 단어의 쌍 : 연어) 계산해 결과를 개선 
# 높은 빈도로 발생하는 바이그램을 찾을 수 있음 

In [51]:
import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from random import shuffle

# x²테스트 : 파이계수의 제곱과 바이그램 총 발생건수 N 의 곱
# x²테스트는 전체 코퍼스에서 가장 유익한 단어를 추출하는데 사용
# x²척도로 문서별 500 개의 가장 좋은 바이그램 선택 
#train bigram:
def bigrams_words_features(words, nbigrams=200, measure=BigramAssocMeasures.chi_sq):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(measure, nbigrams)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])


negfeatures = [(bigrams_words_features(r.words,500), 'neg') for r in reviews_neg]
posfeatures = [(bigrams_words_features(r.words,500), 'pos') for r in reviews_pos]
portionpos = int(len(posfeatures)*0.8)
portionneg = int(len(negfeatures)*0.8)
print(portionpos, ' - ', portionneg)
trainfeatures = negfeatures[:portionpos] + posfeatures[:portionneg]
print(len(trainfeatures))
classifier = NaiveBayesClassifier.train(trainfeatures)

##test bigram
testfeatures = negfeatures[portionneg:] + posfeatures[portionpos:]
shuffle(testfeatures)
err = 0
print('test on: ',len(testfeatures))
for r in testfeatures:
    sent = classifier.classify(r[0])
    #print r[1],'-pred: ',sent
    if sent != r[1]:
       err +=1.
print('error rate: ',err/float(len(testfeatures)))

# 단어의 중요도 점수화 : 긍정 혹은 부정 문서의 빈도와 비교
# 예) great : 긍정 리뷰에서 x²이 높고 부정리뷰 낮다면 이단어는 긍정

0  -  800
0


ValueError: A ELE probability distribution must have at least one bin.

In [55]:
# 코퍼스의 가장 중요한 단어 만개는 전체 코퍼스에서 총 빈도와 
# 긍정 및 부정 부분집합에서의 빈도를 계산

import nltk.classify.util, nltk.metrics
tot_poswords = [val for l in [r.words for r in reviews_pos] for val in l]
tot_negwords = [val for l in [r.words for r in reviews_neg] for val in l]
from nltk.probability import FreqDist, ConditionalFreqDist
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
 
for word in tot_poswords:
    word_fd[word.lower()] +=1
    label_word_fd['pos'][word.lower()] +=1
 
for word in tot_negwords:
    word_fd[word.lower()] +=1
    label_word_fd['neg'][word.lower()] +=1
pos_words = len(tot_poswords)
neg_words = len(tot_negwords)

tot_words = pos_words + neg_words
#select the best words in terms of information contained in the two classes pos and neg
word_scores = {}
 
for word, freq in word_fd.iteritems():
    pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
                (freq, pos_words), tot_words)
    neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
                (freq, neg_words), tot_words)
    word_scores[word] = pos_score + neg_score
print('total: ',len(word_scores))
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
bestwords = set([w for w, s in best])

SyntaxError: invalid syntax (<ipython-input-55-e09e7da86e8f>, line 32)

In [56]:
# bestwords 의 단어만 니용해 나이브 베이즈에 훈련
#training naive bayes with chi square feature selection of best words
def best_words_features(words):
    return dict([(word, True) for word in words if word in bestwords])

negfeatures = [(best_words_features(r.words), 'neg') for r in reviews_neg]
posfeatures = [(best_words_features(r.words), 'pos') for r in reviews_pos]
portionpos = int(len(posfeatures)*0.8)
portionneg = int(len(negfeatures)*0.8)
print portionpos,'-',portionneg
trainfeatures = negfeatures[:portionpos] + posfeatures[:portionneg]
print len(trainfeatures)
classifier = NaiveBayesClassifier.train(trainfeatures)
##test with feature chi square selection
testfeatures = negfeatures[portionneg:] + posfeatures[portionpos:]
shuffle(testfeatures)
err = 0
print 'test on: ',len(testfeatures)
for r in testfeatures:
    sent = classifier.classify(r[0])
    #print r[1],'-pred: ',sent
    if sent != r[1]:
       err +=1.
print 'error rate: ',err/float(len(testfeatures))

SyntaxError: Missing parentheses in call to 'print' (<ipython-input-56-7ec3359b4041>, line 10)

In [57]:
from gensim.models import Doc2Vec

import multiprocessing

shuffle(tot_reviews)
cores = multiprocessing.cpu_count()
vec_size = 500
model_d2v = Doc2Vec(dm=1, dm_concat=0, size=vec_size, window=5, negative=0, hs=0, min_count=1, workers=cores)

#build vocab
model_d2v.build_vocab(tot_reviews)
#train
numepochs= 20
for epoch in range(numepochs):
    try:
        print 'epoch %d' % (epoch)
        model_d2v.train(tot_reviews)
        model_d2v.alpha *= 0.99
        model_d2v.min_alpha = model_d2v.alpha
    except (KeyboardInterrupt, SystemExit):
        break

SyntaxError: invalid syntax (<ipython-input-57-d141b1738c03>, line 16)

In [58]:
# 데이터가 작아 교차 검증을 해야 한다. (3장..) 숙제

In [59]:
# Doc2Vec 벡터는 분류기를 훈련하기 위해 사용
# Doc2vec 벡터는 이미 훈련해 model_d2v.docvecs 객체 저장 (가정)
# training 80 test 20 

#split train,test sets
trainingsize = 2*int(len(reviews_pos)*0.8)

train_d2v = np.zeros((trainingsize, vec_size))
train_labels = np.zeros(trainingsize)
test_size = len(tot_reviews)-trainingsize
test_d2v = np.zeros((test_size, vec_size))
test_labels = np.zeros(test_size)

cnt_train = 0
cnt_test = 0
for r in reviews_pos:
    name_pos = r.tags[0]
    if int(name_pos.split('_')[1])>= int(trainingsize/2.):
        test_d2v[cnt_test] = model_d2v.docvecs[name_pos]
        test_labels[cnt_test] = 1
        cnt_test +=1
    else:
        train_d2v[cnt_train] = model_d2v.docvecs[name_pos]
        train_labels[cnt_train] = 1
        cnt_train +=1

for r in reviews_neg:
    name_neg = r.tags[0]
    if int(name_neg.split('_')[1])>= int(trainingsize/2.):
        test_d2v[cnt_test] = model_d2v.docvecs[name_neg]
        test_labels[cnt_test] = 0
        cnt_test +=1
    else:
        train_d2v[cnt_train] = model_d2v.docvecs[name_neg]       
        train_labels[cnt_train] = 0
        cnt_train +=1


In [60]:
# SVM 분류기 혹은 로지스틱 회귀 분석모델로도 훈련 가능함
# 훈련 데이터 집합이 작아 정확도가 낮음 
# 데이터 집합이 작으면 신경망 처럼 대규모 파라미터 훈련은 힘듬

#train log regre
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(train_d2v, train_labels)
print 'accuracy:',classifier.score(test_d2v,test_labels)

from sklearn.svm import SVC
clf = SVC()
clf.fit(train_d2v, train_labels)
print 'accuracy:',clf.score(test_d2v,test_labels)


SyntaxError: invalid syntax (<ipython-input-60-6c8d41c98139>, line 9)

In [61]:
#svm linear
clf = SVC(kernel='linear')
clf.fit(train_d2v, train_labels)
print clf.score(test_d2v,test_labels)

SyntaxError: invalid syntax (<ipython-input-61-52a9a951074d>, line 4)