NLP(자연어 처리)의 하위분야인 감성분석

### 텍스트 처리용 IMDb 영화리뷰 데이터 준비

#### 영화 리뷰 데이터 셋을 더 간편한 형태로 전처리

In [2]:
import pyprind
import pandas as pd
import os

In [3]:
basepath = 'data/aclImdb'

In [5]:
labels = {'pos' : 1, 'neg' : 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test','train'):
    for l in ('pos','neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path,file),
                      'r',encoding='utf-8') as infile:
                txt= infile.read()
            df= df.append([[txt, labels[l]]],
                         ignore_index = True)
            pbar.update()
df.columns = ['review', 'sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:01:08


In [6]:
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding='utf-8')

In [7]:
df

Unnamed: 0,review,sentiment
11841,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
19602,OK... so... I really like Kris Kristofferson a...,0
45519,"***SPOILER*** Do not read this, if you think a...",0
25747,hi for all the people who have seen this wonde...,1
42642,"I recently bought the DVD, forgetting just how...",0
...,...,...
21243,"OK, lets start with the best. the building. al...",0
45891,The British 'heritage film' industry is out of...,0
42613,I don't even know where to begin on this one. ...,0
43567,Richard Tyler is a little boy who is scared of...,0


In [8]:
df = pd.read_csv('movie_data.csv', encoding='utf-8')
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [9]:
df.shape

(50000, 2)

### BoW

1. 전체 문서에 대해 고유한 토큰, 예를 들어 단어로 이루어진 어휘 사전을 만든다.

2. 특정 문서에 각 단어가 얼마나 자주 등장하는지 헤아려 문서의 특성 벡터를 만든다.

#### 단어를 특성 벡터로 변환

In [22]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining, the weather is sweet, and one and one is two'])
bag = count.fit_transform(docs)

In [23]:
#어휘사전
print(count.vocabulary_)

{'the': 6, 'sun': 4, 'is': 1, 'shining': 3, 'weather': 8, 'sweet': 5, 'and': 0, 'one': 2, 'two': 7}


In [24]:
#x특성백터
print(bag.toarray())

[[0 1 0 1 1 0 1 0 0]
 [0 1 0 0 0 1 1 0 1]
 [2 3 2 1 1 1 2 1 1]]


#### tf=idf
특성 벡터에서 자주 등장하는 단어의 가중치를 낮추는 기법

단어 빈도와 역문서 빈도의 곱으로 정의됨

In [27]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True,
                       norm='l2',
                       smooth_idf=True)
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


#### 텍스트 데이터 정제

In [28]:
df.loc[0, 'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [33]:
#파이썬 정규 표현식re
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '',text)
    emoticons =re.findall('(?::|;|=)(?:-)?(?:\):\(|D|P)',text)
    text = (re.sub('[\W] +', ' ', text.lower()) +
           ' '.join(emoticons).replace('-',''))
    return text

In [34]:
preprocessor(df.loc[0, 'review'][-50:])

'is seven.title (brazil) not available'

In [36]:
preprocessor("</a>This :) is :( a test : :-)!")

'this : is : a test  :-)!'

#### 문서를 토큰으로 나누기

In [38]:
#공백을 기준으로
def tokenizer(text):
    return text.split()
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [39]:
#어간추출
#nltk
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [40]:
#불용어 제거
from nltk.corpus import stopwords

stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

### 문서 분류를 위한 로지스틱 회귀모델 훈련

In [41]:
#훈련 set을 25000 -> 2500으로 축소 하였음
X_train = df.loc[:2500, 'review'].values
y_train = df.loc[:2500, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values


In [42]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [50]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(solver='liblinear', random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=1)
gs_lr_tfidf.fit(X_train, y_train)


Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  'stop_words.' % sorted(inconsistent))


KeyboardInterrupt: 

In [None]:
print('촤적의 매개변수 조합: %s' %gs_lr_tfidf.best_params_)

In [None]:
print('CV 정확도 : %.3f'%gs_lr_tfidf.best_score_)
clf =gs_lr_tfidf.best_estimator_
print('테스트 정확도 : %.3f'%clf.score(X_test, y_test))


### 대용량 데이터 처리: 온라인 알고리즘과 외부 메모리 학습

In [None]:
import numpy as np
import re
from nltk.corpus import stopwords
stop = stopwords.words('english')
def tokenizer(text):
    text =re.sub('<[^>]*>', '',text)
    emoticons =re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                          text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) \ + ' '.join(emoticons).replace('-','')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized


In [None]:
#문서를 하나씩 읽어서 반환하는 제너레이터
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)#헤더 넘기기
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text,label
            
    

In [None]:
next(stream_docs([path ='movie_data.csv']))

In [None]:
def get_minibatch(doc_stream, size):
    docs, y = [],[]
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        pass
    return docs, y

In [None]:
from sklearn.featurn_extraction.text import HashingVectorizer
from sklearn.liner_model import SGDClassifier
vect = hashingVectorizer(decode_error = 'ignore',
                        n_features = 2**21,
                        preprocessor=None,
                        tokenizer=tokenizer)
clf = SGDCLassifier(loss='log', random_state=1, max_iter=1)
doc_stream = stream_docs(path ='movie_data.csv')


In [None]:
#외부메모리 학습
import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0,1])
for _ in range(45):
    X_train, y_train = get_minbatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train =vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

In [None]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('정확도 : %.3f'%clf.score(X_test, y_test))


In [None]:
clf = clf.partial_fit(X_test, y_test)

# 09. 웹 애플리케이션에 머신러닝 모델 내장
### 학습된 사이킷런 추정기 저장

pickle모듈을 사용하여, 파이썬 객체의 구조를 압축된 바이트코드로 직렬화하고 복원할 수 있다.

In [None]:
import pickle
import os
dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)
    
pickle.dump(stop,
           open(os.path.join(dest, 'stopwords.pkl'), 'wb'),
           protocol=4)
pickle.dump(clf,
           open(os.path.join(dest,'classifier.pkl'),'wb'),
           protocol=4)


In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
import re
import os
import pickle

cur_dir =os.path.dirname(__file__)
stop = pickle.load(open(os.path.join(cur_dir,
                                     'pkl_objects','stopwords.pkl'), 'rb'))
def tokenizer(text):
    text =re.sub('<[^>]*>', '',text)
    emoticons =re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                          text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) \ + ' '.join(emoticons).replace('-','')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

vect = hashingVectorizer(decode_error = 'ignore',
                        n_features = 2**21,
                        preprocessor=None,
                        tokenizer=tokenizer)