## Data Examination

In [2]:
import pandas as pd

In [3]:
'''
header = 0 은 파일의 첫번째 줄에 열이름 있음을 나타내고
delimiter = \t 은 필드가 탭으로 구분됨
quoting = 3은 쌍따옴표 무시
'''
# train data with label(sentiment)
train = pd.read_csv('./data/word2vec-nlp-tutorial/labeledTrainData.tsv', header=0, delimiter='\t', quoting=3)
#test data w/o label
test = pd.read_csv('./data/word2vec-nlp-tutorial/testData.tsv', header=0, delimiter='\t', quoting=3)

#25000 rows and 3 columns
train.shape

(25000, 3)

In [4]:
train.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [5]:
#25000 rows and 2 columns(sentiment excluded)
test.shape

(25000, 2)

In [6]:
test.head()

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [7]:
train.columns.values

array(['id', 'sentiment', 'review'], dtype=object)

In [8]:
test.columns.values

array(['id', 'review'], dtype=object)

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
id           25000 non-null object
sentiment    25000 non-null int64
review       25000 non-null object
dtypes: int64(1), object(2)
memory usage: 586.0+ KB


In [10]:
train['sentiment'].value_counts()

1    12500
0    12500
Name: sentiment, dtype: int64

In [11]:
#only 700 characters
#since html tag is included, 정제 needed
train['review'][0][:700]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely lik'

## Data Cleaning and Text Preprocessing

In order to prevent overfitting from outliers(abnormal data), **data cleaning** is required.

1. HTML tag removal with BeautifulSoup
2. replacing characters that is not alphabet with space using regular expression(정규표현식 사용)
3. Stopword(불용어) removal with NLTK
> e.g.) I, my, me와 같이 자주 등장하지만 의미가 많지 않은 데이터
4. 어간추출 with SnowballStemmer

*만약 한국어로 전처리를 하고싶다면?*
[KoNLPy](https://github.com/twitter/twitter-korean-text) 사용!



### 1. HTML Tag Removal

In [12]:
from bs4 import BeautifulSoup

#전처리전
print(train['review'][0][:700])
#전처리후
example1 = BeautifulSoup(train['review'][0],'html5lib')
example1.get_text()[:700]

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely lik


'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyw'

### 2. Replacing non-alphabetical characters with blank

In [13]:
#for regular expression
import re

#^ = not
letters_only = re.sub('[^a-zA-Z]',' ', example1.get_text())
letters_only[:700]

' With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyw'

In [14]:
# change to lowercase
lower_case = letters_only.lower()
#split sentences to words = 토큰화
words = lower_case.split()
print(len(words))
words[:10] 

437


['with',
 'all',
 'this',
 'stuff',
 'going',
 'down',
 'at',
 'the',
 'moment',
 'with']

### 3. Stopword Removal

Stopword(e.g. i, me, it, this that, is, are, to, the) does not contribute to the actual meaning of sentence. 

Therefore, let's use [NLTK](https://www.nltk.org) since it contains 153 stopwords in 17 languages including English.

*단, 한국어는 아직 없다.*

In [30]:
#nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/brit/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
import nltk
from nltk.corpus import stopwords
stopwords.words('english')[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [16]:
# remove stopwords from tokens
words = [w for w in words if not w in stopwords.words('english')]
print(len(words))
words[:10]

219


['stuff',
 'going',
 'moment',
 'mj',
 'started',
 'listening',
 'music',
 'watching',
 'odd',
 'documentary']

### 4. Stemming(어간추출, 형태소 분석)
[어간추출](https://ko.wikipedia.org/wiki/어간_추출)
> i.e. message, messages, messaging can be classified as simply 'message'

**NLTK** also provides this function.

In [17]:
#stemmer example
stemmer = nltk.stem.PorterStemmer()
print(stemmer.stem('maximum'))
print("The stemmed form of runs is: {}".format(stemmer.stem("running")))
print("The stemmed form of runs is: {}".format(stemmer.stem("runs")))
print("The stemmed form of run is: {}".format(stemmer.stem("run")))

maximum
The stemmed form of runs is: run
The stemmed form of runs is: run
The stemmed form of run is: run


In [18]:
words[:10]

['stuff',
 'going',
 'moment',
 'mj',
 'started',
 'listening',
 'music',
 'watching',
 'odd',
 'documentary']

In [42]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')
words = [stemmer.stem(w) for w in words]
#result
words[:10]

['stuff',
 'go',
 'moment',
 'mj',
 'start',
 'listen',
 'music',
 'watch',
 'odd',
 'documentari']

### Lemmatization 음소표기법
한국어로 예를 들자면, 
1) 배가 맛있다
2) 배를 타는 것은 재밌다.
3) 음식을 잘못먹어서 배가 아프다

 여기서 '배'는 3개의 다른 의미를 가진 단어로 **앞뒤 문맥을 보고 의미를 식별**할 필요가 있다.
 
 Examples in English:
 1) John is right in front of me because I can see him.
 2) John and I used to go out, but he is seeing someone else now. 
 
 A word 'see' may contain different meanings in diffent context. Hence, **examining words in context** is essential.

In [45]:
#nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/brit/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [19]:
#lemmatizer example
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

print(wordnet_lemmatizer.lemmatize('fly'))
print(wordnet_lemmatizer.lemmatize('flies'))

fly
fly


In [20]:
words = [wordnet_lemmatizer.lemmatize(w) for w in words]
# result
words[:10]

['stuff',
 'going',
 'moment',
 'mj',
 'started',
 'listening',
 'music',
 'watching',
 'odd',
 'documentary']

## Let's put it into action now!

In [21]:
def review_to_words( raw_review ):
    # 1. HTML removal
    review_text = BeautifulSoup(raw_review, 'html.parser').get_text()
    # 2. non-alphabetical characters -> blank
    letters_only = re.sub('[^a-zA-Z]', ' ', review_text)
    # 3. every letter to lowercase 
    words = letters_only.lower().split()
    stops = set(stopwords.words('english'))
    # 4. Stopwords removal
    meaningful_words = [w for w in words if not w in stops]
    # 5. stemming
    stemming_words = [stemmer.stem(w) for w in meaningful_words]
    return( ' '.join(stemming_words) )

In [22]:
clean_review = review_to_words(train['review'][0])
clean_review

'stuff go moment mj start listen music watch odd documentari watch wiz watch moonwalk mayb want get certain insight guy thought realli cool eighti mayb make mind whether guilti innoc moonwalk part biographi part featur film rememb go see cinema origin releas subtl messag mj feel toward press also obviou messag drug bad kay visual impress cours michael jackson unless remot like mj anyway go hate find bore may call mj egotist consent make movi mj fan would say made fan true realli nice actual featur film bit final start minut exclud smooth crimin sequenc joe pesci convinc psychopath power drug lord want mj dead bad beyond mj overheard plan nah joe pesci charact rant want peopl know suppli drug etc dunno mayb hate mj music lot cool thing like mj turn car robot whole speed demon sequenc also director must patienc saint came film kiddi bad sequenc usual director hate work one kid let alon whole bunch perform complex danc scene bottom line movi peopl like mj one level anoth think peopl stay 

In [23]:
#total number of review data
num_review = train['review'].size
num_review

25000

In [24]:
# 참고 : https://gist.github.com/yong27/7869662
# http://www.racketracer.com/2016/07/06/pandas-in-parallel/

#since it takes way too long to preprocess 25000 data, we use multiprocessing
from multiprocessing import Pool
import numpy as np

def _apply_df(args):
    df, func, kwargs = args
    return df.apply(func, **kwargs)

def apply_by_multiprocessing(df, func, **kwargs):
    # 키워드 항목 중 workers 파라메터를 꺼냄
    workers = kwargs.pop('workers')
    # 위에서 가져온 workers 수로 프로세스 풀을 정의
    pool = Pool(processes=workers)
    # 실행할 함수와 데이터프레임을 워커의 수 만큼 나눠 작업
    result = pool.map(_apply_df, [(d, func, kwargs)
            for d in np.array_split(df, workers)])
    pool.close()
    # 작업 결과를 합쳐서 반환
    return pd.concat(list(result))

In [25]:
%time clean_train_reviews = apply_by_multiprocessing(train['review'], review_to_words, workers=4)

CPU times: user 103 ms, sys: 106 ms, total: 209 ms
Wall time: 32.1 s


In [26]:
%time clean_test_reviews = apply_by_multiprocessing(test['review'], review_to_words, workers=4)

CPU times: user 99.3 ms, sys: 111 ms, total: 210 ms
Wall time: 31.3 s


## Feature Extraction using CountVectorizer

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

vectorizer = CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None,
                            stop_words=None, min_df=2, #토큰이 나타날 최소 문서 개수
                            ngram_range=(1,3), max_features=20000)
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=20000, min_df=2,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [34]:
#in order to improve speed
pipeline = Pipeline([('vect', vectorizer)])

%time train_data_features = pipeline.fit_transform(clean_train_reviews)
train_data_features

CPU times: user 25.6 s, sys: 802 ms, total: 26.4 s
Wall time: 26.5 s


<25000x20000 sparse matrix of type '<class 'numpy.int64'>'
	with 2757913 stored elements in Compressed Sparse Row format>

In [35]:
train_data_features.shape
#number of features = 20000

(25000, 20000)

In [36]:
vocab = vectorizer.get_feature_names()
#number of vocab = number of features
print(len(vocab))
vocab[:10]

20000


['aag',
 'aaron',
 'ab',
 'abandon',
 'abbey',
 'abbi',
 'abbot',
 'abbott',
 'abc',
 'abduct']

In [38]:
import numpy as np
dist = np.sum(train_data_features, axis=0)

pd.DataFrame(dist,columns=vocab)

Unnamed: 0,aag,aaron,ab,abandon,abbey,abbi,abbot,abbott,abc,abduct,...,zombi bloodbath,zombi film,zombi flick,zombi movi,zone,zoo,zoom,zorro,zu,zucker
0,26,48,22,288,24,30,29,30,125,55,...,23,52,37,89,161,31,71,59,40,23


## Training & Prediction using Random Forest
  여러개의 decision tree(T/F로 계속 파고드는 트리형태)를 만들어 평균값/투표 실시

In [39]:
from sklearn.ensemble import RandomForestClassifier

#n_estimators = number of cpu cores to use(-1 means use everything)
forest = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=2018)
forest

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=2018, verbose=0,
            warm_start=False)

In [40]:
# X_train : 행렬 / Y_train : 벡터
%time forest = forest.fit(train_data_features, train['sentiment'])

CPU times: user 1min 3s, sys: 246 ms, total: 1min 3s
Wall time: 17.2 s


In [41]:
from sklearn.model_selection import cross_val_score
%time score = np.mean(cross_val_score(forest, train_data_features, \
                                     train['sentiment'], cv=10, scoring='roc_auc'))

CPU times: user 7.49 s, sys: 2.11 s, total: 9.6 s
Wall time: 2min 39s


In [42]:
# vectorizing test data
%time test_data_features = pipeline.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

CPU times: user 6.67 s, sys: 347 ms, total: 7.02 s
Wall time: 7.34 s


In [43]:
#prediction
result = forest.predict(test_data_features)
result[:10]

array([1, 0, 1, 1, 1, 0, 0, 0, 0, 1])

In [45]:
#save result to dataframe
output = pd.DataFrame(data={"id":test["id"],"sentiment":result})
output.head()

Unnamed: 0,id,sentiment
0,"""12311_10""",1
1,"""8348_2""",0
2,"""5828_4""",1
3,"""7186_2""",1
4,"""12128_7""",1


In [46]:
output_sentiment=output['sentiment'].value_counts()
#positive vs negative
print(output_sentiment[0] - output_sentiment[1])
output_sentiment

-102


1    12551
0    12449
Name: sentiment, dtype: int64

In [48]:
output.to_csv('tutorial_1_BOW_{0:5f}.csv'.format(score), index=False, quoting=3)