## 本日課程-文字預處理，部分內容前面章節可能提過，這裡會將前處理所需技巧串起

In [1]:
import numpy as np 
import pandas as pd
import nltk

In [2]:
#tsv是指用tab分開字元的檔案
dataset = pd.read_csv('movie_feedback.csv', header=None, encoding='Big5', names=['feedback', 'label'])
dataset

Unnamed: 0,feedback,label
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1
...,...,...
10657,a terrible movie that some people will neverth...,0
10658,there are many definitions of 'time waster' bu...,0
10659,"as it stands , crocodile hunter has the hurrie...",0
10660,the thing looks like a made-for-home-video qui...,0


In [3]:
X = dataset['feedback'].values
print('review before preprocessing : {}'.format(X[0]))

review before preprocessing : the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . 


## 運用re.sub去除部分字元

In [4]:
import re 
# 去除a-zA-Z以外的字元，並將他們取代為空格' '
review = re.sub('[^a-zA-Z]',' ', X[0])
print('review after re.sub : {}'.format(review))

review after re.sub : the rock is destined to be the   st century s new   conan   and that he s going to make a splash even greater than arnold schwarzenegger   jean claud van damme or steven segal   


## 將所有字母轉為小寫:因為大部分情境區分大小寫並不能提供而外訊息，如CV內顏色無法提供額外訊息時我們會將圖像轉為灰階，藉此降低複雜度

In [5]:
#把全部變成小寫
review = review.lower()
print('review after lower : {}'.format(review))

review after lower : the rock is destined to be the   st century s new   conan   and that he s going to make a splash even greater than arnold schwarzenegger   jean claud van damme or steven segal   


## 斷詞

In [6]:
import nltk
#把review裡面的單字切開
print('review after split : {}'.format(review.split()))

review after split : ['the', 'rock', 'is', 'destined', 'to', 'be', 'the', 'st', 'century', 's', 'new', 'conan', 'and', 'that', 'he', 's', 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', 'jean', 'claud', 'van', 'damme', 'or', 'steven', 'segal']


* tokenize 相較於split會是更好的選擇，如 split 無法分開 word. 這種case

In [7]:
review = nltk.word_tokenize(review)
print('review after tokenize : {}'.format(review))

review after tokenize : ['the', 'rock', 'is', 'destined', 'to', 'be', 'the', 'st', 'century', 's', 'new', 'conan', 'and', 'that', 'he', 's', 'going', 'to', 'make', 'a', 'splash', 'even', 'greater', 'than', 'arnold', 'schwarzenegger', 'jean', 'claud', 'van', 'damme', 'or', 'steven', 'segal']


## stopwords: 移除贅字，此步驟為前處理的重要步驟之一，過多的贅字不僅無法提供更多訊息，還會干擾到模型的訓練

In [8]:
#處理文字，有建立好的文字褲會幫我們移除不想要的文字
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\angel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
from nltk.corpus import stopwords
review = [word for word in review if not word in set(stopwords.words('english'))]
print('review after removeing stopwords : {}'.format(review))

review after removeing stopwords : ['rock', 'destined', 'st', 'century', 'new', 'conan', 'going', 'make', 'splash', 'even', 'greater', 'arnold', 'schwarzenegger', 'jean', 'claud', 'van', 'damme', 'steven', 'segal']


## Stemming: 詞幹提取
 * ex. loves,loved都變成love
 * 中文沒有詞幹提取的需求

In [10]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
review = [ ps.stem(word) for word in review ]
print('review after stemming : {}'.format(review))

review after stemming : ['rock', 'destin', 'st', 'centuri', 'new', 'conan', 'go', 'make', 'splash', 'even', 'greater', 'arnold', 'schwarzenegg', 'jean', 'claud', 'van', 'damm', 'steven', 'segal']


## 練習清理所有的句子

In [17]:
X = dataset['feedback'].values
len(X)

10662

In [18]:
corpus = []
row = len(X)
for i in range(0, row):
    review = re.sub('[^a-zA-Z]',' ',X[i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    ## 這裡先不用stopwords 因為 review中很多反定詞會被移掉 如isn't good, 會變成 good
    review = [ps.stem(word) for word in review ]
    review = ' '.join(review)
    corpus.append(review)
    
corpus[:10]

['the rock is destin to be the st centuri s new conan and that he s go to make a splash even greater than arnold schwarzenegg jean claud van damm or steven segal',
 'the gorgeous elabor continu of the lord of the ring trilog is so huge that a column of word cannot adequ describ co writer director peter jackson s expand vision of j r r tolkien s middl earth',
 'effect but too tepid biopic',
 'if you sometim like to go to the movi to have fun wasabi is a good place to start',
 'emerg as someth rare an issu movi that s so honest and keenli observ that it doesn t feel like one',
 'the film provid some great insight into the neurot mindset of all comic even those who have reach the absolut top of the game',
 'offer that rare combin of entertain and educ',
 'perhap no pictur ever made ha more liter show that the road to hell is pave with good intent',
 'steer turn in a snappi screenplay that curl at the edg it s so clever you want to hate it but he somehow pull it off',
 'take care of my cat

## 轉bag-of-words vector

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
#Creating bag of word model
#tokenization(符號化)

#max_features是要建造幾個column，會按造字出現的高低去篩選 
cv = CountVectorizer(max_features=1500)
#toarray是建造matrixs
#X現在為sparsity就是很多零的matrix
X_ = cv.fit_transform(corpus).toarray()
Y_ = dataset['label'].values


## 選擇練習: 將處理好數據放入 naive_bayes模型，並預測評論為正向或負面，詳細原理之後章節會解釋。

## Training

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_, Y_, test_size = 0.1)

# Feature Scaling

#Naive Bayes
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)


GaussianNB()

## Inference

In [21]:
message = 'I really like this movie !!'
## 要使用一樣的前處理
review = re.sub('[^a-zA-Z]', ' ', message)
review = review.lower()
review = review.split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review]
review = ' '.join(review)
input_ = cv.transform([review]).toarray()
prediction = classifier.predict(input_)
prediction ## 1代表正向評價

array([1], dtype=int64)

In [22]:
message = 'A terrible movie  !!'
review = re.sub('[^a-zA-Z]',' ',message)
review = review.lower()
review = review.split()
ps = PorterStemmer()
review = [ps.stem(word) for word in review]
review = ' '.join(review)
input_ = cv.transform([review]).toarray()

prediction = classifier.predict(input_)
prediction ## 0代表負面評價

array([0], dtype=int64)