## 本日課程-文字預處理，部分內容前面章節可能提過，這裡會將前處理所需技巧串起

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords
# tsv是指用tab分開字元的檔案
dataset = pd.read_csv('./datasets/Restaurant_Reviews.tsv',delimiter='\t',quoting=3)

---

In [2]:
print('review before preprocessing : {}'.format(dataset['Review'][0]))

review before preprocessing : Wow... Loved this place.


## 運用re.sub去除部分字元

In [3]:
import re 
# re.sub用來去除不要字元，第一個參數是要去除字元，但可以透過添加＾，變成不要去除字元
# 第二個參數是去除字元後這些東西要變成什麼，在這我們是希望它變成一個空格
# 第三個參數則是我們要剝除的字元從哪裡來

review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][0])

In [4]:
print('review after re.sub : {}'.format(review))

review after re.sub : Wow    Loved this place 


## 將所有字母轉為小寫:因為大部分情境區分大小寫並不能提供而外訊息，如CV內顏色無法提供額外訊息時我們會將圖像轉為灰階，藉此降低複雜度

In [5]:
#把全部變成小寫
review = review.lower()
print('review after lower : {}'.format(review))

review after lower : wow    loved this place 


## 斷詞

In [6]:
import nltk
# 把 review 裡面的單字切開
print('review after split : {}'.format(review.split()))

review after split : ['wow', 'loved', 'this', 'place']


* tokenize 相較於split會是更好的選擇，如 split 無法分開 word. 這種case

In [7]:
nltk.word_tokenize('Wow... Loved this place.')

['Wow', '...', 'Loved', 'this', 'place', '.']

In [8]:
review = nltk.word_tokenize(review)
print('review after tokenize : {}'.format(review))

review after tokenize : ['wow', 'loved', 'this', 'place']


* 中文使用 jieba

In [10]:
import jieba
jieba.set_dictionary('./datasets/dict.txt')

In [11]:
review_ = '哇！我好喜歡這個地方'
cut_result = jieba.cut(review_, cut_all=False, HMM=False)
print("output: {}".format('|'.join(cut_result)))

Building prefix dict from C:\Users\aband\OneDrive\桌面\NLP_marathon\NLP_practice\1-st_NLP\hw\datasets\dict.txt ...
Loading model from cache C:\Users\aband\AppData\Local\Temp\jieba.u0b2b3772ac1635337d56f7845538cf9d.cache
Loading model cost 0.597 seconds.
Prefix dict has been built successfully.


output: 哇|！|我|好|喜歡|這|個|地方


## stopwords: 移除贅字，此步驟為前處理的重要步驟之一，過多的贅字不僅無法提供更多訊息，還會干擾到模型的訓練

In [12]:
#處理文字，有建立好的文字褲會幫我們移除不想要的文字
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aband\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
# 英文停用詞
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [18]:
review = [word for word in review if not word in set(stopwords.words('english'))]
print('review after removeing stopwords : {}'.format(review))

review after removeing stopwords : ['wow', 'loved', 'place']


* stopwords.words('english') 是一個建立好的list，包含一些常見的英文贅字

In [19]:
stopwords.words('english')[:5]

['i', 'me', 'my', 'myself', 'we']

* 我們也可以自己建立 stopwords list

In [21]:
# source:https://github.com/tomlinNTUB/Machine-Learning
with open('./datasets/停用詞-繁體中文.txt','r', encoding='utf-8') as file:
    stop_words = file.readlines()
stop_words = [word.strip('\n') for word in stop_words]

In [22]:
practice_sentence = ['哈哈','!','現在','好想','睡覺','啊']
practice_sentence=[word for word in practice_sentence if not word in set(stop_words)]
print('practice_sentence after removeing stopwords : {}'.format(practice_sentence))

practice_sentence after removeing stopwords : ['現在', '好想', '睡覺']


## Stemming: 詞幹提取
 * ex. loves,loved都變成love
 * 中文沒有詞幹提取的需求

In [23]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
review=[ps.stem(word) for word in review]

In [24]:
print('review after stemming : {}'.format(review))

review after stemming : ['wow', 'love', 'place']


## 練習清理所有的句子

In [25]:
#dataset=pd.read_csv('movie_feedback.csv',encoding = 'Big5',names=['feedback', 'label'] )
dataset = pd.read_csv('./datasets/Restaurant_Reviews.tsv', delimiter='\t', quoting=3)
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [26]:
corpus = []
row = len(dataset)
for i in range(row):
    review = re.sub('[^a-zA-Z]',' ',dataset['Review'][i])  # 保留英文
    review = review.lower()  # 全小寫
    review = review.split()  # 英文斷詞, 用nltk.word_tokensize更好
    ps = PorterStemmer()  # 英文詞幹提取
    ## 這裡先不用stopwords 因為 review中很多反定詞會被移掉 如isn't good, 會變成 good
    review = [ps.stem(word) for word in review ]
    review = ' '.join(review)
    corpus.append(review)

## 手動選出現頻率較高的單字：一般來說我們不需要自己處理這個步驟，通常文字轉向量或index的api都有參數可以設定，這裡是讓大家自己練習

In [29]:
from collections import Counter

In [30]:
## 從整個corpus中取出所有的單詞
whole_words = []
for sentence in corpus:
    for words in nltk.word_tokenize(sentence):
        whole_words.append(words)


In [32]:
## 取出出現頻率top_k的單詞
top_k = 1000
top_k_words = []
# 前1000的字詞
for item in Counter(whole_words).most_common(top_k):
    print(item)
    top_k_words.append(item[0])

('the', 585)
('and', 393)
('i', 357)
('wa', 295)
('a', 237)
('to', 220)
('is', 171)
('it', 155)
('thi', 143)
('of', 127)
('food', 127)
('not', 118)
('place', 112)
('for', 110)
('in', 109)
('t', 97)
('good', 95)
('we', 88)
('servic', 87)
('be', 81)
('veri', 76)
('my', 73)
('with', 72)
('great', 70)
('that', 70)
('had', 70)
('so', 66)
('you', 66)
('were', 63)
('are', 63)
('have', 63)
('go', 62)
('but', 62)
('back', 61)
('they', 58)
('here', 57)
('on', 55)
('time', 55)
('at', 55)
('like', 51)
('all', 44)
('s', 42)
('our', 41)
('will', 37)
('there', 36)
('as', 36)
('realli', 36)
('just', 35)
('love', 33)
('an', 32)
('their', 31)
('if', 30)
('disappoint', 30)
('best', 30)
('would', 29)
('wait', 29)
('get', 28)
('ever', 28)
('restaur', 28)
('order', 28)
('also', 27)
('friendli', 27)
('eat', 27)
('one', 27)
('up', 26)
('onli', 26)
('never', 26)
('don', 26)
('can', 26)
('no', 25)
('your', 25)
('out', 25)
('nice', 25)
('been', 24)
('what', 24)
('amaz', 24)
('again', 24)
('delici', 24)
('from', 

### 以 corpus中第一個句子為範例

In [33]:
remove_low_frequency_word = ' '.join([word for word in nltk.word_tokenize(corpus[0]) if word in set(top_k_words)])

In [34]:
print('Before removing low frequency words:\n {}'.format(corpus[0]))
print('\n')
print('After removing low frequency words:\n {}'.format(remove_low_frequency_word))

Before removing low frequency words:
 wow love thi place


After removing low frequency words:
 wow love thi place


## 轉bag-of-words vector

In [35]:
from sklearn.feature_extraction.text import CountVectorizer
#Creating bag of word model
#tokenization(符號化)
from sklearn.feature_extraction.text import CountVectorizer
#max_features是要建造幾個column，會按造字出現的高低去篩選 
cv = CountVectorizer(max_features=1000)
#toarray是建造matrixs
#X現在為sparsity就是很多零的matrix
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,1].values

## 選擇練習: 將處理好數據放入 naive_bayes模型，並預測評論為正向或負面，詳細原理之後章節會解釋。

## Training

In [36]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)

# Feature Scaling

#Naive Bayes
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

## Inference

In [37]:
message='I really like this!!'
## 要使用一樣的前處理
review=re.sub('[^a-zA-Z]',' ',message)
review=review.lower()
review=review.split()
ps=PorterStemmer()
review=[ps.stem(word) for word in review]
review = ' '.join(review)
input_ = cv.transform([review]).toarray()
prediction = classifier.predict(input_)



In [38]:
prediction ## 1代表正向評價

array([1], dtype=int64)

In [39]:
message='All dishes are disgusting !!'
review=re.sub('[^a-zA-Z]',' ',message)
review=review.lower()
review=review.split()
ps=PorterStemmer()
review=[ps.stem(word) for word in review]
review = ' '.join(review)
input_ = cv.transform([review]).toarray()
prediction = classifier.predict(input_)



In [40]:
prediction ## 0代表負面評價

array([0], dtype=int64)