In [1]:
# 題目:電商產品評分文件以機器學習方式分辨是否為正向或負向
#
# 說明：輸入文件 positive.review 和 negative.review，兩者都是XML檔。我們用BeautifulSoup讀進來，
# 擷取review_text，然後用NLTK自建Tokenizer。 先產生 word-to-index map 再產生 word-frequency vectors。
# 之後 shuffle data 創造 train/test splits，留100個給 test 用。接著用Logistic Regression 分類器
# 找出訓練組和測試組的準確度(Accuracy)。接著我們可以看看每個單字的正負權重，可以訂一個閥值，
# 比方絕對值大於正負0.5，以確認情緒是顯著的。最後我們找出根據現有演算法歸類錯誤最嚴重的正向情緒和負向
# 情緒的例子。
#
# 延伸:可用不同的tokenizer，不同的tokens_to_vector，不同的ML分類器做改進準確率的比較。最後可用您的
# model去預測unlabeled.review檔的內容。
#
# 範例程式檔名: sentiment_情緒分析.py，以LogisticRegression 方式完成情緒分析。
# 模組: sklearn, bs4, numpy, nltk
# 輸入檔：stopwords.txt, /electronics 下 positive.review, negative.review
# 成績：辨識百分率
#
#注意事項：nltk 需要有 punkt corpus 和 wordnet  資源
#import nltk
#nltk.download('punkt')
#nltk.download('wordnet') 
#資料檔需在適當位置 jupyter 或 colab 才能看到，用colab時要上傳 data 到 ./sample_data 或 mount
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range

import nltk
import numpy as np
import jieba 
from sklearn.utils import shuffle

from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup

In [2]:
wordnet_lemmatizer = WordNetLemmatizer()

In [3]:
# from http://www.lextek.com/manuals/onix/stopwords1.html
stopwords = set(w.rstrip() for w in open('stopwords.txt'))

In [4]:
# 另一個 stopwords 的來源
# from nltk.corpus import stopwords
# stopwords.words('english')

# 讀正向與負向 reviews
# data courtesy of http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html
positive_reviews = BeautifulSoup(open('positive.review', encoding='utf-8').read(), features='lxml')
positive_reviews = positive_reviews.findAll('review_text')

negative_reviews = BeautifulSoup(open('negative.review', encoding='utf-8').read(), features='lxml')
negative_reviews = negative_reviews.findAll('review_text')

unlabeled_reviews = BeautifulSoup(open('unlabeled.review', encoding='utf-8').read(), features='lxml')
unlabeled_reviews = unlabeled_reviews.findAll('review_text')

In [5]:
'''
# 基於nltk自建 tokenizer
def my_tokenizer(s):
    s = s.lower() # downcase
    tokens = nltk.tokenize.word_tokenize(s) # 將字串改為tokens
    tokens = [t for t in tokens if len(t) > 2] # 去除短字
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # 去除大小寫
    tokens = [t for t in tokens if t not in stopwords] # 去除 stopwords
    return tokens
'''

# 使用jieba
def my_tokenizer(s):
    s = s.lower() # downcase
    tokens = jieba.lcut(s) 
    tokens = [t for t in tokens if len(t) > 2] # 去除短字
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # 去除大小寫
    tokens = [t for t in tokens if t not in stopwords] # 去除 stopwords
    return tokens

In [6]:
# 先產生 word-to-index map 再產生 word-frequency vectors
# 同時儲存 tokenized 版本未來不需再做 tokenization
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []
unlabeled_tokenized = []
orig_reviews = []
unlab_reviews = []

In [7]:
for review in positive_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\User\AppData\Local\Temp\jieba.cache
Loading model cost 0.930 seconds.
Prefix dict has been built successfully.


In [8]:
for review in negative_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

In [9]:
for review in unlabeled_reviews:
    unlab_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    unlabeled_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

In [10]:
print("len(word_index_map):", len(word_index_map))

len(word_index_map): 27093


In [11]:
# now let's create our input matrices
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1) # 最後一個元素是標記
    y = np.zeros(len(word_index_map) )
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
        y[i] += 1
    x = x / x.sum() # 正規化數據提升未來準確度
    y = y / y.sum() # 正規化數據提升未來準確度
    if label == None:
        return y
    else:
        x[-1] = label
        return x

In [12]:
N = len(positive_tokenized) + len(negative_tokenized)
# (N x D+1) 矩陣 - 擺在一塊將來便於shuffle
data = np.zeros((N, len(word_index_map) + 1))
i = 0
test_data = []
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1

for tokens in unlabeled_tokenized:
    xy = tokens_to_vector(tokens, None)
    test_data.append(xy)

In [13]:
# shuffle data 創造 train/test splits
orig_reviews, data = shuffle(orig_reviews, data)

Xtrain = data[:,:-1]
Ytrain = data[:,-1]

Xtest = np.array(test_data)

In [14]:
# 方法1 GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier()
model.fit(Xtrain, Ytrain)
print("train score:", model.score(Xtrain, Ytrain))
print("unlabeled review preds:", model.predict(Xtest))

train score: 0.9065
unlabeled review preds: [0. 1. 1. ... 0. 1. 1.]


In [15]:
print("unlabeled review prob:", model.predict_proba(Xtest)[:,1])

unlabeled review prob: [0.47144319 0.58174125 0.61032186 ... 0.32116678 0.65921019 0.59519415]


In [16]:
# 方法2 LogisticRegression
model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("unlabeled review preds:", model.predict(Xtest))

Train accuracy: 0.791
unlabeled review preds: [1. 0. 0. ... 0. 1. 1.]


In [17]:
print("unlabeled review prob:", model.predict_proba(Xtest)[:,1])

unlabeled review prob: [0.51052999 0.48280122 0.45929778 ... 0.4768884  0.50207023 0.53503969]


In [18]:
# 列出每個字的正負 weight
# 用不同的 threshold values!
threshold = 0.5
for word, index in iteritems(word_index_map):
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)

unit -0.6839767337739494
bad -0.8961762281389208
cable 0.7397115156320087
time -0.7018078684449433
month -0.7922365671726258
pro 0.5326834439832305
sound 1.1080203119353158
lot 0.8222495672164125
you 0.9859696446758778
easy 1.8718482044932874
quality 1.5367618653589483
company -0.5736435895717941
card -0.5079121381894793
item -1.0322462709034472
wa -1.6091936531777662
perfect 1.0765576791365705
fast 1.0122104997265828
ha 0.7782993871405728
price 2.865441975283546
value 0.572901434453409
money -1.1270451691206727
memory 1.0000927829276396
picture 0.6078309662608883
buy -0.9437946601016287
don -1.0511873118020099
bit 0.6334003566308123
happy 0.6661380126109168
pretty 0.7710884509254797
doe -0.7921279822228178
highly 1.1002339295548502
recommend 0.6789927286956916
fit 0.539592200513864
customer -0.7031876740645026
support -0.9162831181586766
little 0.9899799214631945
sent -0.5022993129207496
returned -0.8421302926291562
excellent 1.4338215546015884
love 1.2199514652285073
video 0.53502877

In [19]:
# 找出歸類錯誤的例子
preds = model.predict(Xtrain)
P = model.predict_proba(Xtrain)[:,1] # p(y = 1 | x)

In [20]:
preds

array([0., 1., 0., ..., 0., 1., 0.])

In [21]:
P

array([0.47173793, 0.51429151, 0.47705149, ..., 0.4901071 , 0.50796664,
       0.49108515])

In [22]:
# 只列出最糟的
# Most wrong positive review
minP_whenYis1 = 1
# Most wrong negative review
maxP_whenYis0 = 0
wrong_positive_review = None
wrong_negative_review = None
wrong_positive_prediction = None
wrong_negative_prediction = None
for i in range(N):
    p = P[i]
    y = Ytrain[i]
    if y == 1 and p < 0.5:
        if p < minP_whenYis1:
            wrong_positive_review = orig_reviews[i]
            wrong_positive_prediction = preds[i]
            minP_whenYis1 = p
    elif y == 0 and p > 0.5:
        if p > maxP_whenYis0:
            wrong_negative_review = orig_reviews[i]
            wrong_negative_prediction = preds[i]
            maxP_whenYis0 = p

In [23]:
print("Most wrong positive review (prob = %s, pred = %s):" % (minP_whenYis1, wrong_positive_prediction))
print(wrong_positive_review)
print("Most wrong negative review (prob = %s, pred = %s):" % (maxP_whenYis0, wrong_negative_prediction))
print(wrong_negative_review)

Most wrong positive review (prob = 0.3736172804629852, pred = 0.0):

This was a defective unit. Got new unit and it works as expected

Most wrong negative review (prob = 0.6027663738328604, pred = 1.0):

The Voice recorder meets all my expectations and more
Easy to use, easy to transfer great results

