In [3]:
# 題目:電商產品評分文件以機器學習方式分辨是否為正向或負向
#
# 說明：輸入文件 positive.review 和 negative.review，兩者都是XML檔。我們用BeautifulSoup讀進來，
# 擷取review_text，然後用NLTK自建Tokenizer。 先產生 word-to-index map 再產生 word-frequency vectors。
# 之後 shuffle data 創造 train/test splits，留100個給 test 用。接著用Logistic Regression 分類器
# 找出訓練組和測試組的準確度(Accuracy)。接著我們可以看看每個單字的正負權重，可以訂一個閥值，
# 比方絕對值大於正負0.5，以確認情緒是顯著的。最後我們找出根據現有演算法歸類錯誤最嚴重的正向情緒和負向
# 情緒的例子。
#
# 延伸:可用不同的tokenizer，不同的tokens_to_vector，不同的ML分類器做改進準確率的比較。最後可用您的
# model去預測unlabeled.review檔的內容。
#
# 範例程式檔名: sentiment_情緒分析.py，以LogisticRegression 方式完成情緒分析。
# 模組: sklearn, bs4, numpy, nltk
# 輸入檔：stopwords.txt, /electronics 下 positive.review, negative.review
# 成績：辨識百分率
#
#注意事項：nltk 需要有 punkt corpus 和 wordnet  資源
#import nltk
#nltk.download('punkt')
#nltk.download('wordnet') 
#資料檔需在適當位置 jupyter 或 colab 才能看到，用colab時要上傳 data 到 ./sample_data 或 mount
#
#
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range



import nltk
import numpy as np
from sklearn.utils import shuffle

from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup


wordnet_lemmatizer = WordNetLemmatizer()

# from http://www.lextek.com/manuals/onix/stopwords1.html
stopwords = set(w.rstrip() for w in open('./datasets/stopwords.txt'))

# 另一個 stopwords 的來源
# from nltk.corpus import stopwords
# stopwords.words('english')

# 讀正向與負向 reviews
# data courtesy of http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html
positive_reviews = BeautifulSoup(open('./datasets/electronics/positive.review', encoding='utf-8').read(), features="html5lib")
positive_reviews = positive_reviews.findAll('review_text')

negative_reviews = BeautifulSoup(open('./datasets/electronics/negative.review', encoding='utf-8').read(), features="html5lib")
negative_reviews = negative_reviews.findAll('review_text')



# 基於nltk自建 tokenizer

def my_tokenizer(s):
    s = s.lower() # downcase
    tokens = nltk.tokenize.word_tokenize(s) # 將字串改為tokens
    tokens = [t for t in tokens if len(t) > 2] # 去除短字
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens] # 去除大小寫
    tokens = [t for t in tokens if t not in stopwords] # 去除 stopwords
    return tokens


# 先產生 word-to-index map 再產生 word-frequency vectors
# 同時儲存 tokenized 版本未來不需再做 tokenization
word_index_map = {}
current_index = 0
positive_tokenized = []
negative_tokenized = []
orig_reviews = []

for review in positive_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

for review in negative_reviews:
    orig_reviews.append(review.text)
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

print("len(word_index_map):", len(word_index_map))

# now let's create our input matrices
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1) # 最後一個元素是標記
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum() # 正規化數據提升未來準確度
    x[-1] = label
    return x

N = len(positive_tokenized) + len(negative_tokenized)
# (N x D+1) 矩陣 - 擺在一塊將來便於shuffle
data = np.zeros((N, len(word_index_map) + 1))
i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens, 1)
    data[i,:] = xy
    i += 1

for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens, 0)
    data[i,:] = xy
    i += 1

# shuffle data 創造 train/test splits
# 多次嘗試!
orig_reviews, data = shuffle(orig_reviews, data)

X = data[:,:-1]
Y = data[:,-1]

# 最後 100 列是測試用
Xtrain = X[:-100,]
Ytrain = Y[:-100,]
Xtest = X[-100:,]
Ytest = Y[-100:,]

model = LogisticRegression()
model.fit(Xtrain, Ytrain)
print("Train accuracy:", model.score(Xtrain, Ytrain))
print("Test accuracy:", model.score(Xtest, Ytest))


# 列出每個字的正負 weight
# 用不同的 threshold values!
threshold = 0.5
for word, index in iteritems(word_index_map):
    weight = model.coef_[0][index]
    if weight > threshold or weight < -threshold:
        print(word, weight)


# 找出歸類錯誤的例子
preds = model.predict(X)
P = model.predict_proba(X)[:,1] # p(y = 1 | x)

# 只列出最糟的
minP_whenYis1 = 1
maxP_whenYis0 = 0
wrong_positive_review = None
wrong_negative_review = None
wrong_positive_prediction = None
wrong_negative_prediction = None
for i in range(N):
    p = P[i]
    y = Y[i]
    if y == 1 and p < 0.5:
        if p < minP_whenYis1:
            wrong_positive_review = orig_reviews[i]
            wrong_positive_prediction = preds[i]
            minP_whenYis1 = p
    elif y == 0 and p > 0.5:
        if p > maxP_whenYis0:
            wrong_negative_review = orig_reviews[i]
            wrong_negative_prediction = preds[i]
            maxP_whenYis0 = p

print("Most wrong positive review (prob = %s, pred = %s):" % (minP_whenYis1, wrong_positive_prediction))
print(wrong_positive_review)
print("Most wrong negative review (prob = %s, pred = %s):" % (maxP_whenYis0, wrong_negative_prediction))
print(wrong_negative_review)



len(word_index_map): 10950
Train accuracy: 0.783157894736842
Test accuracy: 0.72
unit -0.7325409911126661
bad -0.7079166875260443
cable 0.6821719552990964
time -0.6998176501234197
've 0.7742410035472721
month -0.6687013020845857
sound 1.0822704376449217
lot 0.8264244970303497
you 0.9307459602491117
n't -2.111080608425142
easy 1.729649358645354
quality 1.329125891447414
company -0.5602488518608839
card -0.6088667937906855
item -0.979212113036486
wa -1.4880481682341744
perfect 0.969446259726007
fast 0.9926448950000607
ha 0.6571586938848277
price 2.802432765473389
value 0.5522359577351388
money -1.0105369092266132
memory 0.9649587794947415
buy -0.8724835584810097
bit 0.6085916175146011
happy 0.5548016332264535
pretty 0.5566216504044649
doe -1.238831332580061
highly 1.0751342531053283
recommend 0.6514289892205909
customer -0.6625763582079012
support -0.8643565295751044
little 0.9296661361946765
returned -0.8218108248759785
excellent 1.3176802253957047
love 1.1957539252117
feature 0.5026506

In [8]:
len(positive_reviews), len(negative_reviews), len(X), len(Y)

(1000, 1000, 2000, 2000)

## 專案流程
- 資料取得與清洗
- 文字轉向量
- 包裝成train/test
- 模型套用
- 觀察錯誤樣本


In [13]:
import pandas as pd

In [17]:
df = positive_reviews + negative_reviews
df = pd.DataFrame(df, columns=['text'])
df.head()

Unnamed: 0,text
0,\nI purchased this unit due to frequent blacko...
1,\nI ordered 3 APC Back-UPS ES 500s on the reco...
2,\nWish the unit had a separate online/offline ...
3,\nCheaper than thick CD cases and less prone t...
4,\nHi\n\nI brought 256 MB Kingston SD card from...


In [18]:
df['label'] = 1
df.head()

Unnamed: 0,text,label
0,\nI purchased this unit due to frequent blacko...,1
1,\nI ordered 3 APC Back-UPS ES 500s on the reco...,1
2,\nWish the unit had a separate online/offline ...,1
3,\nCheaper than thick CD cases and less prone t...,1
4,\nHi\n\nI brought 256 MB Kingston SD card from...,1


In [22]:
df.iloc[1000:, 1] = 0

In [28]:
# shuffle
df = df.sample(frac=1)
df

Unnamed: 0,text,label
1047,\nTo back up my personal DVD collection I burn...,0
276,\nWorks well....Customer support from Garmin.....,1
1284,\nMy husband bought me this for Christmas beca...,0
1225,\nFor some years I tried this and other Monste...,0
1038,\nThe is perhaps the lease durable item I have...,0
...,...,...
1074,\nThis is a complete waste of money for a numb...,0
790,\nI've read the other reviews of this product ...,1
855,\nI would highly recommend reading the Busines...,1
1107,\nThe product not only did not work but blew o...,0


In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# 拆分資料
df_train, df_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.1)


# 向量化
tfidf = TfidfVectorizer(max_features=3000)
x_train = tfidf.fit_transform(df_train)
x_test = tfidf.transform(df_test)

In [40]:
x_train.shape, x_test.shape

((1800, 3000), (200, 3000))

In [45]:
## 模型套用 + 超參數調整
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

def get_result_grid(estimator, params, model_name, Xtrain, Ytrain, Xtest, Ytest):
    print(model_name, '結果如下:')
    grid = GridSearchCV(estimator, param_grid=params)
    grid.fit(Xtrain, Ytrain)
    print(grid.best_estimator_)
    print(grid.best_params_)
    print(grid.best_score_)

    print(grid.score(Xtest, Ytest))
    return grid

In [42]:
# KNN

knn_param = {
    'n_neighbors': [5, 10, 15],
    'weights': ['uniform', 'distance'],
}
grid_knn = get_result_grid(KNeighborsClassifier(), knn_param, 'KNN', x_train, y_train, x_test, y_test)

KNN 結果如下:
KNeighborsClassifier(n_neighbors=10)
{'n_neighbors': 10, 'weights': 'uniform'}
0.6888888888888889
0.735


In [43]:
## 隨機森林

rf_param = {
    'n_estimators': [100, 150],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 4]
}

grid_rf = get_result_grid(RandomForestClassifier(), rf_param, '隨機森林', x_train, y_train, x_test, y_test)

隨機森林 結果如下:
RandomForestClassifier(min_samples_split=4, n_estimators=150)
{'max_depth': None, 'min_samples_split': 4, 'n_estimators': 150}
0.7977777777777778
0.865


In [46]:
## GDBT

gdbt_param = {
    'n_estimators': [100],
    'max_depth': [None],
}

grid_gdbt = get_result_grid(GradientBoostingClassifier(), gdbt_param, '梯度提升樹', x_train, y_train, x_test, y_test)

梯度提升樹 結果如下:
GradientBoostingClassifier(max_depth=None)
{'max_depth': None, 'n_estimators': 100}
0.6699999999999999
0.655


In [47]:
## Naive bayes

bayes_param = {
    'alpha': [0, 0.5, 1],
}

grid_bayes = get_result_grid(MultinomialNB(), bayes_param, '樸素貝式分類', x_train, y_train, x_test, y_test)

樸素貝式分類 結果如下:


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


MultinomialNB(alpha=1)
{'alpha': 1}
0.8
0.85


In [49]:
## Logistic

log_param = {
    'penalty': ['l1', 'l2'],
    'max_iter': [100, 150]
}

grid_log = get_result_grid(LogisticRegression(), log_param, 'LogisticRegression', x_train, y_train, x_test, y_test)

LogisticRegression 結果如下:


Traceback (most recent call last):
  File "C:\Users\aband\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\aband\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 1306, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\aband\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line 444, in _check_solver
    "got %s penalty." % (solver, penalty))
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

Traceback (most recent call last):
  File "C:\Users\aband\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\aband\AppData\Roaming\Python\Python37\site-packages\sklearn\linear_model\_logistic.py", line

LogisticRegression()
{'max_iter': 100, 'penalty': 'l2'}
0.8116666666666668
0.87


