In [16]:
#基本套件和模組
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rcParams['font.sans-serif'] = ['DFKai-sb']
plt.rcParams['axes.unicode_minus'] = False
import warnings
warnings.filterwarnings('ignore')

In [17]:
#載入資料並將日期格式轉換為日期
df = pd.read_csv('kindle_rating.csv',parse_dates=['date'])
df.head()

Unnamed: 0,id,rating,title,date,content
0,Professor Nishanth,5,An outstanding refresh of the base Kindle at a...,2019-04-15,"Original review: April 15, 2019, and two updat..."
1,Beverly K,3,Base Kindle gets an upgrade\n,2019-04-15,The pros: I like that you have a choice of col...
2,Gwaredd Thomas,1,Lower ppi - Not good.\n,2019-04-15,I wouldn't purchase this product for the follo...
3,Lynn,5,Greatly Improved Basic Kindle\n,2019-04-15,Don't buy into the petty negative reviews. The...
4,A.B.,4,"Pleasant updates to the ""base"" Kindle\n",2019-04-15,I had a Kindle touch years ago and had stopped...


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2780 entries, 0 to 2779
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   id       2780 non-null   object        
 1   rating   2780 non-null   int64         
 2   title    2780 non-null   object        
 3   date     2780 non-null   datetime64[ns]
 4   content  2780 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 108.7+ KB


無遺漏值

In [19]:
#觀察評分比例
size=df['rating'].value_counts().sort_index()
pct =df['rating'].value_counts(normalize=True).round(3).sort_index()
pd.DataFrame(zip(size,pct),columns=['次數','百分比'],index=range(1,6))

Unnamed: 0,次數,百分比
1,219,0.079
2,134,0.048
3,235,0.085
4,414,0.149
5,1778,0.64


可以看出滿意的占多數(4~5分)，大部分使用者給予正面的評價。

In [20]:
#資料切割為滿意和不滿意
df['rating'] = (df['rating'] > 3).map({True:1,False:0})
df['rating'].value_counts()

1    2192
0     588
Name: rating, dtype: int64

In [21]:
#定義與切割
x = df['content']
y = df['rating']
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3)
x_train.head()

2652                             i am still learning it\n
2448                                Like the backlight.\n
350     i don't like that when you hold it your finger...
631     I love this kindle! It’s so lightweight & easy...
2355           Excellent device! Longer battery Charge.\n
Name: content, dtype: object

In [22]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.model_selection import GridSearchCV

In [23]:
#使用模型和組合器來進行預測
model_pl=Pipeline([('preprocess',CountVectorizer(stop_words='english')),('model',LogisticRegression())])
param_grid = {'model':[LogisticRegression(),SVC(),KNeighborsClassifier(),DecisionTreeClassifier(max_depth=10),MultinomialNB(),XGBClassifier(),BaggingClassifier(),AdaBoostClassifier(),RandomForestClassifier()]}
gs=GridSearchCV(model_pl,param_grid=param_grid,cv=5,return_train_score=True)
gs.fit(x_train,y_train)
score = gs.best_estimator_.score(x_test,y_test)
print('最佳預測模型和參數',gs.best_params_['model'])
print('訓練集的最佳結果',gs.best_score_.round(3))
print('測試集訓練結果',score.round(3))
y_pred = gs.best_estimator_.predict(x_test)
print('混亂矩陣')
print(pd.DataFrame(confusion_matrix(y_test,y_pred),index=['實際1','實際2'],columns=['預測1','預測2']))
print('綜合報告')
print(classification_report(y_test,y_pred))

最佳預測模型和參數 MultinomialNB()
訓練集的最佳結果 0.857
測試集訓練結果 0.85
混亂矩陣
     預測1  預測2
實際1   83  101
實際2   24  626
綜合報告
              precision    recall  f1-score   support

           0       0.78      0.45      0.57       184
           1       0.86      0.96      0.91       650

    accuracy                           0.85       834
   macro avg       0.82      0.71      0.74       834
weighted avg       0.84      0.85      0.83       834



最佳預測模型為MultinomialNB，準確率為0.85，不過如果要了解不滿意的評論，其樣本不滿意的召回率僅0.46，所以要找出不滿意評論，要進行資料不均衡的處理。

In [28]:
#向下取樣
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline
model_pl=make_pipeline(CountVectorizer(stop_words='english'),RandomUnderSampler(),MultinomialNB())
model_pl.fit(x_train,y_train)
y_pred = model_pl.predict(x_test)
score = model_pl.score(x_test,y_test)
print('測試集訓練結果',score.round(3))
y_pred = model_pl.predict(x_test)
print('混亂矩陣')
print(pd.DataFrame(confusion_matrix(y_test,y_pred),index=['實際1','實際2'],columns=['預測1','預測2']))
print('綜合報告')
print(classification_report(y_test,y_pred))

測試集訓練結果 0.776
混亂矩陣
     預測1  預測2
實際1  148   36
實際2  151  499
綜合報告
              precision    recall  f1-score   support

           0       0.49      0.80      0.61       184
           1       0.93      0.77      0.84       650

    accuracy                           0.78       834
   macro avg       0.71      0.79      0.73       834
weighted avg       0.84      0.78      0.79       834



正確率下降至0.776，但召回率上升至0.8。

In [29]:
#向上取樣
from imblearn.over_sampling import SMOTE
model_pl=make_pipeline(CountVectorizer(stop_words='english'),SMOTE(),MultinomialNB())
model_pl.fit(x_train,y_train)
y_pred = model_pl.predict(x_test)
score = model_pl.score(x_test,y_test)
print('測試集訓練結果',score.round(3))
y_pred = model_pl.predict(x_test)
print('混亂矩陣')
print(pd.DataFrame(confusion_matrix(y_test,y_pred),index=['實際1','實際2'],columns=['預測1','預測2']))
print('綜合報告')
print(classification_report(y_test,y_pred))

測試集訓練結果 0.817
混亂矩陣
     預測1  預測2
實際1  126   58
實際2   95  555
綜合報告
              precision    recall  f1-score   support

           0       0.57      0.68      0.62       184
           1       0.91      0.85      0.88       650

    accuracy                           0.82       834
   macro avg       0.74      0.77      0.75       834
weighted avg       0.83      0.82      0.82       834



正確率上升至0.817，但召回率下降至0.68。

In [32]:
#將標題也納入考量
#水平整合器
from sklearn.compose import ColumnTransformer
x = df[['title','content']]
data_pl = ColumnTransformer([('title',CountVectorizer(stop_words='english'),'title'),('content',CountVectorizer(stop_words='english'),'content')])
data_pl.fit_transform(x).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [33]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3)
model_pl=make_pipeline(data_pl,MultinomialNB())
model_pl.fit(x_train,y_train)
y_pred = model_pl.predict(x_test)
score = model_pl.score(x_test,y_test)
print('測試集訓練結果',score.round(3))
y_pred = model_pl.predict(x_test)
print('混亂矩陣')
print(pd.DataFrame(confusion_matrix(y_test,y_pred),index=['實際1','實際2'],columns=['預測1','預測2']))
print('綜合報告')
print(classification_report(y_test,y_pred))

測試集訓練結果 0.887
混亂矩陣
     預測1  預測2
實際1  102   67
實際2   27  638
綜合報告
              precision    recall  f1-score   support

           0       0.79      0.60      0.68       169
           1       0.90      0.96      0.93       665

    accuracy                           0.89       834
   macro avg       0.85      0.78      0.81       834
weighted avg       0.88      0.89      0.88       834



再把標題也納入考量後，準確率為0.887，不過其樣本不滿意的召回率僅0.6，所以要找出不滿意評論，還是要進行資料不均衡的處理。

In [34]:
#向下取樣
model_pl=make_pipeline(data_pl,RandomUnderSampler(),MultinomialNB())
model_pl.fit(x_train,y_train)
y_pred = model_pl.predict(x_test)
score = model_pl.score(x_test,y_test)
print('測試集訓練結果',score.round(3))
y_pred = model_pl.predict(x_test)
print('混亂矩陣')
print(pd.DataFrame(confusion_matrix(y_test,y_pred),index=['實際1','實際2'],columns=['預測1','預測2']))
print('綜合報告')
print(classification_report(y_test,y_pred))

測試集訓練結果 0.832
混亂矩陣
     預測1  預測2
實際1  148   21
實際2  119  546
綜合報告
              precision    recall  f1-score   support

           0       0.55      0.88      0.68       169
           1       0.96      0.82      0.89       665

    accuracy                           0.83       834
   macro avg       0.76      0.85      0.78       834
weighted avg       0.88      0.83      0.84       834



正確率下降至0.832，但召回率上升至0.88。

In [35]:
#向上取樣
model_pl=make_pipeline(data_pl,SMOTE(),MultinomialNB())
model_pl.fit(x_train,y_train)
y_pred = model_pl.predict(x_test)
score = model_pl.score(x_test,y_test)
print('測試集訓練結果',score.round(3))
y_pred = model_pl.predict(x_test)
print('混亂矩陣')
print(pd.DataFrame(confusion_matrix(y_test,y_pred),index=['實際1','實際2'],columns=['預測1','預測2']))
print('綜合報告')
print(classification_report(y_test,y_pred))

測試集訓練結果 0.878
混亂矩陣
     預測1  預測2
實際1  125   44
實際2   58  607
綜合報告
              precision    recall  f1-score   support

           0       0.68      0.74      0.71       169
           1       0.93      0.91      0.92       665

    accuracy                           0.88       834
   macro avg       0.81      0.83      0.82       834
weighted avg       0.88      0.88      0.88       834



正確率上升至0.878，但召回率下降至0.74。

結論:在商品評論的預測上，可以採用MultinomialNB模型，並將標題一起考慮進去，其正確率可以到0.887，如果要解決目標類別不均衡，可以使用向下均衡法，在對不滿意目標上，有較好的召回率。