# Product Reviews Analysis

In [1]:
#import library
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

## Data Extraction

In [2]:
#membaca data csv
review = pd.read_csv('Reviews.csv')
review.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [3]:
#membuat dataframe dari data yang terbaca
df = pd.DataFrame(review)
df

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
...,...,...,...,...,...,...,...,...,...,...
568449,568450,B001EO7N10,A28KG5XORO54AY,Lettie D. Carter,0,0,5,1299628800,Will not do without,Great for sesame chicken..this is a good if no...
568450,568451,B003S1WTCU,A3I8AFVPEE8KI5,R. Sawyer,0,0,2,1331251200,disappointed,I'm disappointed with the flavor. The chocolat...
568451,568452,B004I613EE,A121AA1GQV751Z,"pksd ""pk_007""",2,2,5,1329782400,Perfect for our maltipoo,"These stars are small, so you can give 10-15 o..."
568452,568453,B004I613EE,A3IBEVCTXKNOH,"Kathy A. Welch ""katwel""",1,1,5,1331596800,Favorite Training and reward treat,These are the BEST treats for training and rew...


In [20]:
df.shape

(568454, 10)

In [4]:
#set variabel komentar dan skor
komentar = df['Text'].apply(lambda x: x.lower()) #setting agar semua komentar menjadi huruf kecil
skor = df['Score']

In [5]:
#stemming komentar
stemmer = nltk.stem.SnowballStemmer('english')
komentar = komentar.apply(lambda x: " ".join(stemmer.stem(x) for x in x.split()))

## Naive Bayes Model

In [6]:
#pembagian data training dan data testing 
X_train, X_test, y_train, y_test = train_test_split(komentar, skor, test_size = 0.3, random_state = 1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(397917,)
(170537,)
(397917,)
(170537,)


### CountVectorization Method

In [7]:
#vektorisasi dengan CountVectorizer
cv = CountVectorizer()
vector_x_train = cv.fit_transform(X_train) 
vector_x_test = cv.transform(X_test)

In [8]:
#tes prediksi dengan metrics
nb = MultinomialNB()
nb.fit(vector_x_train, y_train)
%time nb.fit(vector_x_train, y_train)

Wall time: 200 ms


MultinomialNB()

In [12]:
#implementasi model ke data testing
y_pred = nb.predict(vector_x_test)
print(y_pred)

[5 4 5 ... 3 5 5]


In [13]:
df_2 = pd.DataFrame({'Actual' : y_test, 'Prediction' : y_pred})
df_2.head()

Unnamed: 0,Actual,Prediction
288312,5,5
431726,5,4
110311,5,5
91855,5,5
338855,5,1


In [14]:
#evaluasi model dengan metrics
metrics.accuracy_score(y_test, y_pred)

0.7011968077308737

### TFIDFVectorizer Method

In [16]:
#vektorisasi dengan TF-IDF Vectorizer
tfid = TfidfVectorizer()
vector_x_train_2 = tfid.fit_transform(X_train) 
vector_x_test_2 = tfid.transform(X_test)

In [17]:
#tes prediksi dengan metrics
nb = MultinomialNB()
nb.fit(vector_x_train_2, y_train)
%time nb.fit(vector_x_train_2, y_train)

Wall time: 233 ms


MultinomialNB()

In [18]:
#implementasi model ke data testing
y_pred_2 = nb.predict(vector_x_test_2)
print(y_pred_2)

[5 5 5 ... 5 5 5]


In [19]:
#evaluasi model dengan metrics
metrics.accuracy_score(y_test, y_pred_2)

0.6460416214663094