In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer,LancasterStemmer,WordNetLemmatizer


In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('punkt_tab')

In [288]:
data = pd.read_csv("amazon_alexa.tsv",sep='\t')

In [289]:
data

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1
...,...,...,...,...,...
3145,5,30-Jul-18,Black Dot,"Perfect for kids, adults and everyone in betwe...",1
3146,5,30-Jul-18,Black Dot,"Listening to music, searching locations, check...",1
3147,5,30-Jul-18,Black Dot,"I do love these things, i have them running my...",1
3148,5,30-Jul-18,White Dot,Only complaint I have is that the sound qualit...,1


In [290]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   rating            3150 non-null   int64 
 1   date              3150 non-null   object
 2   variation         3150 non-null   object
 3   verified_reviews  3149 non-null   object
 4   feedback          3150 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 123.2+ KB


In [291]:
data["variation"].unique()

array(['Charcoal Fabric ', 'Walnut Finish ', 'Heather Gray Fabric ',
       'Sandstone Fabric ', 'Oak Finish ', 'Black', 'White',
       'Black  Spot', 'White  Spot', 'Black  Show', 'White  Show',
       'Black  Plus', 'White  Plus', 'Configuration: Fire TV Stick',
       'Black  Dot', 'White  Dot'], dtype=object)

In [292]:
data['feedback'].unique()

array([1, 0])

In [293]:
data['rating'].unique()

array([5, 4, 3, 2, 1])

In [294]:
data.shape

(3150, 5)

In [295]:
data.duplicated().sum()

np.int64(715)

In [296]:
data.drop_duplicates(inplace=True)

In [297]:
data.shape

(2435, 5)

In [298]:
data.isna().sum()

rating              0
date                0
variation           0
verified_reviews    1
feedback            0
dtype: int64

In [299]:
data.dropna(inplace=True)

In [300]:
data.shape

(2434, 5)

In [301]:
data.describe()

Unnamed: 0,rating,feedback
count,2434.0,2434.0
mean,4.436319,0.909614
std,1.10869,0.286793
min,1.0,0.0
25%,4.0,1.0
50%,5.0,1.0
75%,5.0,1.0
max,5.0,1.0


In [302]:
no_of_pos = data[data['feedback']==1]['feedback'].count()
no_of_pos
print("Positive Reviews percentage: {0}".format(no_of_pos/(data.shape[0])*100))

Positive Reviews percentage: 90.96138044371405


In [303]:
no_of_neg = data[data['feedback']==0]['feedback'].count()
no_of_neg
print("Positive Reviews percentage: {0}".format(no_of_neg/(data.shape[0])*100))

Positive Reviews percentage: 9.038619556285948


In [304]:
stop_words = stopwords.words('English')
print(stop_words)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [305]:
lm = WordNetLemmatizer()

In [306]:
reviews = []
for review in data['verified_reviews']:
    review = review.lower()
    review = re.sub('[^A-Za-z]',' ',review)
    tokens = word_tokenize(review)
    filtered_tokens =[i for i in tokens if i not in stop_words]
    lemmatized_tokens =[lm.lemmatize(i) for i in filtered_tokens ]
    review =" ".join(lemmatized_tokens)
    reviews.append(review)


In [307]:
reviews[:10]

['love echo',
 'loved',
 'sometimes playing game answer question correctly alexa say got wrong answer like able turn light away home',
 'lot fun thing yr old learns dinosaur control light play game like category nice sound playing music well',
 'music',
 'received echo gift needed another bluetooth something play music easily accessible found smart speaker wait see else',
 'without cellphone use many feature ipad see use great alarm u r almost deaf hear alarm bedroom living room reason enough keep fun ask random question hear response seem smartbon politics yet',
 'think th one purchased working getting one every room house really like feature offer specifily playing music echo controlling light throughout house',
 'look great',
 'love listened song heard since childhood get news weather information great']

In [308]:
X = reviews
Y = data['feedback'].values

In [309]:
X

['love echo',
 'loved',
 'sometimes playing game answer question correctly alexa say got wrong answer like able turn light away home',
 'lot fun thing yr old learns dinosaur control light play game like category nice sound playing music well',
 'music',
 'received echo gift needed another bluetooth something play music easily accessible found smart speaker wait see else',
 'without cellphone use many feature ipad see use great alarm u r almost deaf hear alarm bedroom living room reason enough keep fun ask random question hear response seem smartbon politics yet',
 'think th one purchased working getting one every room house really like feature offer specifily playing music echo controlling light throughout house',
 'look great',
 'love listened song heard since childhood get news weather information great',
 'sent year old dad talk constantly',
 'love learning knew thing eveyday still figuring everything work far easy use understand make laugh time',
 'purchased mother knee problem giv

In [310]:
type(X)

list

In [311]:
X[0]

'love echo'

In [312]:
Y

array([1, 1, 1, ..., 1, 1, 1], shape=(2434,))

In [313]:
cv = CountVectorizer(max_features=2000)
vectors = cv.fit_transform(X).toarray()
print(len(cv.get_feature_names_out()))
len(vectors)

2000


2434

In [314]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(2434, 2000))

In [315]:
Y

array([1, 1, 1, ..., 1, 1, 1], shape=(2434,))

In [None]:
# Balancing the classes
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X,Y = smote.fit_resample(vectors, Y)


In [319]:
len(Y)

4428

In [321]:
pos_count,neg_count = 0,0
for i in Y:
    if i==0:
        neg_count+=1
    else:
        pos_count+=1
print(pos_count,neg_count)
print("Percentage of postive classes: {0}".format(pos_count/len(Y)*100))
print("Percentage of negitive classes: {0}".format(neg_count/len(Y)*100))

2214 2214
Percentage of postive classes: 50.0
Percentage of negitive classes: 50.0


In [322]:
from sklearn.model_selection import train_test_split

In [323]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=42)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(3542, 2000)
(886, 2000)
(3542,)
(886,)


In [324]:
x_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(886, 2000))

In [325]:
type(y_train)

numpy.ndarray

### MultinomialNB Classification Model

In [326]:
# Naive Bayes Classification
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

MNB=MultinomialNB()
MNB.fit(x_train,y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [327]:
mnb_pred_train_y = MNB.predict(x_train)
print(mnb_pred_train_y)
print(len(mnb_pred_train_y))

[0 0 0 ... 0 0 1]
3542


In [328]:
# print('Training score',MNB.score(y_test,mnb_pred_train_y))
# print('Testing score',model1.score(x_test,y_test))


In [329]:
training_acc_mnb = accuracy_score(y_train,mnb_pred_train_y)*100
print("Training accquarcy of MNB Classification model {0}".format(training_acc_mnb))

Training accquarcy of MNB Classification model 93.53472614342179


In [330]:
mnb_pred_test_y = MNB.predict(x_test)
print(mnb_pred_test_y)
print(len(mnb_pred_test_y))

[1 1 0 0 1 1 0 0 1 1 0 1 0 1 0 1 1 1 0 0 0 1 0 0 0 0 1 0 1 1 0 0 0 1 1 0 0
 0 0 1 1 0 1 0 1 1 1 1 1 0 1 0 0 0 1 1 0 1 0 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0
 1 1 0 1 0 0 1 0 1 1 0 0 0 0 0 0 0 0 1 0 0 1 0 1 0 1 0 1 0 0 0 0 0 0 0 1 1
 0 1 1 0 0 1 0 0 0 1 0 1 0 1 0 1 0 1 0 1 0 1 1 0 0 0 0 1 1 1 1 0 1 1 0 0 0
 1 0 1 1 1 1 1 1 0 1 1 0 0 1 0 0 0 0 1 0 1 0 0 1 0 0 1 1 1 1 0 0 1 0 0 0 1
 0 1 0 0 1 1 1 0 1 0 0 1 0 1 1 0 1 0 0 0 0 1 1 1 1 1 0 0 1 1 1 0 1 0 1 1 1
 0 1 0 1 1 1 1 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 1 1 0 1 1 1 0 1 0 0 0 1
 1 0 0 0 1 0 0 0 1 0 1 1 0 1 0 1 0 0 1 0 0 1 1 0 0 1 1 0 1 0 0 1 0 1 1 0 1
 0 1 0 0 0 0 1 1 1 0 0 0 1 1 1 1 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 1 0 1 0 0 0
 1 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 1 0 1 0 0 1 1 1 1 0 1 1 1 0 0 0 0 0 1 1 1
 0 1 1 1 1 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 1 0 1 1 1 1 1 1 0 1 0 1 0 1 0
 1 1 0 0 1 1 1 1 1 1 0 0 1 0 0 1 1 0 0 1 0 1 0 1 0 1 0 1 0 1 1 1 0 1 1 0 0
 0 0 0 1 0 1 1 0 0 1 0 1 0 1 1 1 0 1 1 1 1 1 0 1 0 0 0 0 1 1 0 1 0 0 1 0 1
 0 1 1 1 1 0 1 0 1 1 0 0 

In [331]:
testing_acc_mnb = accuracy_score(y_test,mnb_pred_test_y)*100
print("Training accquarcy of MNB Classification model {0}".format(testing_acc_mnb))

Training accquarcy of MNB Classification model 90.85778781038375


In [338]:
# New review Preprocessing 
new_review = 'product is very bad'.lower()
new_review_tokens = word_tokenize(new_review)
new_review_tokens = [token for token in new_review_tokens if token not in stop_words] 
new_reivew_lemmatized_tokens =[lm.lemmatize(i) for i in new_review_tokens ]
final_review =" ".join(new_reivew_lemmatized_tokens)

In [339]:
type(final_review)

str

In [340]:
final_review

'product bad'

In [341]:
# convert review into vector
final_review_vector = cv.transform([final_review]).toarray()
print(final_review_vector.shape)


(1, 2000)


In [342]:
pred_feedback_new_reivew = MNB.predict(final_review_vector)

In [343]:
pred_feedback_new_reivew

array([0])

### Random Forest Classifier Model

In [344]:
from sklearn.ensemble import RandomForestClassifier

RFC = RandomForestClassifier()

In [345]:
RFC.fit(x_train,y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [346]:
rfc_preds_y_train = RFC.predict(x_train)

In [347]:
rfc_training_acc = accuracy_score(y_train,rfc_preds_y_train)*100
print("Training accuracy of RFC model {0}".format(rfc_training_acc))

Training accuracy of RFC model 98.6730660643704


In [348]:
rfc_preds_y_test = RFC.predict(x_test)

In [349]:
rfc_testing_acc = accuracy_score(y_test,rfc_preds_y_test)*100
print("Testing accuracy of RFC model {0}".format(rfc_testing_acc))

Testing accuracy of RFC model 91.53498871331828


In [350]:
RFC.predict(final_review_vector)

array([0])

In [351]:
# New review Preprocessing 
new_review = 'product is very good'.lower()
new_review_tokens = word_tokenize(new_review)
new_review_tokens = [token for token in new_review_tokens if token not in stop_words] 
new_reivew_lemmatized_tokens =[lm.lemmatize(i) for i in new_review_tokens ]
final_review =" ".join(new_reivew_lemmatized_tokens)

In [352]:
# convert review into vector
final_review_vector = cv.transform([final_review]).toarray()
print(final_review_vector.shape)

(1, 2000)


In [353]:
RFC.predict(final_review_vector)

array([1])

### Logistic Regression

In [355]:
from sklearn.linear_model import LogisticRegression

LR_model = LogisticRegression()

In [356]:
LR_model.fit(x_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [357]:
LR_y_train_preds = LR_model.predict(x_train)

In [367]:
# accuracy on training dataset
lr_training_acc = accuracy_score(y_train,LR_y_train_preds)*100
lr_training_acc

94.97459062676454

In [359]:
LR_y_test_preds = LR_model.predict(x_test)

In [366]:
# accuracy on testing dataset
lr_testing_acc = accuracy_score(y_test,LR_y_test_preds)*100
lr_testing_acc

88.93905191873588

In [368]:
acc_dict = {
    'Logistic Regression':[lr_training_acc,lr_testing_acc],
    'MultinomialNB':[training_acc_mnb,testing_acc_mnb],
    'Random Forest':[rfc_training_acc,rfc_testing_acc]
}
pd.DataFrame(acc_dict,index=['Training acc','Testing acc'])


Unnamed: 0,Logistic Regression,MultinomialNB,Random Forest
Training acc,94.974591,93.534726,98.673066
Testing acc,88.939052,90.857788,91.534989


In [369]:
# Random Forest having best accuray
import pickle
model=open('model.pkl','wb')
pickle.dump(RFC,model)
model.close()