In [1]:
#Basic libraries
import pandas as pd 
import numpy as np 

#NLTK libraries
import nltk
import re
import string
from wordcloud import WordCloud,STOPWORDS

# Machine Learning libraries
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import  StratifiedKFold, cross_val_score
from sklearn.metrics import f1_score, roc_auc_score,roc_curve
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB 
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import neighbors
from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
#Visualization libraries
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

#Ignore warnings
import warnings
warnings.filterwarnings('ignore')

#Other miscellaneous libraries
from collections import Counter


In [2]:
toys_reviews7 = pd.read_csv('baby_nlp_final.csv')
toys_reviews7.head(2)

Unnamed: 0.1,Unnamed: 0,reviewerID,asin,overall,reviewTime,helpful_score,reviews,Sentiment_score,Sentiments
0,0,A1HK2FQW6KXQB2,097293751X,5,2013-07-16,0.0,perfect new parent able keep track baby feed s...,0.7579,Extreme Positive
1,1,A19K65VY14D13R,097293751X,5,2013-06-29,0.0,book life saver helpful able go back track tre...,0.3818,Positive


In [3]:
toys_reviews7.dropna(axis=0,inplace=True)

In [4]:
x=toys_reviews7['reviews']
y=toys_reviews7['Sentiments']
trainx,testx,trainy,testy= train_test_split(x,y,test_size=0.4,random_state=20)
print("trainx={}, trainy={},testx={}, testy={}".format(len(trainx),len(trainy),len(testx),len(testy)))
      
vectorizer = TfidfVectorizer(sublinear_tf=True,
                             max_df=0.5,
                             min_df=1,
                             use_idf=True,
                             smooth_idf=True,lowercase=True, max_features=2500)

trainx_v=vectorizer.fit_transform(trainx)
testx_v=vectorizer.transform(testx)
print(trainx_v.shape)
print(testx_v.shape)

trainx=95619, trainy=95619,testx=63746, testy=63746
(95619, 2500)
(63746, 2500)


In [5]:
trainx_v=trainx_v.toarray()
testx_v=testx_v.toarray()

### Naive Bayes

In [6]:
nb = GaussianNB().fit(trainx_v,trainy)
scorenb=round(nb.score(trainx_v,trainy)*100,2)
y_prednb =nb.predict(testx_v)


In [7]:
def evaluate(actual,predicted):
    print('Accuracy:',accuracy_score(actual,predicted))
    print('------------------------------------------------------------')
    print('\n Confusion Matrix: \n',pd.crosstab(actual,predicted,margins=True))
    print('------------------------------------------------------------')
    print("\nclassification report: \n",classification_report(testy, predicted))

In [8]:
evaluate(testy,y_prednb)

Accuracy: 0.22997521413108274
------------------------------------------------------------

 Confusion Matrix: 
 col_0             Extreme Negative  Extreme Positive  Negative  Neutral  \
Sentiments                                                                
Extreme Negative               611                18       103       55   
Extreme Positive             24672             13288      5335     6843   
Negative                      1263                55       215      171   
Neutral                       1388               101       237      269   
Positive                      4551               557       795      937   
All                          32485             14019      6685     8275   

col_0             Positive    All  
Sentiments                         
Extreme Negative        19    806  
Extreme Positive      1879  52017  
Negative                39   1743  
Neutral                 68   2063  
Positive               277   7117  
All                   2282  63746 

### Decision tree classifier

In [9]:
dtm = DecisionTreeClassifier(max_depth=6,min_samples_split=5,max_leaf_nodes=10)
dtm1=dtm.fit(trainx_v,trainy)
y_pred_dt=dtm1.predict(testx_v)


In [10]:
evaluate(testy,y_pred_dt)

Accuracy: 0.8160041414363254
------------------------------------------------------------

 Confusion Matrix: 
 col_0             Extreme Positive    All
Sentiments                               
Extreme Negative               806    806
Extreme Positive             52017  52017
Negative                      1743   1743
Neutral                       2063   2063
Positive                      7117   7117
All                          63746  63746
------------------------------------------------------------

classification report: 
                   precision    recall  f1-score   support

Extreme Negative       0.00      0.00      0.00       806
Extreme Positive       0.82      1.00      0.90     52017
        Negative       0.00      0.00      0.00      1743
         Neutral       0.00      0.00      0.00      2063
        Positive       0.00      0.00      0.00      7117

        accuracy                           0.82     63746
       macro avg       0.16      0.20      0.18     63746

### Logistic cv

In [11]:
from sklearn.linear_model import LogisticRegressionCV
clf = LogisticRegressionCV(cv=5, random_state=0,multi_class='multinomial').fit(trainx_v,trainy)
y_pred_lrcv=clf.predict(testx_v)

In [12]:
evaluate(testy,y_pred_lrcv)

Accuracy: 0.8820318137608635
------------------------------------------------------------

 Confusion Matrix: 
 col_0             Extreme Negative  Extreme Positive  Negative  Neutral  \
Sentiments                                                                
Extreme Negative               326                45       295       91   
Extreme Positive                17             50769        50       75   
Negative                       230               139       563      370   
Neutral                         61               216       294      452   
Positive                        75              2348       228      350   
All                            709             53517      1430     1338   

col_0             Positive    All  
Sentiments                         
Extreme Negative        49    806  
Extreme Positive      1106  52017  
Negative               441   1743  
Neutral               1040   2063  
Positive              4116   7117  
All                   6752  63746  

### adaboost

In [13]:
ada = AdaBoostClassifier(n_estimators=40, learning_rate=1).fit(trainx_v,trainy)
y_pred_ada=ada.predict(testx_v)

In [14]:
evaluate(testy,y_pred_ada)

Accuracy: 0.8260753615913156
------------------------------------------------------------

 Confusion Matrix: 
 col_0             Extreme Negative  Extreme Positive  Negative  Neutral  \
Sentiments                                                                
Extreme Negative               140               251       133       14   
Extreme Positive                38             51037        46        2   
Negative                        95               766        96       13   
Neutral                         26              1020        43        1   
Positive                        45              5635        47        5   
All                            344             58709       365       35   

col_0             Positive    All  
Sentiments                         
Extreme Negative       268    806  
Extreme Positive       894  52017  
Negative               773   1743  
Neutral                973   2063  
Positive              1385   7117  
All                   4293  63746  

### Logistic one vs rest

In [15]:
m1=LogisticRegression(max_iter=100)
m1_ova=OneVsRestClassifier(m1).fit(trainx_v,trainy)

In [16]:
p1=m1_ova.predict(testx_v)
pd.DataFrame({'Actual':testy,"predicted":p1})[1045:1050]

Unnamed: 0,Actual,predicted
126104,Extreme Positive,Extreme Positive
25660,Extreme Positive,Extreme Positive
92825,Extreme Positive,Extreme Positive
49136,Extreme Positive,Extreme Positive
42273,Extreme Positive,Extreme Positive


In [17]:
evaluate(testy,p1)

Accuracy: 0.8502337401562451
------------------------------------------------------------

 Confusion Matrix: 
 col_0             Extreme Negative  Extreme Positive  Negative  Neutral  \
Sentiments                                                                
Extreme Negative               119               121       138       16   
Extreme Positive                 0             51720        12        2   
Negative                        51               408        93       26   
Neutral                          8               560        44       30   
Positive                         8              4836        20       16   
All                            186             57645       307       90   

col_0             Positive    All  
Sentiments                         
Extreme Negative       412    806  
Extreme Positive       283  52017  
Negative              1165   1743  
Neutral               1421   2063  
Positive              2237   7117  
All                   5518  63746  

### multinomial

In [18]:
m2 = MultinomialNB()
ovr2=OneVsRestClassifier(m2).fit(trainx_v,trainy)

In [19]:
p2=ovr2.predict(testx_v)
pd.DataFrame({'Actual':testy,"predicted":p2})[:10]

Unnamed: 0,Actual,predicted
71210,Extreme Positive,Extreme Positive
54941,Extreme Positive,Extreme Positive
49085,Extreme Positive,Extreme Positive
122131,Extreme Positive,Extreme Positive
143013,Extreme Positive,Extreme Positive
9908,Extreme Positive,Extreme Positive
154144,Extreme Positive,Extreme Positive
129334,Extreme Positive,Extreme Positive
53108,Extreme Positive,Extreme Positive
1422,Extreme Positive,Extreme Positive


In [20]:
evaluate(testy,p2)

Accuracy: 0.8160041414363254
------------------------------------------------------------

 Confusion Matrix: 
 col_0             Extreme Negative  Extreme Positive  Negative  Positive  \
Sentiments                                                                 
Extreme Negative                 1               769         6        30   
Extreme Positive                 0             52012         0         5   
Negative                         0              1736         1         6   
Neutral                          0              2062         0         1   
Positive                         0              7114         0         3   
All                              1             63693         7        45   

col_0               All  
Sentiments               
Extreme Negative    806  
Extreme Positive  52017  
Negative           1743  
Neutral            2063  
Positive           7117  
All               63746  
------------------------------------------------------------

classific