# Load the Kindle Review Dataset

In [51]:
# Load the data
import pandas as pd
kindle_data=pd.read_csv('all_kindle_review.csv')
kindle_data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,2,9146,B002ZG96I4,"[0, 0]",3,I'll start by saying this is the first of four...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,3,7038,B002QHWOEU,"[1, 3]",3,Aggie is Angela Lansbury who carries pocketboo...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,4,1776,B001A06VJ8,"[0, 1]",4,I did not expect this type of book to be in li...,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


In [3]:
kindle_data=kindle_data[['reviewText','rating']]
kindle_data.head()

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",3
1,Great short read. I didn't want to put it dow...,5
2,I'll start by saying this is the first of four...,3
3,Aggie is Angela Lansbury who carries pocketboo...,3
4,I did not expect this type of book to be in li...,4


In [4]:
kindle_data.shape

(12000, 2)

In [5]:
kindle_data.isnull().sum()

Unnamed: 0,0
reviewText,0
rating,0


In [6]:
kindle_data['rating'].unique()

array([3, 5, 4, 2, 1])

In [7]:
kindle_data['rating'].value_counts()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
5,3000
4,3000
3,2000
2,2000
1,2000


# Text Preprocessing

In [8]:
# Positive review is 1 and negative review is o
kindle_data['rating']=kindle_data['rating'].apply(lambda x:1 if x>3 else 0)
kindle_data['rating'].unique()

array([0, 1])

In [9]:
kindle_data.head()

Unnamed: 0,reviewText,rating
0,"Jace Rankin may be short, but he's nothing to ...",0
1,Great short read. I didn't want to put it dow...,1
2,I'll start by saying this is the first of four...,0
3,Aggie is Angela Lansbury who carries pocketboo...,0
4,I did not expect this type of book to be in li...,1


In [10]:
# Convert to lower case
kindle_data['reviewText']=kindle_data['reviewText'].str.lower() # str convert pandas series into string

In [11]:
kindle_data['reviewText'].head()

Unnamed: 0,reviewText
0,"jace rankin may be short, but he's nothing to ..."
1,great short read. i didn't want to put it dow...
2,i'll start by saying this is the first of four...
3,aggie is angela lansbury who carries pocketboo...
4,i did not expect this type of book to be in li...


In [12]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords') #downloading all stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
from bs4 import BeautifulSoup

In [14]:
## Removing special characters
kindle_data['reviewText']=kindle_data['reviewText'].apply(lambda x:re.sub('[^a-z A-z 0-9-]+', '',x))
## Remove the stopswords
kindle_data['reviewText']=kindle_data['reviewText'].apply(lambda x:" ".join([y for y in x.split() if y not in stopwords.words('english')]))
## Remove url
kindle_data['reviewText']=kindle_data['reviewText'].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '' , str(x)))
## Remove html tags
kindle_data['reviewText']=kindle_data['reviewText'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())
## Remove any additional spaces
kindle_data['reviewText']=kindle_data['reviewText'].apply(lambda x: " ".join(x.split()))


In [15]:
kindle_data.head()

Unnamed: 0,reviewText,rating
0,jace rankin may short hes nothing mess man hau...,0
1,great short read didnt want put read one sitti...,1
2,ill start saying first four books wasnt expect...,0
3,aggie angela lansbury carries pocketbooks inst...,0
4,expect type book library pleased find price right,1


In [16]:
# Lemmatizer
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [17]:
def lemmatize_words(text):
  return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [18]:
kindle_data['reviewText']=kindle_data['reviewText'].apply(lambda x:lemmatize_words(x))

# Train-Test Split

In [19]:
kindle_data.head()

Unnamed: 0,reviewText,rating
0,jace rankin may short he nothing mess man haul...,0
1,great short read didnt want put read one sitti...,1
2,ill start saying first four book wasnt expecti...,0
3,aggie angela lansbury carry pocketbook instead...,0
4,expect type book library pleased find price right,1


In [20]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(kindle_data['reviewText'],kindle_data['rating'],test_size=0.2,random_state=42)
#

# Text Representation and Model Implementation


### BOW with Naïve Bayes and Random Forest

In [56]:
# Covert words to vector
from sklearn.feature_extraction.text import CountVectorizer
bow=CountVectorizer()
X_train_bow=bow.fit_transform(X_train).toarray()
X_test_bow=bow.transform(X_test).toarray()
X_train_bow

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

##### Naïve Bayes

In [58]:
# Naive Bayes
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

nb_model_bow=GaussianNB().fit(X_train_bow,y_train)
y_pred_bow_nb=nb_model_bow.predict(X_test_bow)
print("Accuracy of BOW with Nb: ", accuracy_score(y_test,y_pred_bow_nb))
confusion_matrix(y_test,y_pred_bow_nb)

Accuracy of BOW with Nb:  0.5895833333333333


array([[913, 277],
       [708, 502]])

##### Random Forest

In [59]:
# Random Forest
model = RandomForestClassifier()
model.fit(X_train_bow, y_train)
y_pred_bow_rf = model.predict(X_test_bow)
print("Accuracy of BOW using RF: ", accuracy_score(y_test,y_pred_bow_rf))
confusion_matrix(y_test,y_pred_bow_rf)

Accuracy of Word2vec using RF:  0.8008333333333333


array([[995, 195],
       [283, 927]])

### TF-IDF with Naïve Bayes and Random Forest

In [22]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf=TfidfVectorizer()
X_train_tfidf=tf_idf.fit_transform(X_train).toarray()
X_test_tfidf=tf_idf.transform(X_test).toarray()

In [23]:
X_train_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

##### Naive Bayes

In [60]:
# Naive Bayes
nb_model_tfidf=GaussianNB().fit(X_train_tfidf,y_train)
y_pred_tfidf_nb=nb_model_tfidf.predict(X_test_tfidf)
print("Accuracy of TFIDF with Nb: ", accuracy_score(y_test,y_pred_tfidf_nb))
confusion_matrix(y_test,y_pred_tfidf_nb)

Accuracy of TFIDF with Nb:  0.6266666666666667


array([[808, 382],
       [514, 696]])

##### Random Forest

In [61]:
# Random Forest
model = RandomForestClassifier()
model.fit(X_train_tfidf, y_train)
y_pred_tfidf_rf = model.predict(X_test_tfidf)
print("Accuracy of Word2vec using RF: ", accuracy_score(y_test,y_pred_tfidf_rf))
confusion_matrix(y_test,y_pred_tfidf_rf)

Accuracy of Word2vec using RF:  0.7954166666666667


array([[976, 214],
       [277, 933]])

### Word2Vec with Naïve Bayes and Random Forest

In [37]:
# Word2vec
!pip install gensim # library for Word2vec



In [36]:
from gensim.models import Word2Vec, KeyedVectors


In [38]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300') # Google’s Pre-trained Word2Vec (300-dimensional vectors)



In [42]:
import numpy as np
import gensim.downloader as api
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')
# Function to Convert Sentences into Vectors
def get_avg_word2vec(sentence, model, vector_size=300):
    words = word_tokenize(sentence)  # Tokenize the sentence
    word_vectors = [model[word] for word in words if word in model]  # Get word vectors if present in model

    if len(word_vectors) == 0:
        return np.zeros(vector_size)  # Return a zero vector if no words in the sentence are in the model

    return np.mean(word_vectors, axis=0)  # Take the average of word vectors

# Convert X_train & X_test into Word2Vec Vectors
X_train_w2v = np.array([get_avg_word2vec(sent, wv, vector_size=300) for sent in X_train])
X_test_w2v = np.array([get_avg_word2vec(sent, wv, vector_size=300) for sent in X_test])


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


##### Naive Bayes

In [65]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

nb_model_bow=GaussianNB().fit(X_train_w2v,y_train)
y_pred_w2v_nb=nb_model_bow.predict(X_test_w2v)
print("Accuracy of Word2vec using Nb: ", accuracy_score(y_test,y_pred_w2v_nb))
confusion_matrix(y_test,y_pred_w2v_nb)

Accuracy of Word2vec using Nb:  0.745


array([[983, 207],
       [405, 805]])

##### Random Forest

In [66]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train_w2v, y_train)
y_pred_w2v_rf = model.predict(X_test_w2v)
print("Accuracy of Word2vec using RF: ", accuracy_score(y_test,y_pred_w2v_rf))
confusion_matrix(y_test,y_pred_w2v_rf)

Accuracy of Word2vec using RF:  0.7791666666666667


array([[938, 252],
       [278, 932]])