In [2]:
import pandas as pd
data=pd.read_csv('/kindle_reviews.csv', engine='python',
    on_bad_lines='skip')
data.head()

Unnamed: 0.1,Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,B000F83SZQ,"[0, 0]",5,I enjoy vintage books and movies so I enjoyed ...,"05 5, 2014",A1F6404F1VG29J,Avidreader,Nice vintage story,1399248000
1,1,B000F83SZQ,"[2, 2]",4,This book is a reissue of an old one; the auth...,"01 6, 2014",AN0N05A9LIJEQ,critters,Different...,1388966400
2,2,B000F83SZQ,"[2, 2]",4,This was a fairly interesting read. It had ol...,"04 4, 2014",A795DMNCJILA6,dot,Oldie,1396569600
3,3,B000F83SZQ,"[1, 1]",5,I'd never read any of the Amy Brewster mysteri...,"02 19, 2014",A1FV0SX13TWVXQ,"Elaine H. Turley ""Montana Songbird""",I really liked it.,1392768000
4,4,B000F83SZQ,"[0, 1]",4,"If you like period pieces - clothing, lingo, y...","03 19, 2014",A3SPTOKDG7WBLN,Father Dowling Fan,Period Mystery,1395187200


In [3]:
data=data[['reviewText','overall']]

In [4]:
data.shape


(121338, 2)

In [5]:

data['overall'].unique()

array([5, 4, 3, 2, 1])

In [6]:
data['overall'].value_counts()

Unnamed: 0_level_0,count
overall,Unnamed: 1_level_1
5,57757
4,35895
3,16657
2,6543
1,4486


In [7]:
data = data.copy()
data['overall'] = pd.to_numeric(data['overall'], errors='coerce')


In [8]:
#preprocessing and cleaning
#positive review is 1 and negative review is 0
data.loc[:, 'overall'] = (data['overall'] >= 3).astype(int)
data.head()

Unnamed: 0,reviewText,overall
0,I enjoy vintage books and movies so I enjoyed ...,1
1,This book is a reissue of an old one; the auth...,1
2,This was a fairly interesting read. It had ol...,1
3,I'd never read any of the Amy Brewster mysteri...,1
4,"If you like period pieces - clothing, lingo, y...",1


In [9]:
data['overall'].value_counts()

Unnamed: 0_level_0,count
overall,Unnamed: 1_level_1
1,110309
0,11029


In [10]:
data['overall'].unique()

array([1, 0])

In [11]:
#lower all case
data['reviewText']=data['reviewText'].str.lower()
data.head()

Unnamed: 0,reviewText,overall
0,i enjoy vintage books and movies so i enjoyed ...,1
1,this book is a reissue of an old one; the auth...,1
2,this was a fairly interesting read. it had ol...,1
3,i'd never read any of the amy brewster mysteri...,1
4,"if you like period pieces - clothing, lingo, y...",1


In [12]:
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    text = " ".join(word for word in text.split() if word not in stop_words)
    text = " ".join(text.split())
    return text

data['reviewText'] = data['reviewText'].apply(clean_text)


In [14]:
data.head()

Unnamed: 0,reviewText,overall
0,enjoy vintage books movies enjoyed reading boo...,1
1,book reissue old one author born era say nero ...,1
2,fairly interesting read old style terminologyi...,1
3,id never read amy brewster mysteries one reall...,1
4,like period pieces clothing lingo enjoy myster...,1


In [10]:
data_small = data.sample(10000, random_state=42)
data_small.shape


(10000, 2)

In [11]:
#apply lemmatizer
from nltk.stem import WordNetLemmatizer


In [12]:
lemmatizer=WordNetLemmatizer()

In [13]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
def lemmatize_words(text):
  return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [15]:
data_small['reviewText']=data_small['reviewText'].apply(lambda x:lemmatize_words(x))

In [18]:
data_small.head()
data_small.shape

(10000, 2)

In [19]:
from sklearn.model_selection import train_test_split

X = data_small['reviewText']
y = data_small['overall']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)


In [20]:
from sklearn.feature_extraction.text import CountVectorizer
bow=CountVectorizer()
X_train_bow=bow.fit_transform(X_train).toarray()
X_test_bow=bow.transform(X_test).toarray()


In [50]:
from sklearn.feature_extraction.text import TfidfTransformer

# Get count vectors using the 'bow' CountVectorizer fitted in DPfW1_gFLm_m
# This ensures that the vocabulary learned by the CountVectorizer is used.
X_train_counts = bow.transform(X_train)
X_test_counts = bow.transform(X_test)

# Instantiate TfidfTransformer
tfidf_transformer = TfidfTransformer()

# Apply TfidfTransformer to the count vectors to get TF-IDF features
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts).toarray()
X_test_tfidf = tfidf_transformer.transform(X_test_counts).toarray()

In [22]:
X_train_bow

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [42]:
from sklearn.naive_bayes import GaussianNB
nb_model_bow=GaussianNB().fit(X_train_bow,y_train)
nb_model_tfidf=GaussianNB().fit(X_train_tfidf,y_train)

In [43]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [44]:
y_pred_bow=nb_model_bow.predict(X_test_bow)


In [51]:
y_pred_tfid=nb_model_tfidf.predict(X_test_tfidf)


In [46]:
confusion_matrix(y_test, y_pred_bow)

array([[  62,  131],
       [ 355, 1452]])

In [54]:
print("BOW accuracy: ", accuracy_score(y_test,y_pred_bow))

BOW accuracy:  0.757


In [52]:
confusion_matrix(y_test, y_pred_tfid)

array([[  63,  130],
       [ 355, 1452]])

In [53]:
print("TFIDF accuracy: ", accuracy_score(y_test,y_pred_tfid))

TFIDF accuracy:  0.7575
