In [1]:
# KNN Amazon Baby Review.

import pandas as pd
import numpy as np
import nltk
import string
import numpy as np
import scipy.sparse as sparse

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from nltk.corpus import stopwords
from sklearn.neighbors import KNeighborsClassifier


In [2]:
#reading reviews using pandas library from amazon_baby_train.csv file
reviews = pd.read_csv('amazon_baby_train.csv')
reviews.shape

# dropping observations which are incomplete
reviews = reviews.dropna()
reviews.shape

# changing the reviews into positive and negative reviews
scores = reviews['rating']
reviews['rating'] = reviews['rating'].apply(lambda x: 1 if x > 3 else 0)

# printing the mean and standard deviation of ratings
print("The Mean of the Review Attribute is : ")
print(scores.mean())
print("The Standard Deviation of the Review Attribute is : ")
print(scores.std())

The Mean of the Review Attribute is : 
0.7642793999739596
The Standard Deviation of the Review Attribute is : 
0.4244498007103935


In [3]:
def splitPosNeg(Summaries):
    neg = reviews.loc[Summaries['rating'] == 0]
    pos = reviews.loc[Summaries['rating'] == 1]
    return [pos,neg]    


In [4]:
# splitting the positive and negative review and storing them in separate arrays
[pos,neg] = splitPosNeg(reviews)

In [5]:
# Preprocessing steps

# Using lemmatizer to lemmatizze words
lemmatizer = nltk.WordNetLemmatizer()

# using stop words to remove the words which do not contribute to the sentiment
stop = stopwords.words('english')
translation = str.maketrans(string.punctuation,' '*len(string.punctuation))

def preprocessing(line):
    tokens=[]
    line = line.translate(translation)
    line = nltk.word_tokenize(line.lower())
    #print(line)
    stops = stopwords.words('english')
    stops.remove('not')
    stops.remove('no')
    line = [word for word in line if word not in stops]
    for t in line:
        stemmed = lemmatizer.lemmatize(t)
        tokens.append(stemmed)
    return ' '.join(tokens)

In [6]:
# Storing the positive and negative reviews in separate arrays
pos_data = []
neg_data = []
for p in pos['review']:
    pos_data.append(preprocessing(p))

for n in neg['review']:
    neg_data.append(preprocessing(n))
print("Done")

Done


In [7]:
data = pos_data + neg_data
labels = np.concatenate((pos['rating'].values,neg['rating'].values))

In [8]:
#tokenizing each sentence from the file into words
t = []
for line in data:
    l = nltk.word_tokenize(line)
    for w in l:
        t.append(w)


In [9]:
# Calculating the frequency dstribution of each word
word_features = nltk.FreqDist(t)
print(len(word_features))

55558


In [12]:
# The most common 200 words
topwords = [fpair[0] for fpair in list(word_features.most_common(200))]
print(word_features.most_common(25))

[('not', 80912), ('baby', 70749), ('one', 66194), ('love', 52997), ('great', 47666), ('like', 45664), ('would', 45661), ('use', 42480), ('seat', 39416), ('get', 38306), ('month', 34560), ('time', 33391), ('little', 33166), ('easy', 32862), ('old', 31945), ('well', 30745), ('product', 30585), ('really', 28026), ('also', 27756), ('son', 26691), ('bought', 25451), ('work', 25281), ('no', 24775), ('good', 23749), ('much', 23651)]


In [13]:
#printing the top 20 most common words
word_his = pd.DataFrame(word_features.most_common(20), columns = ['words','count'])
print(word_his)

      words  count
0       not  80912
1      baby  70749
2       one  66194
3      love  52997
4     great  47666
5      like  45664
6     would  45661
7       use  42480
8      seat  39416
9       get  38306
10    month  34560
11     time  33391
12   little  33166
13     easy  32862
14      old  31945
15     well  30745
16  product  30585
17   really  28026
18     also  27756
19      son  26691


In [14]:
# Vectorizing the top words
vec = CountVectorizer()
c_fit = vec.fit_transform([' '.join(topwords)])

In [15]:
# Using Tfidf Transformer on the data
tf_vec = TfidfTransformer()
tf_fit = tf_vec.fit_transform(c_fit)

In [16]:
ctr_features = vec.transform(data)
tr_features = tf_vec.transform(ctr_features)

In [17]:
tr_features.shape

(145927, 193)

In [18]:
tr_features = tr_features.astype('int32')
print(tr_features.dtype)

int32


In [19]:
# Using KNN classifier to classify the data
clf =  KNeighborsClassifier()
clf = clf.fit(tr_features, labels)

In [21]:
lencheck= tr_features.shape
print(lencheck)

(145927, 193)


In [24]:
num_correct = 0;
newlen = lencheck[0]-1
for ch in range(0,newlen):
    checkPrediction = clf.predict(tr_features[ch])
    if(checkPrediction == [labels[ch]]):
        num_correct = num_correct+1;
print("Number of Correct")
print(num_correct)

accuracy = (num_correct/newlen)*100;
print("Training Accuracy");
print(accuracy);

Number of Correct
500
Training Accuracy
100.0


In [31]:
#reading reviews using pandas library from amazon_baby_test.csv file
reviews = pd.read_csv('amazon_baby_test.csv')
reviews.shape

# dropping observations which are incomplete
reviews = reviews.dropna()
reviews.shape

# changing the reviews into positive and negative reviews
scores = reviews['rating']
reviews['rating'] = reviews['rating'].apply(lambda x: 1 if x > 3 else 0)

# calculating the mean of reviews
scores.mean()

0.7622404476506569

In [32]:
# splitting the positive and negative review and storing them in separate arrays
[pos,neg] = splitPosNeg(reviews)

In [33]:
# Storing the positive and negative reviews in separate arrays
pos_data = []
neg_data = []
for p in pos['review']:
    pos_data.append(preprocessing(p))

for n in neg['review']:
    neg_data.append(preprocessing(n))
print("Done")

Done


In [34]:
# combining the positive and negative reviews
data = pos_data + neg_data
labels = np.concatenate((pos['rating'].values,neg['rating'].values))

In [35]:

#tokenizing each sentence from the file into words
t = []
for line in data:
    l = nltk.word_tokenize(line)
    for w in l:
        t.append(w)


In [36]:
# Calculating the frequency dstribution of each word
word_features = nltk.FreqDist(t)
print(len(word_features))

27828


In [37]:
# The most common 200 words
topwords = [fpair[0] for fpair in list(word_features.most_common(200))]
print(word_features.most_common(25))

[('not', 20502), ('baby', 17687), ('one', 16201), ('love', 13132), ('great', 11756), ('would', 11417), ('like', 11267), ('seat', 10442), ('use', 10437), ('get', 9549), ('month', 8510), ('little', 8383), ('time', 8267), ('easy', 8255), ('old', 7899), ('well', 7800), ('product', 7426), ('really', 6923), ('also', 6870), ('son', 6468), ('work', 6259), ('bought', 6186), ('no', 6051), ('good', 5950), ('much', 5944)]


In [38]:
# Vectorizing the top words
vec = CountVectorizer()
c_fit = vec.fit_transform([' '.join(topwords)])

In [39]:
# Using Tfidf Transformer on the data
tf_vec = TfidfTransformer()
tf_fit = tf_vec.fit_transform(c_fit)

In [40]:
# Transforming the features using Tfidf transformer
cte_features = vec.transform(data)
te_features = tf_vec.transform(cte_features)

In [41]:
te_features.shape

(36457, 193)

In [43]:
num_correct = 0;
newlen = lencheck[0]-1
for ch in range(0,newlen):
    checkPrediction = clf.predict(te_features[ch])
    if(checkPrediction == [labels[ch]]):
        num_correct = num_correct+1;
print("Number of Correct")
print(num_correct)

accuracy = (num_correct/newlen)*100;
print("Testing Accuracy");
print(accuracy);

Number of Correct
8
Testing Accuracy
80.0
