In [1]:
from textblob.classifiers import NaiveBayesClassifier
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd
# import the data and take a look
df = pd.read_csv("Reviews.csv")
list(df)
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [2]:
df.groupby('Recommended IND').size()

Recommended IND
0     4172
1    19314
dtype: int64

In [3]:
def num_missing(df):
     return sum(df.isnull())
df.apply(num_missing, axis=0)

Unnamed: 0                    0
Clothing ID                   0
Age                           0
Title                      3810
Review Text                 845
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Division Name                14
Department Name              14
Class Name                   14
dtype: int64

In [4]:
df = df.dropna()

In [5]:
df['Complete Review'] = df['Title'] + ' ' + df['Review Text']

In [6]:
np.random.seed(0)
msk = np.random.rand(len(df)) < 0.8
train = df[msk]
test = df[~msk]

In [7]:
subset_train = train[['Complete Review', 'Recommended IND']]
tuples_train = [tuple(x) for x in subset_train.values]
subset_test = test[['Complete Review', 'Recommended IND']]
tuples_test = [tuple(x) for x in subset_test.values]

In [8]:
tuples_train[0:3]

[('Some major design flaws I had such high hopes for this dress and really wanted it to work for me. i initially ordered the petite small (my usual size) but i found this to be outrageously small. so small in fact that i could not zip it up! i reordered it in petite medium, which was just ok. overall, the top half was comfortable and fit nicely, but the bottom half had a very tight under layer and several somewhat cheap (net) over layers. imo, a major design flaw was the net over layer sewn directly into the zipper - it c',
  0),
 ("My favorite buy! I love, love, love this jumpsuit. it's fun, flirty, and fabulous! every time i wear it, i get nothing but great compliments!",
  1),
 ('Flattering shirt This shirt is very flattering to all due to the adjustable front tie. it is the perfect length to wear with leggings and it is sleeveless so it pairs well with any cardigan. love this shirt!!!',
  1)]

In [9]:
cl = NaiveBayesClassifier(tuples_train)

In [10]:
predicted_classifications = []
def make_predictions():
    for i in range(len(tuples_test)):
             classification = cl.classify(tuples_test[i][0])
             predicted_classifications.append(classification)
make_predictions()

In [11]:
predicted_classifications[0:9]

[1, 0, 1, 1, 0, 1, 1, 1, 1]

In [12]:
def accuracy():
    countCorrect = 0
    for i in range(len(tuples_test)):
        if predicted_classifications[i] == tuples_test[i][1]:
            countCorrect += 1
    accuracy = (countCorrect / len(predicted_classifications)) * 100
    print('Classifier is correct', accuracy, 'of the time')
accuracy()

Classifier is correct 89.98978549540347 of the time


In [13]:
percent_positive = len(df[df['Recommended IND'] == 1]) / len(df) * 100
print(percent_positive, 'of reviews are positive')

81.81771945885464 of reviews are positive


In [14]:
actual = pd.Series([x[1] for x in tuples_test])
predicted = pd.Series(predicted_classifications)
pd.crosstab(actual, predicted, rownames = ['Actual'], colnames=['Predicted'])

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,557,179
1,213,2967


In [15]:
def return_incorrect():
        incorrectIndexes = []
        for i in range(len(tuples_test)):
            if predicted_classifications[i] != tuples_test[i][1]:
                incorrectIndexes.append(i)
        return incorrectIndexes
return_incorrect()

[11,
 16,
 42,
 50,
 54,
 75,
 81,
 95,
 96,
 97,
 105,
 107,
 128,
 138,
 146,
 157,
 161,
 176,
 180,
 182,
 183,
 187,
 188,
 206,
 222,
 229,
 242,
 249,
 257,
 270,
 277,
 282,
 283,
 288,
 307,
 312,
 322,
 331,
 338,
 344,
 353,
 359,
 390,
 394,
 402,
 406,
 410,
 412,
 415,
 420,
 422,
 437,
 448,
 451,
 460,
 467,
 495,
 496,
 527,
 528,
 533,
 534,
 537,
 584,
 586,
 595,
 610,
 615,
 622,
 623,
 646,
 653,
 665,
 683,
 684,
 698,
 703,
 712,
 719,
 720,
 722,
 731,
 734,
 736,
 738,
 768,
 781,
 799,
 819,
 820,
 823,
 825,
 842,
 843,
 851,
 855,
 857,
 860,
 864,
 869,
 879,
 895,
 912,
 917,
 931,
 932,
 936,
 945,
 956,
 959,
 971,
 980,
 982,
 984,
 1021,
 1023,
 1039,
 1043,
 1050,
 1076,
 1080,
 1091,
 1102,
 1109,
 1117,
 1119,
 1122,
 1134,
 1170,
 1173,
 1188,
 1189,
 1200,
 1211,
 1222,
 1225,
 1233,
 1239,
 1240,
 1242,
 1252,
 1257,
 1259,
 1273,
 1290,
 1328,
 1336,
 1344,
 1346,
 1354,
 1355,
 1362,
 1386,
 1389,
 1395,
 1402,
 1440,
 1452,
 1472,
 1489,
 154

In [16]:
tuples_test[4]
tuples_test[19]

("Beautiful Love this skirt. the detail is amazing. runs small i ordered a 12 i'm usually a 10, but still a little snug.",
 1)