In [1]:
from math import sqrt
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize 
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

stop_words = set(stopwords.words('english'))

In [2]:
data = pd.read_csv("sub.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,unique_id,id,product_name,product_type,helpful,rating,title,date,reviewer,location,text,MaxTrait
0,1,B0007QCQA4:good_sneakers:christopher_w._damico...,B0007QCQA4,adidas Originals Men's Superstar II Basketball...,apparel,0 of 1,4,GOOD SNEAKERS,"July 15, 2006","Christopher W. Damico ""MACMAN""",NYC,GOOD LOOKING KICKS IF YOUR KICKIN IT OLD SCHOO...,Cat
1,2,"B0002Y2JYY:pretty_good:sharon_civile_""jackbaue...",B0002Y2JYY,Elite Metal Aviator Sunglasses with Mirror Len...,apparel,3 of 5,4,Pretty Good,"August 13, 2006","Sharon Civile ""Jackbauerfreak""","Philadelphia, PA",These sunglasses are all right. They were a li...,Cat
2,3,B0002X9A5G:can't_go_wrong_at_this_price:j._gou...,B0002X9A5G,5-Pack Bodysuits: Apparel,apparel,1 of 1,5,Can't go wrong at this price,"May 18, 2006","J. Gould ""south_paw712""",KY,I don't see the difference between these bodys...,Cat
3,4,B0002X9A5G:green!:s._feldman,B0002X9A5G,5-Pack Bodysuits: Apparel,apparel,0 of 1,5,Green!,"February 28, 2006",S. Feldman,"Virginia, United States",Very nice basic clothing. I think the size is ...,Cat
4,5,B0006UHRJQ:perfect!:amanda_kathleen,B0006UHRJQ,3-Pack Straight Edge (non-skid) Socks: Apparel,apparel,8 of 8,5,perfect!,"December 15, 2005",Amanda Kathleen,"Delaware, USA",I love these socks. They fit great (my 15 mont...,Cat


In [3]:
sentiment = data["rating"] >3
sentiment = sentiment.astype(int)

In [4]:
#Prepare model data
#Run model by group
model_data = pd.DataFrame({"MaxTrait":data["MaxTrait"],"text": data["text"], "sentiment":sentiment})
model_data.head()

Unnamed: 0,MaxTrait,text,sentiment
0,Cat,GOOD LOOKING KICKS IF YOUR KICKIN IT OLD SCHOO...,1
1,Cat,These sunglasses are all right. They were a li...,1
2,Cat,I don't see the difference between these bodys...,1
3,Cat,Very nice basic clothing. I think the size is ...,1
4,Cat,I love these socks. They fit great (my 15 mont...,1


In [5]:
def remove_pun(element):
    import string
    translator = str.maketrans('', '', string.punctuation)
    return element.translate(translator)

In [6]:
def remove_stopwords(element):
    element = [word.lower() for word in element.split() if word.lower() not in stop_words]
    return " ".join(element)

In [7]:
model_data["text"] = model_data["text"].apply(remove_pun)
model_data["text"] = model_data["text"].apply(remove_stopwords)

In [8]:
model_data.head()

Unnamed: 0,MaxTrait,text,sentiment
0,Cat,good looking kicks kickin old school like comf...,1
1,Cat,sunglasses right little crooked still cool,1
2,Cat,dont see difference bodysuits expensive ones f...,1
3,Cat,nice basic clothing think size fine really lik...,1
4,Cat,love socks fit great 15 month old daughter thi...,1


In [9]:
#Generate training test split
#0.8 training 0.2 test
#all data
np.random.seed(13)
array = np.random.rand(model_data.shape[0])
train = array > 0.2
test = array <= 0.2
train = model_data[train]
test  = model_data[test]
#Building TF-IDF Naive Bayes Classifier
x_train = train["text"].values
y_train = train["sentiment"].values
x_test = test["text"].values
y_test = test["sentiment"].values
#TF-IDF
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(x_train)
test_vectors = vectorizer.transform(x_test)
print(train_vectors.shape, test_vectors.shape)
#Train basic Naive Bayes Classifier
nb = MultinomialNB().fit(train_vectors, y_train)
#predict values
predicted = nb.predict(test_vectors)
#Use array to store all data
accuracy = np.array(accuracy_score(y_test, predicted))
cm = confusion_matrix(y_test, predicted)
if cm.shape[0] == 2:
    measure = sqrt((cm[0,0]/(cm[0,0]+cm[1,0]))*(cm[1,1]/(cm[0,1]+cm[1,1])))
else:
    measure = 0
g_mean = np.array(measure)


(63, 2297) (17, 2297)


In [10]:
for element in np.unique(model_data["MaxTrait"]):
    model_subset = model_data[model_data["MaxTrait"] == element]
    #Generate training test split
    #0.8 training 0.2 test
    #all data
    np.random.seed(19)
    array = np.random.rand(model_subset.shape[0])
    train = array > 0.2
    test = array <= 0.2
    train = model_subset[train]
    test  = model_subset[test]
    #Building TF-IDF Naive Bayes Classifier
    x_train = train["text"].values
    y_train = train["sentiment"].values
    x_test = test["text"].values
    y_test = test["sentiment"].values
    #TF-IDF
    vectorizer = TfidfVectorizer()
    train_vectors = vectorizer.fit_transform(x_train)
    test_vectors = vectorizer.transform(x_test)
    print(train_vectors.shape, test_vectors.shape)
    #Train basic Naive Bayes Classifier
    nb = MultinomialNB().fit(train_vectors, y_train)
    #predict values
    predicted = nb.predict(test_vectors)
    #Use array to store all data
    accuracy = np.append(accuracy, np.array(accuracy_score(y_test, predicted)))
    cm = confusion_matrix(y_test, predicted)
    if cm.shape != (2,2):
        measure = 0
    else:
        measure = sqrt((cm[0,0]/(cm[0,0]+cm[1,0]))*(cm[1,1]/(cm[0,1]+cm[1,1])))
    g_mean = np.append(g_mean, np.array(measure))
    

(11, 204) (4, 204)
(21, 467) (5, 467)
(19, 1493) (5, 1493)
(11, 571) (4, 571)


In [11]:
#outputing nb result
category = np.append("All", np.unique(model_data["MaxTrait"]))
output = pd.DataFrame({"Category":category, "accuracy":accuracy, "g_mean":g_mean})

In [12]:
output

Unnamed: 0,Category,accuracy,g_mean
0,All,0.882353,0.881917
1,Cat,1.0,0.0
2,Dog,1.0,0.0
3,Hamster,1.0,0.0
4,Lizard,1.0,0.0


In [13]:
output.to_csv("nboutput.csv",index = False)