In [14]:
from math import sqrt
from math import isnan
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

stop_words = set(stopwords.words('english'))

In [15]:
data = pd.read_csv("newcleanset.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,unique_id,id,product_name,product_type,helpful,rating,title,date,reviewer,location,text,MaxTrait
0,1,B0007QCQA4:good_sneakers:christopher_w._damico...,B0007QCQA4,adidas Originals Men's Superstar II Basketball...,apparel,0 of 1,4,GOOD SNEAKERS,"July 15, 2006","Christopher W. Damico ""MACMAN""",NYC,GOOD LOOKING KICKS IF YOUR KICKIN IT OLD SCHOO...,conscientiousness
1,2,"B0002Y2JYY:pretty_good:sharon_civile_""jackbaue...",B0002Y2JYY,Elite Metal Aviator Sunglasses with Mirror Len...,apparel,3 of 5,4,Pretty Good,"August 13, 2006","Sharon Civile ""Jackbauerfreak""","Philadelphia, PA",These sunglasses are all right. They were a li...,openness
2,3,B0002X9A5G:can't_go_wrong_at_this_price:j._gou...,B0002X9A5G,5-Pack Bodysuits: Apparel,apparel,1 of 1,5,Can't go wrong at this price,"May 18, 2006","J. Gould ""south_paw712""",KY,I don't see the difference between these bodys...,extraversion
3,4,B0002X9A5G:green!:s._feldman,B0002X9A5G,5-Pack Bodysuits: Apparel,apparel,0 of 1,5,Green!,"February 28, 2006",S. Feldman,"Virginia, United States",Very nice basic clothing. I think the size is ...,extraversion
4,5,B0006UHRJQ:perfect!:amanda_kathleen,B0006UHRJQ,3-Pack Straight Edge (non-skid) Socks: Apparel,apparel,8 of 8,5,perfect!,"December 15, 2005",Amanda Kathleen,"Delaware, USA",I love these socks. They fit great (my 15 mont...,extraversion


In [16]:
sentiment = data["rating"] >3
sentiment = sentiment.astype(int)

In [17]:
#Prepare model data
#Run model by group
model_data = pd.DataFrame({"MaxTrait":data["MaxTrait"],"text": data["text"], "sentiment":sentiment})
model_data.shape

(11984, 3)

In [18]:
def remove_pun(element):
    import string
    translator = str.maketrans('', '', string.punctuation)
    return element.translate(translator)

In [19]:
def remove_stopwords(element):
    element = [word.lower() for word in element.split() if word.lower() not in stop_words]
    return " ".join(element)

In [20]:
model_data["text"] = model_data["text"].apply(remove_pun)
model_data["text"] = model_data["text"].apply(remove_stopwords)

In [21]:
#Generate training test split
#0.8 training 0.2 test
#all data
x = model_data["text"]
y = model_data["sentiment"]
x_train, x_test, y_train, y_test = train_test_split(x, y,train_size = 0.8, random_state = 13)
x_train = x_train.values
y_train = y_train.values
x_test = x_test.values
y_test = y_test.values
#TF-IDF
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(x_train)
test_vectors = vectorizer.transform(x_test)
print(train_vectors.shape, test_vectors.shape)
#Train basic Naive Bayes Classifier
nb = MultinomialNB().fit(train_vectors, y_train)
#predict values
predicted = nb.predict(test_vectors)
#Use array to store all data
accuracy = np.array(accuracy_score(y_test, predicted))
cm = confusion_matrix(y_test, predicted)
if cm.shape[0] == 2:
    neg_acc = cm[0,0]/(cm[0,0]+cm[1,0])
    pos_acc = cm[1,1]/(cm[0,1]+cm[1,1])
    if isnan(neg_acc):
        neg_acc = 0
    if isnan(pos_acc):
        pos_acc = 0
    measure = sqrt(neg_acc*pos_acc)
else:
    measure = 0
g_mean = np.array(measure)
print(cm)

(9587, 50482) (2397, 50482)
[[1020  135]
 [ 303  939]]


In [22]:
for element in np.unique(model_data["MaxTrait"]):
    model_subset = model_data[model_data["MaxTrait"] == element]
    #Generate training test split
    #0.8 training 0.2 test
    #all data
    np.random.seed(19)
    array = np.random.rand(model_subset.shape[0])
    train = array > 0.2
    test = array <= 0.2
    train = model_subset[train]
    test  = model_subset[test]
    #Building TF-IDF Naive Bayes Classifier
    x_train = train["text"].values
    y_train = train["sentiment"].values
    x_test = test["text"].values
    y_test = test["sentiment"].values
    #TF-IDF
    vectorizer = TfidfVectorizer()
    train_vectors = vectorizer.fit_transform(x_train)
    test_vectors = vectorizer.transform(x_test)
    print(train_vectors.shape, test_vectors.shape)
    #Train basic Naive Bayes Classifier
    nb = MultinomialNB().fit(train_vectors, y_train)
    #predict values
    predicted = nb.predict(test_vectors)
    #Use array to store all data
    accuracy = np.append(accuracy, np.array(accuracy_score(y_test, predicted)))
    cm = confusion_matrix(y_test, predicted)
    if cm.shape != (2,2):
        measure = 0
    else:
        measure = sqrt((cm[0,0]/(cm[0,0]+cm[1,0]))*(cm[1,1]/(cm[0,1]+cm[1,1])))
    g_mean = np.append(g_mean, np.array(measure))
    print(cm)

(961, 12250) (255, 12250)
[[131   6]
 [ 63  55]]
(2661, 24227) (645, 24227)
[[264  55]
 [ 77 249]]
(3137, 18928) (770, 18928)
[[ 73 242]
 [  0 455]]
(203, 2616) (62, 2616)
[[54  0]
 [ 8  0]]




(2649, 27923) (641, 27923)
[[371   1]
 [245  24]]


In [23]:
#outputing nb result
category = np.append("All", np.unique(model_data["MaxTrait"]))
output = pd.DataFrame({"Category":category, "accuracy":accuracy, "g_mean":g_mean})

In [24]:
output

Unnamed: 0,Category,accuracy,g_mean
0,All,0.817272,0.821014
1,agreeableness,0.729412,0.780281
2,conscientiousness,0.795349,0.79632
3,extraversion,0.685714,0.807959
4,neuroticism,0.870968,
5,openness,0.616225,0.760383


In [25]:
output.to_csv("nboutput.csv",index = False)