In [79]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize 
from nltk.corpus import wordnet as wn
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from hmmlearn import hmm
import random
#nltk.download()

In [2]:
#Read data from dataset
data = pd.read_csv("sub.csv")

In [3]:
data.describe()
data.head()

Unnamed: 0.1,Unnamed: 0,unique_id,id,product_name,product_type,helpful,rating,title,date,reviewer,location,text,MaxTrait
0,1,B0007QCQA4:good_sneakers:christopher_w._damico...,B0007QCQA4,adidas Originals Men's Superstar II Basketball...,apparel,0 of 1,4,GOOD SNEAKERS,"July 15, 2006","Christopher W. Damico ""MACMAN""",NYC,GOOD LOOKING KICKS IF YOUR KICKIN IT OLD SCHOO...,Cat
1,2,"B0002Y2JYY:pretty_good:sharon_civile_""jackbaue...",B0002Y2JYY,Elite Metal Aviator Sunglasses with Mirror Len...,apparel,3 of 5,4,Pretty Good,"August 13, 2006","Sharon Civile ""Jackbauerfreak""","Philadelphia, PA",These sunglasses are all right. They were a li...,Cat
2,3,B0002X9A5G:can't_go_wrong_at_this_price:j._gou...,B0002X9A5G,5-Pack Bodysuits: Apparel,apparel,1 of 1,5,Can't go wrong at this price,"May 18, 2006","J. Gould ""south_paw712""",KY,I don't see the difference between these bodys...,Cat
3,4,B0002X9A5G:green!:s._feldman,B0002X9A5G,5-Pack Bodysuits: Apparel,apparel,0 of 1,5,Green!,"February 28, 2006",S. Feldman,"Virginia, United States",Very nice basic clothing. I think the size is ...,Cat
4,5,B0006UHRJQ:perfect!:amanda_kathleen,B0006UHRJQ,3-Pack Straight Edge (non-skid) Socks: Apparel,apparel,8 of 8,5,perfect!,"December 15, 2005",Amanda Kathleen,"Delaware, USA",I love these socks. They fit great (my 15 mont...,Cat


In [4]:
stop_words = set(stopwords.words('english')) 

In [5]:
np.array(stop_words)

array({'about', 'into', 'with', "mightn't", 'mustn', "doesn't", 'don', 'won', 'having', "that'll", 'be', 'any', 'only', 'ma', 'yours', 'how', "shouldn't", 'what', 'not', 'further', 'doing', 'weren', "you'd", "weren't", 'the', 'these', 'why', 'all', "mustn't", 'once', 'other', 'up', 'such', "isn't", "she's", 'those', 'will', 'himself', 'hasn', 'to', 'very', 'are', 'who', 'his', "you've", 'themselves', 'a', 'from', 'here', 's', 'ain', 'am', 'or', 'own', 'your', "don't", 'do', 'ours', 'now', 'against', 'doesn', 'each', 'shan', 'wouldn', 'above', 'her', 'shouldn', 'yourself', 'we', 'off', 'of', 'for', 'theirs', 'been', 'did', 'whom', 'out', 'and', 'haven', 'does', 'hers', 'by', 'being', 'had', "hasn't", 'she', 'below', 'if', 'after', 'then', 'as', 'needn', 'in', "needn't", 'more', 'y', 'couldn', 'isn', 'this', 'before', 'few', 'can', "hadn't", 'you', 'them', 'too', 't', 'our', 'i', "you're", "it's", 'again', 'during', 'should', "haven't", 'me', "you'll", 'at', 'd', "wouldn't", 'him', 'is',

In [6]:
text = pd.Series(data["text"])
text.head(20)

0     GOOD LOOKING KICKS IF YOUR KICKIN IT OLD SCHOO...
1     These sunglasses are all right. They were a li...
2     I don't see the difference between these bodys...
3     Very nice basic clothing. I think the size is ...
4     I love these socks. They fit great (my 15 mont...
5     Finally I have found a quality brand of swimsu...
6     Your company was a pleasure to work with- than...
7     very portable. great picture. easy to operate....
8     I have been looking for a pair of Docs for a w...
9     The quality is much better than expected. I bo...
10    Nice shirt. Well made. Good price. What more c...
11    The shirts are what I had asked for. They are ...
12    I love wearing those tank tops with shelf bras...
13    I was having a hard time finding a cheaper swi...
14     With a good selection of colors to go with th...
15    A difficult item to find in department stores....
16    The quality is much better than expected. I bo...
17    The collar stays are great! They came in a

In [24]:
def remove_pun(element):
    import string
    translator = str.maketrans('', '', string.punctuation)
    return element.translate(translator)

In [25]:
def remove_stopwords(element):
    element = [word.lower() for word in element.split() if word.lower() not in stop_words]
    return " ".join(element)

In [26]:
text = text.apply(remove_pun)
text = text.apply(remove_stopwords)

In [27]:
text.size
text.head

<bound method NDFrame.head of 0     good looking kicks kickin old school like comf...
1            sunglasses right little crooked still cool
2     see difference bodysuits expensive ones fits b...
3     nice basic clothing think size fine really lik...
4     love socks fit great 15 month old daughter thi...
                            ...                        
75    therese lux mary bonnie cecilia make five lisb...
76    exact copy ang lees wonderful eat drink man wo...
77    part ongoing quest catch comic book movies mis...
78    ashley judds normal life locusts husband playe...
79    someone wanted make indiana jones part 4 starr...
Name: text, Length: 80, dtype: object>

In [28]:
#Postive 1 negative 0
sent_processed = data["rating"] >3
bool_dict = {True:"pos", False:"neg"}
sent_processed = sent_processed.map(bool_dict)

In [29]:
#Prepare model data
#Run model by group
model_data = pd.DataFrame({"MaxTrait":data["MaxTrait"], "sentiment":sent_processed, "text": text})

In [30]:
model_data.head()

Unnamed: 0,MaxTrait,sentiment,text
0,Cat,pos,good looking kicks kickin old school like comf...
1,Cat,pos,sunglasses right little crooked still cool
2,Cat,pos,see difference bodysuits expensive ones fits b...
3,Cat,pos,nice basic clothing think size fine really lik...
4,Cat,pos,love socks fit great 15 month old daughter thi...


In [31]:
#Create POS tags
def tokenized_tag(element):
    tokenized = sent_tokenize(element)
    for i in tokenized:
        word_list = nltk.word_tokenize(i)
        tagged = nltk.pos_tag(word_list)
    return tagged

In [32]:
#Extract pos sequence
def extract_pos(element):
    seq = [i[1] for i in element]
    seq = tuple(seq)
    return seq

In [33]:
#Process the data for POS-HMM
model_data["text_pos"] = model_data["text"].apply(tokenized_tag)
model_data["seq"] = model_data["text_pos"].apply(extract_pos)

In [34]:
model_data.head()

Unnamed: 0,MaxTrait,sentiment,text,text_pos,seq
0,Cat,pos,good looking kicks kickin old school like comf...,"[(good, JJ), (looking, VBG), (kicks, NNS), (ki...","(JJ, VBG, NNS, VBP, JJ, NN, IN, JJ, RB, JJ, NN..."
1,Cat,pos,sunglasses right little crooked still cool,"[(sunglasses, NNS), (right, RB), (little, JJ),...","(NNS, RB, JJ, VBD, RB, JJ)"
2,Cat,pos,see difference bodysuits expensive ones fits b...,"[(see, VB), (difference, NN), (bodysuits, NNS)...","(VB, NN, NNS, JJ, NNS, NNS, RB, RB)"
3,Cat,pos,nice basic clothing think size fine really lik...,"[(nice, JJ), (basic, JJ), (clothing, NN), (thi...","(JJ, JJ, NN, VBP, NN, VBP, RB, IN, JJ, VBP, JJ..."
4,Cat,pos,love socks fit great 15 month old daughter thi...,"[(love, NN), (socks, NNS), (fit, VBP), (great,...","(NN, NNS, VBP, JJ, CD, NN, JJ, NN, NN, NNS, VB..."


In [84]:
#States
states = ("pos", "neg")
#Possible observations
possible_observations = np.unique(possible_observations).tolist()
#Number of observation sequence
quantities_observations = [1] *model_data.shape[0]
observation_tuple = []
observation_tuple.extend([element for element in model_data["seq"]])

# Input initual parameters as Numpy matrices
start_probability = np.matrix('0.5 0.5')
#Aritifitial transistion probabilities
#Need work
transition_probability = np.matrix('0.6 0.4;  0.3 0.7')
#Aritifitial emission probabilities
#Need work
emission_probability = np.matrix('0 0.04 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.02 0.58; 0.03 0.03 0.03 0.03 0.03 0.03 0.03 0.03 0.03 0.03 0.03 0.03 0.03 0.06 0 0.03 0.03 0.03 0.03 0.03 0.03 0.34')