In [1]:
import numpy as np
import pandas as pd

from collections import Counter

import re

In [2]:
df = pd.read_csv('dataset/reviews.csv')

In [3]:
df.head()

Unnamed: 0,rating,review
0,negative,terrible place to work for i just heard a stor...
1,negative,"hours , minutes total time for an extremely s..."
2,negative,my less than stellar review is for service . w...
3,negative,i m granting one star because there s no way t...
4,negative,the food here is mediocre at best . i went aft...


In [4]:
df.tail()

Unnamed: 0,rating,review
55995,positive,"great food . wonderful , friendly service . i ..."
55996,positive,charlotte should be the new standard for moder...
55997,positive,get the encore sandwich ! ! make sure to get i...
55998,positive,i m a pretty big ice cream gelato fan . pretty...
55999,positive,where else can you find all the parts and piec...


In [5]:
vocab={}

In [6]:
def initVocab():
    unkToken = '<UNK>'
    vocab['t_2_i']={} #vocab for index to token
    vocab['i_2_t']={} #vocab for token to index
    vocab['unkToken']=unkToken
    idx = addToken(unkToken) # helps to add token to the dict/vocab as we go through the data, should only add unique token
    vocab['unkTokenIdx'] = idx
    

In [7]:
def addToken(token):
    if token in vocab['t_2_i']:
        idx = vocab['t_2_i'][token]
    else:
        idx = len(vocab['t_2_i'])
        vocab['t_2_i'][token]=idx
        vocab['i_2_t'][idx]=token
        
    return idx


In [8]:
def addManyTokens(tokens):
    ''' tokens: list of words
    returns indices of the list of words
    '''
    idxs = [addToken(token) for token in tokens]
    return idxs

In [9]:
def lookUpToken(token):
    '''given the token, it returns the index for that token'''
    if vocab['unkTokenIdx']>=0:
        return vocab['t_2_i'].get(token,vocab['unkTokenIdx'])
    else:
        return vocab['t_2_i'][token]
    
        

In [10]:
def lookUpIndex(idx):
    '''given the idx, it returns the token for that index'''
    if idx not in vocab['i_2_t']:
        raise KeyError('the index {%d} is not present' %idx)
    return vocab['i_2_t'][idx]
        
   
    

In [11]:
def vocabFromDf(df,cutoff=25):
    '''
    takes a df as an input and builds a vocabulary.
    if the number of occurences of the word is greater than the cutoff value it is qualified as token else ignore it
    '''
    initVocab()
    wordCounts = Counter()
    for r in df.review:
        for w in re.split('\W+',r):
            wordCounts[w]+=1
   
    for word,count in wordCounts.items():
        if count>=cutoff:
            addToken(word)
            
    
    
    

In [12]:
# build a more general vocabulary from corpus rather than a dataframe
# corpus is more general than a dataframe

def vocabFromCorpus(Corpus,cutoff=25):
    '''
    takes a df as an input and builds a vocabulary.
    if the number of occurences of the word is greater than the cutoff value it is qualified as token else ignore it
    '''
    initVocab()
    wordCounts = Counter()
    for doc in Corpus:
        for w in re.split('\W+',doc):
            wordCounts[w]+=1
   
    for word,count in wordCounts.items():
        if count>=cutoff:
            addToken(word)
    

In [13]:
# loading the data

df = pd.read_csv('dataset/reviews.csv')

In [14]:
vocabFromDf(df)

In [15]:
# first build a corpus as numpy array

X=np.asarray(df['review'])
vocabFromCorpus(X)


In [16]:
X

array(['terrible place to work for i just heard a story of them find a girl over her biological father coming in there who she hadn t seen in years she said hi to him which upset his wife and they left she finished the rest of her day working fine the next day when she went into work they fired over that situation . i for one and boycotting texas roadhouse because any place that could be that cruel to their staff does not deserve my business . . . yelp wants me to give them a star but i don t believe they deserve it',
       ' hours , minutes total time for an extremely simple physical . stay away unless you have hours to waste ! ! ! ',
       'my less than stellar review is for service . we waited minutes for our meals to be delivered . when we questioned the waiter , he was not helpful , so we asked to speak to the manager . the manager did not even come to speak with us ! we were loyal neighborhood customers , even walking to the restaurant frequently ! my husband then wrote an emai

In [17]:
vocab

{'t_2_i': {'<UNK>': 0,
  'terrible': 1,
  'place': 2,
  'to': 3,
  'work': 4,
  'for': 5,
  'i': 6,
  'just': 7,
  'heard': 8,
  'a': 9,
  'story': 10,
  'of': 11,
  'them': 12,
  'find': 13,
  'girl': 14,
  'over': 15,
  'her': 16,
  'father': 17,
  'coming': 18,
  'in': 19,
  'there': 20,
  'who': 21,
  'she': 22,
  'hadn': 23,
  't': 24,
  'seen': 25,
  'years': 26,
  'said': 27,
  'hi': 28,
  'him': 29,
  'which': 30,
  'upset': 31,
  'his': 32,
  'wife': 33,
  'and': 34,
  'they': 35,
  'left': 36,
  'finished': 37,
  'the': 38,
  'rest': 39,
  'day': 40,
  'working': 41,
  'fine': 42,
  'next': 43,
  'when': 44,
  'went': 45,
  'into': 46,
  'fired': 47,
  'that': 48,
  'situation': 49,
  'one': 50,
  'texas': 51,
  'roadhouse': 52,
  'because': 53,
  'any': 54,
  'could': 55,
  'be': 56,
  'their': 57,
  'staff': 58,
  'does': 59,
  'not': 60,
  'deserve': 61,
  'my': 62,
  'business': 63,
  'yelp': 64,
  'wants': 65,
  'me': 66,
  'give': 67,
  'star': 68,
  'but': 69,
  'don':

In [18]:
lookUpToken('the')

38

In [19]:
lookUpIndex(38)

'the'

In [20]:
len(vocab['t_2_i'])

9123

### One Hot Encoding

In [21]:
def oneHotVector(token,N):
    '''
    token: for which to generate one-hot vector
    N: length of the vocabulary
    '''
    oneHot = np.zeros((N,1))
    oneHot[lookUpToken(token)]=1
    return oneHot
    

In [22]:
N = len(vocab['t_2_i'])

oneHot = oneHotVector('the',N)
oneHot

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

In [23]:
oneHot[38]

array([1.])

In [24]:
## compute the feature vector of the whole document/review by taking an average
# in ML, most of the algo, every feature vector is represented as a row vector

In [25]:
def computeFeatureVec(doc,N):
    isFirst = True
    for token in doc:
        oneHot = oneHotVector(token,N)
        if isFirst:
            xF = oneHot
            isFirst=False
        else:
            xF = np.hstack((xF,oneHot)) # takes a lot of time 
    
    return np.mean(xF, axis=1)[:,np.newaxis]
    
        

In [26]:
def computeFeatureVecOpt(doc,N):
    ''' optimized version of above function '''
    fv = np.zeros(N)
    numTokens = 0
    for token in doc:
        fv[lookUpToken(token)]+=1
        numTokens+=1
    return fv/numTokens
    

In [27]:
def corpusToFeatureMatrix(Corpus,N):
    isFirstdoc = True
    for doc in Corpus:
        fV = computeFeatureVec(doc,N)
        if isFirstdoc:
            fM = fV
            isFirstdoc = False
        fM = np.hstack((fM,fV))
    return fM.T
            
        

In [28]:
def corpusToFeatureMatrixOpt(Corpus,N):
    fM = np.zeros((N,len(Corpus))) # predeclaring the size os faster then stacking 
    i=0
    for doc in Corpus:
        fM[:,i] = computeFeatureVecOpt(doc,N)
        i+=1
    return fM.T
    

In [29]:
X[0]

'terrible place to work for i just heard a story of them find a girl over her biological father coming in there who she hadn t seen in years she said hi to him which upset his wife and they left she finished the rest of her day working fine the next day when she went into work they fired over that situation . i for one and boycotting texas roadhouse because any place that could be that cruel to their staff does not deserve my business . . . yelp wants me to give them a star but i don t believe they deserve it'

In [30]:
fv = computeFeatureVec(X[0], len(vocab['t_2_i']))

In [31]:
len(fv)

9123

In [32]:
fv.shape

(9123, 1)

### Train Test Split

In [33]:
df_new = df.copy()

In [34]:
df_new.head()

Unnamed: 0,rating,review
0,negative,terrible place to work for i just heard a stor...
1,negative,"hours , minutes total time for an extremely s..."
2,negative,my less than stellar review is for service . w...
3,negative,i m granting one star because there s no way t...
4,negative,the food here is mediocre at best . i went aft...


In [35]:
X = np.asarray(df_new['review'])
y = np.asarray(df_new['rating'])

In [36]:
from sklearn.model_selection import train_test_split

In [37]:
# 70-30 split

In [38]:
Xtrain,Xtest,ytrain,ytest = train_test_split(X,y,test_size=0.3, shuffle=True, random_state=42)

In [39]:
Xtrain

array(['been here before and it was way better than this last thursday ! had veggie hummus appetizer , burger that was ok , had one porter and one hefe and both were sub par , but the root beer float was the best thing i had even if i only got one scoop of ice cream . . . last time was the last time . ',
       'why can t the attendant understand a simple direct question ? n ncan someone please tell me just where the hell i m supposed to put my freakin drink ? n nand worst of all . . . my balls don t fit in this place . n n one extra star for the bartender offering me her personal lighter n minus the extra star for not having any matches',
       'egg drop soup is awesome ! best in phoenix . now the crab puffs had nothing but cream cheese . . . i literally broke it open to search for the meat and i didn t find any so that was very disappointing . i had the house plate for my entr u e e . the taste was ok . . . but what really killed it for me was the texture . . . it felt as though the

In [40]:
# we build the vocabulary from the train data as we assume we have never seen the test data

In [41]:
vocabFromCorpus(Xtrain)

In [42]:
# function to return entire feature matrix for the classificaiton algorithm

In [43]:
N = len(vocab['t_2_i'])

Xtrain_fM = corpusToFeatureMatrixOpt(Xtrain,N)
Xtest_fM = corpusToFeatureMatrixOpt(Xtest,N)

In [44]:
Xtrain_fM.shape

(39200, 7507)

In [45]:
Xtest_fM.shape

(16800, 7507)

### Training the MOdel

In [46]:
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import confusion_matrix

import seaborn as sns
import matplotlib.pyplot as plt
sns.set()

In [None]:
M = LR().fit(Xtrain_fM,ytrain)

In [1]:
y_pred = M.predict(Xtest_fM)

NameError: name 'M' is not defined

In [None]:
mat = confusion_matrix(ytest,y_pred)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False, xticklabels = np.unique(y), yticklabels = np.unique(y))
plt.xlabel('True Label')
plt.ylabel('Predicted Label')