                                Natural Language Processing

# Task : A simple Sentiment Analysis task for classifying Yelp reviews 

In [2]:
#Python package
import spacy
import re

In [3]:
#Vocabulary are Unique words
#Tokens are every single word that appears in the text

In [4]:
#Loading the english model
nlp = spacy.load('en_core_web_sm')

In [5]:
doc = 'I always uh do the main um processing, I mean, the uh um data-processing.'

In [6]:
stats = nlp(doc)

In [7]:
for token in stats:
    print(token.text)

I
always
uh
do
the
main
um
processing
,
I
mean
,
the
uh
um
data
-
processing
.


In [8]:
doc2 = 'U.K. has a reasonable population'
stats = nlp(doc2)
for token in stats:
    print(token.text)

U.K.
has
a
reasonable
population


In [9]:
for token in re.split('\W+',doc):
    print(token)

I
always
uh
do
the
main
um
processing
I
mean
the
uh
um
data
processing



In [10]:
for token in re.split('\W+',doc2):
    print(token)

U
K
has
a
reasonable
population


In [11]:
L = list(nlp.vocab.strings)

In [12]:
len(L)

83814

In [13]:
L[50000]

'glands'

In [14]:
L[60000]

'mfy'

In [15]:
L

['\t',
 '\n',
 ' ',
 '  ',
 '!',
 '!!',
 '!!!',
 '!!!!',
 '!!!!!!!!!!!!!!!!',
 '!!!!.',
 '!!.',
 '!!?',
 '!!??',
 '!*',
 '!.',
 '!?',
 '!??',
 '"',
 '""',
 '#',
 "##'s",
 "##'x",
 "#'s",
 '#15',
 '#^%',
 '#dd',
 '$',
 '$19',
 '$Whose',
 '$Xxxxx',
 '$whose',
 '$xxxx',
 '%',
 '%-3',
 '%ach',
 '%ah',
 '%eh',
 '%er',
 '%ha',
 '%hm',
 '%huh',
 '%mm',
 '%oof',
 '%pw',
 '%uh',
 '%um',
 '%xx',
 '%xxx',
 '&',
 '&#',
 '&G.',
 '&L.',
 '&Ls',
 '&M.',
 '&P.',
 '&SA',
 '&T.',
 '&ex',
 '&in',
 '&ls',
 '&of',
 '&on',
 '&sa',
 '&the',
 '&to',
 '&uh',
 '&von',
 '&xx',
 '&xxx',
 "'",
 "''",
 "''It",
 "''Xx",
 "''it",
 "''xx",
 "'-(",
 "'-)",
 "'03",
 "'07",
 "'20s",
 "'30s",
 "'40s",
 "'45",
 "'46",
 "'50s",
 "'60s",
 "'67",
 "'68",
 "'69",
 "'70's",
 "'70s",
 "'71",
 "'73",
 "'74",
 "'76",
 "'78",
 "'80",
 "'80's",
 "'80s",
 "'82",
 "'86",
 "'89",
 "'90's",
 "'90s",
 "'91",
 "'94",
 "'96",
 "'97",
 "'98",
 "'99",
 "'Arabi",
 "'Cause",
 "'Connery",
 "'Cos",
 "'Coz",
 "'Cuz",
 "'Id",
 "'Il",
 "'It",
 "'N"

In [16]:
import numpy as np
import pandas as pd
from collections import Counter
import re

In [17]:
df = pd.read_csv('reviews.csv')

In [18]:
df.head()

Unnamed: 0,rating,review
0,negative,terrible place to work for i just heard a stor...
1,negative,"hours , minutes total time for an extremely s..."
2,negative,my less than stellar review is for service . w...
3,negative,i m granting one star because there s no way t...
4,negative,the food here is mediocre at best . i went aft...


In [19]:
df.tail()

Unnamed: 0,rating,review
55995,positive,"great food . wonderful , friendly service . i ..."
55996,positive,charlotte should be the new standard for moder...
55997,positive,get the encore sandwich ! ! make sure to get i...
55998,positive,i m a pretty big ice cream gelato fan . pretty...
55999,positive,where else can you find all the parts and piec...


In [20]:
vocab = {}

In [21]:
def initializeVocabulary():
    unkToken = '<UNK>'
    vocab['t_2_i'] = {}
    vocab['i_2_t'] = {}
    vocab['unkToken'] = unkToken
    idx = addToken(unkToken)
    vocab['unkTokenIdx'] = idx

In [22]:
def addToken(token):
    if token in vocab['t_2_i']:
        idx = vocab['t_2_i'][token]
    else:
        idx = len(vocab['t_2_i'])
        vocab['t_2_i'][token] = idx
        vocab['i_2_t'][idx] = token
    return idx

In [23]:
def addManyTokens(tokens):
    idxes = [addToken(token) for token in tokens]
    return idxes

In [24]:
#Helper function to check for the Token

def lookUpToken(token):
    if vocab['unkTokenIdx']>=0:
        return vocab['t_2_i'].get(token,vocab['unkTokenIdx'])
    else:
        return vocab['t_2_i'][token]

In [25]:
#Helper function for checking the index of a certain token
def lookUpIndex(idx):
    if idx not in vocab['i_2_t']:
        raise KeyError("the index (%d) is not there" % idx)
    return vocab['i_2_t'][idx]

In [26]:
#Focusing on more frequent words.i.e Only words occuring more than 25 are the ones which will be added as tokens.
def vocabularyFromDataFrame(df,cutoff=25):
    initializeVocabulary()
    wordCounts = Counter()
    for r in df.review:
        for word in re.split('\W+',r):
            wordCounts[word] += 1
    for word,count in wordCounts.items():
        if count > cutoff:
            addToken(word)

In [27]:
def vocabularyFromCorpus(Corpus,cutoff=25):
    initializeVocabulary()
    wordCounts = Counter()
    for doc in Corpus:
        for word in re.split('\W+',doc):
            wordCounts[word] += 1
    for word,count in wordCounts.items():
        if count > cutoff:
            addToken(word)

In [28]:
df = pd.read_csv('reviews.csv')

In [29]:
#vocabularyFromDataFrame(df)
Corpus = np.asarray(df['review'])
vocabularyFromCorpus(Corpus)

In [30]:
Corpus[:5]

array(['terrible place to work for i just heard a story of them find a girl over her biological father coming in there who she hadn t seen in years she said hi to him which upset his wife and they left she finished the rest of her day working fine the next day when she went into work they fired over that situation . i for one and boycotting texas roadhouse because any place that could be that cruel to their staff does not deserve my business . . . yelp wants me to give them a star but i don t believe they deserve it',
       ' hours , minutes total time for an extremely simple physical . stay away unless you have hours to waste ! ! ! ',
       'my less than stellar review is for service . we waited minutes for our meals to be delivered . when we questioned the waiter , he was not helpful , so we asked to speak to the manager . the manager did not even come to speak with us ! we were loyal neighborhood customers , even walking to the restaurant frequently ! my husband then wrote an emai

In [31]:
lookUpToken('the')

38

In [32]:
lookUpIndex(38)

'the'

In [33]:
len(vocab['t_2_i'])

8946

In [53]:
#Representing tokens in a numerical format since ML models don't work with 
#Categorical values

#The helper function is operating in a way that it assigns the token number index 
#in the generated matrix of zeros , in order to make sure that different tokens possess different 
#arrays (Main purpose for the oneHotVector encoding)

def oneHotVector(token,N):
    oneHot = np.zeros((N,1))
    oneHot[lookUpToken(token)] = 1
    return oneHot

In [34]:
N = len(vocab['t_2_i'])
token = 'the'
oneHot = oneHotVector(token,N)

In [58]:
for token in vocab['t_2_i']:
    print(token)
    break

<UNK>


In [90]:
#A helper function that computes the feature vector of the whole document for the 
# whole review
def computeFeatures(doc,N):
    isFirst = True
    for token in doc:
        oneHot = oneHotVector(token,N)
        if isFirst:
            xF = oneHot
            isFirst = False
        else:
            xF = np.hstack((xF,oneHot))
    return np.mean(xF,axis=1)[:,np.newaxis]

In [91]:
a = np.zeros((3,1))
a[2]=1
b = np.zeros((3,1))
b[1]=1
c = np.zeros((3,1))
c[1]=1
X = np.hstack((a,b,c))
np.mean(X,axis=1)[:,np.newaxis]

array([[0.        ],
       [0.66666667],
       [0.33333333]])

In [115]:
# b[:,0]
b

array([[0.],
       [1.],
       [0.]])

In [99]:
# # computeFeatures()
# L = len(doc)
computeFeatures(doc,N)

array([[0.26027397],
       [0.        ],
       [0.        ],
       ...,
       [0.        ],
       [0.        ],
       [0.        ]])

In [100]:
def computeFeatures_fast(doc,N):
    fv = np.zeros(N)
    numTokens = 0
    for token in doc:
        fv[lookUpToken(token)] += 1
        numTokens += 1
    return fv/numTokens

In [101]:
def corpusToFeatureMatrix(Corpus,N):
    isFirst = True
    for doc in Corpus:
        fv = computeFeatures(doc,N)
        if isFirst:
            fM = fv
            isFirst = False
        else:
            fM = np.hstack((fM,fv))
    return fM.T

In [102]:
def corpusToFeatureMatrix_fast(Corpus,N):
    fM = np.zeros((N,len(Corpus)))
    i = 0
    for doc in Corpus:
        fM[:,i] = computeFeatures_fast(doc,N)
        i+=1
    return fM.T

In [103]:
%timeit fv = computeFeatures_fast(Corpus[0],len(vocab['t_2_i']))

550 µs ± 17.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [104]:
%timeit fv = computeFeatures(Corpus[0],len(vocab['t_2_i']))

3.37 s ± 289 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [105]:
df = pd.read_csv('reviews.csv')
X = np.asarray(df['review'])
y = np.asarray(df['rating'])

In [106]:
from sklearn.model_selection import train_test_split

In [107]:
Xtrain,Xtest,ytrain,ytest = train_test_split(X,y,test_size=0.3,shuffle=True)

In [108]:
vocabularyFromCorpus(Xtrain)

In [109]:
N = len(vocab['t_2_i'])
Xtrain_fM = corpusToFeatureMatrix_fast(Xtrain,N)
Xtest_fM = corpusToFeatureMatrix_fast(Xtest,N)

In [110]:
Xtrain_fM.shape

(39200, 7341)

In [111]:
Xtest_fM.shape

(16800, 7341)

In [116]:
#from sklearn.linear_model import LogisticRegression as clf
#from sklearn.naive_bayes import GaussianNB as clf
#from sklearn.ensemble import RandomForestClassifier as clf
from sklearn.svm import SVC as clf
from sklearn.metrics import confusion_matrix
import seaborn as sns
from matplotlib import pyplot as plt
sns.set()

In [None]:
M = clf().fit(Xtrain_fM,ytrain)

In [None]:
y_pred = M.predict(Xtest_fM)

In [None]:
mat = confusion_matrix(ytest,y_pred)
sns.heatmap(mat.T,square=True,annot=True,fmt='d',cbar=False,
           xticklabels=np.unique(y),yticklabels=np.unique(y))
plt.xlabel("True Label")
plt.ylabel("Predicted Label")

In [None]:
Xtrain.shape