# Sentiment Analysis on Restaurant Review Dataset

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('./Restaurant_Reviews.tsv', delimiter='\t')
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [5]:
data.dtypes

Review    object
Liked      int64
dtype: object

In [6]:
data['Liked'].value_counts()

1    500
0    500
Name: Liked, dtype: int64

In [7]:
data.isna().sum()

Review    0
Liked     0
dtype: int64

## Data Cleaning

### Remove punctuations and Lowering Case

In [8]:
from string import punctuation

In [9]:
def removePunctuations(text):
    newText = "".join([i for i in text if i not in punctuation])
    return newText

In [10]:
data['noPunctuations'] = data['Review'].apply(lambda x:removePunctuations(x).lower())

In [11]:
data.head()

Unnamed: 0,Review,Liked,noPunctuations
0,Wow... Loved this place.,1,wow loved this place
1,Crust is not good.,0,crust is not good
2,Not tasty and the texture was just nasty.,0,not tasty and the texture was just nasty
3,Stopped by during the late May bank holiday of...,1,stopped by during the late may bank holiday of...
4,The selection on the menu was great and so wer...,1,the selection on the menu was great and so wer...


### Remove stopwords and Tokenization

In [12]:
from nltk.corpus import stopwords

In [13]:
def removeStopwords(text):
    newtext = [i for i in text.split() if i not in stopwords.words("english")]
    return newtext

In [14]:
data['noStopwordsTokenized'] = data['noPunctuations'].apply(lambda x:removeStopwords(x))
data.head()

Unnamed: 0,Review,Liked,noPunctuations,noStopwordsTokenized
0,Wow... Loved this place.,1,wow loved this place,"[wow, loved, place]"
1,Crust is not good.,0,crust is not good,"[crust, good]"
2,Not tasty and the texture was just nasty.,0,not tasty and the texture was just nasty,"[tasty, texture, nasty]"
3,Stopped by during the late May bank holiday of...,1,stopped by during the late may bank holiday of...,"[stopped, late, may, bank, holiday, rick, stev..."
4,The selection on the menu was great and so wer...,1,the selection on the menu was great and so wer...,"[selection, menu, great, prices]"


### Lemmatization

In [15]:
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

In [16]:
def lemmatize(text):
    newText = [wnl.lemmatize(ele) for ele in text]
    return newText

In [17]:
data['lemmatized'] = data['noStopwordsTokenized'].apply(lambda x:lemmatize(x))
data.head()

Unnamed: 0,Review,Liked,noPunctuations,noStopwordsTokenized,lemmatized
0,Wow... Loved this place.,1,wow loved this place,"[wow, loved, place]","[wow, loved, place]"
1,Crust is not good.,0,crust is not good,"[crust, good]","[crust, good]"
2,Not tasty and the texture was just nasty.,0,not tasty and the texture was just nasty,"[tasty, texture, nasty]","[tasty, texture, nasty]"
3,Stopped by during the late May bank holiday of...,1,stopped by during the late may bank holiday of...,"[stopped, late, may, bank, holiday, rick, stev...","[stopped, late, may, bank, holiday, rick, stev..."
4,The selection on the menu was great and so wer...,1,the selection on the menu was great and so wer...,"[selection, menu, great, prices]","[selection, menu, great, price]"


### Finalizing the cleaned string

In [18]:
# Join the each string in the array with a space in between
def finalize(textList):
    text = " ".join(textList)
    return text

In [19]:
data['Review_cleaned'] = data['noStopwordsTokenized'].apply(lambda x:finalize(x))
data.head()

Unnamed: 0,Review,Liked,noPunctuations,noStopwordsTokenized,lemmatized,Review_cleaned
0,Wow... Loved this place.,1,wow loved this place,"[wow, loved, place]","[wow, loved, place]",wow loved place
1,Crust is not good.,0,crust is not good,"[crust, good]","[crust, good]",crust good
2,Not tasty and the texture was just nasty.,0,not tasty and the texture was just nasty,"[tasty, texture, nasty]","[tasty, texture, nasty]",tasty texture nasty
3,Stopped by during the late May bank holiday of...,1,stopped by during the late may bank holiday of...,"[stopped, late, may, bank, holiday, rick, stev...","[stopped, late, may, bank, holiday, rick, stev...",stopped late may bank holiday rick steve recom...
4,The selection on the menu was great and so wer...,1,the selection on the menu was great and so wer...,"[selection, menu, great, prices]","[selection, menu, great, price]",selection menu great prices


----------------------------------------------------------------------------------

In [20]:
data_cleaned = pd.DataFrame(data[['Review_cleaned', 'Liked']])

In [21]:
data_cleaned.head()

Unnamed: 0,Review_cleaned,Liked
0,wow loved place,1
1,crust good,0
2,tasty texture nasty,0
3,stopped late may bank holiday rick steve recom...,1
4,selection menu great prices,1


------------------

In [22]:
corpus = data_cleaned['Review_cleaned']
corpus

0                                        wow loved place
1                                             crust good
2                                    tasty texture nasty
3      stopped late may bank holiday rick steve recom...
4                            selection menu great prices
                             ...                        
995                    think food flavor texture lacking
996                              appetite instantly gone
997                      overall impressed would go back
998    whole experience underwhelming think well go n...
999    hadnt wasted enough life poured salt wound dra...
Name: Review_cleaned, Length: 1000, dtype: object

### Create a Bag of Words Model

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=1500)

In [28]:
X = cv.fit_transform(corpus).toarray()

### Creating the output Y

In [31]:
Y = data.iloc[:, 1]

---

## Applying Train Test Split

In [33]:
from sklearn.model_selection import train_test_split

In [34]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=0)

In [35]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((800, 1500), (200, 1500), (800,), (200,))

## Model Creation and Prediction (GaussianNB)

In [36]:
from sklearn.naive_bayes import GaussianNB

In [42]:
classifier = GaussianNB()
classifier.fit(X_train, Y_train)

GaussianNB()

In [43]:
Y_pred = classifier.predict(X_test)

### Evaluation Metrics

In [44]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [45]:
confusion_matrix(Y_test, Y_pred)

array([[51, 46],
       [16, 87]], dtype=int64)

In [46]:
accuracy_score(Y_test, Y_pred)

0.69

### Check for absolutely new Review

In [48]:
Review = "nice service"

input_data = cv.transform([Review]).toarray()

prediction = classifier.predict(input_data)
if prediction[0]:
    print("Positive Review")
else:
    print("Negative Review")

Positive Review


In [49]:
Review = "long wait time"

input_data = cv.transform([Review]).toarray()

prediction = classifier.predict(input_data)
if prediction[0]:
    print("Positive Review")
else:
    print("Negative Review")

Negative Review


---

## Model Creation and Prediction (Logistic Regression)

In [51]:
from sklearn.linear_model import LogisticRegression