# Natural Language Processing

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)
dataset

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


## Data Cleaning
Cleaning the texts

In [3]:
import re
import nltk
nltk.download('stopwords')  # Remove non-relevent words like- a,an,the,....
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer  # apply stemming
# stemming means finding the root word of words -
# words -----> root word
# loved, loving, lovely, love -----> love

[nltk_data] Downloading package stopwords to /home/its-k/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
corpus = []     # cleaned reviews
for i in range(0, len(dataset)):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])     # replace punctuation by space
    review = review.lower().split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    for w in ["no", "not", "don't", "aren't", "couldn't", "won't", "shouldn't", "wouldn't"]:
        all_stopwords.remove(w)
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

In [5]:
corpus[:10]

['wow love place',
 'crust not good',
 'not tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price',
 'get angri want damn pho',
 'honeslti tast fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fri great',
 'great touch']

## Creating the Bag of Words model(classification)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
cv.fit_transform(corpus).shape

(1000, 1567)

In [7]:
cv = CountVectorizer(max_features=1540)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

## Splitting the dataset into the Training set and Test set

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

## Fitting Naive Bayes to the Training set

In [9]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

## Predicting the Test set results

In [10]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_test.reshape(-1, 1), y_pred.reshape(-1, 1)), axis=1))

[[0 1]
 [0 1]
 [0 1]
 [0 0]
 [0 0]
 [0 1]
 [1 1]
 [0 1]
 [0 1]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 1]
 [0 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [0 0]
 [0 1]
 [1 1]
 [1 1]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [0 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 1]
 [0 0]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [0 1]
 [1 1]
 [0 1]
 [1 1]
 [1 1]
 [0 1]
 [1 0]
 [1 1]
 [1 1]
 [0 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [1 0]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [0 1]
 [0 0]
 [0 0]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 1]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 1]
 [1 0]
 [1 1]
 [1 1]

## Making the Confusion Matrix

In [11]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
cm = confusion_matrix(y_test, y_pred)
cm

array([[55, 42],
       [12, 91]])

In [12]:
accuracy_score(y_test, y_pred)

0.73

In [13]:
precision_score(y_test, y_pred)

0.6842105263157895

In [14]:
recall_score(y_test, y_pred)

0.883495145631068

In [15]:
f1_score(y_test, y_pred)

0.7711864406779663

## Predicting if a single review is positive or negative

### Positive Review

Use our model to predict if the following review is positive or negative:

" I love this restaurant so much"


**Solution**: We just repeat the same text preprocessing process we did before with single review.

In [16]:
new_review = "I love this restaurant so much"
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower().split()
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = " ".join(new_review)
new_X = cv.transform([new_review]).toarray()
classifier.predict(new_X)

array([1])

This review was correctly predicted as positive by our model.

### Negative Review

Use our model to predict if the following review is positive or negative:

" I hate this resturant so much."


In [17]:
new_review = " I hate this resturant so much."
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower().split()
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = " ".join(new_review)
new_X = cv.transform([new_review]).toarray()
classifier.predict(new_X)

array([0])

This review was correctly predicted as negative by our model.