# Natural Language Processing

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


## Cleaning the texts

In [3]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to C:\Users\Ahmed Abdel-
[nltk_data]     Monem\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
corpus
from nltk import FreqDist
frequency_distribution = FreqDist(corpus)
print(frequency_distribution)

<FreqDist with 980 samples and 1000 outcomes>


In [6]:
frequency_distribution.most_common(100)

[('back', 5),
 ('return', 3),
 ('food good', 3),
 ('disappoint', 3),
 ('love place', 3),
 ('like', 2),
 ('delici', 2),
 ('go back', 2),
 ('food terribl', 2),
 ('mistak', 2),
 ('awesom', 2),
 ('would recommend place', 2),
 ('eat', 2),
 ('wow love place', 1),
 ('crust good', 1),
 ('tasti textur nasti', 1),
 ('stop late may bank holiday rick steve recommend love', 1),
 ('select menu great price', 1),
 ('get angri want damn pho', 1),
 ('honeslti tast fresh', 1),
 ('potato like rubber could tell made ahead time kept warmer', 1),
 ('fri great', 1),
 ('great touch', 1),
 ('servic prompt', 1),
 ('would go back', 1),
 ('cashier care ever say still end wayyy overpr', 1),
 ('tri cape cod ravoli chicken cranberri mmmm', 1),
 ('disgust pretti sure human hair', 1),
 ('shock sign indic cash', 1),
 ('highli recommend', 1),
 ('waitress littl slow servic', 1),
 ('place worth time let alon vega', 1),
 ('burritto blah', 1),
 ('food amaz', 1),
 ('servic also cute', 1),
 ('could care less interior beauti', 

In [None]:
ahmed mohamed omar x y z s 
1       0       1  0 0 0 0
0       1       0  1 0 0 0 


## Creating the Bag of Words model

In [69]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values
X.shape

(1000, 1500)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [70]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Splitting the dataset into the Training set and Test set

In [73]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

## Training the Naive Bayes model on the Training set

In [72]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [74]:
from sklearn.ensemble import RandomForestClassifier
cls = RandomForestClassifier()
cls.fit(X_train,y_train)
y_pred_2 = cls.predict(X_test)


from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred_2)

0.71

## Predicting the Test set results

In [63]:
y_pred = classifier.predict(X_test)

In [64]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.715

## Making the Confusion Matrix

In [8]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[55 42]
 [12 91]]
