# Natural Language Processing

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix

In [None]:
df = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

In [None]:
print(f'{df.head()}')
print('---------------------------------------------------')
print(f'{df.describe()}')
print('---------------------------------------------------')
print(f'{df.info()}')
print('---------------------------------------------------')
print(f'{df.columns}')

### Cleaning the texts

In [None]:
nltk.download('stopwords')

In [None]:
#
# stripped = [w.translate(table) for w in words]
# print(stripped[:100])

stop_words = set(stopwords.words('english'))
corpus = []
for i in range(len(df)):
    review = word_tokenize(df['Review'][i])
    table = str.maketrans('', '', string.punctuation)
    review = [char.translate(table) for char in review]
    review = [word.lower() for word in review if word.isalpha()]
    ps = PorterStemmer()
    review = [ps.stem(num) for num in review if num not in stop_words]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
corpus

### Creating the Bag of Words model

In [None]:
cv = CountVectorizer(max_features=1000)
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:, 1].values

### Splitting the dataset into the Training set and Test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=0)

### Fitting Naive Bayes to the Training set

In [None]:
classifier = GaussianNB()
classifier.fit(X_train, y_train)

### Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
y_pred

### Making the Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)

In [None]:
cm

In [None]:
accuracy = (cm[0][0]+cm[1][1])/(cm.sum())
print(f'Accuracy = {accuracy*100}%')