# Natural Language Processing

## Importing the libraries

In [None]:
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd 

## Importing the dataset

In [None]:
# delimiter is the seperator of data file
# we will ignore quotes using quoting
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = "\t", quoting = 3)


## Cleaning the texts

In [None]:
# Preprocessing and cleaning texts
import re                       # regular expression library
import nltk                     # text processing libraries for classification, tokenization, stemming, tagging, parsing, and semantic reasoning, wrappers

nltk.download('stopwords')      # Download all stopping words such as 'a', 'and', 'the', etc that are articles which doesnt give hint about review

from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer      # Remove tenses from words such as 'loved' and 'love'

# Cleanig the text
corpus = []                 # all the cleaned texts
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])                           # using Regular expression to remove punctuations
    review = review.lower()                                                           # lower all characters

    review = review.split()                                                           # splitting review into an array of words

    ps = PorterStemmer()

    allStopWords = stopwords.words('english')
    allStopWords.remove('not')

    review = [ps.stem(word) for word in review if not word in set()]      # stemming each word from review list while neglecting the stopword
    review = ' '.join(review)   # join words of list into one element

    corpus.append(review)


In [None]:
# print(corpus)

## Creating the Bag of Words model

In [None]:
# tokenication using sklearn
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)                       # neglect the uncommon words such as names, that only apear once, etc

X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

In [None]:
# len(X[0])

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.2)

## Training the Naive Bayes model on the Training set

In [None]:
# any classification model can be used
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()
classifier.fit(X_train, y_train)

## Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

## Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)

print(cm)
accuracy_score(y_test, y_pred)

In [None]:
# accuract can be imporved by more data and not excluding words that does affect review