<a href="https://colab.research.google.com/github/Dhruvjain484/NLP-project-for-beginners/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Importing Libraries

In [111]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


#Importing dataset

In [112]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting = 3) #quoting is used to remove the double quotes in the dataset


#Cleaning the data

In [113]:
import re #re is used to simplify the data by keeping words only removing all the punctuation, etc.
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
all_words = stopwords.words('english')
all_words = [word for word in all_words if word not in ('not', 'no', "hadn't","didn't")]
from nltk.stem.porter import PorterStemmer
corpus = [] #List of all the clean reviews
for i in range(0, len(dataset)):
  review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) #removing that which not in the range a-z or A_Z
  review = review.lower()
  review = review.split() #Stop and print(review) to get a jist of what is going on
  ps = PorterStemmer()
  review = [ps.stem(word) for word in review if not word in set(all_words)]
  review = ' '.join(review)
  corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#Comparing original vs cleaned

In [126]:
dataset['cleaned_review'] = corpus
dataset

Unnamed: 0,Review,Liked,new,cleaned_review
0,Wow... Loved this place.,1,wow love place,wow love place
1,Crust is not good.,0,crust not good,crust not good
2,Not tasty and the texture was just nasty.,0,not tasti textur nasti,not tasti textur nasti
3,Stopped by during the late May bank holiday of...,1,stop late may bank holiday rick steve recommen...,stop late may bank holiday rick steve recommen...
4,The selection on the menu was great and so wer...,1,select menu great price,select menu great price
...,...,...,...,...
995,I think food should have flavor and texture an...,0,think food flavor textur lack,think food flavor textur lack
996,Appetite instantly gone.,0,appetit instantli gone,appetit instantli gone
997,Overall I was not impressed and would not go b...,0,overal not impress would not go back,overal not impress would not go back
998,"The whole experience was underwhelming, and I ...",0,whole experi underwhelm think go ninja sushi n...,whole experi underwhelm think go ninja sushi n...


#Bag of words model

In [115]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
x = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

In [116]:
len(x[0])

1500

#Splitting the dataset into train-test

In [117]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

#Training the classification model

In [118]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()

# from sklearn.linear_model import LogisticRegression
# classifier = LogisticRegression()
classifier.fit(x_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

#Predicting the result

In [119]:
y_pred = classifier.predict(x_test)


In [120]:
np.concatenate((y_test.reshape(len(y_test), 1), y_pred.reshape(len(y_pred), 1)), 1)

array([[0, 1],
       [1, 1],
       [0, 0],
       [0, 0],
       [1, 1],
       [0, 0],
       [1, 1],
       [0, 0],
       [0, 0],
       [1, 1],
       [1, 1],
       [0, 1],
       [1, 1],
       [0, 1],
       [1, 1],
       [1, 1],
       [1, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [0, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 0],
       [0, 0],
       [1, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [1, 0],
       [0, 0],
       [0, 0],
       [1, 1],
       [1, 1],
       [0, 1],
       [0, 0],
       [0, 1],
       [0, 0],
       [1, 1],
       [1, 1],
       [0, 0],
       [1, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 0],
       [1, 1],
       [1, 0],
       [1, 1],
       [1, 1],
       [0, 1],
       [0, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [0, 0],
       [0, 0],
       [0, 0],
       [0, 1],
       [0,

#Confusion matrix




In [121]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
cm

array([[49, 49],
       [13, 89]])

In [122]:
accuracy_score(y_test,y_pred)

0.69

#K-Fold cross validation

In [123]:
from sklearn.model_selection import cross_val_score
cv = cross_val_score(classifier, x_train, y_train, cv = 10)
cv.mean()

0.6925000000000001