# NLP

## Importing libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing dataset

In [5]:
dataset = pd.read_csv(r"C:\Users\arunk\Downloads\Zomato Project\Zomato.csv", delimiter = '\t', quoting = 3, encoding='latin-1')

## Cleaning the dataset

In [7]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
if 'not' in all_stopwords:
    all_stopwords.remove('not')

n_rows = min(1000, len(dataset))
for i in range(0, n_rows):
  # dataset was read with a single combined column "Review,Liked".
  # split the row at the last comma to extract the review text safely.
  row = str(dataset.iloc[i, 0])
  if ',' in row:
    review = row.rsplit(',', 1)[0]
  else:
    review = row

  zomato_ratings = re.sub('[^a-zA-Z]', ' ', review)
  zomato_ratings = zomato_ratings.lower()
  zomato_ratings = zomato_ratings.split()
  zomato_ratings = [ps.stem(word) for word in zomato_ratings if word not in set(all_stopwords)]
  zomato_ratings = ' '.join(zomato_ratings)
  corpus.append(zomato_ratings)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arunk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
print(corpus)

['wow love place', 'crust not good', 'not tasti textur nasti', 'stop late may bank holiday rick steve recommend love', 'select menu great price', 'get angri want damn pho', 'honeslti tast fresh', 'potato like rubber could tell made ahead time kept warmer', 'fri great', 'great touch', 'servic prompt', 'would not go back', 'cashier care ever say still end wayyy overpr', 'tri cape cod ravoli chicken cranberri mmmm', 'disgust pretti sure human hair', 'shock sign indic cash', 'highli recommend', 'waitress littl slow servic', 'place not worth time let alon vega', 'not like', 'burritto blah', 'food amaz', 'servic also cute', 'could care less interior beauti', 'perform', 'right red velvet cake ohhh stuff good', 'never brought salad ask', 'hole wall great mexican street taco friendli staff', 'took hour get food tabl restaur food luke warm sever run around like total overwhelm', 'worst salmon sashimi', 'also combo like burger fri beer decent deal', 'like final blow', 'found place accid could not

## Bag of words creation

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

## Splitting data into test and training sets

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 21)

## Training Naive Bayes

In [11]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

  ys_types = set(type_of_target(x) for x in ys)


0,1,2
,priors,
,var_smoothing,1e-09


## Prediction on the test set

In [12]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[['"I\'m so happy to be here!!!""",1'
  '"Worse of all, he humiliated his worker right in front of me..Bunch of horrible name callings.",0']
 ['"The food, amazing.",1' 'The food was terrible.,0']
 ['"I\'m so happy to be here!!!""",1'
  '"Went in for happy hour, great list of wines.",1']
 ['"I\'m so happy to be here!!!""",1' 'They were excellent.,1']
 ['Great service and food.,1'
  '"Good value, great food, great service.",1']
 ['"I\'m so happy to be here!!!""",1' '2 Thumbs Up!!,1']
 ["Don't waste your time here.,0"
  "Bland... Not a liking this place for a number of reasons and I don't want to waste time on bad reviewing.. I'll leave it at that...,0"]
 ['"After 20 minutes wait, I got a table.",0'
  'Waited 2 hours & never got either of our pizzas as many other around us who came in later did!,0']
 ['Awesome service and food.,1'
  'My boyfriend and I came here for the first time on a recent trip to Vegas and could not have been more pleased with the quality of food and service.,1']
 ['"

## Making Confusion Matrix

In [13]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


  type_true = type_of_target(y_true, input_name="y_true")
  ys_types = set(type_of_target(x) for x in ys)
  type_true = type_of_target(y_true, input_name="y_true")


0.005