# Natural Language Processing
#Sentiment analysis

## Importing the libraries

In [113]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Importing the dataset

In [114]:
dataset=pd.read_csv("Restaurant_Reviews.tsv",delimiter='\t',quoting=3)
#telling that it is a tsv file & tell to avoid "" within the text

## Cleaning the texts

In [115]:
import re
import nltk#using this we can remove the stop words(words that are not relevent eg:the,and)
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer#for stemming(taking root of the word(eg:loved - love(v1)))
corpus = []#all cleaned reviews
for i in range(0,1000):
    review = re.sub('[^a-zA-Z]',' ',dataset['Review'][i])#replace all the punctuations with a space
                    #replace all characters other than a-z and A-Z in column Review by a space
                    #from the reviews select a specific one(i)
    review = review.lower()#convert every letter to lower case
    review = review.split()#split the review into different words
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')#not is also a stopword
    all_stopwords.remove('not')#as it is a -ve word, we have to remove that
    #we have to apply stemming to the list of words in review but we have to omit stopwords. For that we are using a single line for loop
    all_stopwords.remove('no')
   
    review = [ps.stem(word)for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)#convert the list into a string and seperate the words with a space
    corpus.append(review)

[nltk_data] Downloading package stopwords to /Users/anju/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Creating the Bag of Words model
#To create the sparce matrix - row - different reviews from corpus,
#Column - number of occurence of each words(process in called tokenization)

In [116]:
#tokenization - use sklearn
from sklearn.feature_extraction.text import CountVectorizer#for tokenization
cv = CountVectorizer(max_features = 1500)#maximum no.of columns(take most frequent words(imp))
x = cv.fit_transform(corpus).toarray()#array of features
#fit - take all the words from corpus, transform - add all the words in different columns
y = dataset.iloc[:,-1].values#create dependent variable vector

In [117]:
#create a matrix with 1/0 in each column
#no.of columns - no.of total words

In [118]:
len(x[0])#set (max_features) after calculating this

1500

## Splitting the dataset into the Training set and Test set

In [119]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

## Training the Naive Bayes model on the Training set

In [120]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

## Predicting the Test set results

In [121]:
y_pred = classifier.predict(x_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[[1 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 0]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [0 0]
 [0 1]
 [1 1]
 [1 0]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [0 0]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [1 0]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 1]
 [0 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [0 0]
 [0 0]
 [1 0]
 [1 1]
 [1 0]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 0]
 [1 1]
 [1 1]
 [0 0]
 [0 1]
 [0 1]
 [1 1]
 [0 0]
 [1 0]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [0 0]
 [1 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [1 1]
 [0 0]
 [0 0]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 1]
 [1 0]
 [0 1]
 [1 1]
 [1 1]

## Making the Confusion Matrix

In [122]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(y_test,y_pred)
print(cm)
accuracy_score(y_test,y_pred)

[[55 42]
 [12 91]]


0.73