Import Libraries

In [None]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt

Importing DataSet

In [None]:
dataset=pd.read_csv('dataset_project.csv')

In [None]:
dataset.shape

(10000, 2)

In [None]:
dataset.head()

Unnamed: 0,reviews,positive/negative
0,Some glitches are there but it's good,0
1,Phlox when Lexi is well,1
2,I like it,1
3,Good wrks great,0
4,Simple to use,0


In [None]:
dataset.columns

Index(['reviews', 'positive/negative'], dtype='object')

In [None]:
dataset['positive/negative'].value_counts()

1         7427
0         2572
#NAME?       1
Name: positive/negative, dtype: int64

In [None]:
dataset.isnull().any()

reviews              False
positive/negative    False
dtype: bool

In [None]:
# dataset.isnull().sum()

Cleaning the Texts

In [None]:
# simplifying all the texts, removing punctuations, special characters, uniformize the case of letters (lowercase)

# re: llibrary to simplify the reviews 
# nltk: library that enable us to dwnld the essemble of stopwords

In [None]:
import re  
import nltk

nltk.download('stopwords') 
# nltk.download('wordnet')
# nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
# from nltk.stem import WordNetLemmatizer

cleaned_data=[]

for i in range(0,len(dataset)):
  review=re.sub('[^a-zA-Z]',' ',dataset['reviews'][i])  
  review=review.lower() 
  review=review.split()
  ps=PorterStemmer()
  # wordnet=WordNetLemmatizer()

  all_stopwords=stopwords.words('english')
  all_stopwords.remove('not')

  review= [ps.stem(word) for word in review if not word in set(all_stopwords)] 
  # instead of taking set of original stopwords to exclude from review, we are now taking all stopwords except not
  review=' '.join(review) 
  cleaned_data.append(review)

# cleaned_data

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Bag of Words Model in NLP

In [None]:
from sklearn.feature_extraction.text import CountVectorizer 

cv = CountVectorizer(max_features=2000)  

x=cv.fit_transform(cleaned_data).toarray() 
# features matrix shoud be 2D array not vector, because naive bayes expect array as input
y=dataset.iloc[:,-1].values # dependent variable vector

# y
# len(y)
# x
# x.shape
# len(x)
# len(x[0])  # 5722 tokens from tokenization,before max_features=2000,

# considering 2000 most frequent words for cv, so that we can get rid of waste words that dont help for predicting review 

# **********   hence we created matrix of features and dependent variable vector y ********

Splitting DataSet into TestSet and TrainingSet

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

Training Naive Bayes Model On Training Set

In [None]:
# naive bayes method does give good results with nlp problems, thus started with naive bayes classifier 
# Also tried other classifiers to see which gives best accuracy 

from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train, y_train)

GaussianNB()

Predicting TestSet Result

In [None]:
y_predicted = classifier.predict(x_test)
np.concatenate((y_predicted.reshape(len(y_predicted),1), y_test.reshape(len(y_test),1)),1)

array([['1', '1'],
       ['1', '1'],
       ['1', '1'],
       ...,
       ['1', '1'],
       ['0', '0'],
       ['0', '0']], dtype=object)

Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score
cm=confusion_matrix(y_test,y_predicted)
cm

array([[   0,    0,    1],
       [   0,  258,  238],
       [   0,  181, 1322]])

In [None]:
accuracy_score(y_test,y_predicted)

0.79

Predicting if single review is Positive/Negative

In [None]:
# positive review 

new_review = 'I love this app so much'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_x_test = cv.transform(new_corpus).toarray()
new_y_predicted = classifier.predict(new_x_test)
print(new_y_predicted)

['1']


In [None]:
# negative review

new_review = 'I hate this app so much'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_x_test = cv.transform(new_corpus).toarray()
new_y_predicted = classifier.predict(new_x_test)
print(new_y_predicted)

['0']
