# Predict reviews rating with Multinomial Naive Bayes

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from textblob import Word, TextBlob

In [2]:
df = pd.read_csv('reviews.csv')

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = stopwords.words('english')
custom_stop_words = ['Disney', 'Disneyland', 'Disneyworld']
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /Users/donor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/donor/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/donor/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
def preprocess_reviews(review, custom_stopwords):
    preprocessed_review = review
    preprocessed_review.replace('[^\w\s]', '') # remove whitespace at the beginning of the review
    preprocessed_review = ' '.join(word for word in preprocessed_review.split() if word not in stop_words)
    preprocessed_review = ' '.join(Word(word).lemmatize() for word in preprocessed_review.split())
    return preprocessed_review

In [5]:
df['processed_review'] = df['Review_Text'].apply(lambda x: preprocess_reviews(x, custom_stop_words))
df

Unnamed: 0,Rating,Year_Month,Reviewer_Location,Review_Text,processed_review
0,5,2019-3,United Arab Emirates,"We've been to Disneyland Hongkong and Tokyo, s...","We've Disneyland Hongkong Tokyo, far one best...."
1,4,2018-6,United Kingdom,I went to Disneyland Paris in April 2018 on Ea...,I went Disneyland Paris April 2018 Easter week...
2,5,2019-4,United Kingdom,"What a fantastic place, the queues were decent...","What fantastic place, queue decent best time y..."
3,4,2019-4,Australia,We didn't realise it was school holidays when ...,"We realise school holiday went, consequently e..."
4,5,missing,France,A Trip to Disney makes you all warm and fuzzy ...,A Trip Disney make warm fuzzy actual kid again...
...,...,...,...,...,...
13625,5,missing,United Kingdom,i went to disneyland paris in july 03 and thou...,went disneyland paris july 03 thought brillian...
13626,5,missing,Canada,2 adults and 1 child of 11 visited Disneyland ...,2 adult 1 child 11 visited Disneyland Paris be...
13627,5,missing,South Africa,My eleven year old daughter and myself went to...,My eleven year old daughter went visit son Lon...
13628,4,missing,United States,"This hotel, part of the Disneyland Paris compl...","This hotel, part Disneyland Paris complex, won..."


In [6]:
X = df['processed_review']
y = df['Rating']

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
vect = CountVectorizer(stop_words = 'english')
X_train_matrix = vect.fit_transform(X_train)

In [10]:
from sklearn.naive_bayes import MultinomialNB

clf=MultinomialNB()
clf.fit(X_train_matrix, y_train)
print(clf.score(X_train_matrix, y_train))

X_test_matrix = vect.transform(X_test)
print (clf.score(X_test_matrix, y_test))

0.7585158788386962
0.5385179750550256


In [11]:
predicted_result=clf.predict(X_test_matrix)
from sklearn.metrics import classification_report
print(classification_report(y_test,predicted_result))

              precision    recall  f1-score   support

           1       0.66      0.13      0.21       247
           2       0.32      0.06      0.10       311
           3       0.36      0.37      0.37       627
           4       0.35      0.31      0.33      1065
           5       0.66      0.86      0.75      1839

    accuracy                           0.54      4089
   macro avg       0.47      0.35      0.35      4089
weighted avg       0.51      0.54      0.50      4089



In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words = 'english')

X_train_tfidf = vectorizer.fit_transform(X_train)
X_train_tfidf.shape

(9541, 22894)

In [13]:
from sklearn.naive_bayes import MultinomialNB
clf2=MultinomialNB()
clf2.fit(X_train_tfidf, y_train)
print(clf2.score(X_train_tfidf, y_train))
X_test_tfidf = vectorizer.transform(X_test)
print (clf2.score(X_test_tfidf, y_test))

0.4588617545330678
0.44925409635607727


In [14]:
predicted_result_2=clf2.predict(X_test_tfidf)
from sklearn.metrics import classification_report
print(classification_report(y_test,predicted_result_2))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00       247
           2       0.00      0.00      0.00       311
           3       0.00      0.00      0.00       627
           4       0.03      0.00      0.00      1065
           5       0.46      1.00      0.63      1839

    accuracy                           0.45      4089
   macro avg       0.10      0.20      0.13      4089
weighted avg       0.21      0.45      0.28      4089



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
