In [1]:
#Library files importation and tsv dataset loading
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB

In [2]:
data = pd.read_csv("Restaurant_Reviews.tsv", delimiter="\t")

In [3]:
#Dataset displaying all data
data

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [20]:
#EDA analysis and data classification of the dataset
print("Number of reviews:", data.shape[0])
print("Average length of reviews:", data["Review"].str.len().mean())

Number of reviews: 1003
Average length of reviews: 58.315


In [6]:
# Addition of fake reviews data
data["review_type"] = data["Review"].apply(lambda x: "good" if "great" in x or "excellent" in x else "bad" if "spam" in x or "rude" in x else "fake")

In [7]:
# New List
fake_reviews = [
 "This is a fake review.",
 "This is another fake review.",
 "This is a third fake review."
]

In [8]:
fake_data = pd.DataFrame({"review": fake_reviews, "review_type": "fake"})

In [9]:
data = data.append(fake_data, ignore_index=True)

  data = data.append(fake_data, ignore_index=True)


In [21]:
data

Unnamed: 0,Review,Liked,review_type,review,cleaned_review
0,Wow... Loved this place.,1.0,fake,,
1,Crust is not good.,0.0,fake,,
2,Not tasty and the texture was just nasty.,0.0,fake,,
3,Stopped by during the late May bank holiday of...,1.0,fake,,
4,The selection on the menu was great and so wer...,1.0,good,,
...,...,...,...,...,...
998,"The whole experience was underwhelming, and I ...",0.0,fake,,
999,"Then, as if I hadn't wasted enough of my life ...",0.0,fake,,
1000,,,fake,This is a fake review.,fake review.
1001,,,fake,This is another fake review.,anoth fake review.


In [10]:
#Addition of stopwords and dataset cleaning and preprocessing
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
def clean_text(text):
 stop_words = set(stopwords.words("english"))
 stemmer = PorterStemmer()
 text = str(text)
 text = text.lower()
 text = [stemmer.stem(word) for word in text.split() if word not in stop_words]
 text = " ".join(text)
 return text

In [12]:
data["cleaned_review"] = data["review"].apply(clean_text)

In [13]:
# TF-IDF Analysis
#TF-IDF analysis and Naïve Bayes model addition for the reviews accuracy,
# confusion and classification report
vectorizer = TfidfVectorizer()

In [14]:
X = vectorizer.fit_transform(data["cleaned_review"])
y = data["review_type"]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [17]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.9203980099502488


In [18]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Confusion Matrix:
 [[  0   2   0]
 [  0 185   0]
 [  0  14   0]]


In [19]:
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

         bad       0.00      0.00      0.00         2
        fake       0.92      1.00      0.96       185
        good       0.00      0.00      0.00        14

    accuracy                           0.92       201
   macro avg       0.31      0.33      0.32       201
weighted avg       0.85      0.92      0.88       201



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
