In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score, recall_score
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
import re

In [4]:
# Download NLTK stopwords if not already
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aadit\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [6]:
# Load the dataset
data = pd.read_csv("C:/Users/aadit/OneDrive/文档/Engineering TY/Sem-6th/Data_Mining/Restaurant_Reviews.tsv",sep="\t",quoting=3)

In [7]:
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [9]:
# Preprocessing : remove stopwords + apply stemming
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

In [10]:
def preprocess(text):
    text = re.sub(r'\W+', ' ', text)  # Remove non-word characters
    words = text.lower().split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

In [11]:
# Apply Preprocessing
data['Review_cleaned'] = data['Review'].apply(preprocess)

In [12]:
# Feature Extraction
vectorizer = TfidfVectorizer(max_features=1000)  # 1000 features for simplicity
X = vectorizer.fit_transform(data['Review_cleaned'])


In [13]:
# Labels
y = data['Liked']  # Assuming the label is in column 'Liked'

In [14]:
#  Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
#  Classify with Naive Bayes
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [16]:
#  Evaluate
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')

Precision: 0.7732
Recall: 0.7212
