In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [2]:
df = pd .read_csv("reviews.csv")

In [3]:
df.head()

Unnamed: 0,Country,Restaurant Name,Sentiment,Review Title,Review Date,Review
0,France,The Frog at Bercy Village,Negative,Rude manager,May 2024 •,The manager became agressive when I said the c...
1,France,The Frog at Bercy Village,Negative,A big disappointment,Feb 2024 •,"I ordered a beef fillet ask to be done medium,..."
2,France,The Frog at Bercy Village,Negative,Pretty Place with Bland Food,Nov 2023 •,"This is an attractive venue with welcoming, al..."
3,France,The Frog at Bercy Village,Negative,Great service and wine but inedible food,Mar 2023 •,Sadly I used the high TripAdvisor rating too ...
4,France,The Frog at Bercy Village,Negative,Avoid- Worst meal in Rome - possibly ever,Nov 2022 •,From the start this meal was bad- especially g...


In [4]:
# Initialize lemmatizer and stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ANUP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ANUP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ANUP\AppData\Roaming\nltk_data...


In [5]:
# Text preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters, punctuation, and numbers
    tokens = word_tokenize(text)  # Tokenize text
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # Lemmatization & stopword removal
    return ' '.join(tokens)


In [7]:
# Apply preprocessing
df['Cleaned_Review'] = df['Review'].astype(str).apply(preprocess_text)


In [9]:
# Text Vectorization using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Cleaned_Review'])
y = df['Sentiment'].map({'Positive': 1, 'Negative': 0})  # Convert sentiment to binary labels

In [10]:
# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Train Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

In [12]:
# Predictions
y_pred = model.predict(X_test)

In [13]:
# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)


In [14]:
# Print Evaluation Metrics
print("Model Evaluation:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Model Evaluation:
Accuracy: 0.91
Precision: 0.90
Recall: 1.00
F1-Score: 0.95

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.43      0.60        49
           1       0.90      1.00      0.95       252

    accuracy                           0.91       301
   macro avg       0.95      0.71      0.77       301
weighted avg       0.92      0.91      0.89       301



In [16]:
# Example of correctly and incorrectly classified reviews
df['Predicted'] = model.predict(vectorizer.transform(df['Cleaned_Review']))
correct_predictions = df[df['Predicted'] == df['Sentiment'].map({'Positive': 1, 'Negative': 0})].head(3)
incorrect_predictions = df[df['Predicted'] != df['Sentiment'].map({'Positive': 1, 'Negative': 0})].head(3)


In [17]:
print("\nCorrectly Classified Reviews:")
print(correct_predictions[['Review', 'Sentiment']])



Correctly Classified Reviews:
                                              Review Sentiment
1  I ordered a beef fillet ask to be done medium,...  Negative
3  Sadly I  used the high TripAdvisor rating too ...  Negative
4  From the start this meal was bad- especially g...  Negative


In [18]:
print("\nIncorrectly Classified Reviews:")
print(incorrect_predictions[['Review', 'Sentiment']])


Incorrectly Classified Reviews:
                                              Review Sentiment
0  The manager became agressive when I said the c...  Negative
2  This is an attractive venue with welcoming, al...  Negative
6  We tired the tasting menu with wine pairing. T...  Negative
