# Yelp Review Rating Prediction

This notebook demonstrates how to build a model that predicts **customer ratings (stars)** 
based on their **written feedback (reviews)** using NLP techniques.

**Dataset**: `yelp.csv`

**Features**: `text` (review text)  
**Label**: `label` (star rating)

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning and NLP
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
# Load the Yelp dataset
df = pd.read_csv("yelp.csv")

# Drop unnecessary index column if present
df = df.drop(columns=["Unnamed: 0"], errors="ignore")

# Display first few rows
df.head()

In [None]:
# Show dataset info
df.info()

# Show distribution of labels
sns.countplot(x=df['label'])
plt.title("Distribution of Star Ratings")
plt.show()

In [None]:
# Features (X) and target (y)
X = df['text']
y = df['label']

# Split into training and test sets (80-20 split, stratified for balance)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape

In [None]:
# Convert text into TF-IDF features
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

X_train_tfidf.shape, X_test_tfidf.shape

In [None]:
# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Predict on test set
y_pred = model.predict(X_test_tfidf)

In [None]:
# Print accuracy and classification report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()