In [6]:
# ==============================
# 01_rating_prediction.ipynb
# DineSense AI â€“ Rating Prediction (Tuned)
# ==============================

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingRandomSearchCV

# ------------------------------
# 1. Load Cleaned Dataset
# ------------------------------
DATA_PATH = r"C:\DineSense AI\data\processed\clean_restaurants.csv"
df = pd.read_csv(DATA_PATH)
print("âœ… Cleaned dataset loaded. Shape:", df.shape)

# ------------------------------
# 2. Features & Target
# ------------------------------
features = [
    'average_cost_for_two',
    'votes',
    'price_range',
    'cuisines',
    'city'
]
target = 'aggregate_rating'

X = df[features].copy()
y = df[target]

# ------------------------------
# 3. Train/Test Split
# ------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

# ------------------------------
# 4. Preprocessing
# ------------------------------
cat_features = ['cuisines', 'city']
num_features = ['average_cost_for_two', 'votes', 'price_range']

# Convert cuisines list â†’ string
def join_cuisines(x):
    if isinstance(x, list):
        return " ".join(x)
    return str(x)

X_train.loc[:, 'cuisines'] = X_train['cuisines'].apply(join_cuisines)
X_test.loc[:, 'cuisines'] = X_test['cuisines'].apply(join_cuisines)

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_features),
        ('num', 'passthrough', num_features)
    ]
)

# ------------------------------
# 5. Model Pipeline
# ------------------------------
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(
        random_state=42,
        n_jobs=-1
    ))
])

# ------------------------------
# 6. Hyperparameter Search Space
# ------------------------------
param_distributions = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [10, 15, 20, None],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__max_features': ['sqrt', 'log2']
}

# ------------------------------
# 7. Halving Random Search (FAST)
# ------------------------------
search = HalvingRandomSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    scoring='neg_root_mean_squared_error',
    factor=3,
    cv=3,
    n_jobs=-1,
    verbose=2
)

# ------------------------------
# 8. Train Tuned Model
# ------------------------------
search.fit(X_train, y_train)
print("\nâœ… Best Hyperparameters:")
print(search.best_params_)

# ------------------------------
# 9. Evaluate on Test Set
# ------------------------------
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred) ** 0.5
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nðŸŽ¯ Tuned Model Performance:")
print(f"RMSE: {rmse:.3f}")
print(f"MAE: {mae:.3f}")
print(f"RÂ² Score: {r2:.3f}")


âœ… Cleaned dataset loaded. Shape: (7299, 17)
Train shape: (5839, 5) Test shape: (1460, 5)
n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 7
min_resources_: 6
max_resources_: 5839
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 216
n_resources: 6
Fitting 3 folds for each of 216 candidates, totalling 648 fits




----------
iter: 1
n_candidates: 72
n_resources: 18
Fitting 3 folds for each of 72 candidates, totalling 216 fits
----------
iter: 2
n_candidates: 24
n_resources: 54
Fitting 3 folds for each of 24 candidates, totalling 72 fits
----------
iter: 3
n_candidates: 8
n_resources: 162
Fitting 3 folds for each of 8 candidates, totalling 24 fits
----------
iter: 4
n_candidates: 3
n_resources: 486
Fitting 3 folds for each of 3 candidates, totalling 9 fits

âœ… Best Hyperparameters:
{'regressor__n_estimators': 200, 'regressor__min_samples_split': 2, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 'sqrt', 'regressor__max_depth': None}

ðŸŽ¯ Tuned Model Performance:
RMSE: 0.355
MAE: 0.263
RÂ² Score: 0.590


In [7]:
import joblib
import os

MODEL_DIR = r"C:\DineSense AI\models"
os.makedirs(MODEL_DIR, exist_ok=True)

MODEL_PATH = os.path.join(MODEL_DIR, "rating_prediction_model.joblib")

joblib.dump(best_model, MODEL_PATH)

print(f"âœ… Model saved successfully at: {MODEL_PATH}")


âœ… Model saved successfully at: C:\DineSense AI\models\rating_prediction_model.joblib
