In [4]:
import pandas as pd
import numpy as np
import joblib
import os

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error

# =========================
# PATHS
# =========================
DATA_PATH = r"C:\DineSense AI\data\processed\clean_restaurants.csv"
MODEL_DIR = r"C:\DineSense AI\models"
OUTPUT_DIR = r"C:\DineSense AI\data\outputs"

# Ensure directories exist
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# =========================
# LOAD DATA
# =========================
df = pd.read_csv(DATA_PATH)
print("‚úÖ Dataset loaded:", df.shape)

# =========================
# BASIC CLEANING
# =========================
df = df.dropna(subset=[
    "city",
    "aggregate_rating",
    "votes",
    "average_cost_for_two"
])

# =========================
# LOCATION AGGREGATION
# =========================
location_df = df.groupby("city").agg(
    restaurant_count=("restaurant_name", "count"),
    avg_rating=("aggregate_rating", "mean"),
    avg_votes=("votes", "mean"),
    avg_cost=("average_cost_for_two", "mean"),
    rating_std=("aggregate_rating", "std")
).reset_index()

location_df.fillna(0, inplace=True)
print("üìç Location dataset created:", location_df.shape)

# =========================
# LOCATION SCORE (STABLE)
# =========================
EPS = 1e-6  # numerical stability

location_df["location_score"] = (
    (location_df["avg_rating"] * 0.4) +
    (np.log1p(location_df["avg_votes"]) * 0.3) +
    ((1 / (location_df["avg_cost"] + EPS)) * 100 * 0.2) +
    (np.log1p(location_df["restaurant_count"]) * 0.1)
)

# =========================
# FEATURES & TARGET
# =========================
FEATURES = [
    "restaurant_count",
    "avg_rating",
    "avg_votes",
    "avg_cost",
    "rating_std"
]

X = location_df[FEATURES]
y = location_df["location_score"]

# =========================
# SCALING
# =========================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save scaler and features
joblib.dump(
    scaler,
    os.path.join(MODEL_DIR, "location_scaler_v1.joblib")
)

joblib.dump(
    FEATURES,
    os.path.join(MODEL_DIR, "location_features_v1.joblib")
)

# =========================
# TRAIN / TEST SPLIT
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled,
    y,
    test_size=0.2,
    random_state=42
)

# =========================
# RANDOM FOREST + TUNING
# =========================
rf = RandomForestRegressor(random_state=42)

param_grid = {
    "n_estimators": [150, 250, 350],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

search = RandomizedSearchCV(
    rf,
    param_grid,
    n_iter=20,
    cv=3,
    scoring="r2",
    n_jobs=-1,
    verbose=1
)

search.fit(X_train, y_train)

best_model = search.best_estimator_
print("‚úÖ Best RF Params:", search.best_params_)

# =========================
# EVALUATION
# =========================
y_pred = best_model.predict(X_test)

print("\nüìä Location Score Model Performance")
print("R¬≤ Score:", round(r2_score(y_test, y_pred), 4))
print("RMSE:", round(np.sqrt(mean_squared_error(y_test, y_pred)), 4))

# =========================
# SAVE REGRESSION MODEL
# =========================
joblib.dump(
    best_model,
    os.path.join(MODEL_DIR, "location_score_model_v1.joblib")
)

# =========================
# CLUSTERING (FULL DATA)
# =========================
kmeans = KMeans(n_clusters=4, random_state=42)
location_df["location_cluster"] = kmeans.fit_predict(X_scaled)

joblib.dump(
    kmeans,
    os.path.join(MODEL_DIR, "location_cluster_model_v1.joblib")
)

# =========================
# CLUSTER SUMMARY
# =========================
cluster_summary = (
    location_df
    .groupby("location_cluster")[["avg_rating", "avg_votes", "avg_cost", "restaurant_count"]]
    .mean()
)

print("\nüìç Location Cluster Summary:")
print(cluster_summary)

# =========================
# EXPORT CITY RANKINGS
# =========================
location_df.sort_values(
    "location_score",
    ascending=False
).to_csv(
    os.path.join(OUTPUT_DIR, "city_rankings.csv"),
    index=False
)

print("\nüèôÔ∏è Top Cities:")
print(location_df[["city", "location_score", "location_cluster"]].head(10))


‚úÖ Dataset loaded: (7299, 17)
üìç Location dataset created: (137, 6)
Fitting 3 folds for each of 20 candidates, totalling 60 fits
‚úÖ Best RF Params: {'n_estimators': 150, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': None}

üìä Location Score Model Performance
R¬≤ Score: 0.9104
RMSE: 0.174

üìç Location Cluster Summary:
                  avg_rating    avg_votes     avg_cost  restaurant_count
location_cluster                                                        
0                   3.652181   127.381883    73.957237          4.288462
1                   4.334484  1222.289286  1149.603175         13.571429
2                   4.046744   284.195756   511.095641         41.471429
3                   3.289430   152.515943   630.356515       3983.000000

üèôÔ∏è Top Cities:
        city  location_score  location_cluster
0  Abu Dhabi        3.848710                 2
1       Agra        3.312659                 2
2  Ahmedabad        3.908732                 2
3     Alban