In [2]:
import pandas as pd
import numpy as np
import joblib
import os
import ast

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

# =====================================================
# 1. CONFIG
# =====================================================
DATA_PATH = r"C:\DineSense AI\data\processed\clean_restaurants.csv"
MODEL_DIR = r"C:\DineSense AI\models"
MODEL_PATH = os.path.join(MODEL_DIR, "cuisine_classifier_v2.joblib")

MIN_SAMPLES_PER_CLASS = 50

# =====================================================
# 2. HELPERS
# =====================================================
def clean_list(value):
    if pd.isna(value):
        return ""
    if isinstance(value, list):
        return " ".join(map(str, value))
    if isinstance(value, str):
        try:
            parsed = ast.literal_eval(value)
            if isinstance(parsed, list):
                return " ".join(map(str, parsed))
        except Exception:
            pass
    return str(value)

def extract_primary_cuisine(cuisine_text):
    if not cuisine_text:
        return "unknown"
    return cuisine_text.lower().split(",")[0].strip()

# =====================================================
# 3. LOAD DATA
# =====================================================
df = pd.read_csv(DATA_PATH)
print(f"‚úÖ Dataset loaded: {df.shape}")

df = df.copy()
df["cuisines"] = df["cuisines"].apply(clean_list)

# Create target
df["primary_cuisine"] = df["cuisines"].apply(extract_primary_cuisine)

# Remove rare classes
counts = df["primary_cuisine"].value_counts()
valid = counts[counts >= MIN_SAMPLES_PER_CLASS].index
df = df[df["primary_cuisine"].isin(valid)]

print(f"‚úÖ Classes retained: {df['primary_cuisine'].nunique()}")

# =====================================================
# 4. INPUT TEXT (NO LEAKAGE)
# =====================================================
df["text"] = (
    df["restaurant_name"].astype(str) + " " +
    df["locality"].astype(str) + " " +
    df["city"].astype(str)
)

X = df["text"]
y = df["primary_cuisine"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# =====================================================
# 5. MODEL + TUNING
# =====================================================
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1, 2),
        max_df=0.9,
        min_df=5,
        stop_words="english"
    )),
    ("clf", LinearSVC())
])

param_grid = {
    "tfidf__max_features": [5000, 10000, 20000],
    "clf__C": [0.1, 0.5, 1, 2, 5]
}

print("üîç Training REAL cuisine classifier...")

search = RandomizedSearchCV(
    pipeline,
    param_grid,
    n_iter=10,
    cv=3,
    scoring="f1_weighted",
    verbose=2,
    n_jobs=-1,
    random_state=42
)

search.fit(X_train, y_train)

model = search.best_estimator_
print("‚úÖ Best Params:", search.best_params_)

# =====================================================
# 6. EVALUATION
# =====================================================
y_pred = model.predict(X_test)
print("\nüìä REAL Classification Report:\n")
print(classification_report(y_test, y_pred))

# =====================================================
# 7. SAVE MODEL
# =====================================================
os.makedirs(MODEL_DIR, exist_ok=True)
joblib.dump(model, MODEL_PATH)
print(f"‚úÖ Cuisine classifier saved at: {MODEL_PATH}")

# =====================================================
# 8. INFERENCE (API READY)
# =====================================================
def predict_cuisine(restaurant_name, locality, city):
    text = f"{restaurant_name} {locality} {city}"
    return model.predict([text])[0]

print("\nüçΩÔ∏è Sample Prediction:")
print(predict_cuisine("Domino's Pizza", "Indiranagar", "Bangalore"))


‚úÖ Dataset loaded: (7299, 17)
‚úÖ Classes retained: 24
üîç Training REAL cuisine classifier...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
‚úÖ Best Params: {'tfidf__max_features': 20000, 'clf__C': 0.5}

üìä REAL Classification Report:

                                                 precision    recall  f1-score   support

['american' 'fast food' 'salad' 'healthy food']       1.00      1.00      1.00        12
              ['bakery' 'desserts' 'fast food']       0.33      0.17      0.22        12
                          ['bakery' 'desserts']       0.25      0.22      0.23        23
                         ['bakery' 'fast food']       0.50      0.31      0.38        16
                                     ['bakery']       0.46      0.50      0.48        26
                                       ['cafe']       0.71      0.90      0.80        50
                        ['chinese' 'fast food']       0.29      0.12      0.17        16
                     ['chinese'