<a href="https://colab.research.google.com/github/ChandraMadhumanchi/Cassandra_Spark_Integration/blob/master/Restaurant_Predictiion_problem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# rf_xgb_pipeline.py
import os
import warnings
from datetime import datetime

import numpy as np
import pandas as pd
from sklearn.base import clone
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, KFold, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, OneHotEncoder, StandardScaler
import xgboost as xgb
import joblib

warnings.filterwarnings("ignore")
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# ---------- Utility functions ----------
def rmse(a, b):
    return np.sqrt(mean_squared_error(a, b))

def log1p_and_back(y):
    return np.log1p(y), lambda z: np.expm1(z)

# ---------- Load data ----------
TRAIN_PATH = "Train_dataset_(1)_(1)_(2)_(1).csv"
TEST_PATH = "Test_dataset_(1)_(1)_(2)_(1).csv"
OUTPUT_SUB = "submission_rf_xgb.csv"

train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

print("Train shape:", train.shape)
print("Test shape:", test.shape)

# keep registration ids
test_ids = test['Registration Number'].values

# ---------- Basic preprocessing & feature engineering ----------
def preprocess_base(df, is_train=True):
    df = df.copy()

    # Opening Day -> datetime features
    if 'Opening Day of Restaurant' in df.columns:
        df['Opening Day of Restaurant'] = pd.to_datetime(df['Opening Day of Restaurant'], errors='coerce')
        df['Opening_Year'] = df['Opening Day of Restaurant'].dt.year.fillna(0).astype(int)
        df['Opening_Month'] = df['Opening Day of Restaurant'].dt.month.fillna(0).astype(int)
        df['Opening_DayOfYear'] = df['Opening Day of Restaurant'].dt.dayofyear.fillna(0).astype(int)
        df['Days_Since_Opening'] = (pd.Timestamp.now() - df['Opening Day of Restaurant']).dt.days.fillna(0).astype(int)
        # cyclical month
        df['Opening_Month_Sin'] = np.sin(2 * np.pi * df['Opening_Month'] / 12)
        df['Opening_Month_Cos'] = np.cos(2 * np.pi * df['Opening_Month'] / 12)
        df.drop(columns=['Opening Day of Restaurant'], inplace=True, errors='ignore')

    # Missing indicators for sparse event columns
    for c in ['Live Music Rating', 'Comedy Gigs Rating', 'Value Deals Rating', 'Live Sports Rating', 'Overall Restaurant Rating']:
        if c in df.columns:
            df[c + '_NA'] = df[c].isna().astype(int)

    # Social media / composite features
    if 'Facebook Popularity Quotient' in df.columns and 'Instagram Popularity Quotient' in df.columns:
        df['Social_Media_Score'] = (df['Facebook Popularity Quotient'].fillna(0) + df['Instagram Popularity Quotient'].fillna(0)) / 2
        df['Social_Media_Ratio'] = (df['Facebook Popularity Quotient'].fillna(0) + 1) / (df['Instagram Popularity Quotient'].fillna(0) + 1)

    # Service quality composite
    service_cols = ['Staff Responsivness', 'Hygiene Rating', 'Food Rating', 'Service']
    present = [c for c in service_cols if c in df.columns]
    if present:
        weights = {'Food Rating': 0.4, 'Service': 0.3, 'Hygiene Rating': 0.2, 'Staff Responsivness': 0.1}
        df['Service_Quality_Score'] = 0.0
        total_w = 0.0
        for c in present:
            w = weights.get(c, 0.25)
            df['Service_Quality_Score'] += df[c].fillna(df[c].median()) * w
            total_w += w
        df['Service_Quality_Score'] /= total_w

    # Entertainment / infrastructure / value features
    ent_cols = [c for c in ['Live Music Rating','Comedy Gigs Rating','Ambience','Lively'] if c in df.columns]
    if ent_cols:
        df['Entertainment_Score'] = df[ent_cols].fillna(0).mean(axis=1)
    infra_cols = [c for c in ['Fire Audit','Liquor License Obtained','Situated in a Multi Complex','Dedicated Parking','Open Sitting Available'] if c in df.columns]
    if infra_cols:
        df['Infrastructure_Score'] = df[infra_cols].fillna(0).sum(axis=1)

    # Restaurant age info
    if 'Days_Since_Opening' in df.columns:
        df['Restaurant_Age_Years'] = df['Days_Since_Opening'] / 365.25
        df['Is_New_Restaurant'] = (df['Restaurant_Age_Years'] < 1).astype(int)
        df['Is_Established'] = (df['Restaurant_Age_Years'] > 5).astype(int)

    # Value quality ratio
    if 'Value for Money' in df.columns and 'Food Rating' in df.columns:
        df['Value_Quality_Ratio'] = df['Value for Money'].fillna(df['Value for Money'].median()) / (df['Food Rating'].fillna(df['Food Rating'].median()) + 1)

    # City frequency
    if 'City' in df.columns:
        df['City'] = df['City'].astype(str)
        # frequency will be computed later with global mapping so fillna handled
    return df

# apply to combined to create consistent encodings
combined = pd.concat([train.drop(columns=['Annual Turnover']), test], ignore_index=True, sort=False)
combined = preprocess_base(combined, is_train=False)

# compute City frequency on combined
if 'City' in combined.columns:
    city_counts = combined['City'].fillna('Unknown').value_counts()
    combined['City_Frequency'] = combined['City'].fillna('Unknown').map(city_counts)

# split back
proc_train = combined.iloc[:len(train)].copy()
proc_test  = combined.iloc[len(train):].copy()

# Bring target back
proc_train['Annual Turnover'] = train['Annual Turnover'].values

# ---------- Prepare feature lists ----------
drop_cols = ['Registration Number']  # keep id in test for submission
target_col = 'Annual Turnover'

# define categorical vs numerical
all_cols = [c for c in proc_train.columns if c not in [target_col] + drop_cols]
# heuristics for categorical
cat_cols = [c for c in all_cols if proc_train[c].dtype == 'object']
num_cols = [c for c in all_cols if c not in cat_cols]

print("Numerical cols:", len(num_cols), "Categorical cols:", len(cat_cols))

# For high-cardinality categoricals, use frequency encoding (we already have City_Frequency)
high_card_cols = []
for c in cat_cols:
    if proc_train[c].nunique() > 30:
        high_card_cols.append(c)

# We'll label-encode small categoricals and one-hot the truly small ones
small_card_cat = [c for c in cat_cols if proc_train[c].nunique() <= 10]
large_card_cat = [c for c in cat_cols if proc_train[c].nunique() > 10 and c not in high_card_cols]

print("small_card_cat:", small_card_cat)
print("large_card_cat:", large_card_cat)
print("high_card_cols:", high_card_cols)

# ---------- Simple column transformer ----------
# Imputation for numeric
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),  # scale for XGBoost sometimes helpful
])

# For categoricals: label encode small_card_cat, one-hot for small_card_cat with <=6 unique else ordinal via LabelEncoder
# We'll implement a custom transformer using pandas operations for simplicity.

def df_transform(X_df):
    X = X_df.copy()
    # label-encode small-cardinality categoricals
    for c in small_card_cat:
        X[c] = X[c].astype(str).fillna('NA_small')
        le = LabelEncoder()
        X[c] = le.fit_transform(X[c])
    # label-encode large_card_cat (frequency encoding is often better, but keep label encoding)
    for c in large_card_cat:
        X[c] = X[c].astype(str).fillna('NA_large')
        le = LabelEncoder()
        X[c] = le.fit_transform(X[c])
    # high-cardinality: frequency encoding
    for c in high_card_cols:
        freq = X[c].fillna('NA_high').value_counts()
        X[c + "_freq_enc"] = X[c].fillna('NA_high').map(freq).fillna(0)
        X.drop(columns=[c], inplace=True, errors='ignore')
    # City already has City_Frequency - keep it and drop original
    if 'City' in X.columns:
        X.drop(columns=['City'], inplace=True, errors='ignore')
    # fill any remaining missing numeric values with median
    for c in X.columns:
        if X[c].dtype in [np.float64, np.int64]:
            X[c] = X[c].fillna(X[c].median())
    return X

# Apply df_transform
X_train_raw = df_transform(proc_train[all_cols])
X_test_raw  = df_transform(proc_test[all_cols])

# Keep alignment of columns (some columns may be created/dropped)
X_train_raw, X_test_raw = X_train_raw.align(X_test_raw, join='left', axis=1, fill_value=0)

# separate features and target
y = proc_train[target_col].values
X = X_train_raw.values
X_test = X_test_raw.values
feature_names = X_train_raw.columns.tolist()

print("Feature matrix shape:", X.shape, "Test matrix shape:", X_test.shape)

# ---------- Log transform target ----------
y_log = np.log1p(y)

# ---------- Cross-validated baseline (optional quick check) ----------
cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

# ---------- Grid search for RandomForest ----------
rf = RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=-1)

rf_param_grid = {
    'n_estimators': [200, 400],
    'max_depth': [10, 15, None],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 5]
}

rf_gs = GridSearchCV(
    estimator=rf,
    param_grid=rf_param_grid,
    scoring='neg_mean_squared_error',
    cv=cv,
    n_jobs=-1,
    verbose=2
)

print("Starting GridSearchCV for RandomForest...")
rf_gs.fit(X, y_log)  # training on log target
print("Best RF params:", rf_gs.best_params_)
print("Best RF CV RMSE:", np.sqrt(-rf_gs.best_score_))

best_rf = rf_gs.best_estimator_

# ---------- Grid search for XGBoost ----------
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=RANDOM_STATE, n_jobs=-1, verbosity=0)

xgb_param_grid = {
    'n_estimators': [200, 400],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.7, 0.9],
    'colsample_bytree': [0.6, 0.8]
}

xgb_gs = GridSearchCV(
    estimator=xgb_model,
    param_grid=xgb_param_grid,
    scoring='neg_mean_squared_error',
    cv=cv,
    n_jobs=-1,
    verbose=2
)

print("Starting GridSearchCV for XGBoost...")
xgb_gs.fit(X, y_log)
print("Best XGB params:", xgb_gs.best_params_)
print("Best XGB CV RMSE:", np.sqrt(-xgb_gs.best_score_))

best_xgb = xgb_gs.best_estimator_

# ---------- Out-of-fold predictions to evaluate ensemble ----------
print("Generating out-of-fold predictions for both models to estimate OOF RMSE...")

oof_rf = cross_val_predict(best_rf, X, y_log, cv=cv, n_jobs=-1)
oof_xgb = cross_val_predict(best_xgb, X, y_log, cv=cv, n_jobs=-1)

# convert back to original scale
inv = lambda z: np.expm1(z)
oof_rf_orig = inv(oof_rf)
oof_xgb_orig = inv(oof_xgb)
y_orig = y  # original

# simple weighted ensemble search (grid search over weights)
best_rmse = 1e18
best_w = None
for w in np.linspace(0, 1, 11):
    pred = w * oof_xgb_orig + (1 - w) * oof_rf_orig
    score = rmse(y_orig, pred)
    if score < best_rmse:
        best_rmse = score
        best_w = w

print(f"Best ensemble weight (XGB): {best_w:.2f}, RF weight: {1-best_w:.2f}, OOF RMSE: {best_rmse:,.2f}")

# ---------- Fit final models on full training data ----------
print("Fitting final models on full training data...")
best_rf.fit(X, y_log)
best_xgb.fit(X, y_log)

# ---------- Predict on test set ----------
pred_rf_test_log = best_rf.predict(X_test)
pred_xgb_test_log = best_xgb.predict(X_test)

# convert back
pred_rf_test = np.expm1(pred_rf_test_log)
pred_xgb_test = np.expm1(pred_xgb_test_log)

final_pred = best_w * pred_xgb_test + (1 - best_w) * pred_rf_test

# clip negatives
final_pred = np.clip(final_pred, a_min=0, a_max=None)

# ---------- Submission ----------
submission_df = pd.DataFrame({
    "Registration Number": test_ids,
    "Annual Turnover": np.round(final_pred).astype(int)
})

submission_df.to_csv(OUTPUT_SUB, index=False)
print("Saved submission to:", OUTPUT_SUB)
print("Submission head:\n", submission_df.head())

# ---------- Save models for later ----------
joblib.dump(best_rf, "best_rf.joblib")
joblib.dump(best_xgb, "best_xgb.joblib")
print("Models dumped: best_rf.joblib, best_xgb.joblib")

# ---------- Final messages ----------
print("Done. Summary:")
print("RF best params:", rf_gs.best_params_)
print("XGB best params:", xgb_gs.best_params_)
print(f"Estimated OOF ensemble RMSE: {best_rmse:,.2f}")


Train shape: (3493, 34)
Test shape: (500, 33)
Numerical cols: 46 Categorical cols: 6
small_card_cat: ['Restaurant Location', 'EndorsedBy', 'Restaurant Type']
large_card_cat: ['Cuisine']
high_card_cols: ['City', 'Restaurant Theme']
Feature matrix shape: (3493, 52) Test matrix shape: (500, 52)
Starting GridSearchCV for RandomForest...
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best RF params: {'max_depth': None, 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 400}
Best RF CV RMSE: 0.48609021944375846
Starting GridSearchCV for XGBoost...
Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best XGB params: {'colsample_bytree': 0.6, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 200, 'subsample': 0.9}
Best XGB CV RMSE: 0.477815471791973
Generating out-of-fold predictions for both models to estimate OOF RMSE...
Best ensemble weight (XGB): 1.00, RF weight: 0.00, OOF RMSE: 20,540,684.90
Fitting final models on full training data...
Saved subm