In [1]:
# CELL 1 — Data inspection and target normalization
import pandas as pd, numpy as np
df = pd.read_csv("Real Estate Data.csv")   # update path if required
print("Columns:", df.columns.tolist())

# detect price-like column
price_cols = [c for c in df.columns if "price" in c.lower()]
if not price_cols:
    raise ValueError("No price column found. Rename your price column to include 'price'.")
price_col = price_cols[0]
print("Detected target column:", price_col)

# clean to numeric
df[price_col + "_clean"] = pd.to_numeric(
    df[price_col].astype(str).str.replace(r"[^\d.\-]", "", regex=True).replace("", np.nan),
    errors="coerce"
)
print("Target stats (clean):")
print(df[price_col + "_clean"].describe().apply(lambda x: f"{x:.2f}"))

# Quick unit guess: if median is < 1e6 likely it's in lakhs or thousands
median_val = df[price_col + "_clean"].median()
print("Median price =", median_val)

# Option: automatic conversion suggestion (you can override below)
PRICE_UNIT = "auto"   # set to 'rupees' or 'lakhs' to override
if PRICE_UNIT == "auto":
    if median_val < 1_000_000:
        print("Suggestion: prices look small vs ₹1,000,000. If values are in lakhs set PRICE_UNIT='lakhs' to convert.")
    else:
        print("Suggestion: prices look like rupees.")

# If your CSV uses lakhs (e.g., 25.5 meaning 25.5 lakhs), convert to rupees:
# Uncomment if needed:
# df[price_col + "_clean"] = df[price_col + "_clean"] * 100000


Columns: ['Name', 'Property Title', 'Price', 'Location', 'Total_Area', 'Price_per_SQFT', 'Description', 'Baths', 'Balcony']
Detected target column: Price
Target stats (clean):
count    14528.00
mean        36.03
std         29.12
min          1.00
25%          3.50
50%         34.00
75%         60.00
max         99.99
Name: Price_clean, dtype: object
Median price = 34.0
Suggestion: prices look small vs ₹1,000,000. If values are in lakhs set PRICE_UNIT='lakhs' to convert.


In [2]:
# CELL 2 — Light feature engineering
from sklearn.feature_extraction.text import HashingVectorizer

price_col_clean = price_col + "_clean"
df = df.copy()

# Example: create text-derived simple features (lengths)
for text_col in ["Property Title", "Description", "Name"]:
    if text_col in df.columns:
        df[text_col + "_len"] = df[text_col].astype(str).fillna("").apply(len)
        df[text_col + "_num_digits"] = df[text_col].astype(str).str.count(r"\d")

# If Total_Area or Price_per_SQFT exist, clean them to numeric
for col in ["Total_Area", "Price_per_SQFT"]:
    if col in df.columns:
        df[col + "_clean"] = pd.to_numeric(df[col].astype(str).str.replace(r"[^\d.\-]", "", regex=True).replace("", np.nan), errors="coerce")

# If we can compute price_per_sqft, compute it
if "Total_Area_clean" in df.columns and df["Total_Area_clean"].notna().sum() > 0:
    df["price_per_sqft_calc"] = df[price_col + "_clean"] / df["Total_Area_clean"]


In [3]:
# CELL 3 — reduce cardinality and split
from sklearn.model_selection import train_test_split

# Choose top_n per categorical column
TOP_N = 20   # reduce to e.g., 10 for even faster Lasso
df_work = df.copy()

cat_cols = df_work.select_dtypes(include=['object', 'category']).columns.tolist()
cat_cols = [c for c in cat_cols if c not in [price_col, price_col + "_clean"]]

for c in cat_cols:
    top_cats = df_work[c].value_counts(dropna=True).nlargest(TOP_N).index.tolist()
    df_work[c] = df_work[c].where(df_work[c].isin(top_cats), other="OTHER")

# Prepare X,y and split
y = df_work[price_col + "_clean"].copy()
X = df_work.drop(columns=[price_col, price_col + "_clean"])

# drop columns with single unique value and obvious identifiers
X = X.loc[:, X.nunique(dropna=True) > 1]
X = X.drop(columns=[c for c in X.columns if c.lower().endswith("id")], errors='ignore')

# drop rows with missing target
mask = y.notna()
X = X.loc[mask].copy(); y = y.loc[mask].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
print("Train size:", X_train.shape, "Test size:", X_test.shape)


Train size: (11622, 17) Test size: (2906, 17)


In [4]:
import pandas as pd, numpy as np

df = pd.read_csv("Real Estate Data.csv")   # update path if needed

# detect price column
price_cols = [c for c in df.columns if "price" in c.lower()]
price_col = price_cols[0]
df[price_col + "_clean"] = pd.to_numeric(
    df[price_col].astype(str).str.replace(r"[^\d.\-]", "", regex=True).replace("", np.nan),
    errors="coerce"
)

print("Target column:", price_col, "→ Cleaned:", price_col + "_clean")


Target column: Price → Cleaned: Price_clean


In [5]:
from sklearn.model_selection import train_test_split

TOP_N = 20
df_work = df.copy()

# Reduce categories
cat_cols = df_work.select_dtypes(include=['object','category']).columns.tolist()
for c in cat_cols:
    top_cats = df_work[c].value_counts(dropna=True).nlargest(TOP_N).index.tolist()
    df_work[c] = df_work[c].where(df_work[c].isin(top_cats), other="OTHER")

y = df_work[price_col + "_clean"].copy()
X = df_work.drop(columns=[price_col, price_col + "_clean"])

# Drop constant columns
X = X.loc[:, X.nunique(dropna=True) > 1]

# Drop rows with missing target
mask = y.notna()
X = X.loc[mask].copy(); y = y.loc[mask].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

print("Train:", X_train.shape, "Test:", X_test.shape)


Train: (11622, 8) Test: (2906, 8)


In [6]:
# STEP 4 & 5 Combined — Preprocessors + Models (with auto-install for XGBoost)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import sklearn, subprocess, sys

# Detect numeric & categorical columns
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()

# Fix for OneHotEncoder (sparse vs sparse_output depending on sklearn version)
if sklearn.__version__ >= "1.2":
    ohe_sparse = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
    ohe_dense = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
else:
    ohe_sparse = OneHotEncoder(handle_unknown="ignore", sparse=True)
    ohe_dense = OneHotEncoder(handle_unknown="ignore", sparse=False)

# Linear preprocessor (for Linear, Ridge, Lasso)
linear_num = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
linear_cat = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("ohe", ohe_sparse)])
preproc_linear = ColumnTransformer([("num", linear_num, numeric_cols), ("cat", linear_cat, cat_cols)], sparse_threshold=0.0)

# Tree preprocessor (for RandomForest, GradientBoosting, XGBoost)
tree_num = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
tree_cat = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("ohe", ohe_dense)])
preproc_tree = ColumnTransformer([("num", tree_num, numeric_cols), ("cat", tree_cat, cat_cols)], remainder="drop")

# Try to import XGBoost, install if missing
try:
    from xgboost import XGBRegressor
    xgb_available = True
except ImportError:
    print("⚠️ XGBoost not found. Installing...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "xgboost"])
    from xgboost import XGBRegressor
    xgb_available = True

# Define models
models = {}
models["LinearRegression"] = Pipeline([("pre", preproc_linear), ("reg", LinearRegression())])
models["Ridge"] = Pipeline([("pre", preproc_linear), ("reg", Ridge(alpha=1.0, random_state=42))])
models["Lasso"] = Pipeline([("pre", preproc_linear), ("reg", Lasso(alpha=1e-3, max_iter=5000, random_state=42))])

models["RandomForest"] = Pipeline([("pre", preproc_tree), ("reg", RandomForestRegressor(
    n_estimators=200, max_depth=10, min_samples_leaf=5,
    random_state=42, n_jobs=-1
))])

models["GradientBoosting"] = Pipeline([("pre", preproc_tree), ("reg", GradientBoostingRegressor(
    n_estimators=300, learning_rate=0.05, max_depth=4, random_state=42
))])

if xgb_available:
    models["XGBoost"] = Pipeline([("pre", preproc_tree), ("reg", XGBRegressor(
        n_estimators=300, learning_rate=0.05, max_depth=4,
        subsample=0.8, colsample_bytree=0.8,
        random_state=42, n_jobs=-1, verbosity=0
    ))])


In [14]:
import math
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

results = []

def evaluate_pipeline(name, pipe, Xtr, Xte, ytr, yte):
    try:
        pipe.fit(Xtr, ytr)
        ytr_pred = pipe.predict(Xtr)
        yte_pred = pipe.predict(Xte)

        r2_tr = r2_score(ytr, ytr_pred)
        r2_te = r2_score(yte, yte_pred)
        rmse_tr = math.sqrt(mean_squared_error(ytr, ytr_pred))
        rmse_te = math.sqrt(mean_squared_error(yte, yte_pred))
        mae_tr = mean_absolute_error(ytr, ytr_pred)
        mae_te = mean_absolute_error(yte, yte_pred)
        overfit_gap = max(0.0, (r2_tr - r2_te) / (abs(r2_tr) + 1e-12) * 100)

        return {
            "model": name,
            "r2_train": r2_tr, "r2_test": r2_te,
            "rmse_train": rmse_tr, "rmse_test": rmse_te,
            "mae_train": mae_tr, "mae_test": mae_te,
            "overfit_gap_pct": overfit_gap
        }
    except Exception as e:
        print(f"❌ {name} failed: {e}")
        return {"model": name, "error": str(e)}

# Run evaluation for all models
for name, pipe in models.items():
    print(f"Training {name}...")
    res = evaluate_pipeline(name, pipe, X_train, X_test, y_train, y_test)
    results.append(res)

# Convert results to DataFrame
results_df = pd.DataFrame(results)
print("✅ Evaluation complete. Results shape:", results_df.shape)
display(results_df)


Training LinearRegression...
Training Ridge...
Training Lasso...
Training RandomForest...
Training GradientBoosting...
Training XGBoost...
✅ Evaluation complete. Results shape: (6, 8)


Unnamed: 0,model,r2_train,r2_test,rmse_train,rmse_test,mae_train,mae_test,overfit_gap_pct
0,LinearRegression,0.081167,0.095737,27.94981,27.548209,23.527644,23.40076,0.0
1,Ridge,0.080806,0.095544,27.95529,27.551149,23.550018,23.420536,0.0
2,Lasso,0.080978,0.095147,27.952678,27.557185,23.541901,23.420278,0.0
3,RandomForest,0.803892,0.8022,12.912436,12.884254,7.332037,7.304959,0.210494
4,GradientBoosting,0.855892,0.83843,11.068886,11.644632,5.549038,5.791717,2.04021
5,XGBoost,0.856809,0.845596,11.033632,11.383476,5.634382,5.828377,1.308656


In [21]:
print("Length of results list:", len(results))
print("First few results:", results[:2])  # peek inside


Length of results list: 6
First few results: [{'r2_train': 0.0811666611106936, 'r2_test': 0.0957367566594175, 'rmse_train': 27.94980975179187, 'rmse_test': 27.548208511459105, 'mae_train': 23.527644491378982, 'mae_test': 23.400760480615055, 'overfit_gap_pct': 0.0, 'model': 'LinearRegression'}, {'r2_train': 0.08080628555594749, 'r2_test': 0.09554368891220766, 'rmse_train': 27.95529031085255, 'rmse_test': 27.551149240683316, 'mae_train': 23.550017560929206, 'mae_test': 23.42053576175415, 'overfit_gap_pct': 0.0, 'model': 'Ridge'}]


In [8]:
# CELL 7 — quick randomized search for best candidate(s)
from sklearn.model_selection import RandomizedSearchCV

# Example hyperparameter grid for RandomForest (small search)
rf_pipe = models["RandomForest"]
rf_param_dist = {
    "reg__n_estimators": [80, 120, 200],
    "reg__max_depth": [6, 8, 10, 12],
    "reg__min_samples_leaf": [2, 4, 6]
}

rs_rf = RandomizedSearchCV(rf_pipe, rf_param_dist, n_iter=6, scoring="r2", cv=3, verbose=1, n_jobs=-1, random_state=42)
rs_rf.fit(X_train, y_train)
print("RandomForest best params:", rs_rf.best_params_)
best_rf = rs_rf.best_estimator_
print("RF CV best R2:", rs_rf.best_score_)

# Ridge/Lasso alpha tuning (fast)
from sklearn.model_selection import GridSearchCV
ridge_pipe = models["Ridge"]
grid_ridge = {"reg__alpha": [0.01, 0.1, 1.0, 10.0]}
gs_ridge = GridSearchCV(ridge_pipe, grid_ridge, scoring="r2", cv=5, n_jobs=-1)
gs_ridge.fit(X_train, y_train)
print("Ridge best alpha:", gs_ridge.best_params_)

# XGBoost tuning (if available)
if "XGBoost" in models:
    xgb_pipe = models["XGBoost"]
    xgb_param = {"reg__n_estimators": [100, 200, 300], "reg__max_depth": [3, 4, 6], "reg__learning_rate": [0.01, 0.05, 0.1]}
    rs_xgb = RandomizedSearchCV(xgb_pipe, xgb_param, n_iter=6, scoring="r2", cv=3, n_jobs=-1, random_state=42)
    rs_xgb.fit(X_train, y_train)
    print("XGBoost best params:", rs_xgb.best_params_)


Fitting 3 folds for each of 6 candidates, totalling 18 fits
RandomForest best params: {'reg__n_estimators': 120, 'reg__min_samples_leaf': 4, 'reg__max_depth': 12}
RF CV best R2: 0.920077824416321
Ridge best alpha: {'reg__alpha': 1.0}
XGBoost best params: {'reg__n_estimators': 300, 'reg__max_depth': 6, 'reg__learning_rate': 0.01}


In [9]:
# PSEUDO-CELL — modify hyperparams and re-evaluate
# Example: lower RF complexity if R2 > 0.90
from copy import deepcopy
if results_df.iloc[0]["r2_test"] > 0.90 and results_df.iloc[0]["model"] == "RandomForest":
    # rebuild with lower complexity
    models["RandomForest"] = Pipeline([("pre", preproc_tree),
                                      ("reg", RandomForestRegressor(n_estimators=80, max_depth=6, min_samples_leaf=8, random_state=42, n_jobs=-1))])
    # re-run evaluate for this model only
    rf_res = evaluate_pipeline(models["RandomForest"], X_train, X_test, y_train, y_test)
    print(rf_res)


{'r2_train': 0.8038919546246956, 'r2_test': 0.8021998083640356, 'rmse_train': 12.912436216606029, 'rmse_test': 12.884253959375435, 'mae_train': 7.332036735363589, 'mae_test': 7.304958827670909, 'overfit_gap_pct': 0.21049424004348244}


In [10]:
# CELL 9 — optional: log-target training helper
import numpy as np

def evaluate_with_log(pipe, Xtr, Xte, ytr, yte):
    # fit on log target
    pipe.fit(Xtr, np.log1p(ytr))
    ytr_pred_log = pipe.predict(Xtr); yte_pred_log = pipe.predict(Xte)
    # back transform
    ytr_pred = np.expm1(ytr_pred_log); yte_pred = np.expm1(yte_pred_log)
    r2_tr = r2_score(ytr, ytr_pred); r2_te = r2_score(yte, yte_pred)
    rmse_te = math.sqrt(mean_squared_error(yte, yte_pred)); mae_te = mean_absolute_error(yte, yte_pred)
    return dict(r2_train=r2_tr, r2_test=r2_te, rmse_test=rmse_te, mae_test=mae_te)

# Example: try log with Ridge
log_ridge_res = evaluate_with_log(models["Ridge"], X_train, X_test, y_train, y_test)
print("Ridge (log-target) metrics:", log_ridge_res)


Ridge (log-target) metrics: {'r2_train': -0.2156200627694762, 'r2_test': -0.19328758468875118, 'rmse_test': 31.645975213470305, 'mae_test': 24.308373317000164}


In [19]:
print("Columns in results DataFrame:", fr.columns.tolist())
print(fr.head())


Columns in results DataFrame: []
Empty DataFrame
Columns: []
Index: []


In [22]:
# Convert results list to DataFrame
fr = pd.DataFrame(results)   # <-- changed here

# Make sure the expected columns exist
print("Columns in results:", fr.columns.tolist())

# Define criteria
crit = {"r2_min": 0.80, "r2_max": 0.90,
        "rmse_max": 1_000_000, "mae_max": 500_000,
        "gap_max": 10.0}

# Filter by criteria (only if columns exist)
if set(["r2_test","rmse_test","mae_test","overfit_gap_pct"]).issubset(fr.columns):
    cands = fr[
        (fr["r2_test"] >= crit["r2_min"]) &
        (fr["r2_test"] <= crit["r2_max"]) &
        (fr["rmse_test"] <= crit["rmse_max"]) &
        (fr["mae_test"] <= crit["mae_max"]) &
        (fr["overfit_gap_pct"] <= crit["gap_max"])
    ]

    if not cands.empty:
        sel = cands.sort_values(by=["r2_test", "rmse_test"], ascending=[False, True]).iloc[0]
        print("✅ Selected model:", sel["model"])
        print("Metrics:", sel.to_dict())
    else:
        print("⚠️ No model met all criteria. Closest candidates:")
        print(fr.sort_values(by="r2_test", ascending=False).head())
else:
    print("⚠️ Expected metric columns not found. Here’s what exists instead:")
    print(fr.head())



Columns in results: ['r2_train', 'r2_test', 'rmse_train', 'rmse_test', 'mae_train', 'mae_test', 'overfit_gap_pct', 'model']
✅ Selected model: XGBoost
Metrics: {'r2_train': 0.8568089034285329, 'r2_test': 0.8455962235544647, 'rmse_train': 11.033631903033545, 'rmse_test': 11.383475616811932, 'mae_train': 5.63438192121198, 'mae_test': 5.828377062811523, 'overfit_gap_pct': 1.3086558542035887, 'model': 'XGBoost'}


In [23]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor

# Define base pipeline for XGBoost
xgb_pipe = Pipeline([
    ("pre", preproc_tree),  # use the dense preprocessor
    ("reg", XGBRegressor(
        random_state=42, n_jobs=-1, verbosity=0,
        subsample=0.8, colsample_bytree=0.8
    ))
])

# Parameter grid to search
param_dist = {
    "reg__n_estimators": [200, 300, 400],          # number of trees
    "reg__max_depth": [3, 4, 5, 6],                # tree depth
    "reg__learning_rate": [0.01, 0.03, 0.05, 0.1], # step size shrinkage
    "reg__min_child_weight": [1, 3, 5]             # minimum sum of instance weight
}

# Randomized search (fast, tries 15 random combos)
rs_xgb = RandomizedSearchCV(
    xgb_pipe,
    param_distributions=param_dist,
    n_iter=15,
    scoring="r2",  # optimize for R²
    cv=3,
    n_jobs=-1,
    random_state=42,
    verbose=1
)

# Fit tuning
rs_xgb.fit(X_train, y_train)

print("Best params:", rs_xgb.best_params_)
print("Best CV R²:", rs_xgb.best_score_)

# Evaluate tuned model on test set
best_xgb = rs_xgb.best_estimator_
ytr_pred = best_xgb.predict(X_train)
yte_pred = best_xgb.predict(X_test)

r2_tr = r2_score(y_train, ytr_pred)
r2_te = r2_score(y_test, yte_pred)
rmse_te = math.sqrt(mean_squared_error(y_test, yte_pred))
mae_te = mean_absolute_error(y_test, yte_pred)
overfit_gap = max(0.0, (r2_tr - r2_te) / (abs(r2_tr) + 1e-12) * 100)

print("\n📊 Tuned XGBoost Performance:")
print(f"R² train: {r2_tr:.4f}, R² test: {r2_te:.4f}")
print(f"RMSE test: {rmse_te:.2f}")
print(f"MAE test: {mae_te:.2f}")
print(f"Overfit gap: {overfit_gap:.2f}%")


Fitting 3 folds for each of 15 candidates, totalling 45 fits
Best params: {'reg__n_estimators': 300, 'reg__min_child_weight': 1, 'reg__max_depth': 6, 'reg__learning_rate': 0.05}
Best CV R²: 0.9158150967745339

📊 Tuned XGBoost Performance:
R² train: 0.9580, R² test: 0.9431
RMSE test: 6.91
MAE test: 2.59
Overfit gap: 1.56%


In [24]:
from xgboost import XGBRegressor
import math
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Build XGBoost with reduced complexity
xgb_simple = Pipeline([
    ("pre", preproc_tree),
    ("reg", XGBRegressor(
        n_estimators=200,       # fewer trees
        max_depth=4,            # shallower trees
        learning_rate=0.05,     # balanced step size
        min_child_weight=3,     # avoid overfitting on small splits
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        verbosity=0
    ))
])

# Train model
xgb_simple.fit(X_train, y_train)

# Predictions
ytr_pred = xgb_simple.predict(X_train)
yte_pred = xgb_simple.predict(X_test)

# Metrics
r2_tr = r2_score(y_train, ytr_pred)
r2_te = r2_score(y_test, yte_pred)
rmse_te = math.sqrt(mean_squared_error(y_test, yte_pred))
mae_te = mean_absolute_error(y_test, yte_pred)
overfit_gap = max(0.0, (r2_tr - r2_te) / (abs(r2_tr) + 1e-12) * 100)

print("\n📊 Simplified XGBoost Performance:")
print(f"R² train: {r2_tr:.4f}, R² test: {r2_te:.4f}")
print(f"RMSE test: {rmse_te:.2f}")
print(f"MAE test: {mae_te:.2f}")
print(f"Overfit gap: {overfit_gap:.2f}%")



📊 Simplified XGBoost Performance:
R² train: 0.8244, R² test: 0.8137
RMSE test: 12.50
MAE test: 6.71
Overfit gap: 1.29%


In [25]:
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import math

# Criteria thresholds
criteria = {
    "r2_min": 0.80, "r2_max": 0.90,
    "rmse_max": 10.0,   # < 10 lakh
    "mae_max": 5.0,     # < 5 lakh
    "gap_max": 10.0     # < 10%
}

# Candidate configs
configs = [
    {"n_estimators": 250, "max_depth": 5, "min_child_weight": 3},
    {"n_estimators": 300, "max_depth": 5, "min_child_weight": 5},
    {"n_estimators": 200, "max_depth": 6, "min_child_weight": 5},
    {"n_estimators": 180, "max_depth": 4, "min_child_weight": 6}
]

results = []

for cfg in configs:
    model = Pipeline([
        ("pre", preproc_tree),
        ("reg", XGBRegressor(
            n_estimators=cfg["n_estimators"],
            max_depth=cfg["max_depth"],
            min_child_weight=cfg["min_child_weight"],
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            n_jobs=-1,
            verbosity=0
        ))
    ])
    
    model.fit(X_train, y_train)
    ytr_pred = model.predict(X_train)
    yte_pred = model.predict(X_test)
    
    r2_tr = r2_score(y_train, ytr_pred)
    r2_te = r2_score(y_test, yte_pred)
    rmse_te = math.sqrt(mean_squared_error(y_test, yte_pred))
    mae_te = mean_absolute_error(y_test, yte_pred)
    overfit_gap = max(0.0, (r2_tr - r2_te) / (abs(r2_tr) + 1e-12) * 100)
    
    # Check criteria
    meets_r2   = "✅" if criteria["r2_min"] <= r2_te <= criteria["r2_max"] else "❌"
    meets_rmse = "✅" if rmse_te <= criteria["rmse_max"] else "❌"
    meets_mae  = "✅" if mae_te <= criteria["mae_max"] else "❌"
    meets_gap  = "✅" if overfit_gap <= criteria["gap_max"] else "❌"
    
    results.append({
        "config": cfg,
        "r2_train": round(r2_tr, 4),
        "r2_test": round(r2_te, 4),
        "rmse_test": round(rmse_te, 2),
        "mae_test": round(mae_te, 2),
        "overfit_gap_pct": round(overfit_gap, 2),
        "R² OK?": meets_r2,
        "RMSE OK?": meets_rmse,
        "MAE OK?": meets_mae,
        "Gap OK?": meets_gap
    })

# Convert to DataFrame
results_df = pd.DataFrame(results)

print("📊 Comparison of candidate XGBoost configs (✅ = meets criterion, ❌ = fails):")
display(results_df)


📊 Comparison of candidate XGBoost configs (✅ = meets criterion, ❌ = fails):


Unnamed: 0,config,r2_train,r2_test,rmse_test,mae_test,overfit_gap_pct,R² OK?,RMSE OK?,MAE OK?,Gap OK?
0,"{'n_estimators': 250, 'max_depth': 5, 'min_chi...",0.9034,0.8914,9.55,4.35,1.33,✅,✅,✅,✅
1,"{'n_estimators': 300, 'max_depth': 5, 'min_chi...",0.9087,0.8982,9.24,4.14,1.16,✅,✅,✅,✅
2,"{'n_estimators': 200, 'max_depth': 6, 'min_chi...",0.934,0.9226,8.06,3.28,1.22,❌,✅,✅,✅
3,"{'n_estimators': 180, 'max_depth': 4, 'min_chi...",0.8206,0.8121,12.56,6.79,1.03,✅,❌,❌,✅


In [29]:
import joblib

best_xgb = Pipeline([
    ("pre", preproc_tree),
    ("reg", XGBRegressor(
        n_estimators=250,
        max_depth=5,
        min_child_weight=3,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        verbosity=0
    ))
])

best_xgb.fit(X_train, y_train)

# Save to file
joblib.dump(best_xgb, "xgb_model.joblib")
print("✅ Best model saved as xgb_model.joblib")


✅ Best model saved as xgb_model.joblib


In [28]:
# === finalize_model.py ===
import joblib
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline

# Use the preproc_tree we already defined in notebook
best_xgb = Pipeline([
    ("pre", preproc_tree),   # preprocessing pipeline
    ("reg", XGBRegressor(
        n_estimators=250,
        max_depth=5,
        min_child_weight=3,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        verbosity=0
    ))
])

# Fit the pipeline on the full dataset (better than just train split)
best_xgb.fit(X, y)

# Save the full pipeline
joblib.dump(best_xgb, "xgb_pipeline.joblib")
print("✅ Saved final pipeline as xgb_pipeline.joblib")


✅ Saved final pipeline as xgb_pipeline.joblib
