In [8]:
# ...existing code...
import numpy as np
import os
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor

import warnings
warnings.filterwarnings("ignore")

def evaluate_model(name, model, X_train, y_train, X_test, y_test):
    for setname, X_, y_ in (("train", X_train, y_train), ("test", X_test, y_test)):
        preds = model.predict(X_)
        rmse = np.sqrt(mean_squared_error(y_, preds))
        print(f"{name} {setname}: R2={r2_score(y_, preds):.3f}, RMSE={rmse:.3f}, MAE={mean_absolute_error(y_, preds):.3f}")

# Evaluate current models
dt = DecisionTreeRegressor(random_state=42)
# dt.fit(X_train, y_train)
evaluate_model("DecisionTree", dt, X_train, y_train, X_test, y_test)
evaluate_model("GradientBoosting", gbr, X_train, y_train, X_test, y_test)
evaluate_model("AdaBoost", model, X_train, y_train, X_test, y_test)
evaluate_model("XGBoost(initial)", model1, X_train, y_train, X_test, y_test)

# XGBoost with early stopping
xgb_es = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=4,
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1,
    verbosity=0
)
xgb_es.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=50, verbose=False)
evaluate_model("XGBoost(earlystop)", xgb_es, X_train, y_train, X_test, y_test)

# Quick RandomizedSearchCV
param_dist = {
    "n_estimators": [100, 200, 500, 1000],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [3, 4, 5],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "reg_alpha": [0, 0.1, 1],
    "reg_lambda": [1, 2, 5]
}
rs = RandomizedSearchCV(
    XGBRegressor(objective="reg:squarederror", random_state=42, n_jobs=-1, verbosity=0),
    param_distributions=param_dist,
    n_iter=12,
    scoring="neg_mean_squared_error",
    cv=3,
    n_jobs=-1,
    random_state=42,
    verbose=1
)
rs.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=30, verbose=False)
print("Best params:", rs.best_params_)
best_xgb = rs.best_estimator_
evaluate_model("XGBoost(best_rs)", best_xgb, X_train, y_train, X_test, y_test)

# Cross-validation
cv_r2 = cross_val_score(best_xgb, X, y, cv=5, scoring="r2", n_jobs=-1)
print(f"CV R2 mean={cv_r2.mean():.3f} std={cv_r2.std():.3f}")

# Feature importance plot
fi = pd.Series(best_xgb.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(8,6))
sns.barplot(x=fi.values[:10], y=fi.index[:10])
plt.title("Top 10 feature importances (XGBoost)")
plt.tight_layout()
plt.show()

# Save model and feature importance
os.makedirs("models", exist_ok=True)
joblib.dump(best_xgb, "models/xgb_best.pkl")
fi.to_csv("models/xgb_feature_importance.csv")
print("Saved best model and feature importance to models/")

# Pred vs actual and residuals
preds = best_xgb.predict(X_test)
plt.figure(figsize=(6,6))
plt.scatter(y_test, preds, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Predicted vs Actual")
plt.show()

plt.figure(figsize=(6,4))
sns.histplot(y_test - preds, kde=True)
plt.title("Residuals distribution")
plt.show()

# Optional: SHAP summary (installs if missing)
try:
    import shap
except Exception:
    import sys
    !{sys.executable} -m pip install shap -q
    import shap

try:
    explainer = shap.Explainer(best_xgb)
    shap_values = explainer(X_train)
    shap.summary_plot(shap_values, X_train, show=False)
    plt.tight_layout()
    plt.show()
except Exception as e:
    print("SHAP plotting failed:", e)
# ...existing code...

NameError: name 'X_train' is not defined