# Gradient Boosting Regression

This notebook explores Boosting algorithms (Gradient Boosting, AdaBoost, XGBoost) using Scikit-Learn Pipelines on the Housing Dataset.

In [None]:
!pip install xgboost -q

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

### 1️⃣ Load & Split Data

In [None]:
df = pd.read_csv("../Linear Regression/data/dataset.csv") 
target_column = 'median_house_value'
X = df.drop(columns=[target_column])
y = df[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set:", X_train.shape, "Testing set:", X_test.shape)

### 2️⃣ Preprocessing Pipeline

In [None]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Boosting models benefit from imputation. Scaling is not strictly required but harmless.
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()) 
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ])

### 3️⃣ Gradient Boosting Regressor (GBR)
GBR builds an additive model in a forward stage-wise fashion; it allows for the optimization of arbitrary differentiable loss functions.

In [None]:
gbr_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42))])

gbr_pipeline.fit(X_train, y_train)
y_pred_gbr = gbr_pipeline.predict(X_test)

print("--- Gradient Boosting ---")
print("MSE:", mean_squared_error(y_test, y_pred_gbr))
print("MAE:", mean_absolute_error(y_test, y_pred_gbr))
print("R2:", r2_score(y_test, y_pred_gbr))

### 4️⃣ XGBoost Regressor
eXtreme Gradient Boosting is an efficient and scalable implementation of gradient boosting.

In [None]:
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', XGBRegressor(n_estimators=100, learning_rate=0.1, n_jobs=-1, random_state=42))])

xgb_pipeline.fit(X_train, y_train)
y_pred_xgb = xgb_pipeline.predict(X_test)

print("--- XGBoost ---")
print("MAE:", mean_absolute_error(y_test, y_pred_xgb))
print("R2:", r2_score(y_test, y_pred_xgb))

### 5️⃣ Hyperparameter Tuning (GridSearchCV on GBR)
We can tune parameters like `n_estimators`, `learning_rate`, and `max_depth` inside the pipeline.

In [None]:
param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__learning_rate': [0.05, 0.1],
    'regressor__max_depth': [3, 5]
}

# Using a smaller grid for demonstration speed. Expand for better results.
search = GridSearchCV(gbr_pipeline, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
search.fit(X_train, y_train)

print("Best Parameters:", search.best_params_)
print("Best CV Score (RMSE):", np.sqrt(-search.best_score_))

best_model = search.best_estimator_
y_pred_best = best_model.predict(X_test)
print("Test R2 (Tuned Model):", r2_score(y_test, y_pred_best))