<a href="https://colab.research.google.com/github/ABBAS-37405/PYTHON-AND-DATA-SCIENCE/blob/main/HPTXGB_Reg_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **XGBoost Regressor**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv("healthcare_data_10000.csv")

In [2]:
# Step 2: Select numeric columns
numeric_cols = [
    'age', 'bmi', 'systolic_bp', 'diastolic_bp',
    'cholesterol_level', 'glucose_level',
    'exercise_mins_per_week', 'alcohol_units_per_week', 'medications_count'
]
target = 'heart_rate'

# Step 3: Feature matrix (X) and target vector (y)
X = df[numeric_cols]
y = df[target]

# Step 4: Train-test split
X_train, X_test, y_train_reg, y_test_reg = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Step 5: Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [3]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

model = XGBRegressor(n_estimators = 100, learning_rate = 0.1, random_state = 42)
model.fit(X_train_scaled, y_train_reg)

y_pred_reg = model.predict(X_test_scaled)

mse = mean_squared_error(y_pred_reg, y_test_reg)
RMSE = np.sqrt(mse)
print(RMSE)
print(mse)

10.361777831016063
107.36643981933594


In [4]:
pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.7.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.7.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-25.7.0 scikit-optimize-0.10.2


In [5]:
from skopt import BayesSearchCV
from sklearn.metrics import mean_squared_error

In [6]:
search_spaces = {"n_estimators": (100, 1000, 10),
                 "max_depth": (3, 10),
                 "learning_rate": (0.01, 0.3),
                 "subsample": (0.5, 1.0),
                 "colsample_bytree": (0.5, 1.0),
                 "gamma": (0, 5),
                 "min_child_weight": (1, 10, 2)}

In [8]:
xgb = XGBRegressor(random_state=42)

opt = BayesSearchCV(estimator=xgb,
                    search_spaces=search_spaces,
                    n_iter=30,
                    scoring = 'neg_mean_squared_error',
                    cv = 3,
                    random_state=42,
                    n_jobs= -1,
                    verbose=0)

In [9]:
opt.fit(X_train_scaled, y_train_reg)
print("Best parameters",opt.best_params_)

Best parameters OrderedDict({'colsample_bytree': 1.0, 'gamma': 5, 'learning_rate': 0.015447726298075719, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 10, 'subsample': 0.9838146230207931})


In [10]:
best_model = opt.best_estimator_
y_pred_reg = best_model.predict(X_test_scaled)

mse = mean_squared_error(y_test_reg, y_pred_reg)
rmse = np.sqrt(mse)
print("MSE:", mse)
print("RMSE:", rmse)

MSE: 103.70429992675781
RMSE: 10.183530818275056
