# 1.1 What is XGBoost?

XGBoost stands for eXtreme Gradient Boosting. It is a powerful and efficient implementation of gradient boosting, which is a machine learning technique for regression, classification, and ranking problems. XGBoost is widely used in competitions like Kaggle due to its speed and performance.

#### 1.2 Key Terms

1. Decision Trees: A tree-like model of decisions used for classification or regression.

2. Gradient Boosting: A technique where new models are added to correct the errors made by existing models. Models are added sequentially until no further improvements can be made.

3. Loss Function: A function that measures the difference between the predicted value and the actual value.

4. Regularization: A technique to prevent overfitting by adding a penalty for complexity to the model.

#### 1.3 Basic Idea of XGBoost

XGBoost builds an ensemble of decision trees sequentially. Each new tree tries to correct the mistakes of the previous trees. The key innovation in XGBoost is the use of gradient descent to minimize the loss function, and it uses first and second-order gradients to do this efficiently.

#### 1.4 Boosting vs. Other Ensemble Methods

![Difference](images/boost.png)

#### 1.5 Types of Boosting Algorithms

    ✅ AdaBoost (Adaptive Boosting) – Adjusts sample weights.
    ✅ Gradient Boosting (GBM) – Uses gradient descent.
    ✅ XGBoost (eXtreme Gradient Boosting) – Optimized gradient boosting with regularization.

## 2.0 Code Implementation

In [3]:
import numpy as np 

class TreeNode:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index  # Index of the feature to split on
        self.threshold = threshold          # Threshold value for the split
        self.left = left                  # Left subtree
        self.right = right                # Right subtree
        self.value = value                # Value if the node is a leaf


class DecisionTree:
    def __init__(self, max_depth=3, min_samples_split=2, gamma=0, lambda_=1):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.gamma = gamma
        self.lambda_=lambda_
        self.root = None

    def _calculate_gain(self, g_L, h_L, g_R, h_R, g, h):
        gain = 0.5 * ((g_L**2 / (h_L + self.lambda_)) + (g_R**2 / (h_R + self.lambda_)) - (g**2 / (h + self.lambda_))) - self.gamma
        return gain

    def _split(self, X, g, h):
        best_gain = -np.inf 
        best_feature_index = None 
        best_threshold = None

        for feature_index in range(X.shape[1]):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                left_indices = X[:, feature_index] <= threshold
                right_indices = X[:, feature_index] > threshold 

                g_L, h_L = g[left_indices].sum(), h[left_indices].sum()
                g_R, h_R = g[right_indices].sum(), h[right_indices].sum()

                gain = self._calculate_gain(g_L, h_L, g_R, g_L, g.sum(), h.sum())
                if gain > best_gain:
                    best_gain = gain 
                    best_feature_index = feature_index
                    best_threshold = threshold

        return best_feature_index, best_threshold
    
    def _build_tree(self, X, g, h, depth=0):
        num_samples, num_features = X.shape 
        if depth >= self.max_depth or num_samples < self.min_samples_split:
            leaf_value = -g.sum() / (h.sum() + self.lambda_) 
            return TreeNode(value=leaf_value)
        
        feature_index, threshold = self._split(X, g, h)

        if feature_index is None:
            leaf_value = -g.sum() / (h.sum() + self.lambda_)
            return TreeNode(value=leaf_value)
        
        left_indices = X[:, feature_index] <= threshold
        right_indices = X[:, feature_index] > threshold

        left = self._build_tree(X[left_indices], g[left_indices], h[left_indices], depth + 1)
        right = self._build_tree(X[right_indices], g[right_indices], h[right_indices], depth + 1)

        return TreeNode(feature_index, threshold, left, right)
    

    def fit(self, X, g, h):
        self.root = self._build_tree(X, g, h)

    def predict(self, X):
        return np.array([self._predict_tree(x, self.root) for x in X])\
        
    def _predict_tree(self, x, node):
        if node.value is not None:
            return node.value 
        if x[node.feature_index] <= node.threshold:
            return self._predict_tree(x, node.left)
        else:
            return self._predict_tree(x, node.right)

#### XGB Class:

In [6]:
class XGBoost:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3, min_samples_split=2, gamma=0, lambda_=1):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.gamma = gamma
        self.lambda_ = lambda_
        self.trees = []

    def _gradient(self, y, y_pred):
        return y_pred - y

    def _hessian(self, y, y_pred):
        return np.ones_like(y)

    def fit(self, X, y):
        y_pred = np.full_like(y, np.mean(y), dtype=np.float64)  # Initialize as float array
        for _ in range(self.n_estimators):
            g = self._gradient(y, y_pred)
            h = self._hessian(y, y_pred)

            tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split, gamma=self.gamma, lambda_=self.lambda_)
            tree.fit(X, g, h)

            self.trees.append(tree)
            y_pred += self.learning_rate * tree.predict(X)

    def predict(self, X):
        y_pred = np.zeros(X.shape[0], dtype=np.float64)  # Initialize as float array
        for tree in self.trees:
            y_pred += self.learning_rate * tree.predict(X)
        return y_pred

In [8]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load the Diabetes dataset
data = load_diabetes()
X = data.data  # Features
y = data.target  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost model
model = XGBoost(n_estimators=10, learning_rate=0.1, max_depth=3)
model.fit(X, y)

# Make predictions
predictions = model.predict(X)
print(predictions)

[ 3.52848098e+01 -3.58280882e+01  3.13122564e+01  1.19586570e+01
 -2.65633422e+01 -3.31923004e+01 -3.58280882e+01 -1.79788958e+01
 -2.73454926e+00  3.11767501e+01 -3.40359032e+01 -2.94007354e+01
 -2.53944248e+01  2.19191434e+01 -2.84268139e+01  1.64361648e+01
  3.82321580e+01  3.04582263e+01 -1.50023391e+01 -1.27356992e+01
 -2.53944248e+01 -3.58280882e+01 -2.29378359e+01  3.82321580e+01
  1.79977429e+00  1.21426407e+01 -1.07338266e+01 -2.73454926e+00
 -3.88073219e+00  2.48433948e+01  1.26840581e+00 -3.58280882e+01
  4.04418986e+01 -3.10001337e+01 -3.58280882e+01 -4.17265612e+00
  3.34055746e+01  1.93998633e+01  3.82321580e+01  1.93998633e+01
  1.34888165e+01 -3.40359032e+01 -1.15215217e+01 -3.58280882e+01
  3.82321580e+01 -2.53944248e+01 -7.44044044e+00 -2.72578964e+01
 -3.58280882e+01  2.94330213e+01 -3.23636039e+00  2.17775676e+01
 -1.17055054e+01 -2.34618744e+01 -6.49246457e+00 -3.58280882e+01
  3.11767501e+01 -3.58280882e+01 -9.23251519e+00  4.06995092e+00
 -3.75516258e+01  2.03889