# Assignment 7.2 - XGBoost

Please submit your solution of this notebook in the Whiteboard at the corresponding Assignment entry as .ipynb-file and as .pdf. <br><br>
Please do **NOT** rename the file!

#### State both names of your group members here:
Farah Ahmed Atef Abdelhameed Hafez - Mariz Essam Sobhy Ghaly

## Task 7.2.1: XGBoost - Regression

* Build an XGBoost classifier using `numpy` only. Train your XGBoost model on the `California Housing` regression task. Report on the performance predicting unseen test samples. **(RESULTS)**

# Build an XGBoost

In [1]:
import numpy as np

# Class structure that might help. Feel free to modify as needed.
class TreeNode:
    """Represents a node in the decision tree"""
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None, isleaf=False):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value  # Leaf value
        self.is_leaf = isleaf


class DecisionTree:
    """Decision tree for XGBoost"""
    def __init__(self, depth=10, minsamples=3, lambdaterm=1, gamma=0.01):
        self.depth = depth
        self.minsamples = minsamples
        self.root = None
        self.lambdaterm=lambdaterm
        self.gamma=gamma
        # self.task=task

    def fit(self, X, gradients, hessians):
        """Build the tree"""
        self.contordiscrete={}
        for i in range(X.shape[1]):
            uniqpersample=np.unique(X[:,i])
            ratiocontordiscrete=len(uniqpersample)/len(X[:,i])
            if ratiocontordiscrete>=0.9:
              self.contordiscrete[i]="Continous"
            else:
              self.contordiscrete[i]="Discrete"
        # print("mynode")
        self.root = self._build_tree(X,gradients, hessians, 0)


    def _build_tree(self, X, gradients, hessians, depth):
          """Recursively build the tree."""
          if depth < self.depth and len(X)>self.minsamples:
              best_feature, best_threshold, best_gain = self._find_best_split(X, gradients, hessians)


              if best_feature is None or best_threshold is None:
                  return self.leaf_value(gradients, hessians)
              if best_gain<0:
                  return self.leaf_value(gradients, hessians)
              Xi = X[:, best_feature]
              left_mask = Xi <= best_threshold
              right_mask = Xi > best_threshold
              gradients_left=gradients[left_mask]
              hessians_left=hessians[left_mask]
              gradients_right=gradients[right_mask]
              hessians_right=hessians[right_mask]
              X_left=X[left_mask]
              X_right=X[right_mask]
              depth+=1
              left = self._build_tree(X_left,gradients_left, hessians_left, depth)
              right = self._build_tree(X_right,gradients_right, hessians_right, depth)
              return TreeNode(best_feature,best_threshold,left,right)
          return self.leaf_value(gradients, hessians)



    def predict(self, X):
        """Make predictions for X."""
        y_pred=[]
        for x in X:
          res=self.root

          while res.is_leaf==False:
            if x[res.feature_index]<=res.threshold:
              res=res.left
              # print("left")
            else:
              res=res.right
              # print("right")
          y_pred.append(res.value)
        return y_pred



    def _find_best_split(self, X, gradients, hessians):

      parentgain= (np.sum(gradients)**2)/(np.sum(hessians)+self.lambdaterm)

      featuremsepair={}

      for i in range(X.shape[1]):
        sorted_idx = np.argsort(X[:, i])
        arr_sorted = X[sorted_idx, i]
        gradients_sorted=gradients[sorted_idx]
        hessians_sorted=hessians[sorted_idx]
        sumofgradients=np.cumsum(gradients_sorted) # changed our masking approach done in previous tree impelementation because when it did masking per feature per threshold when there is 20000 plus samples, the computation time is huge
        sumofhessians=np.cumsum(hessians_sorted)
        scorelist=[]
        thresholdlist=[]
        if self.contordiscrete[i]=="Continous":
          lim=X.shape[0]-1
        else:
          uniquearr_sorted, counts=np.unique(arr_sorted, return_counts=True)
          cum_counts = np.cumsum(counts)
          lim=len(uniquearr_sorted)
        for j in range(lim):

          if self.contordiscrete[i]=="Continous":
            threshold=(arr_sorted[j]+arr_sorted[j+1])/2
            k=j
          else:
            threshold=uniquearr_sorted[j]
            k=cum_counts[j]-1

          gradients_left=sumofgradients[k]
          hessians_left=sumofhessians[k]
          gradients_right=sumofgradients[-1]-sumofgradients[k]
          hessians_right=sumofhessians[-1]-sumofhessians[k]
          if k+1<self.minsamples or len(X)-(k+1)<self.minsamples:
              continue

          leftgain= (gradients_left**2)/(hessians_left+self.lambdaterm)
          rightgain= (gradients_right**2)/(hessians_right+self.lambdaterm)
          splitgain=0.5*(leftgain+rightgain-parentgain)-self.gamma
          scorelist.append(splitgain)
          thresholdlist.append(threshold)
        if len(scorelist)==0:
          continue
        k=np.argmax(np.array(scorelist))
        featuremsepair[i]={"score":scorelist[k], "threshold":thresholdlist[k]}
      if len(featuremsepair)==0:
        return None, None, None
      best_feature = max(featuremsepair, key=lambda i: featuremsepair[i]["score"])

      bestvalue = featuremsepair[best_feature]

      return best_feature, bestvalue["threshold"], bestvalue["score"]


    def leaf_value(self, gradients, hessians):
      value= -(np.sum(gradients)/(np.sum(hessians)+self.lambdaterm))
      return TreeNode(value=value, isleaf=True)






In [2]:
class XGBoost:
    """XGBoost implementation"""

    def __init__(self, boosting_iterations=10, learning_rate=0.1, depth=10, minsamples=3, lambdaterm=1, gamma=0.01):
        self.boosting_iterations = boosting_iterations
        self.learning_rate = learning_rate
        self.trees = []
        self.depth=depth
        self.minsamples=minsamples
        self.lambdaterm=lambdaterm
        self.gamma=gamma


    def fit(self, X, y):
        """Train the XGBoost model"""

        uniqpersample=np.unique(y)

        if len(uniqpersample)==2:
          self.task="Classification"
        else:
          self.task="Regression"

        if self.task=="Regression":
          prevpreds=np.full_like(y, np.mean(y), dtype=float)
          self.firstpred=np.mean(y)
          hessians=np.ones_like(y)
        else:
          prevpreds=np.full_like(y, 0.5, dtype=float)
          prevpreds=np.log(prevpreds/(1-prevpreds))
          self.firstpred=np.log(0.5/(1-0.5))
          hessians=np.full_like(y, 0.5*(1-0.5), dtype=float)


        gradients=prevpreds-y

        for i in range(self.boosting_iterations):
          # print(i)
          mytree=DecisionTree(self.depth, self.minsamples, self.lambdaterm, self.gamma)
          mytree.fit(X,gradients,hessians)
          y_pred=mytree.predict(X)
          newpreds=prevpreds+(self.learning_rate*np.array(y_pred))
          prevpreds=newpreds
          if self.task=="Classification":
            newpreds=1/(1+np.exp(-newpreds))
            hessians=newpreds*(1-newpreds)
          gradients=newpreds-y
          self.trees.append(mytree)

    def predict(self, X):
        """Make predictions"""
        if self.task=="Regression":
          return self.predict_helper(X)
        else:
          return self.predict_proba(X)

    def predict_helper(self, X):
      y_pred=[]
      y_pred.append(np.full(shape=(X.shape[0],), fill_value=self.firstpred, dtype=float))
      for tree in self.trees:
        treepred=tree.predict(X)
        mypreds=np.array(treepred)*self.learning_rate
        y_pred.append(mypreds)
      y_pred=np.array(y_pred)
      return np.sum(y_pred, axis=0)
    # Probabilities for classification :) - The Bonus task
    def predict_proba(self, X):
        """Predict probabilities for binary classification"""
        sum=self.predict_helper(X)
        probof1= 1/(1+np.exp(-sum))
        return np.where(probof1 >= 0.5, 1, 0)





In [3]:
def mean_squared_error(y_true, y_pred):
    return sum((y_true - y_pred)**2)/len(y_true)

#Test on California housing dataset

In [4]:

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split


# Load California Housing data
data = fetch_california_housing()
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
myXGboost= XGBoost(boosting_iterations=20)
myXGboost.fit(X_train,y_train)
y_pred=myXGboost.predict(X_test)




In [5]:
print("Mean Squared Error ", mean_squared_error(y_test, y_pred))


Mean Squared Error  0.2871924889390944


## Task 7.2.2: XGBoost - Classification (BONUS)

* Train an XGBoost model on the `Breast Cancer` binary classification task. Report on the performance predicting unseen test samples. **(RESULTS)**

# Test on the breast cancer dataset

In [6]:
from sklearn.datasets import load_breast_cancer

# Load the dataset
data = load_breast_cancer()

# Access the features and labels
X = data.data  # Shape: (569, 30)
y = data.target  # Shape: (569,) - 0 for malignant, 1 for benign

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
myXGboost= XGBoost(boosting_iterations=20)
myXGboost.fit(X_train,y_train)
y_pred=myXGboost.predict(X_test)


In [7]:
print("Accuracy ", np.mean(y_test==y_pred))

Accuracy  0.9649122807017544


## Congratz, you made it! :)