## Task 6.1.1 - Regression Trees

* Implement the Regression Tree Class from scratch using only `NumPy`. **(RESULT)**
* Run your implementation on the synthetic regression dataset provided. **(RESULT)**

In [38]:
import numpy as np
from sklearn.model_selection import train_test_split

def generate_regression_data(n_samples=1000, n_features=8, noise=0.1, random_state=42):
    """Generate synthetic regression data similar to California housing."""
    np.random.seed(random_state)

    X = np.random.randn(n_samples, n_features)

    # Create target with non-linear relationships
    y = (2.5 * X[:, 0] +
         1.8 * X[:, 1] ** 2 +
         -1.2 * X[:, 2] * X[:, 3] +
         0.5 * np.sin(5 * X[:, 4]) +
         0.8 * X[:, 5] +
         -0.3 * X[:, 6] ** 3 +
         1.5 * X[:, 7])

    # Add noise
    y += noise * np.random.randn(n_samples)

    # Scale to reasonable range
    y = (y - y.min()) / (y.max() - y.min()) * 4 + 1

    return X, y


#Build Regression Tree

In [39]:
class TreeNode:
    def __init__(self, feature, threshold, left, right):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right

class RegressionTree:
    """A decision tree for regression using numpy."""

    def __init__(self,depth=5, minsamplesatnode=1):
        self.depth = depth
        self.minsamplesatnode=minsamplesatnode

    def fit(self, X, y):
        """Build the regression tree."""
        self.contordiscrete={}
        for i in range(X.shape[1]):
            uniqpersample=np.unique(X[:,i])
            ratiocontordiscrete=len(uniqpersample)/len(X[:,i])
            if ratiocontordiscrete>=0.9:
              self.contordiscrete[i]="Continous"
            else:
              self.contordiscrete[i]="Discrete"
        self.rootnode = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        """Recursively build the tree."""
        if depth < self.depth and len(X)>self.minsamplesatnode:
            best_feature, best_threshold = self._find_best_split(X, y)
            if best_feature is None or best_threshold is None:
                return y.mean()
            Xi = X[:, best_feature]
            left_mask = Xi <= best_threshold
            right_mask = Xi > best_threshold
            y_left=y[left_mask]
            y_right=y[right_mask]
            X_left=X[left_mask]
            X_right=X[right_mask]
            depth+=1
            left = self._build_tree(X_left,y_left,depth)
            right = self._build_tree(X_right,y_right, depth)
            return TreeNode(best_feature,best_threshold,left,right)
        return y.mean()


    def predict(self, X):
        """Make predictions for X."""
        y_pred=[]
        for x in X:
          res=self.rootnode

          while isinstance(res, TreeNode):
            if x[res.feature]<=res.threshold:
              res=res.left
              # print("left")
            else:
              res=res.right
              # print("right")
          y_pred.append(res)
        return y_pred



    def _find_best_split(self, X, y):

      featuremsepair={}

      for i in range(X.shape[1]):
        sorted_idx = np.argsort(X[:, i])
        arr_sorted = X[sorted_idx, i]
        y = y[sorted_idx]
        X=X[sorted_idx, :]
        mselist=[]
        thresholdlist=[]
        if self.contordiscrete[i]=="Continous":
          lim=X.shape[0]-1
        else:
          uniquearr_sorted=np.unique(arr_sorted)
          lim=len(uniquearr_sorted)
        for j in range(lim):

          if self.contordiscrete[i]=="Continous":
            threshold=(arr_sorted[j]+arr_sorted[j+1])/2
          else:
            threshold=uniquearr_sorted[j]

          Xi = X[:, i]
          left_mask = Xi <= threshold
          right_mask = Xi > threshold
          y_left=y[left_mask]
          y_right=y[right_mask]
          X_left=X[left_mask]
          X_right=X[right_mask]
          if y_left.size == 0 or y_right.size == 0:
              continue
          y_left_mean=np.mean(y_left)
          y_right_mean=np.mean(y_right)
          mse_left=np.sum((y_left-y_left_mean)**2)
          mse_right=np.sum((y_right-y_right_mean)**2)
          mse_total=mse_left+mse_right
          mselist.append(mse_total)
          thresholdlist.append(threshold)
        if len(mselist)==0:
          continue
        k=np.argmin(np.array(mselist))
        featuremsepair[i]={"mse":mselist[k], "threshold":thresholdlist[k]}
      if len(featuremsepair)==0:
        return None, None
      best_feature = min(featuremsepair, key=lambda i: featuremsepair[i]["mse"])

      bestvalue = featuremsepair[best_feature]

      return best_feature, bestvalue["threshold"]









In [40]:
def mean_squared_error(y_true, y_pred):
    return sum((y_true - y_pred)**2)/len(y_true)

#Test on synthetic dataset

In [41]:
X,y=generate_regression_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
tree=RegressionTree()
tree.fit(X_train,y_train)
y_pred=tree.predict(X_test)
print("MSE", mean_squared_error(y_test, y_pred))

MSE 0.07233008392593654


## Task 6.1.2 - Bagging

* Implement Bagging using only `NumPy`. **(RESULT)**
* Compare the results between the bagged run of your `RegressionTree` class on the synthetic dataset. **(RESULT)**

#Build Bagging Regressor

In [42]:
class BaggingRegressor:
    """Bagging ensemble for regression trees."""

    def __init__(self, bags, n_samples):
        self.bags = bags
        self.samples = n_samples

    def fit(self, X, y):
      self.dict_bagged_trees={}
      for i in range(self.bags):
        bag_indices = np.random.choice(len(X), size=self.samples, replace=True)
        bag_X = X[bag_indices]
        bag_y = y[bag_indices]
        tree = RegressionTree()
        tree.fit(bag_X, bag_y)
        self.dict_bagged_trees[i]=tree

    def predict(self, X):
        """Make predictions by averaging all trees."""
        treepred=[]
        for i in range(self.bags):
          tree=self.dict_bagged_trees[i]
          y_pred=tree.predict(X)
          treepred.append(y_pred)
        y_pred=np.mean(treepred,axis=0)
        return y_pred


# Test on different hyperparameters

In [43]:
tree=BaggingRegressor(5,300)
tree.fit(X_train,y_train)
y_pred=tree.predict(X_test)
print("MSE", mean_squared_error(y_test, y_pred))

MSE 0.055688133490828226


In [44]:
tree=BaggingRegressor(3,250)
tree.fit(X_train,y_train)
y_pred=tree.predict(X_test)
print("MSE", mean_squared_error(y_test, y_pred))

MSE 0.06489462319478362


As you can see, the the bagged run has lower mse than the standard tree run which is expected. We tried at different hyperparameters and achieved better results compared to the standard tree at both combinations.

## Congratz, you made it! :)