In [8]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load California housing dataset
data = fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

class SimpleTreeRegressor:
    """A simple decision tree regressor with basic optimizations."""
    def __init__(self, max_depth=3, min_samples_split=10, n_bins=20):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.n_bins = n_bins
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        if depth >= self.max_depth or len(y) < self.min_samples_split or np.std(y) == 0:
            return np.mean(y)

        # Bin feature values to reduce candidate splits
        bins = [
            np.linspace(np.min(X[:, feature]), np.max(X[:, feature]), self.n_bins + 1)
            for feature in range(X.shape[1])
        ]
        X_binned = np.array([
            np.digitize(X[:, feature], bins[feature]) - 1 for feature in range(X.shape[1])
        ]).T

        # Find the best split
        best_feature, best_split, best_loss = None, None, float("inf")
        for feature in range(X.shape[1]):
            thresholds = np.unique(X_binned[:, feature])
            for threshold in thresholds:
                left_mask = X_binned[:, feature] <= threshold
                right_mask = ~left_mask

                if np.sum(left_mask) == 0 or np.sum(right_mask) == 0:
                    continue

                left_loss = np.var(y[left_mask]) * np.sum(left_mask)
                right_loss = np.var(y[right_mask]) * np.sum(right_mask)
                loss = left_loss + right_loss

                if loss < best_loss:
                    best_loss = loss
                    best_feature = feature
                    best_split = bins[feature][threshold]

        if best_feature is None:
            return np.mean(y)

        # Create subtrees
        left_mask = X[:, best_feature] <= best_split
        right_mask = ~left_mask
        return {
            "feature": best_feature,
            "split": best_split,
            "left": self._build_tree(X[left_mask], y[left_mask], depth + 1),
            "right": self._build_tree(X[right_mask], y[right_mask], depth + 1),
        }

    def predict(self, X):
        return np.array([self._predict_row(x, self.tree) for x in X])

    def _predict_row(self, x, tree):
        if not isinstance(tree, dict):
            return tree
        if x[tree["feature"]] <= tree["split"]:
            return self._predict_row(x, tree["left"])
        else:
            return self._predict_row(x, tree["right"])

class GradientBoostingRegressor:
    """Gradient Boosting Regressor with optimized tree fitting."""
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3, min_samples_split=10, n_bins=20):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.n_bins = n_bins
        self.trees = []
        self.initial_prediction = None

    def fit(self, X, y):
        X = X.values if isinstance(X, pd.DataFrame) else X
        self.initial_prediction = np.mean(y)
        y_pred = np.full(y.shape, self.initial_prediction)

        for _ in range(self.n_estimators):
            residuals = y - y_pred
            tree = SimpleTreeRegressor(max_depth=self.max_depth, min_samples_split=self.min_samples_split, n_bins=self.n_bins)
            tree.fit(X, residuals)
            self.trees.append(tree)

            y_pred += self.learning_rate * tree.predict(X)

    def predict(self, X):
        X = X.values if isinstance(X, pd.DataFrame) else X
        y_pred = np.full(X.shape[0], self.initial_prediction)
        for tree in self.trees:
            y_pred += self.learning_rate * tree.predict(X)
        return y_pred

# Train and evaluate the model with reduced dataset size for faster execution
X_sample = X_train.values
y_sample = y_train

model = GradientBoostingRegressor(n_estimators=50, learning_rate=0.1, max_depth=3, n_bins=20)
model.fit(X_sample, y_sample)
y_pred = model.predict(X_test.values)

print("Mean Squared Error:", mean_squared_error(y_test, y_pred))


Mean Squared Error: 0.5032870091200756


In [12]:
def r2_score(y_true, y_pred):
    total_variance = np.sum((y_true - np.mean(y_true)) ** 2)
    residual_variance = np.sum((y_true - y_pred) ** 2)
    return 1 - (residual_variance / total_variance)
# Compute R-squared
r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2}")

R-squared: 0.6159313485506709
