In [34]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

# Load dataset
california = fetch_california_housing()

# Convert to a DataFrame
data = pd.DataFrame(california.data, columns=california.feature_names)
data['MEDIAN_VALUE'] = california.target

# Display the dataset
print(data.head())


   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MEDIAN_VALUE  
0    -122.23         4.526  
1    -122.22         3.585  
2    -122.24         3.521  
3    -122.25         3.413  
4    -122.25         3.422  


In [36]:
import numpy as np

# Function to compute the initial prediction (base prediction, for regression)
def initialize_predictions(y):
    return np.mean(y)

# Function to compute residuals (errors) - for regression
def compute_residuals(y, predictions):
    return y - predictions

# Function to compute the gradient for regression
def compute_gradient(y, predictions):
    return compute_residuals(y, predictions)

# Simple decision tree (regression) as weak learner
def fit_tree(X, residuals, max_depth=3):
    from sklearn.tree import DecisionTreeRegressor
    tree = DecisionTreeRegressor(max_depth=max_depth)
    tree.fit(X, residuals)
    return tree

# Update the model with the new learner's predictions
def update_predictions(predictions, tree, X, learning_rate):
    tree_preds = tree.predict(X)
    return predictions + learning_rate * tree_preds

# XGBoost-like model (simplified version)
class SimplifiedXGBoost:
    def __init__(self, learning_rate=0.1, n_estimators=100, max_depth=3):
        self.learning_rate = learning_rate
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.trees = []
        self.initial_prediction = None
    
    def fit(self, X, y):
        self.initial_prediction = initialize_predictions(y)
        predictions = np.full_like(y, self.initial_prediction, dtype=np.float32)
        
        for _ in range(self.n_estimators):
            residuals = compute_gradient(y, predictions)
            tree = fit_tree(X, residuals, self.max_depth)
            self.trees.append(tree)
            predictions = update_predictions(predictions, tree, X, self.learning_rate)
    
    def predict(self, X):
        predictions = np.full_like(X[:, 0], self.initial_prediction, dtype=np.float32)
        for tree in self.trees:
            predictions = update_predictions(predictions, tree, X, self.learning_rate)
        return predictions


In [38]:
import numpy as np

class LinearRegressionScratch:
    def __init__(self):
        self.weights = None
    
    def fit(self, X, y):
        """
        Fit the linear regression model using the Normal Equation.
        X: Feature matrix (n_samples, n_features)
        y: Target vector (n_samples,)
        """
        # Add bias term (intercept) to X
        X = np.c_[np.ones(X.shape[0]), X]  # Add a column of ones to X
        
        # Compute weights using the Normal Equation
        self.weights = np.linalg.inv(X.T @ X) @ X.T @ y
    
    def predict(self, X):
        """
        Predict target values using the fitted model.
        X: Feature matrix (n_samples, n_features)
        """
        # Add bias term (intercept) to X
        X = np.c_[np.ones(X.shape[0]), X]
        
        # Compute predictions
        return X @ self.weights


In [40]:
# Generating a simple regression dataset
from sklearn.datasets import make_regression

# Generate data
X, y = make_regression(n_samples=100, n_features=5, noise=0.1)

# Train the simplified XGBoost model
model = SimplifiedXGBoost(learning_rate=0.1, n_estimators=50, max_depth=3)
model.fit(X, y)

# Make predictions
predictions = model.predict(X)

# Evaluate the model
from sklearn.metrics import mean_squared_error
print(f"Mean Squared Error: {mean_squared_error(y, predictions)}")


Mean Squared Error: 220.3890651617063


In [42]:
def r2_score(y_true, y_pred):
    total_variance = np.sum((y_true - np.mean(y_true)) ** 2)
    residual_variance = np.sum((y_true - y_pred) ** 2)
    return 1 - (residual_variance / total_variance)
# Compute R-squared
r2 = r2_score(y, predictions)
print(f"R-squared: {r2}")


R-squared: 0.9902311212685999
