In [8]:
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

class DecisionStump:
    def __init__(self):
        self.polarity = 1
        self.feature_idx = None
        self.threshold = None
        self.alpha = None

    def predict(self, X):
        feature_values = X[:, self.feature_idx]
        return np.where(feature_values < self.threshold if self.polarity == 1 
                       else feature_values > self.threshold, -1, 1)

class AdaBoost:
    def __init__(self, n_estimators=50):
        self.n_estimators = n_estimators
        self.stumps = []
        
    def fit(self, X, y):
        n_samples, n_features = X.shape
        sample_weights = np.ones(n_samples) / n_samples
            
        # Convert y to {-1, 1} once
        y_mean = np.mean(y)
        y_ = np.where(y > y_mean, 1, -1)
        
        for _ in range(self.n_estimators):
            stump = DecisionStump()
            min_error = float('inf')
            
            # Randomly select features to consider (33% of features)
            n_features_to_try = max(1, n_features // 3)
            feature_indices = np.random.choice(n_features, n_features_to_try, replace=False)
            
            for feature_idx in feature_indices:
                feature_values = X[:, feature_idx]
                
                # Use percentile-based thresholds instead of all unique values
                thresholds = np.percentile(feature_values, [25, 50, 75])
                
                for threshold in thresholds:
                    for polarity in [-1, 1]:
                        predictions = np.where(feature_values < threshold if polarity == 1 
                                            else feature_values > threshold, -1, 1)
                        
                        error = np.sum(sample_weights * (y_ != predictions))
                        
                        if error < min_error:
                            min_error = error
                            stump.polarity = polarity
                            stump.threshold = threshold
                            stump.feature_idx = feature_idx
            
            # Calculate stump weight (alpha)
            EPS = 1e-10
            stump.alpha = 0.5 * np.log((1.0 - min_error + EPS) / (min_error + EPS))
            
            # Update sample weights
            predictions = stump.predict(X)
            sample_weights *= np.exp(-stump.alpha * y_ * predictions)
            sample_weights /= np.sum(sample_weights)  # Normalize
            
            self.stumps.append(stump)
    
    def predict(self, X):
        predictions = np.zeros(X.shape[0])
        for stump in self.stumps:
            predictions += stump.alpha * stump.predict(X)
        return np.sign(predictions)

# Load and prepare a subset of the California Housing dataset
data = fetch_california_housing()
# Use only 5000 samples for faster execution
n_samples = 5000
indices = np.random.choice(len(data.data), n_samples, replace=False)
X, y = data.data[indices], data.target[indices]

# Split and scale the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train and evaluate
adaboost = AdaBoost(n_estimators=20)  # Reduced number of estimators
adaboost.fit(X_train_scaled, y_train)

# Make predictions
y_pred_train = adaboost.predict(X_train_scaled)
y_pred_test = adaboost.predict(X_test_scaled)

# Calculate accuracy
train_accuracy = np.mean(np.where(y_train > np.mean(y_train), 1, -1) == y_pred_train)
test_accuracy = np.mean(np.where(y_test > np.mean(y_test), 1, -1) == y_pred_test)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")

Training Accuracy: 0.8057
Testing Accuracy: 0.7930
