<a href="https://colab.research.google.com/github/Amulyanrao7777/ML/blob/main/lab2_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
from typing import List, Tuple, Dict
import matplotlib.pyplot as plt

class DecisionStump:
    """Simple decision stump (one-level decision tree) for AdaBoost"""

    def __init__(self):
        self.feature_index = None
        self.threshold = None
        self.polarity = 1  # 1 if greater than threshold predicts +1, -1 otherwise
        self.feature_name = None

    def predict(self, X):
        """Predict using the stump rule"""
        n_samples = X.shape[0]
        predictions = np.ones(n_samples)

        feature_values = X[:, self.feature_index]

        if self.polarity == 1:
            predictions[feature_values < self.threshold] = -1
        else:
            predictions[feature_values >= self.threshold] = -1

        return predictions

    def __repr__(self):
        operator = ">=" if self.polarity == 1 else "<"
        return f"If {self.feature_name} {operator} {self.threshold} → +1 (Yes), else -1 (No)"




In [3]:
class AdaBoost:
    """AdaBoost classifier implementation"""

    def __init__(self, n_estimators=5, verbose=True):
        self.n_estimators = n_estimators
        self.stumps = []
        self.alphas = []
        self.verbose = verbose
        self.training_history = []

    def fit(self, X, y, feature_names=None):
        """
        Train AdaBoost classifier

        Parameters:
        X: feature matrix (n_samples, n_features)
        y: labels (+1 or -1)
        feature_names: list of feature names for display
        """
        n_samples, n_features = X.shape

        # Initialize weights uniformly
        weights = np.ones(n_samples) / n_samples

        if feature_names is None:
            feature_names = [f"Feature_{i}" for i in range(n_features)]

        for t in range(self.n_estimators):
            if self.verbose:
                print(f"\n{'='*80}")
                print(f"ROUND {t+1}")
                print(f"{'='*80}")
                print(f"Current weights: {weights}")

            # Find best stump
            stump, min_error, best_predictions = self._find_best_stump(
                X, y, weights, feature_names
            )

            if self.verbose:
                print(f"\nBest stump: {stump}")
                print(f"Weighted error: ε_{t+1} = {min_error:.4f}")

            # Check for perfect classifier
            if min_error == 0:
                if self.verbose:
                    print(f"\nPerfect classifier found! Error = 0")
                    print(f"α_{t+1} → ∞, stopping AdaBoost")
                # Assign very large alpha for perfect classifier
                alpha = 10.0
                self.stumps.append(stump)
                self.alphas.append(alpha)
                break

            # Check if error is too high (random guessing or worse)
            if min_error >= 0.5:
                if self.verbose:
                    print(f"\nError >= 0.5, stopping (weak learner not better than random)")
                break

            # Calculate alpha (model weight)
            alpha = 0.5 * np.log((1 - min_error) / (min_error + 1e-10))

            if self.verbose:
                print(f"α_{t+1} = 0.5 × ln((1-{min_error:.4f})/{min_error:.4f}) = {alpha:.4f}")
                print(f"e^(+α_{t+1}) ≈ {np.exp(alpha):.4f}")
                print(f"e^(-α_{t+1}) ≈ {np.exp(-alpha):.4f}")

            # Update weights
            weights = self._update_weights(weights, y, best_predictions, alpha)

            if self.verbose:
                self._print_weight_update(y, best_predictions, weights, alpha, t+1)

            # Store stump and alpha
            self.stumps.append(stump)
            self.alphas.append(alpha)

            # Store history
            self.training_history.append({
                'round': t+1,
                'stump': stump,
                'alpha': alpha,
                'error': min_error,
                'weights': weights.copy()
            })

            # Check accuracy on training set
            train_pred = self.predict(X)
            train_acc = np.mean(train_pred == y)
            if self.verbose:
                print(f"\nCumulative training accuracy after round {t+1}: {train_acc:.2%}")

        return self

    def _find_best_stump(self, X, y, weights, feature_names):
        """Find the best decision stump (weak learner)"""
        n_samples, n_features = X.shape
        min_error = float('inf')
        best_stump = None
        best_predictions = None

        # Try each feature
        for feature_idx in range(n_features):
            feature_values = X[:, feature_idx]
            unique_values = np.unique(feature_values)

            # Try thresholds between unique values
            thresholds = []
            for i in range(len(unique_values)):
                if i == 0:
                    # Try threshold at minimum value
                    thresholds.append(unique_values[i])
                else:
                    # Try threshold between consecutive values
                    thresholds.append((unique_values[i-1] + unique_values[i]) / 2)

            # Also try the exact unique values as thresholds
            thresholds.extend(unique_values)
            thresholds = np.unique(thresholds)

            # Try each threshold with both polarities
            for threshold in thresholds:
                for polarity in [1, -1]:
                    # Make predictions
                    predictions = np.ones(n_samples)
                    if polarity == 1:
                        predictions[feature_values < threshold] = -1
                    else:
                        predictions[feature_values >= threshold] = -1

                    # Calculate weighted error
                    misclassified = predictions != y
                    error = np.sum(weights[misclassified])

                    # Update best stump if this is better
                    if error < min_error:
                        min_error = error
                        best_stump = DecisionStump()
                        best_stump.feature_index = feature_idx
                        best_stump.threshold = threshold
                        best_stump.polarity = polarity
                        best_stump.feature_name = feature_names[feature_idx]
                        best_predictions = predictions

        return best_stump, min_error, best_predictions

    def _update_weights(self, weights, y, predictions, alpha):
        """Update sample weights based on classification errors"""
        # Calculate new weights: w_i * e^(-alpha * y_i * h(x_i))
        new_weights = weights * np.exp(-alpha * y * predictions)

        # Normalize weights to sum to 1
        new_weights /= np.sum(new_weights)

        return new_weights

    def _print_weight_update(self, y, predictions, new_weights, alpha, round_num):
        """Print detailed weight update information"""
        print(f"\nWeight updates for Round {round_num}:")
        print(f"{'Row':<5} {'Correct?':<10} {'Update':<20} {'New Weight':<15}")
        print("-" * 60)

        for i in range(len(y)):
            correct = "✓" if y[i] == predictions[i] else "✗"
            multiplier = np.exp(-alpha) if y[i] == predictions[i] else np.exp(alpha)
            update_str = f"× {multiplier:.4f}"
            print(f"{i+1:<5} {correct:<10} {update_str:<20} {new_weights[i]:.4f}")

        print(f"\nWeights sum: {np.sum(new_weights):.4f}")

    def predict(self, X):
        """Make predictions using the ensemble of stumps"""
        n_samples = X.shape[0]
        weighted_predictions = np.zeros(n_samples)

        # Weighted voting
        for stump, alpha in zip(self.stumps, self.alphas):
            predictions = stump.predict(X)
            weighted_predictions += alpha * predictions

        # Return sign of weighted sum
        return np.sign(weighted_predictions)

    def predict_with_scores(self, X):
        """Return both predictions and weighted scores"""
        n_samples = X.shape[0]
        weighted_predictions = np.zeros(n_samples)

        for stump, alpha in zip(self.stumps, self.alphas):
            predictions = stump.predict(X)
            weighted_predictions += alpha * predictions

        return np.sign(weighted_predictions), weighted_predictions

    def print_final_model(self):
        """Print the final ensemble model"""
        print(f"\n{'='*80}")
        print("FINAL ADABOOST MODEL")
        print(f"{'='*80}")
        print(f"\nNumber of weak learners: {len(self.stumps)}")
        print(f"\nH(x) = sign(", end="")

        terms = []
        for i, (stump, alpha) in enumerate(zip(self.stumps, self.alphas)):
            terms.append(f"{alpha:.4f} × h_{i+1}(x)")
        print(" + ".join(terms) + ")")

        print("\nWhere:")
        for i, stump in enumerate(self.stumps):
            print(f"  h_{i+1}(x): {stump}")

        print(f"\nTotal alpha weights by feature:")
        feature_weights = {}
        for stump, alpha in zip(self.stumps, self.alphas):
            fname = stump.feature_name
            if fname in feature_weights:
                feature_weights[fname] += alpha
            else:
                feature_weights[fname] = alpha

        for fname, total_alpha in sorted(feature_weights.items(), key=lambda x: -x[1]):
            print(f"  {fname}: {total_alpha:.4f}")

In [4]:
def encode_data(df, target_col='Illness'):
    """Encode data for AdaBoost (convert target to +1/-1)"""
    df_encoded = df.copy()
    df_encoded[target_col] = df_encoded[target_col].map({'Yes': 1, 'No': -1})
    return df_encoded

In [5]:
def example_1_perfect_classifier():
    """Example 1: Perfect classifier case (no noise)"""
    print("\n" + "="*80)
    print("EXAMPLE 1: PERFECT CLASSIFIER (NO NOISE)")
    print("="*80)

    # Create dataset
    data = {
        'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
        'Age': [38, 52, 45, 29, 61],
        'Income': [420000, 360000, 780000, 300000, 500000],
        'Smoking': [0, 5, 0, 12, 8],
        'Illness': ['No', 'Yes', 'No', 'Yes', 'Yes']
    }
    df = pd.DataFrame(data)

    print("\nDataset:")
    print(df)

    # Prepare data
    df_encoded = encode_data(df)
    X = df_encoded[['Age', 'Income', 'Smoking']].values
    y = df_encoded['Illness'].values
    feature_names = ['Age', 'Income', 'Smoking']

    # Train AdaBoost
    clf = AdaBoost(n_estimators=3, verbose=True)
    clf.fit(X, y, feature_names)

    # Make predictions
    predictions = clf.predict(X)

    print(f"\n{'='*80}")
    print("FINAL PREDICTIONS")
    print(f"{'='*80}")
    print(f"{'Row':<5} {'True y':<10} {'Predicted':<10} {'Correct?':<10}")
    print("-" * 40)
    for i in range(len(y)):
        correct = "✓" if y[i] == predictions[i] else "✗"
        print(f"{i+1:<5} {y[i]:<10} {predictions[i]:<10} {correct:<10}")

    accuracy = np.mean(predictions == y)
    print(f"\nFinal Accuracy: {accuracy:.2%}")

    clf.print_final_model()

In [6]:
def example_1_with_noise():
    """Example 1: With noise (Row 3 is ill non-smoker)"""
    print("\n" + "="*80)
    print("EXAMPLE 1: WITH NOISE (ROW 3 MODIFIED)")
    print("="*80)

    # Create dataset with noise
    data = {
        'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
        'Age': [38, 52, 45, 29, 61],
        'Income': [420000, 360000, 780000, 300000, 500000],
        'Smoking': [0, 5, 0, 12, 8],
        'Illness': ['No', 'Yes', 'Yes', 'Yes', 'Yes']  # Row 3 changed to Yes (noise)
    }
    df = pd.DataFrame(data)

    print("\nDataset (with noise at Row 3):")
    print(df)

    # Prepare data
    df_encoded = encode_data(df)
    X = df_encoded[['Age', 'Income', 'Smoking']].values
    y = df_encoded['Illness'].values
    feature_names = ['Age', 'Income', 'Smoking']

    # Train AdaBoost for 3 rounds
    clf = AdaBoost(n_estimators=3, verbose=True)
    clf.fit(X, y, feature_names)

    # Make predictions with scores
    predictions, scores = clf.predict_with_scores(X)

    print(f"\n{'='*80}")
    print("FINAL PREDICTIONS WITH WEIGHTED SCORES")
    print(f"{'='*80}")
    print(f"{'Row':<5} {'True y':<10} {'Score':<15} {'Predicted':<10} {'Correct?':<10}")
    print("-" * 60)
    for i in range(len(y)):
        correct = "✓" if y[i] == predictions[i] else "✗"
        print(f"{i+1:<5} {y[i]:<10} {scores[i]:<15.4f} {predictions[i]:<10} {correct:<10}")

    accuracy = np.mean(predictions == y)
    print(f"\nAccuracy after 3 rounds: {accuracy:.2%}")

    clf.print_final_model()

    return clf, X, y, feature_names

In [8]:
def continue_rounds_4_5(clf, X, y, feature_names):
    """Continue with rounds 4 and 5 (homework)"""
    print("\n" + "="*80)
    print("HOMEWORK: ROUNDS 4 AND 5")
    print("="*80)

    # Continue training for 2 more rounds
    clf_extended = AdaBoost(n_estimators=5, verbose=True)
    clf_extended.fit(X, y, feature_names)

    # Make predictions
    predictions, scores = clf_extended.predict_with_scores(X)

    print(f"\n{'='*80}")
    print("FINAL PREDICTIONS AFTER 5 ROUNDS")
    print(f"{'='*80}")
    print(f"{'Row':<5} {'True y':<10} {'Score':<15} {'Predicted':<10} {'Correct?':<10}")
    print("-" * 60)
    for i in range(len(y)):
        correct = "✓" if y[i] == predictions[i] else "✗"
        print(f"{i+1:<5} {y[i]:<10} {scores[i]:<15.4f} {predictions[i]:<10} {correct:<10}")

    accuracy = np.mean(predictions == y)
    print(f"\nFinal Accuracy after 5 rounds: {accuracy:.2%}")

    clf_extended.print_final_model()


def example_2():
    """Example 2: BMI dataset"""
    print("\n" + "="*80)
    print("EXAMPLE 2: BMI DATASET")
    print("="*80)

    # Create dataset
    data = {
        'Gender': ['Female', 'Male', 'Male', 'Female', 'Male'],
        'Age': [33, 57, 41, 49, 36],
        'Income': [480000, 320000, 900000, 540000, 450000],
        'BMI': [22.1, 29.5, 24.0, 31.2, 27.8],
        'Illness': ['No', 'Yes', 'No', 'Yes', 'No']
    }
    df = pd.DataFrame(data)

    print("\nDataset:")
    print(df)

    # Prepare data
    df_encoded = encode_data(df)
    X = df_encoded[['Age', 'Income', 'BMI']].values
    y = df_encoded['Illness'].values
    feature_names = ['Age', 'Income', 'BMI']

    # Train AdaBoost
    clf = AdaBoost(n_estimators=5, verbose=True)
    clf.fit(X, y, feature_names)

    # Make predictions
    predictions, scores = clf.predict_with_scores(X)

    print(f"\n{'='*80}")
    print("FINAL PREDICTIONS")
    print(f"{'='*80}")
    print(f"{'Row':<5} {'True y':<10} {'Score':<15} {'Predicted':<10} {'Correct?':<10}")
    print("-" * 60)
    for i in range(len(y)):
        correct = "✓" if y[i] == predictions[i] else "✗"
        print(f"{i+1:<5} {y[i]:<10} {scores[i]:<15.4f} {predictions[i]:<10} {correct:<10}")

    accuracy = np.mean(predictions == y)
    print(f"\nFinal Accuracy: {accuracy:.2%}")

    clf.print_final_model()

In [9]:
if __name__ == "__main__":
    # Run Example 1 - Perfect Classifier
    example_1_perfect_classifier()

    print("\n\n" + "#"*80 + "\n\n")

    # Run Example 1 - With Noise (3 rounds as in document)
    clf, X, y, feature_names = example_1_with_noise()

    print("\n\n" + "#"*80 + "\n\n")

    # Continue with Rounds 4 and 5 (Homework)
    continue_rounds_4_5(clf, X, y, feature_names)

    print("\n\n" + "#"*80 + "\n\n")

    # Run Example 2 - BMI Dataset
    example_2()


EXAMPLE 1: PERFECT CLASSIFIER (NO NOISE)

Dataset:
   Gender  Age  Income  Smoking Illness
0    Male   38  420000        0      No
1  Female   52  360000        5     Yes
2    Male   45  780000        0      No
3  Female   29  300000       12     Yes
4    Male   61  500000        8     Yes

ROUND 1
Current weights: [0.2 0.2 0.2 0.2 0.2]

Best stump: If Smoking >= 2.5 → +1 (Yes), else -1 (No)
Weighted error: ε_1 = 0.0000

Perfect classifier found! Error = 0
α_1 → ∞, stopping AdaBoost

FINAL PREDICTIONS
Row   True y     Predicted  Correct?  
----------------------------------------
1     -1         -1.0       ✓         
2     1          1.0        ✓         
3     -1         -1.0       ✓         
4     1          1.0        ✓         
5     1          1.0        ✓         

Final Accuracy: 100.00%

FINAL ADABOOST MODEL

Number of weak learners: 1

H(x) = sign(10.0000 × h_1(x))

Where:
  h_1(x): If Smoking >= 2.5 → +1 (Yes), else -1 (No)

Total alpha weights by feature:
  Smoking: 10.000