# **Question 2 - Perceptron Experiments (Stagewise Implementation)**  
This notebook implements Question 2 in a **stagewise manner**, meaning:  
- **(a)** Best weight initialization method is found first.  
- **(b)** Then, the best threshold (θ) is determined using that initialization.  
- **(c)** Finally, the best learning rate (η) is found using the optimal init and θ values.  

---



In [2]:
# Import required libraries
import numpy as np
import os
import time
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import accuracy_score
import pandas as pd


def read_char(path):
    """Convert a text file (9x7 character image) into a 1D binary vector."""
    with open(path, 'r') as f:
        lines = [line.strip() for line in f if line.strip()]
    data = np.array([[1 if c == '#' else 0 for c in line] for line in lines], dtype=float)
    return data.flatten()

def load_dataset(folder):
    """
    Load dataset of character text files.
    A → label +1
    others → label -1
    """
    X, y, names = [], [], []
    for fname in sorted(os.listdir(folder)):
        if not fname.endswith(".txt"):
            continue
        label = fname[0].upper()
        target = 1 if label == 'A' else -1
        x = read_char(os.path.join(folder, fname))
        X.append(x)
        y.append(target)
        names.append(fname)
    return np.array(X), np.array(y), names


## Utility Functions
These helper functions include:
- `init_weights()`: initialize weights with different strategies
- `train_one_epoch()`: single-epoch perceptron update rule
- `train_perceptron()`: multiple-epoch training
- `load_dataset()`: placeholder for your dataset loading function


In [3]:
def init_weights(mode, dim, seed=None):
    rng = np.random.default_rng(seed)
    if mode == "zeros":
        return np.zeros(dim), 0.0
    elif mode == "uniform":
        return rng.uniform(-0.5, 0.5, dim), float(rng.uniform(-0.5, 0.5))
    elif mode == "normal":
        return rng.normal(0, 0.01, dim), float(rng.normal(0, 0.01))
    elif mode == "xavier":
        std = np.sqrt(2.0 / (dim + 1.0))
        return rng.normal(0, std, dim), float(rng.normal(0, std))
    elif mode == "kaiming":
        std = np.sqrt(2.0 / dim)
        return rng.normal(0, std, dim), float(rng.normal(0, std))
    else:
        raise ValueError("Unknown initialization mode")


def train_one_epoch(X, y, w, b, lr, theta=0.0):
    errors = 0
    for xi, target in zip(X, y):
        z = np.dot(w, xi) + b
        y_pred = 1 if z >= theta else -1
        if y_pred != target:
            w += lr * target * xi
            b += lr * target
            errors += 1
    preds = np.where(np.dot(X, w.T) + b >= theta, 1, -1)
    error_rate = np.mean(preds != y)
    return w, b, errors, error_rate


def train_perceptron(X, y, w_init, b_init, lr=0.1, theta=0.0, max_epochs=100):
    w, b = w_init.copy(), float(b_init)
    epoch_updates, epoch_errors = [], []
    start_time = time.time()
    converged_epoch = None

    for epoch in range(1, max_epochs + 1):
        w, b, updates, err_rate = train_one_epoch(X, y, w, b, lr, theta)
        epoch_updates.append(updates)
        epoch_errors.append(err_rate)
        if updates == 0 and converged_epoch is None:
            converged_epoch = epoch
            break

    elapsed = time.time() - start_time
    return {
        "w": w, "b": b,
        "epoch_updates": epoch_updates,
        "epoch_errors": epoch_errors,
        "converged_epoch": converged_epoch or max_epochs,
        "elapsed": elapsed
    }


## Stagewise Experiment Function
This function runs the experiments in three stages:
1. Find the best initialization
2. Find the best threshold θ
3. Find the best learning rate η


In [4]:
def experiment_stagewise(train_folder="Characters-TrainSet",
                         test_folder="Characters-TestSet",
                         init_modes=["zeros","uniform","normal","xavier","kaiming"],
                         theta_list=[-1.0, -0.5, 0.0, 0.5, 1.0],
                         eta_list=[0.01, 0.05, 0.1, 0.3],
                         max_epochs=30,
                         seed_base=123):

    X_train, y_train, _ = load_dataset(train_folder)
    X_test, y_test, _ = load_dataset(test_folder)
    dim = X_train.shape[1]

    # -------------------------
    # (A) Initialization Stage
    # -------------------------
    print("\n=== Stage A: Finding best initialization mode ===")
    init_results = []

    for i_mode, mode in enumerate(init_modes):
        seed = seed_base + i_mode * 100
        w0, b0 = init_weights(mode, dim, seed)
        result = train_perceptron(X_train, y_train, w0, b0, lr=0.1, theta=0.0, max_epochs=max_epochs)

        y_pred_test = np.where(np.dot(X_test, result["w"]) + result["b"] >= 0.0, 1, -1)
        test_err = np.mean(y_pred_test != y_test)
        init_results.append((mode, test_err, result))

        print(f"{mode:7s} → test error={test_err*100:.2f}% conv={result['converged_epoch']}")

    best_mode, best_err, best_result = min(init_results, key=lambda x: x[1])
    print(f"\n✅ Best init mode: {best_mode} (error={best_err*100:.2f}%)")

    # -------------------------
    # (B) Threshold Stage
    # -------------------------
    print("\n=== Stage B: Finding best threshold (θ) ===")
    theta_results = []
    for theta in theta_list:
        w0, b0 = init_weights(best_mode, dim, seed_base)
        result = train_perceptron(X_train, y_train, w0, b0, lr=0.1, theta=theta, max_epochs=max_epochs)

        y_pred_test = np.where(np.dot(X_test, result["w"]) + result["b"] >= theta, 1, -1)
        test_err = np.mean(y_pred_test != y_test)
        theta_results.append((theta, test_err, result))

        print(f"θ={theta:+.2f} → test error={test_err*100:.2f}% conv={result['converged_epoch']}")

    best_theta, best_theta_err, best_theta_result = min(theta_results, key=lambda x: x[1])
    print(f"\n✅ Best θ: {best_theta:+.2f} (error={best_theta_err*100:.2f}%)")

    # -------------------------
    # (C) Learning Rate Stage
    # -------------------------
    print("\n=== Stage C: Evaluating learning rates (η) ===")
    eta_results = []
    for eta in eta_list:
        w0, b0 = init_weights(best_mode, dim, seed_base)
        result = train_perceptron(X_train, y_train, w0, b0, lr=eta, theta=best_theta, max_epochs=max_epochs)

        y_pred_test = np.where(np.dot(X_test, result["w"]) + result["b"] >= best_theta, 1, -1)
        test_err = np.mean(y_pred_test != y_test)
        eta_results.append((eta, test_err, result))

        print(f"η={eta:.3f} → test error={test_err*100:.2f}% conv={result['converged_epoch']}")

    best_eta, best_eta_err, best_eta_result = min(eta_results, key=lambda x: x[1])
    print(f"\n✅ Best η: {best_eta} (error={best_eta_err*100:.2f}%)")

    # -------------------------
    # Final Summary
    # -------------------------
    best_summary = {
        "best_init": best_mode,
        "best_theta": best_theta,
        "best_eta": best_eta,
        "final_test_error": best_eta_err,
        "converged_epoch": best_eta_result["converged_epoch"],
        "elapsed": best_eta_result["elapsed"]
    }

    print("\n=== Final Best Configuration ===")
    for k, v in best_summary.items():
        print(f"{k:15s}: {v}")
    
    return best_summary, {
        "init_results": init_results,
        "theta_results": theta_results,
        "eta_results": eta_results
    }


## Running the Experiment
Now we can run the full pipeline and observe the best initialization, threshold, and learning rate.


In [5]:
best_summary, all_results = experiment_stagewise(
    train_folder="Characters-TrainSet",
    test_folder="Characters-TestSet",
    max_epochs=30
)



=== Stage A: Finding best initialization mode ===
zeros   → test error=4.76% conv=3
uniform → test error=14.29% conv=3
normal  → test error=0.00% conv=3
xavier  → test error=0.00% conv=3
kaiming → test error=0.00% conv=3

✅ Best init mode: normal (error=0.00%)

=== Stage B: Finding best threshold (θ) ===
θ=-1.00 → test error=0.00% conv=3
θ=-0.50 → test error=9.52% conv=3
θ=+0.00 → test error=0.00% conv=3
θ=+0.50 → test error=4.76% conv=4
θ=+1.00 → test error=9.52% conv=3

✅ Best θ: -1.00 (error=0.00%)

=== Stage C: Evaluating learning rates (η) ===
η=0.010 → test error=19.05% conv=4
η=0.050 → test error=4.76% conv=3
η=0.100 → test error=0.00% conv=3
η=0.300 → test error=0.00% conv=5

✅ Best η: 0.1 (error=0.00%)

=== Final Best Configuration ===
best_init      : normal
best_theta     : -1.0
best_eta       : 0.1
final_test_error: 0.0
converged_epoch: 3
elapsed        : 0.0


## Notes
- **Best Init Mode:** chosen from five initialization methods.
- **Best θ:** selected based on minimal test error using the chosen init.
- **Best η:** chosen with the previous best parameters.
- The final summary shows total training time, convergence epoch, and test error.


Ensure we have the best params

In [6]:
# Try to use existing best_summary; if not present, compute it (this may take time)
try:
    best_summary  # if defined in notebook previously
    print("Found existing best_summary:", best_summary)
except NameError:
    print("best_summary not found — running stagewise search (this may take some time)...")
    best_summary, all_results = experiment_stagewise(
        train_folder="Characters-TrainSet",
        test_folder="Characters-TestSet",
        max_epochs=30
    )

# Extract best params
best_init = best_summary["best_init"]
best_theta = best_summary["best_theta"]
best_eta = best_summary["best_eta"]
print("\nBest config (used for next steps):", best_init, best_theta, best_eta)


Found existing best_summary: {'best_init': 'normal', 'best_theta': -1.0, 'best_eta': 0.1, 'final_test_error': np.float64(0.0), 'converged_epoch': 3, 'elapsed': 0.0}

Best config (used for next steps): normal -1.0 0.1


Important functions for this section (train perceptron already available)

In [7]:
def evaluate_model(w, b, X, y, theta):
    """Return error rate and accuracy for given weights/bias and threshold."""
    preds = np.where(np.dot(X, w) + b >= theta, 1, -1)
    err = np.mean(preds != y)
    acc = accuracy_score(y, preds)
    return {"error": err, "accuracy": acc, "preds": preds}

# -------------------------
# Adaline (Batch Gradient Descent) - manual implementation
# -------------------------
def train_adaline_batch(X, y, w_init, b_init, lr=0.01, max_epochs=1000, tol=1e-6):
    """
    Batch gradient descent Adaline that minimizes MSE:
      z = w·x + b
      error = y - z
      MSE = mean(error^2)
    Update (batch):
      w <- w + lr * mean((y - z) * x)  (note: sign depends on convention)
      b <- b + lr * mean(y - z)
    Returns dict with final w,b, mse_list, elapsed, converged_epoch
    """
    w = w_init.copy().astype(float)
    b = float(b_init)
    N = X.shape[0]
    mse_list = []
    start = time.time()
    converged_epoch = None

    for epoch in range(1, max_epochs+1):
        z = X.dot(w) + b               # shape (N,)
        errors = y - z                 # shape (N,)
        mse = np.mean(errors**2)
        mse_list.append(mse)

        # gradient of 0.5*MSE wrt w is -mean(error * x) if MSE = mean((y - z)^2)
        # We want w <- w + lr * mean(error * x)
        grad_w = np.mean((errors)[:, None] * X, axis=0)  # shape (D,)
        grad_b = np.mean(errors)

        w += lr * grad_w
        b += lr * grad_b

        if epoch > 1 and abs(mse_list[-2] - mse_list[-1]) < tol:
            converged_epoch = epoch
            break

    elapsed = time.time() - start
    return {"w": w, "b": b, "mse_list": mse_list,
            "converged_epoch": converged_epoch or max_epochs,
            "elapsed": elapsed}

# -------------------------
# Adaline via sklearn.SGDRegressor wrapper (just train & return continuous regressor)
# -------------------------
def train_adaline_sklearn(X, y, learning_rate=0.01, max_iter=1000):
    """
    Train sklearn SGDRegressor with squared loss; targets are expected as -1/+1.
    Returns trained regressor and elapsed time.
    """
    reg = SGDRegressor(loss='squared_error', learning_rate='constant', eta0=learning_rate,
                       max_iter=max_iter, tol=1e-6, random_state=42)
    start = time.time()
    reg.fit(X, y)
    elapsed = time.time() - start
    return reg, elapsed


Perceptron + Adaline

In [8]:
import numpy as np
import time
from collections import defaultdict

# === Load dataset again ===
X_train, y_train, _ = load_dataset("Characters-TrainSet")
X_test, y_test, _ = load_dataset("Characters-TestSet")

# === Use best hyperparameters from previous experiment ===
best_init = "normal"
best_theta = -1.0
best_eta = 0.1

# Initialize weights reproducibly
dim = X_train.shape[1]
seed = 999
w0, b0 = init_weights(best_init, dim, seed)

# === Train Perceptron ===
perc_res = train_perceptron(X_train, y_train, w0, b0, lr=best_eta, theta=best_theta, max_epochs=200)

# Evaluation helper
def evaluate_model(w, b, X, y, theta):
    preds = np.where(np.dot(X, w) + b >= theta, 1, -1)
    accuracy = np.mean(preds == y)
    error = 1 - accuracy
    return {"accuracy": accuracy, "error": error}

perc_train_eval = evaluate_model(perc_res["w"], perc_res["b"], X_train, y_train, best_theta)
perc_test_eval  = evaluate_model(perc_res["w"], perc_res["b"], X_test,  y_test,  best_theta)

print("=== Perceptron Results ===")
print(f"Converged Epoch: {perc_res['converged_epoch']} | Time: {perc_res['elapsed']:.4f}s")
print(f"Train Accuracy: {perc_train_eval['accuracy']*100:.2f}% | Test Accuracy: {perc_test_eval['accuracy']*100:.2f}%")

# ============================================================
#                   Adaline implementation
# ============================================================

def train_adaline(X, y, w_init, b_init, lr=0.1, theta=0.0, max_epochs=100):
    w, b = w_init.copy(), float(b_init)
    errors, losses = [], []
    start_time = time.time()

    for epoch in range(1, max_epochs + 1):
        # Linear output (no step function)
        y_net = np.dot(X, w) + b
        error = y - y_net
        w += lr * np.dot(X.T, error) / len(X)
        b += lr * np.mean(error)

        # Mean squared error
        mse = np.mean(error**2)
        losses.append(mse)

        # convergence criterion
        if mse < 1e-4:
            break

    elapsed = time.time() - start_time
    return {"w": w, "b": b, "losses": losses, "epochs": epoch, "elapsed": elapsed}

# Train Adaline
w0, b0 = init_weights(best_init, dim, seed)
adaline_res = train_adaline(X_train, y_train, w0, b0, lr=best_eta, theta=best_theta, max_epochs=200)

# Evaluate Adaline
adaline_train_eval = evaluate_model(adaline_res["w"], adaline_res["b"], X_train, y_train, best_theta)
adaline_test_eval  = evaluate_model(adaline_res["w"], adaline_res["b"], X_test,  y_test,  best_theta)

print("\n=== Adaline Results ===")
print(f"Epochs: {adaline_res['epochs']} | Time: {adaline_res['elapsed']:.4f}s")
print(f"Train Accuracy: {adaline_train_eval['accuracy']*100:.2f}% | Test Accuracy: {adaline_test_eval['accuracy']*100:.2f}%")

# ============================================================
#                   Comparison Summary
# ============================================================
import pandas as pd

summary = pd.DataFrame([
    {
        "Model": "Perceptron",
        "Train Accuracy (%)": perc_train_eval["accuracy"]*100,
        "Test Accuracy (%)": perc_test_eval["accuracy"]*100,
        "Epochs": perc_res["converged_epoch"],
        "Time (s)": perc_res["elapsed"]
    },
    {
        "Model": "Adaline",
        "Train Accuracy (%)": adaline_train_eval["accuracy"]*100,
        "Test Accuracy (%)": adaline_test_eval["accuracy"]*100,
        "Epochs": adaline_res["epochs"],
        "Time (s)": adaline_res["elapsed"]
    }
])

display(summary.round(3))


=== Perceptron Results ===
Converged Epoch: 3 | Time: 0.0000s
Train Accuracy: 100.00% | Test Accuracy: 100.00%

=== Adaline Results ===
Epochs: 200 | Time: 0.0125s
Train Accuracy: 66.67% | Test Accuracy: 33.33%


Unnamed: 0,Model,Train Accuracy (%),Test Accuracy (%),Epochs,Time (s)
0,Perceptron,100.0,100.0,3,0.0
1,Adaline,66.667,33.333,200,0.013


feature projection

In [9]:
import numpy as np
import os

def load_and_project(folder):
    """
    Reads all character text files from a folder and returns
    the projected feature matrix (row+col sums) and labels.
    """
    X, y, names = [], [], []
    for fname in sorted(os.listdir(folder)):
        if not fname.endswith(".txt"):
            continue
        path = os.path.join(folder, fname)
        label = fname[0].upper()  # e.g., 'A1.txt' → 'A'
        with open(path, "r") as f:
            lines = [list(line.strip()) for line in f.readlines() if line.strip()]
        # Convert to binary 0/1
        arr = np.array([[1 if ch == "#" else 0 for ch in row] for row in lines])
        # Compute row & column projections
        row_sum = np.sum(arr, axis=1)
        col_sum = np.sum(arr, axis=0)
        features = np.concatenate([row_sum, col_sum])
        X.append(features)
        y.append(label)
        names.append(fname)
    return np.array(X, dtype=float), np.array(y), names

# Example usage:
X_proj_train, y_proj_train, _ = load_and_project("Characters-TrainSet")
X_proj_test, y_proj_test, _ = load_and_project("Characters-TestSet")

print("Train shape:", X_proj_train.shape)
print("Example feature vector:\n", X_proj_train[0])


Train shape: (21, 16)
Example feature vector:
 [2. 1. 1. 2. 2. 5. 2. 2. 6. 1. 4. 5. 4. 4. 4. 1.]


In [10]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Encode class labels (A → 0, B → 1, …)
enc = LabelEncoder()
y_train_enc = enc.fit_transform(y_proj_train)
y_test_enc  = enc.transform(y_proj_test)

# Normalize features
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_proj_train)
X_test_s  = scaler.transform(X_proj_test)


In [11]:
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Train Perceptron
perc_proj = Perceptron(max_iter=1000, eta0=0.1, random_state=42)
perc_proj.fit(X_train_s, y_train_enc)

# Predict
y_pred_proj = perc_proj.predict(X_test_s)

# Evaluate
acc_proj = accuracy_score(y_test_enc, y_pred_proj)
print(f"Accuracy (Projected Features): {acc_proj*100:.2f}%")
print("Confusion Matrix:\n", confusion_matrix(y_test_enc, y_pred_proj))
print("Classification Report:\n", classification_report(y_test_enc, y_pred_proj, zero_division=0))


Accuracy (Projected Features): 61.90%
Confusion Matrix:
 [[2 0 0 0 1 0 0]
 [0 2 0 0 1 0 0]
 [0 0 2 0 1 0 0]
 [0 0 0 2 0 1 0]
 [0 0 0 0 3 0 0]
 [0 0 0 0 3 0 0]
 [0 0 0 0 1 0 2]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       1.00      0.67      0.80         3
           2       1.00      0.67      0.80         3
           3       1.00      0.67      0.80         3
           4       0.30      1.00      0.46         3
           5       0.00      0.00      0.00         3
           6       1.00      0.67      0.80         3

    accuracy                           0.62        21
   macro avg       0.76      0.62      0.64        21
weighted avg       0.76      0.62      0.64        21

