# Logistic Regression (Binary Classification) from Scratch
**Objective:** Implement Binary Logistic Regression using only NumPy (Batch Gradient Descent), including evaluation metrics and decision boundary visualization.

## Setup

In [None]:
import numpy as np
import matplotlib.pyplot as plt
# Optional: pandas for display if useful, though main logic is pure numpy
# import pandas as pd

# Set seed for reproducibility
np.random.seed(42)

## Problem Setup
**Logistic Regression** predicts the probability that an instance belongs to a class (0 or 1).

**Model:**
$$z = Xw + b$$
$$\hat{y} = \sigma(z) = \frac{1}{1 + e^{-z}}$$

**Cost Function (Binary Cross-Entropy / Log-Loss):**
$$J(w, b) = -\frac{1}{m} \sum_{i=1}^{m} [y^{(i)} \log(\hat{y}^{(i)}) + (1 - y^{(i)}) \log(1 - \hat{y}^{(i)})]$$

**Gradients:**
Derivatives for Gradient Descent are surprisingly identical to Linear Regression (due to the choice of Sigmoid + Cross-Entropy):
*   $\frac{\partial J}{\partial w} = \frac{1}{m} X^T (\hat{y} - y)$
*   $\frac{\partial J}{\partial b} = \frac{1}{m} \sum_{i=1}^{m} (\hat{y}^{(i)} - y^{(i)})$

## Data

In [None]:
# Generate Synthetic Data: Two Gaussian Clusters
n_samples = 200

# Class 0
X0 = np.random.randn(n_samples // 2, 2) * 1.5 + [2, 2]
y0 = np.zeros((n_samples // 2, 1))

# Class 1
X1 = np.random.randn(n_samples // 2, 2) * 1.5 + [6, 6]
y1 = np.ones((n_samples // 2, 1))

X = np.vstack((X0, X1))
y = np.vstack((y0, y1))

# Shuffle
indices = np.arange(n_samples)
np.random.shuffle(indices)
X = X[indices]
y = y[indices]

# Split Train/Test (80/20)
split_idx = int(0.8 * n_samples)
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

# Visualization
plt.figure(figsize=(8, 6))
plt.scatter(X_train[y_train.flatten() == 0][:, 0], X_train[y_train.flatten() == 0][:, 1], color='red', label='Class 0 (Train)')
plt.scatter(X_train[y_train.flatten() == 1][:, 0], X_train[y_train.flatten() == 1][:, 1], color='blue', label='Class 1 (Train)')
plt.title('Synthetic Binary Classification Data')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## Implementation (NumPy)

In [None]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def predict_proba(X, w, b):
    z = X.dot(w) + b
    return sigmoid(z)

def predict(X, w, b, threshold=0.5):
    probs = predict_proba(X, w, b)
    return (probs >= threshold).astype(int)

def log_loss(y_true, y_prob):
    # Add epsilon to prevent log(0)
    eps = 1e-15
    y_prob = np.clip(y_prob, eps, 1 - eps)
    return -np.mean(y_true * np.log(y_prob) + (1 - y_true) * np.log(1 - y_prob))

def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

def precision_recall_f1(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return precision, recall, f1

def fit_gd(X, y, lr=0.1, epochs=1000, normalize=True):
    m, n = X.shape
    history = []
    
    # Normalization Stats
    mean, std = np.zeros(n), np.ones(n)
    X_train = np.copy(X)
    
    if normalize:
        mean = np.mean(X, axis=0)
        std = np.std(X, axis=0) + 1e-8 # eps for safety
        X_train = (X - mean) / std

    # Initialize weights
    w = np.zeros((n, 1))
    b = 0.0
    
    for epoch in range(epochs):
        # Forward
        y_prob = predict_proba(X_train, w, b)
        
        # Loss
        loss = log_loss(y, y_prob)
        history.append(loss)
        
        # Gradients
        dw = (1/m) * X_train.T.dot(y_prob - y)
        db = (1/m) * np.sum(y_prob - y)
        
        # Update
        w -= lr * dw
        b -= lr * db
        
    return w, b, history, (mean, std)

## Experiments

In [None]:
configs = [
    {"name": "Small LR, No Norm", "lr": 0.01, "norm": False, "epochs": 1000},
    {"name": "Med LR, Norm", "lr": 0.1, "norm": True, "epochs": 1000},
    {"name": "High LR (Unstable)", "lr": 5.0, "norm": True, "epochs": 1000}
]

plt.figure(figsize=(18, 5))

best_model = None
best_acc = 0

for i, conf in enumerate(configs):
    w, b, hist, stats = fit_gd(X_train, y_train, lr=conf['lr'], epochs=conf['epochs'], normalize=conf['norm'])
    
    # Evaluate on Test
    mean, std = stats
    X_test_eval = (X_test - mean) / std if conf['norm'] else X_test
    
    y_pred = predict(X_test_eval, w, b)
    acc = accuracy(y_test, y_pred)
    prec, rec, f1 = precision_recall_f1(y_test, y_pred)
    
    print(f"Experiment: {conf['name']}")
    print(f"  Final Acc: {acc:.4f} | Prec: {prec:.2f} | Rec: {rec:.2f} | F1: {f1:.2f}")
    print(f"  Final Loss: {hist[-1]:.4f}\n")
    
    # Keep best normalized model for visualization
    if conf['norm'] and conf['lr'] == 0.1:
        best_model = (w, b, stats)
    
    # Plot
    plt.subplot(1, 3, i+1)
    plt.plot(hist)
    plt.title(f"{conf['name']} (Acc: {acc:.2f})")
    plt.xlabel('Epochs')
    plt.ylabel('Log Loss')
    plt.grid(True)

plt.tight_layout()
plt.show()

## Decision Boundary (2D)

In [None]:
if best_model is None:
    print("Run experiments first to get the best model.")
else:
    w_star, b_star, stats_star = best_model
    mean, std = stats_star
    
    # Create a meshgrid
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                         np.arange(y_min, y_max, 0.1))
    
    # Normalize meshgrid points
    grid_points = np.c_[xx.ravel(), yy.ravel()]
    grid_points_norm = (grid_points - mean) / std
    
    # Predict
    Z = predict(grid_points_norm, w_star, b_star)
    Z = Z.reshape(xx.shape)
    
    plt.figure(figsize=(10, 8))
    plt.contourf(xx, yy, Z, alpha=0.3, cmap='coolwarm')
    
    # Plot class 0 and 1
    plt.scatter(X[y.flatten()==0][:, 0], X[y.flatten()==0][:, 1], c='red', edgecolor='k', label='Class 0')
    plt.scatter(X[y.flatten()==1][:, 0], X[y.flatten()==1][:, 1], c='blue', edgecolor='k', label='Class 1')
    
    plt.title("Decision Boundary (Normalized GD Model)")
    plt.xlabel("Feature 1")
    plt.ylabel("Feature 2")
    plt.legend()
    plt.show()

## Threshold Analysis

In [None]:
# Using the best model on test set
X_test_norm = (X_test - mean) / std
thresholds = [0.3, 0.5, 0.7]

print("Threshold Performance Analysis:")
print("-"*40)
for thr in thresholds:
    y_pred_thr = predict(X_test_norm, w_star, b_star, threshold=thr)
    prec, rec, _ = precision_recall_f1(y_test, y_pred_thr)
    print(f"Threshold {thr}: Precision: {prec:.2f} | Recall: {rec:.2f}")

print("-"*40)
print("Insight: Lowering threshold increases Recall (catches more positives) but drops Precision (more false alarms).")

## Results & Takeaways
*   **Normalization:** Crucial for efficient gradient descent, especially when features have different scales or non-zero means.
*   **Convergence:** With appropriate LR (0.1) and normalization, loss decreases smoothly.
*   **High Learning Rate:** Can cause oscillation or overshoot, though Log-Loss is generally convex (unlike MSE for classification).
*   **Log-Loss vs MSE:** Log-Loss is the standard for classification because it convexifies the error surface when using Sigmoid, guaranteeing a global minimum.
*   **Limitations:** Linear decision boundary. Cannot solve XOR or complex non-linear separations without feature engineering.

## Next Steps
*   Explore **Support Vector Machines (SVM)** for better margin maximization and kernel tricks.
*   [Go to SVM Notebook](./svm-kernels.ipynb)