In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# **Data visualisation** 

In [16]:
dt = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')

In [17]:
dt.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
dt.shape

(42000, 785)

# **Data loading and preprocessing**


In [19]:
data = np.array(dt)
m, n = data.shape
np.random.shuffle(data)

data_test = data[0:1000].T
Y_test = data_test[0]
X_test = data_test[1:n]
X_test = X_test / 255  

data_train = data[1000:m].T
Y_train = data_train[0]
X_train = data_train[1:n]
X_train = X_train / 255 # NORMALIZE for better trai

In [20]:
X_train[:,0].shape

(784,)

In [21]:
def init_param():
    # Optimize initialization (He initialization for ReLU)
    W1 = np.random.randn(128, 784) * np.sqrt(2.0 / 784) 
    b1 = np.zeros((128,1))

    W2 = np.random.randn(64, 128) * np.sqrt(2.0 / 128) 
    b2 = np.zeros((64,1))

    W3 = np.random.randn(10, 64) * np.sqrt(2.0 / 64) 
    b3 = np.zeros((10,1))

    return W1, b1, W2, b2, W3, b3

def ReLu(X):
    return np.maximum(0, X)

def softmax(Z):
    #STABLE SOFTMAX
    Z_shifted = Z - np.max(Z, axis=0, keepdims=True)
    exp_Z = np.exp(Z_shifted)
    return exp_Z / np.sum(exp_Z, axis=0, keepdims=True)

def forward_prop(W1, b1, W2, b2, W3, b3, X):
    Z1 = W1.dot(X) + b1
    A1 = ReLu(Z1)

    Z2 = W2.dot(A1) +b2
    A2 = ReLu(Z2)

    Z3 = W3.dot(A2) +b3
    A3 = softmax(Z3)

    return Z1, A1, Z2, A2, Z3, A3


def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, 10))  #blank canvas (10, 10)
    one_hot_Y[np.arange(Y.size), Y] = 1
    #one_hot_Y[0, Y[0]] = 1
    #one_hot_Y[1, Y[1]] = 1
    #...
    #one_hot_Y[Y.size -1, Y[size-1]] = 1

    one_hot_Y = one_hot_Y.T
    return one_hot_Y

def drv_relu(X):
    return (X > 0).astype(float)

def backward_prop(Z1, A1, Z2, A2, Z3, A3, W1, W2, W3, X, Y, lambda_reg= 0.01):
    m = Y.size
    one_hot_Y = one_hot(Y)

    dZ3 = A3 - one_hot_Y
    dW3 = 1/m * dZ3.dot(A2.T)+ (lambda_reg/m) * W3  # L2 regularization
    db3 = 1/m * np.sum(dZ3, axis=1, keepdims=True)
    
    dZ2 = W3.T.dot(dZ3) * drv_relu(Z2)
    dW2 = 1/m * dZ2.dot(A1.T) + (lambda_reg/m) * W2  # L2 regularization
    db2 = 1/m * np.sum(dZ2, axis=1, keepdims=True)

    
    dZ1 = W2.T.dot(dZ2) * drv_relu(Z1)
    dW1 = 1/m * dZ1.dot(X.T) + (lambda_reg/m) * W1  # L2 regularization
    db1 = 1/m * np.sum(dZ1, axis=1, keepdims=True)

    return dW1, db1, dW2, db2, dW3, db3

def update_param(W1, b1, W2, b2, W3, b3, dW1, db1, dW2, db2, dW3, db3, a):
    W1 = W1- a * dW1
    b1 = b1 - a * db1
    
    W2 = W2 - a * dW2
    b2 = b2 - a * db2

    W3 = W3 - a * dW3
    b3 = b3 - a * db3
    
    return W1, b1, W2, b2, W3, b3    
    

In [22]:
def get_predictions(A3):
    return np.argmax(A3, 0) #the index of the highest probability across columns.

def get_accuracy(prediction, Y):
    if len(prediction) > 10:
        print("Predictions:", prediction[:10], "True:", Y[:10])
    else:
        print("Predictions:", prediction, "True:", Y)
    accuracy = np.sum(prediction == Y) / Y.size
    print(f'Accuracy: {accuracy:.4f}')
    return accuracy

In [23]:
def gradient_descent_enhanced(X, Y, iterations=200, initial_alpha=0.01, lambda_reg=0.01):
    W1, b1, W2, b2, W3, b3 = init_param()
    accuracies = []

    for i in range(iterations):
        # Learing rate decay (cosine annealing)
        learning_rate = initial_alpha * (0.5 * (1 + np.cos(np.pi * i / iterations)))
        
        Z1, A1, Z2, A2, Z3, A3 = forward_prop(W1, b1, W2, b2, W3, b3, X)
        dW1, db1, dW2, db2, dW3, db3 = backward_prop(Z1, A1, Z2, A2, Z3, A3, W1, W2, W3, X, Y, lambda_reg)
        W1, b1, W2, b2, W3, b3 = update_param(W1, b1, W2, b2, W3, b3, dW1, db1, dW2, db2, dW3, db3, learning_rate)

        if i % 50 == 0:
            predictions = get_predictions(A3)
            accuracy = get_accuracy(predictions, Y)
            accuracies.append(accuracy)
            print(f'Iteration: {i}, LR: {learning_rate:.6f}, Acc: {accuracy:.4f}')
    
    return W1, b1, W2, b2, W3, b3, accuracies

# **Test function (Cross-Validation):**

In [24]:
def enhanced_cross_validation(X, Y, k_folds=3, iterations=150, learning_rate=0.015, lambda_reg=0.01):
    from sklearn.model_selection import KFold
    
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    fold_accuracies = []
    
    print(f"Enhanced {k_folds}-fold cross validation...")
    print(f"Parameters: {iterations} iterations, LR: {learning_rate}, Lambda: {lambda_reg}")
    
    for fold, (train_index, val_index) in enumerate(kf.split(X.T), 1):
        X_train_fold = X[:, train_index]
        Y_train_fold = Y[train_index]
        X_val_fold = X[:, val_index]
        Y_val_fold = Y[val_index]
        
        # Train with enhanced parameters
        W1, b1, W2, b2, W3, b3, _ = gradient_descent_enhanced(
            X_train_fold, Y_train_fold, iterations, learning_rate, lambda_reg
        )
        
        # Validate
        _, _, _, _, _, A3_val = forward_prop(W1, b1, W2, b2, W3, b3, X_val_fold)
        val_preds = get_predictions(A3_val)
        val_accuracy = get_accuracy(val_preds, Y_val_fold)
        
        print(f"Fold {fold}: {val_accuracy:.4f}")
        fold_accuracies.append(val_accuracy)
    
    mean_accuracy = np.mean(fold_accuracies)
    std_accuracy = np.std(fold_accuracies)
    
    print(f"\nEnhanced Cross-Validation Results:")
    print(f"Mean Accuracy: {mean_accuracy:.4f} (±{std_accuracy:.4f})")
    print(f"Range: {np.min(fold_accuracies):.4f} - {np.max(fold_accuracies):.4f}")
    
    return fold_accuracies, mean_accuracy



## Different configurations

In [25]:
def find_best_parameters():
    """Try different hyperparameter combinations"""
    param_combinations = [
        {'learning_rate': 0.015, 'lambda_reg': 0.01, 'iterations': 150},
        {'learning_rate': 0.02, 'lambda_reg': 0.005, 'iterations': 200},
        {'learning_rate': 0.01, 'lambda_reg': 0.02, 'iterations': 180},
        {'learning_rate': 0.012, 'lambda_reg': 0.015, 'iterations': 160}
    ]
    
    best_accuracy = 0
    best_params = None
    
    for i, params in enumerate(param_combinations, 1):
        print(f"\n🔧 Trying parameter set {i}: {params}")
        accuracies, mean_acc = enhanced_cross_validation(
            X_train, Y_train,
            k_folds=3,
            iterations=params['iterations'],
            learning_rate=params['learning_rate'],
            lambda_reg=params['lambda_reg']
        )
        
        if mean_acc > best_accuracy:
            best_accuracy = mean_acc
            best_params = params
    
    print(f"\n🎯 Best parameters: {best_params}")
    print(f"Best cross-validation accuracy: {best_accuracy:.4f}")
    
    return best_params


## **TEST MODEL**

In [26]:
def test_model(X_test, Y_test, W1, b1, W2, b2, W3, b3):
    """
    Test the trained model on test data and provide detailed analysis
    """
    print("\n" + "="*60)
    print("🧪 MODEL TESTING ON HELD-OUT TEST SET")
    print("="*60)
    
    # Forward propagation on test data
    _, _, _, _, _, A3_test = forward_prop(W1, b1, W2, b2, W3, b3, X_test)
    
    # Get predictions
    test_predictions = get_predictions(A3_test)
    
    # Calculate accuracy
    test_accuracy = np.sum(test_predictions == Y_test) / Y_test.size
    
    # Display detailed results
    print(f"Test Samples: {Y_test.size}")
    print(f"Correct Predictions: {np.sum(test_predictions == Y_test)}")
    print(f"Wrong Predictions: {np.sum(test_predictions != Y_test)}")
    print(f"Test Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")
    
    # Show sample predictions
    print(f"\nSample Predictions (first 20):")
    print("Predicted:", test_predictions[:20])
    print("Actual:    ", Y_test[:20])
    
    return test_accuracy, test_predictions

# **Run the model** 

## Train final model on full data


In [27]:
print("\nTraining final model on full dataset...")
W1, b1, W2, b2, W3, b3, train_accuracies = gradient_descent_enhanced(
    X_train, Y_train,
    iterations=200,
    initial_alpha=0.015,
    lambda_reg=0.01
)




Training final model on full dataset...
Predictions: [8 4 3 4 4 3 8 6 0 3] True: [7 5 4 8 4 2 2 3 7 0]
Accuracy: 0.0730
Iteration: 0, LR: 0.015000, Acc: 0.0730
Predictions: [8 3 4 3 4 0 8 2 7 0] True: [7 5 4 8 4 2 2 3 7 0]
Accuracy: 0.5572
Iteration: 50, LR: 0.012803, Acc: 0.5572
Predictions: [7 3 4 3 4 0 2 2 7 0] True: [7 5 4 8 4 2 2 3 7 0]
Accuracy: 0.6918
Iteration: 100, LR: 0.007500, Acc: 0.6918
Predictions: [7 3 4 3 4 0 2 2 7 0] True: [7 5 4 8 4 2 2 3 7 0]
Accuracy: 0.7242
Iteration: 150, LR: 0.002197, Acc: 0.7242


In [28]:
print("Running enhanced cross-validation...")
enhanced_accuracies, enhanced_mean = enhanced_cross_validation(
    X_train, Y_train,
    k_folds=3,
    iterations=150,
    learning_rate=0.015,
    lambda_reg=0.01
)

Running enhanced cross-validation...
Enhanced 3-fold cross validation...
Parameters: 150 iterations, LR: 0.015, Lambda: 0.01
Predictions: [8 2 8 7 7 4 7 8 2 7] True: [7 4 8 2 7 0 9 1 2 5]
Accuracy: 0.1209
Iteration: 0, LR: 0.015000, Acc: 0.1209
Predictions: [7 4 8 8 7 0 7 8 2 7] True: [7 4 8 2 7 0 9 1 2 5]
Accuracy: 0.4642
Iteration: 50, LR: 0.011250, Acc: 0.4642
Predictions: [7 4 8 7 7 0 7 1 2 7] True: [7 4 8 2 7 0 9 1 2 5]
Accuracy: 0.6319
Iteration: 100, LR: 0.003750, Acc: 0.6319
Predictions: [8 4 1 8 0 0 4 7 8 7] True: [5 4 2 3 5 0 0 7 8 7]
Accuracy: 0.6440
Fold 1: 0.6440
Predictions: [2 2 3 3 6 2 3 8 7 3] True: [5 4 4 2 3 0 9 1 2 5]
Accuracy: 0.1314
Iteration: 0, LR: 0.015000, Acc: 0.1314
Predictions: [5 4 4 3 3 0 9 8 2 3] True: [5 4 4 2 3 0 9 1 2 5]
Accuracy: 0.4683
Iteration: 50, LR: 0.011250, Acc: 0.4683
Predictions: [5 4 4 3 3 0 9 8 2 3] True: [5 4 4 2 3 0 9 1 2 5]
Accuracy: 0.6226
Iteration: 100, LR: 0.003750, Acc: 0.6226
Predictions: [7 8 0 7 5 8 8 0 0 4] True: [7 8 2 7 5 1 

## Test on held-out test set


In [29]:
print("\nTesting final model...")
test_accuracy, test_predictions = test_model(X_test, Y_test, W1, b1, W2, b2, W3, b3)

print(f"\n🎯 Final Results:")
print(f"Cross-Validation Accuracy: {enhanced_mean:.4f}")
print(f"Test Set Accuracy: {test_accuracy:.4f}")


Testing final model...

🧪 MODEL TESTING ON HELD-OUT TEST SET
Test Samples: 1000
Correct Predictions: 749
Wrong Predictions: 251
Test Accuracy: 0.7490 (74.90%)

Sample Predictions (first 20):
Predicted: [3 4 9 8 4 7 8 8 4 6 6 9 6 7 9 9 8 4 2 1]
Actual:     [3 4 9 5 4 7 8 6 5 6 6 9 6 7 9 9 6 4 2 1]

🎯 Final Results:
Cross-Validation Accuracy: 0.6601
Test Set Accuracy: 0.7490
