In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import joblib

def load_data(train_path, test_path):
    """
    Load and normalize MNIST train and test datasets from CSV files.

    Parameters:
        train_path (str): Path to the training CSV file.
        test_path (str): Path to the testing CSV file.

    Returns:
        X_train, y_train, X_test, y_test (np.ndarray): Normalized features and labels.
    """
    print(f"Loading training data from '{train_path}'...")
    train_df = pd.read_csv(train_path, header=None)
    X_train = train_df.iloc[:, 1:].values / 255.0
    y_train = train_df.iloc[:, 0].values

    print(f"Loading test data from '{test_path}'...")
    test_df = pd.read_csv(test_path, header=None)
    X_test = test_df.iloc[:, 1:].values / 255.0
    y_test = test_df.iloc[:, 0].values

    print(f"Training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")
    return X_train, y_train, X_test, y_test

def train_and_evaluate_models(X_train, y_train, X_test, y_test):
    """
    Train Logistic Regression, KNN, and Perceptron classifiers and evaluate accuracy on test set.

    Parameters:
        X_train, y_train: Training data and labels.
        X_test, y_test: Test data and labels.

    Returns:
        dict: A dictionary with model names as keys and tuples (model_instance, accuracy) as values.
    """
    results = {}

    print("\nTraining Logistic Regression...")
    lr = LogisticRegression(max_iter=1000, random_state=42)
    lr.fit(X_train, y_train)
    lr_pred = lr.predict(X_test)
    lr_acc = accuracy_score(y_test, lr_pred)
    results['Logistic Regression'] = (lr, lr_acc)
    print(f"Logistic Regression Accuracy: {lr_acc:.4f}")

    print("\nTraining K-Nearest Neighbors (k=3)...")
    knn = KNeighborsClassifier(n_neighbors=3)
    knn.fit(X_train, y_train)
    knn_pred = knn.predict(X_test)
    knn_acc = accuracy_score(y_test, knn_pred)
    results['KNN'] = (knn, knn_acc)
    print(f"KNN Accuracy: {knn_acc:.4f}")

    print("\nTraining Perceptron...")
    perceptron = Perceptron(max_iter=1000, tol=1e-3, random_state=42)
    perceptron.fit(X_train, y_train)
    perc_pred = perceptron.predict(X_test)
    perc_acc = accuracy_score(y_test, perc_pred)
    results['Perceptron'] = (perceptron, perc_acc)
    print(f"Perceptron Accuracy: {perc_acc:.4f}")

    return results

def save_best_model(results, accuracy_threshold=0.90, save_path='best_model.pkl'):
    """
    Save the best performing model if it achieves or exceeds the accuracy threshold.

    Parameters:
        results (dict): Dictionary of trained models and their accuracies.
        accuracy_threshold (float): Minimum accuracy required to save the model.
        save_path (str): File path to save the model.
    """
    best_model_name = max(results, key=lambda k: results[k][1])
    best_model, best_acc = results[best_model_name]

    print(f"\nBest model: {best_model_name} with accuracy {best_acc:.4f}")
    if best_acc >= accuracy_threshold:
        joblib.dump(best_model, save_path)
        print(f"Saved best model '{best_model_name}' to '{save_path}'")
    else:
        print(f"No model achieved the accuracy threshold of {accuracy_threshold*100:.1f}%.")

def main():
    # Paths to dataset CSV files
    train_csv = 'mnist_train.csv'
    test_csv = 'mnist_test.csv'

    # Load data
    X_train, y_train, X_test, y_test = load_data(train_csv, test_csv)

    # Train models and evaluate
    results = train_and_evaluate_models(X_train, y_train, X_test, y_test)

    # Save best model if meets threshold
    save_best_model(results)

if __name__ == "__main__":
    main()


Loading training data from 'mnist_train.csv'...
Loading test data from 'mnist_test.csv'...
Training samples: 60000, Test samples: 10000

Training Logistic Regression...
Logistic Regression Accuracy: 0.9262

Training K-Nearest Neighbors (k=3)...
KNN Accuracy: 0.9705

Training Perceptron...
Perceptron Accuracy: 0.8633

Best model: KNN with accuracy 0.9705
Saved best model 'KNN' to 'best_model.pkl'


### Methods

* Loaded the MNIST dataset from CSV files containing flattened 28x28 grayscale images.
* Normalized pixel values to the range \[0,1].
* Implemented three classification algorithms:

  * Logistic Regression (`max_iter=1000`)
  * K-Nearest Neighbors (k=3)
  * Perceptron (`max_iter=1000`, `tol=1e-3`)
* Trained each model on 60,000 training samples and evaluated on 10,000 test samples.

---

### Results

| Model               | Test Accuracy |
| ------------------- | ------------- |
| Logistic Regression | 92.62%        |
| K-Nearest Neighbors | 97.05%        |
| Perceptron          | 86.33%        |

* The KNN model achieved the highest accuracy of 97.05% on the test set.
* The best model (KNN) was saved for future use.

---

### Analysis

* KNN outperformed both Logistic Regression and Perceptron on this dataset.
* Logistic Regression performed well with over 90% accuracy but was less accurate than KNN.
* Perceptron showed the lowest accuracy, likely due to its linear nature and simplicity.
* The high accuracy of KNN suggests that instance-based learning is effective for MNIST digit classification.

---

### Conclusion

The K-Nearest Neighbors classifier is the best-performing model among those tested, achieving 97.05% accuracy on MNIST. Logistic Regression also performed strongly but was less accurate. Perceptron lagged behind. These results demonstrate that classical ML algorithms can effectively classify handwritten digits without deep learning, meeting and exceeding the 90% accuracy target.