### Display the versions of the libraries used for reference purposes.

In [1]:
import sys
import numpy as np
import tensorflow as tf
import sklearn
import torch
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import notebook
import os

# Print Python version
print(f'Python version: {sys.version}')

# Print Jupyter Notebook version
print(f'Jupyter Notebook version: {notebook.__version__}')

# Print library versions
print(f'NumPy version: {np.__version__}')
print(f'TensorFlow version: {tf.__version__}')
print(f'Torch version: {torch.__version__}')
print(f'Scikit-learn version: {sklearn.__version__}')

os.makedirs("Models and Data splits", exist_ok=True)

Python version: 3.12.9 | packaged by Anaconda, Inc. | (main, Feb  6 2025, 18:49:16) [MSC v.1929 64 bit (AMD64)]
Jupyter Notebook version: 7.3.2
NumPy version: 2.0.1
TensorFlow version: 2.19.0
Torch version: 2.6.0+cu126
Scikit-learn version: 1.6.1


### Loading MNIST dataset and Splitting its and saving them

In [4]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
import joblib
import os

# -------------------------
# 1. Load and Preprocess the MNIST Data
# -------------------------
mnist = fetch_openml('mnist_784', version=1)
X = mnist.data.values      # shape (70000, 784)
y = mnist.target.astype(int).values

# 2. Split into train/test
X_train_not_scaled, X_test_not_scaled, \
y_train_not_scaled, y_test_not_scaled = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 3. Save the raw (unscaled) splits
joblib.dump(
    (X_train_not_scaled, X_test_not_scaled,
     y_train_not_scaled, y_test_not_scaled),
    'Models and Data splits/data_[ORIGINAL] Train_Test_Splits.pkl'
)

# 4. [0,1] Min–Max scaling (divide by 255)
X_train_scaled = X_train_not_scaled / 255.0
X_test_scaled  = X_test_not_scaled  / 255.0
joblib.dump(
    (X_train_scaled, X_test_scaled,
     y_train_not_scaled, y_test_not_scaled),
    'Models and Data splits/data_[SCALED] Train_Test_Splits.pkl'
)

['Models and Data splits/data_[SCALED] Train_Test_Splits.pkl']

### Training Random Forests

In [11]:
import os
import joblib
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import warnings

# Set seeds for reproducibility
np.random.seed(42)

# ─────────────── PATHS ────────────────────────────────────────────────────
DATA_PKL         = r"Models and Data splits/data_[SCALED] Train_Test_Splits.pkl"
OUTPUT_MODEL_DIR = "Models and Data splits"
OUTPUT_MODEL_PATH = os.path.join(OUTPUT_MODEL_DIR, "random_forest.pkl")

# Ensure output directory exists
os.makedirs(OUTPUT_MODEL_DIR, exist_ok=True)

# ─────────── Load Data ─────────────────────────────────────────────────────
try:
    data = joblib.load(DATA_PKL)
    X_train, X_test, y_train, y_test = data
except FileNotFoundError:
    print(f"Error: Data file not found at {DATA_PKL}. Please check the path.")
    exit()
except Exception as e:
    print(f"Error loading data from {DATA_PKL}: {e}")
    exit()

# Normalize if needed (assuming data might be in [0, 255] or [0, 1])
if X_train.max() > 1.0:
    X_train = X_train.astype(np.float32) / 255.0
    X_test = X_test.astype(np.float32) / 255.0
    warnings.warn("Data appeared in [0,255]; normalized to [0,1].")
else:
    warnings.warn("Data is already scaled to [0,1]; proceeding.")

# ─────────── Reshape Data ──────────────────────────────────────────────────
# Flatten the 28x28 images into a 1D array of 784 features
# RandomForestClassifier expects 2D data (n_samples, n_features)
if X_train.ndim == 3: # If shape is (n_samples, 28, 28)
    n_samples_train = X_train.shape[0]
    X_train_flat = X_train.reshape(n_samples_train, -1) # -1 infers 28*28 = 784
    print(f"Training data reshaped from {X_train.shape} to {X_train_flat.shape}")
else:
    X_train_flat = X_train # Assume it's already flat
    print("Training data is already flat.")

if X_test.ndim == 3: # If shape is (n_samples, 28, 28)
    n_samples_test = X_test.shape[0]
    X_test_flat = X_test.reshape(n_samples_test, -1)
    print(f"Test data reshaped from {X_test.shape} to {X_test_flat.shape}")
else:
    X_test_flat = X_test # Assume it's already flat
    print("Test data is already flat.")

# ─────────── Train Random Forest Classifier ────────────────────────────────
print("\nTraining Random Forest Classifier for all 10 digits...")

# You can adjust these hyperparameters
rf_classifier = RandomForestClassifier(
    n_estimators=100,      # Number of trees in the forest
    random_state=42,       # For reproducibility
    n_jobs=-1              # Use all available CPU cores
)

rf_classifier.fit(X_train_flat, y_train)

print("Random Forest Classifier training complete.")

# ─────────── Evaluate Model ────────────────────────────────────────────────
print("\nEvaluating Random Forest Classifier...")
y_pred = rf_classifier.predict(X_test_flat)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test set: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# ─────────── Save Model ────────────────────────────────────────────────────
try:
    joblib.dump(rf_classifier, OUTPUT_MODEL_PATH)
    print(f"\nRandom Forest Classifier saved successfully to: {OUTPUT_MODEL_PATH}")
except Exception as e:
    print(f"Error saving Random Forest Classifier: {e}")

print("\nFeature importances (top 196):")
# Get feature importances and their corresponding pixel indices
feature_importances = rf_classifier.feature_importances_
sorted_indices = np.argsort(feature_importances)[::-1] # Sort descending
top_196_indices = sorted_indices[:196]
top_196_importances = feature_importances[top_196_indices]

for i, (idx, importance) in enumerate(zip(top_196_indices, top_196_importances)):
    print(f"{i+1}. Pixel {idx} (row {idx//28}, col {idx%28}): {importance:.4f}")

# You can reload the model later using:
# loaded_rf = joblib.load(OUTPUT_MODEL_PATH)



Training data is already flat.
Test data is already flat.

Training Random Forest Classifier for all 10 digits...
Random Forest Classifier training complete.

Evaluating Random Forest Classifier...
Accuracy on test set: 0.9672

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1381
           1       0.98      0.98      0.98      1575
           2       0.96      0.97      0.97      1398
           3       0.96      0.96      0.96      1428
           4       0.97      0.96      0.96      1365
           5       0.97      0.96      0.96      1263
           6       0.97      0.98      0.98      1375
           7       0.97      0.97      0.97      1459
           8       0.96      0.96      0.96      1365
           9       0.94      0.94      0.94      1391

    accuracy                           0.97     14000
   macro avg       0.97      0.97      0.97     14000
weighted avg       0.97      0.97      0.97  

### Trainning leNet

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import joblib
import numpy as np
from sklearn.metrics import accuracy_score
import os

# 1. Define LeNet architecture with Tanh activations and average pooling
class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, kernel_size=5)   # 28x28 -> 24x24
        self.pool = nn.AvgPool2d(kernel_size=2, stride=2)  # 24x24 -> 12x12
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5)  # 12x12 -> 8x8
        # Pool again 8x8 -> 4x4
        self.fc1 = nn.Linear(16 * 4 * 4, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(torch.tanh(self.conv1(x)))
        x = self.pool(torch.tanh(self.conv2(x)))
        x = x.view(-1, 16 * 4 * 4)
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        x = self.fc3(x)
        return x

# 2. Load your data
X_train, X_test, y_train, y_test = joblib.load('Models and Data splits/data_[SCALED] Train_Test_Splits.pkl')

# 3. Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# 4. Prepare datasets and dataloaders
batch_size = 64

# Convert numpy arrays to torch tensors and reshape for CNN input (N,1,28,28)
X_train_tensor = torch.tensor(X_train.reshape(-1,1,28,28), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test.reshape(-1,1,28,28), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 5. Initialize model, loss, optimizer
model = LeNet().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)  # SGD optimizer as original paper

# 6. Training loop
epochs = 100
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}")

# 7. Evaluation on test set
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(labels.numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"\nTest Accuracy: {accuracy:.4f}")

# 8. Save trained model

# Save the scripted model
model = torch.jit.script(model)
torch.jit.save(model, "Models and Data splits/lenet.pt")
print("Model saved to Models/lenet_original_style.pth")

Using device: cuda
Epoch 1/100, Loss: 1.9044
Epoch 2/100, Loss: 0.6547
Epoch 3/100, Loss: 0.4120
Epoch 4/100, Loss: 0.3415
Epoch 5/100, Loss: 0.2986
Epoch 6/100, Loss: 0.2643
Epoch 7/100, Loss: 0.2352
Epoch 8/100, Loss: 0.2100
Epoch 9/100, Loss: 0.1884
Epoch 10/100, Loss: 0.1697
Epoch 11/100, Loss: 0.1540
Epoch 12/100, Loss: 0.1402
Epoch 13/100, Loss: 0.1288
Epoch 14/100, Loss: 0.1191
Epoch 15/100, Loss: 0.1105
Epoch 16/100, Loss: 0.1032
Epoch 17/100, Loss: 0.0967
Epoch 18/100, Loss: 0.0910
Epoch 19/100, Loss: 0.0863
Epoch 20/100, Loss: 0.0819
Epoch 21/100, Loss: 0.0780
Epoch 22/100, Loss: 0.0744
Epoch 23/100, Loss: 0.0714
Epoch 24/100, Loss: 0.0686
Epoch 25/100, Loss: 0.0658
Epoch 26/100, Loss: 0.0634
Epoch 27/100, Loss: 0.0613
Epoch 28/100, Loss: 0.0594
Epoch 29/100, Loss: 0.0574
Epoch 30/100, Loss: 0.0559
Epoch 31/100, Loss: 0.0542
Epoch 32/100, Loss: 0.0528
Epoch 33/100, Loss: 0.0513
Epoch 34/100, Loss: 0.0500
Epoch 35/100, Loss: 0.0487
Epoch 36/100, Loss: 0.0476
Epoch 37/100, Loss

### LeNet performance

In [6]:
import torch
import joblib
import numpy as np
from sklearn.metrics import classification_report

# Load the scaled data
X_train_scaled, X_test_scaled, y_train, y_test = joblib.load(
    'Models and Data splits/data_[SCALED] Train_Test_Splits.pkl'
)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Convert test data to torch tensor and reshape
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)

# Reshape from [N, 784] to [N, 1, 28, 28] if needed
if X_test_tensor.ndim == 2 and X_test_tensor.shape[1] == 784:
    X_test_tensor = X_test_tensor.view(-1, 1, 28, 28)

# Move input to the correct device
X_test_tensor = X_test_tensor.to(device)

# Load the scripted model and move to device
model = torch.jit.load('Models and Data splits/lenet.pt')
model.to(device)
model.eval()

# Run predictions
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, y_pred = torch.max(outputs, 1)

# Move predictions to CPU for sklearn
y_pred = y_pred.cpu().numpy()

# Print classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1381
           1       0.99      0.99      0.99      1575
           2       0.99      0.98      0.99      1398
           3       0.99      0.98      0.99      1428
           4       0.99      0.99      0.99      1365
           5       0.99      0.99      0.99      1263
           6       0.99      0.99      0.99      1375
           7       0.99      0.99      0.99      1459
           8       0.99      0.98      0.98      1365
           9       0.98      0.99      0.99      1391

    accuracy                           0.99     14000
   macro avg       0.99      0.99      0.99     14000
weighted avg       0.99      0.99      0.99     14000



### Feature Importance

In [1]:
import joblib
import os
import numpy as np

RF_ALL_CLASSES_PATH = os.path.normpath("Models and Data splits/random_forest.pkl")
rf_all_classes = joblib.load(RF_ALL_CLASSES_PATH)
feature_importances = rf_all_classes.feature_importances_
top_588_indices = np.argsort(feature_importances)[:588]
top_588_importances = feature_importances[top_588_indices]

for i, (idx, importance) in enumerate(zip(top_588_indices, top_588_importances)):
    print(f"{i+1}. Pixel {idx} (row {idx//28}, col {idx%28}): {importance:.4f}")

1. Pixel 16 (row 0, col 16): 0.0000
2. Pixel 17 (row 0, col 17): 0.0000
3. Pixel 18 (row 0, col 18): 0.0000
4. Pixel 19 (row 0, col 19): 0.0000
5. Pixel 20 (row 0, col 20): 0.0000
6. Pixel 21 (row 0, col 21): 0.0000
7. Pixel 22 (row 0, col 22): 0.0000
8. Pixel 23 (row 0, col 23): 0.0000
9. Pixel 24 (row 0, col 24): 0.0000
10. Pixel 25 (row 0, col 25): 0.0000
11. Pixel 26 (row 0, col 26): 0.0000
12. Pixel 27 (row 0, col 27): 0.0000
13. Pixel 28 (row 1, col 0): 0.0000
14. Pixel 29 (row 1, col 1): 0.0000
15. Pixel 30 (row 1, col 2): 0.0000
16. Pixel 31 (row 1, col 3): 0.0000
17. Pixel 32 (row 1, col 4): 0.0000
18. Pixel 33 (row 1, col 5): 0.0000
19. Pixel 34 (row 1, col 6): 0.0000
20. Pixel 756 (row 27, col 0): 0.0000
21. Pixel 60 (row 2, col 4): 0.0000
22. Pixel 61 (row 2, col 5): 0.0000
23. Pixel 57 (row 2, col 1): 0.0000
24. Pixel 752 (row 26, col 24): 0.0000
25. Pixel 56 (row 2, col 0): 0.0000
26. Pixel 55 (row 1, col 27): 0.0000
27. Pixel 53 (row 1, col 25): 0.0000
28. Pixel 54 (row 