#GridCV with Multithreading
Made by: Wilfredo Aaron Sosa Ramos

In [1]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from tensorflow.keras.datasets import mnist
import joblib
import threading
from joblib import parallel_backend

# Load and preprocess the MNIST dataset
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Flatten the images (28x28 -> 784)
x_train = x_train.reshape(x_train.shape[0], -1)
x_test = x_test.reshape(x_test.shape[0], -1)

# Normalize pixel values
x_train = x_train.astype('float32') / 255
x_test = x_test.astype('float32') / 255

# Define a pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardize features
    ('classifier', RandomForestClassifier(random_state=42))
])

# Define hyperparameter grid
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5, 10],
}

# Configure GridSearchCV with multithreading
def run_grid_search():
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring='accuracy',
        cv=3,
        n_jobs=-1,  # Use all available cores
        verbose=2
    )

    print("Starting GridSearchCV...")
    with parallel_backend('threading'):
        grid_search.fit(x_train, y_train)

    # Save the best model
    best_model = grid_search.best_estimator_
    joblib.dump(best_model, 'best_mnist_classifier.pkl')

    # Print the best hyperparameters
    print("Best hyperparameters:", grid_search.best_params_)

    # Evaluate the best model on the test set
    y_pred = best_model.predict(x_test)
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Run the GridSearch in a separate thread
grid_search_thread = threading.Thread(target=run_grid_search)
grid_search_thread.start()
grid_search_thread.join()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step
Starting GridSearchCV...
Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] END classifier__max_depth=None, classifier__min_samples_split=2, classifier__n_estimators=50; total time=  28.9s
[CV] END classifier__max_depth=None, classifier__min_samples_split=2, classifier__n_estimators=50; total time=  29.4s
[CV] END classifier__max_depth=None, classifier__min_samples_split=2, classifier__n_estimators=50; total time=  26.6s
[CV] END classifier__max_depth=None, classifier__min_samples_split=2, classifier__n_estimators=100; total time=  51.7s
[CV] END classifier__max_depth=None, classifier__min_samples_split=2, classifier__n_estimators=100; total time=  52.9s
[CV] END classifier__max_depth=None, classifier__min_samples_split=2, classifier__n_estimators=100; total time=  53.2s
[CV] END classifier__max_d