In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [13]:
# Class to store and save the model details along with evaluation metrics
class ModelObject:
    def __init__(self, model_name, model, params, best_params, evaluation_metrics, version):
        self.model_name = model_name
        self.model = model
        self.params = params
        self.best_params = best_params
        self.evaluation_metrics = evaluation_metrics
        self.version = version
    def log_details(self):
        log_message = f"Model: {self.model_name} (Version: {self.version})\n"
        log_message += f"Initial Parameters: {self.params}\n"
        log_message += f"Best Parameters after tuning: {self.best_params}\n"
        log_message += f"Evaluation Metrics: {self.evaluation_metrics}\n"
        return log_message

    def save(self, save_path):
        joblib.dump(self, save_path)
        print(f"Model saved at: {save_path}")

In [2]:
# Base Class for Dataset Handling
class Dataset:
    def __init__(self):
        self.data = None
        self.target = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None

    def load_data(self):
        # Load Iris dataset
        iris = load_iris()
        self.data = iris.data
        self.target = iris.target

    def preprocess(self):
        # Train-test split
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.data, self.target, test_size=0.2, random_state=42
        )

In [3]:
# Base Class for Model Selection and Tuning
class ModelSelector:
    def __init__(self):
        self.models = {
            'RandomForest': RandomForestClassifier(),
            'SVM': SVC(),
            'LogisticRegression': LogisticRegression(max_iter=200)
        }
        self.best_model = None
        self.version = 1  # Versioning starts at 1


    def hyperparameter_tuning(self, model, param_grid, X_train, y_train):
        grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=1)
        grid_search.fit(X_train, y_train)
        return grid_search.best_estimator_

    def select_model(self, X_train, y_train):
        # Define parameter grids for each model
        param_grids = {
            'RandomForest': {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 7]},
            'SVM': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
            'LogisticRegression': {'C': [0.01, 0.1, 1]}
        }

        best_score = 0
        for model_name, model in self.models.items():
            print(f"Tuning {model_name}...")
            tuned_model, best_params = self.hyperparameter_tuning(model, param_grids[model_name], X_train, y_train)
            
            # Evaluate on test data
            y_pred = tuned_model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            evaluation_metrics = classification_report(y_test, y_pred, output_dict=True)

            print(f"{model_name} Test Accuracy: {accuracy}")

            # Save model object only if it is the best one
            if accuracy > best_score:
                best_score = accuracy
                self.best_model_object = ModelObject(
                    model_name=model_name,
                    model=tuned_model,
                    params=param_grids[model_name],
                    best_params=best_params,
                    evaluation_metrics={"accuracy": accuracy, "classification_report": evaluation_metrics},
                    version=self.version
                )

        print(f"Best Model: {self.best_model_object.model_name}")
        return self.best_model_object


In [4]:
# Base Class for Evaluation
class Evaluator:
    def __init__(self, model):
        self.model = model

    def evaluate(self, X_test, y_test):
        y_pred = self.model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        return accuracy, report


In [5]:
# Main AutoML Pipeline
class AutoMLPipeline:
    def __init__(self):
        self.dataset = Dataset()
        self.model_selector = ModelSelector()
        self.evaluator = None

    def run(self):
        # Load and preprocess data
        print("Loading and Preprocessing Data...")
        self.dataset.load_data()
        self.dataset.preprocess()

        # Model Selection
        print("Selecting the best model...")
        best_model = self.model_selector.select_model(self.dataset.X_train, self.dataset.y_train)

        # Evaluation
        print("Evaluating the best model...")
        self.evaluator = Evaluator(best_model)
        accuracy, report = self.evaluator.evaluate(self.dataset.X_test, self.dataset.y_test)

        print(f"Test Accuracy: {accuracy}")
        print(f"Classification Report: \n{report}")

        # Save the best model
        print("Saving the best model...")
        joblib.dump(best_model, "best_model.pkl")
        print("Model saved as best_model.pkl")


In [6]:
# Run the AutoML pipeline
if __name__ == "__main__":
    pipeline = AutoMLPipeline()
    pipeline.run()

Loading and Preprocessing Data...
Selecting the best model...
Tuning RandomForest...
Fitting 5 folds for each of 9 candidates, totalling 45 fits
RandomForest score: 0.975
Tuning SVM...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
SVM score: 0.975
Tuning LogisticRegression...
Fitting 5 folds for each of 3 candidates, totalling 15 fits
LogisticRegression score: 0.975
Best Model: RandomForestClassifier(max_depth=3, n_estimators=10)
Evaluating the best model...
Test Accuracy: 1.0
Classification Report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

Saving the best model...
Model saved as best_model.pkl
