In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [None]:
# Class to store and save the model details along with evaluation metrics
class ModelObject:
    def __init__(self, model_name, model, params, best_params, evaluation_metrics, version):
        self.model_name = model_name
        self.model = model
        self.params = params
        self.best_params = best_params
        self.evaluation_metrics = evaluation_metrics
        self.version = version
    def log_details(self):
        log_message = f"Model: {self.model_name} (Version: {self.version})\n"
        log_message += f"Initial Parameters: {self.params}\n"
        log_message += f"Best Parameters after tuning: {self.best_params}\n"
        log_message += f"Evaluation Metrics: {self.evaluation_metrics}\n"
        return log_message

    def save(self, save_path):
        joblib.dump(self, save_path)
        print(f"Model saved at: {save_path}")

In [None]:
# Base Class for Dataset Handling
class Dataset:
    def __init__(self):
        self.data = None
        self.target = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None

    def load_data(self):
        # Load Iris dataset
        iris = load_iris()
        self.data = iris.data
        self.target = iris.target

    def preprocess(self):
        # Train-test split
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.data, self.target, test_size=0.2, random_state=42
        )

In [None]:
# Base Class for Model Selection and Tuning
class ModelSelector:
    def __init__(self):
        self.models = {
            'RandomForest': RandomForestClassifier(),
            'SVM': SVC(),
            'LogisticRegression': LogisticRegression(max_iter=200)
        }
        self.best_model_object = None
        self.version = 1  # Versioning starts at 1

    def hyperparameter_tuning(self, model, param_grid, X_train, y_train):
        grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=1)
        grid_search.fit(X_train, y_train)
        return grid_search.best_estimator_, grid_search.best_params_

    def select_model(self, X_train, y_train, X_test, y_test):
        # Define parameter grids for each model
        param_grids = {
            'RandomForest': {'n_estimators': [10, 50, 100], 'max_depth': [3, 5, 7]},
            'SVM': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
            'LogisticRegression': {'C': [0.01, 0.1, 1]}
        }

        best_score = 0
        for model_name, model in self.models.items():
            print(f"Tuning {model_name}...")
            tuned_model, best_params = self.hyperparameter_tuning(model, param_grids[model_name], X_train, y_train)
            
            # Evaluate on test data
            y_pred = tuned_model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            evaluation_metrics = classification_report(y_test, y_pred, output_dict=True)

            print(f"{model_name} Test Accuracy: {accuracy}")

            # Save model object only if it is the best one
            if accuracy > best_score:
                best_score = accuracy
                self.best_model_object = ModelObject(
                    model_name=model_name,
                    model=tuned_model,
                    params=param_grids[model_name],
                    best_params=best_params,
                    evaluation_metrics={"accuracy": accuracy, "classification_report": evaluation_metrics},
                    version=self.version
                )

        print(f"Best Model: {self.best_model_object.model_name}")
        return self.best_model_object

    def save_best_model(self):
        if self.best_model_object:
            # Create the model's versioned file name
            save_path = f"{self.best_model_object.model_name}_v{self.version}.pkl"
            self.best_model_object.save(save_path)
            self.version += 1  # Increment the version for the next save


In [None]:
# Main AutoML Pipeline
class AutoMLPipeline:
    def __init__(self):
        self.dataset = Dataset()
        self.model_selector = ModelSelector()

    def run(self):
        # Load and preprocess data
        print("Loading and Preprocessing Data...")
        self.dataset.load_data()
        self.dataset.preprocess()

        # Model Selection and Evaluation
        print("Selecting the best model...")
        best_model = self.model_selector.select_model(
            self.dataset.X_train, self.dataset.y_train, 
            self.dataset.X_test, self.dataset.y_test
        )

        # Save the best model with versioning
        self.model_selector.save_best_model()


In [None]:
# Run the AutoML pipeline
if __name__ == "__main__":
    pipeline = AutoMLPipeline()
    pipeline.run()

In [None]:
import joblib

# Path to the saved model file (replace with your file path)
model_file_path = 'RandomForest_v1.pkl'

# Load the ModelObject instance
loaded_model_object = joblib.load(model_file_path)

# Inspect the contents of the loaded model object
print(f"Model Name: {loaded_model_object.model_name}")
print(f"Version: {loaded_model_object.version}")
print(f"Parameters: {loaded_model_object.params}")
print(f"Best Parameters: {loaded_model_object.best_params}")
print(f"Evaluation Metrics: {loaded_model_object.evaluation_metrics}")

# To inspect the model itself, use the sklearn model's methods
model = loaded_model_object.model
print(f"Model: {model}")


#### **Pipeline Overview**
This AutoML pipeline automates the process of training, tuning, evaluating, and saving machine learning models with key information. The pipeline performs the following tasks:
1. **Data Preprocessing**: Loading the Iris dataset and splitting it into training and testing sets.
2. **Model Selection**: Training multiple machine learning models (RandomForest, SVM, and LogisticRegression), performing hyperparameter tuning, and selecting the best-performing model.
3. **Model Saving**: The best model is saved as a versioned object along with its name, parameters, best hyperparameters, evaluation metrics (accuracy and classification report), and version number.
4. **Version Control**: Each best model is saved with a version number, allowing users to keep track of different versions of the models.

#### **Key Components**

1. **`ModelObject` Class**:
   - Stores the details of the trained model, including the model name, initial and best parameters, evaluation metrics, and version number.
   - It has a `log_details()` method to log important information about the model, and a `save()` method to save the object using `joblib`.

2. **`Dataset` Class**:
   - Handles loading and splitting the data.
   - In this case, it loads the Iris dataset and splits it into training and test sets.

3. **`ModelSelector` Class**:
   - Defines the available models (`RandomForest`, `SVM`, and `LogisticRegression`).
   - Uses `GridSearchCV` to perform hyperparameter tuning on each model and selects the model with the highest test accuracy.
   - Saves the best model using the `ModelObject` class, while keeping track of the versioning.

4. **`AutoMLPipeline` Class**:
   - Orchestrates the entire process.
   - Runs data preprocessing, model selection, and model saving.

5. **Version Control**:
   - The pipeline saves the best model with a version number, starting from 1 and incrementing with each save.

#### **How It Works**:
1. The pipeline loads the Iris dataset, splits it into training and test sets, and preprocesses the data.
2. Three models (`RandomForest`, `SVM`, and `LogisticRegression`) are trained using a grid search for hyperparameter tuning.
3. The model with the best accuracy is selected and its details (name, parameters, evaluation metrics) are saved in a versioned object using `joblib`.
4. Each saved model is tagged with a version number for easy version control and tracking.

# AutoML
In a typical **AutoML** workflow, the steps like **data visualization, feature engineering, attribute selection, and exploratory data analysis (EDA)** are often considered part of the **data scientist's or data analyst's** role, rather than being part of the AutoML system itself. 


### 1. **Data Visualization**:
   - **Purpose**: Data visualization is typically used to understand the distribution of features, detect outliers, and identify patterns before modeling. 
   - **Role**: It helps data scientists or analysts understand the data better and decide on potential feature transformations or selections.
   - **Why not in AutoML**: Visualization is often manual and interpretative. It’s done as a pre-processing step before applying an AutoML system.

### 2. **Feature Engineering**:
   - **Purpose**: Transforming or creating new features from raw data, such as normalizing, encoding categorical variables, or creating interaction terms.
   - **Role**: This is where data analysts and scientists come in to apply domain knowledge to craft features that will improve model performance.
   - **Why not in AutoML**: AutoML systems might automate some basic feature engineering tasks (e.g., scaling, one-hot encoding), but domain-specific feature engineering is usually performed outside the AutoML process by data scientists.

### 3. **Attribute/Feature Selection**:
   - **Purpose**: Selecting the most relevant features to improve model accuracy and avoid overfitting.
   - **Role**: Some AutoML frameworks offer automatic feature selection, but typically, data scientists analyze feature importance to make more intelligent selections.
   - **Why not in AutoML**: While automated feature selection can happen as part of AutoML, more advanced or domain-specific decisions are generally made by data scientists manually.

### 4. **Exploratory Data Analysis (EDA)**:
   - **Purpose**: Understanding the data distributions, relationships, and anomalies through statistical and graphical analysis.
   - **Role**: EDA is crucial for gaining insights into the dataset before building models. It helps in decision-making for data cleaning, feature engineering, and model selection.
   - **Why not in AutoML**: EDA is largely interpretive and manual. AutoML focuses on training, tuning, and evaluating models based on the processed data, but doesn't perform in-depth statistical or graphical analysis.

---

### **Conclusion**:
- **Data Analysts or Data Scientists** typically handle **EDA, feature engineering, attribute selection, and visualization** before feeding the data into an AutoML system. 
- **ML Engineers or AutoML Systems** take preprocessed data and focus on **model selection, hyperparameter tuning, training, evaluation, and model deployment**.

AutoML helps by automating the **model-building process** (which includes tasks like hyperparameter tuning, cross-validation, and model evaluation), but it generally expects **cleaned and pre-processed data** as input, which is where the data scientist’s role comes in.

---

### **README File**

Here's a `README.md` file for your project:

```markdown
# AutoML Pipeline for Model Selection and Versioning

## Overview

This project implements an automated machine learning (AutoML) pipeline to select the best model, tune its hyperparameters, and save it with detailed information, including model name, parameters, evaluation metrics, and version number. The pipeline uses Python and scikit-learn and is designed for simplicity and extendability.

The pipeline currently supports:
- Random Forest
- Support Vector Machine (SVM)
- Logistic Regression

The best model is saved as a versioned object, making it easy to track changes over time.

## Project Structure

```
.
├── automl_pipeline.py   # Main code file for the AutoML pipeline
├── README.md            # Project description and usage
└── models               # Folder where versioned model files will be saved
```

## How It Works

1. **Dataset**: The pipeline uses the Iris dataset (loaded from `sklearn`).
2. **Model Selection**: The pipeline trains three models (`RandomForest`, `SVM`, and `LogisticRegression`), tunes their hyperparameters using grid search, and selects the best model based on test accuracy.
3. **Model Saving**: The best model is saved as an object, which contains:
   - Model Name
   - Initial and Best Hyperparameters
   - Evaluation Metrics (Accuracy and Classification Report)
   - Version Number
4. **Version Control**: Each saved model is assigned a version number, allowing you to keep track of different model versions easily.

## Installation

1. Clone this repository:
   ```bash
   git clone <repository-url>
   cd <repository-folder>
   ```

2. Install dependencies:
   ```bash
   pip install -r requirements.txt
   ```

   The key dependencies are:
   - `scikit-learn`
   - `joblib`

## Running the Pipeline

Run the pipeline using the following command:

```bash
python automl_pipeline.py
```

The pipeline will:
- Load and preprocess the Iris dataset.
- Train, tune, and evaluate three different models.
- Select the best-performing model and save it as a versioned object in the `models/` directory.

## Saved Models

Each best model will be saved in the `models/` directory with a filename format like:
```
RandomForest_v1.pkl
SVM_v2.pkl
```

These files contain all the information about the model, including:
- Model Name
- Parameters and Best Parameters
- Evaluation Metrics
- Version Number

## Loading and Inspecting Saved Models

To load a saved model and inspect its contents, use the following code:

```python
import joblib

# Load the saved model
model_file = 'models/RandomForest_v1.pkl'  # Replace with the actual file path
loaded_model = joblib.load(model_file)

# Access the model details
print(f"Model Name: {loaded_model.model_name}")
print(f"Version: {loaded_model.version}")
print(f"Parameters: {loaded_model.params}")
print(f"Best Parameters: {loaded_model.best_params}")
print(f"Evaluation Metrics: {loaded_model.evaluation_metrics}")

# Access the actual trained model
trained_model = loaded_model.model
print(trained_model)
```

## Key Features

- **Automated Model Selection**: Automatically trains and tunes multiple models to select the best one.
- **Versioned Model Saving**: Each model is saved with a version number for easy tracking.
- **Hyperparameter Tuning**: Uses grid search for hyperparameter tuning to find the best configuration for each model.
- **Model Evaluation**: Reports accuracy and detailed classification metrics for the best model.

## Future Work

- Add support for additional machine learning algorithms.
- Automate feature selection and lightweight feature engineering.
- Integrate with other datasets for a more general pipeline.

## License

This project is licensed under the MIT License. Feel free to use and modify the code.
```

---

### **Explanation of the `README.md`**:
1. **Overview**: Explains the purpose and functionality of the AutoML pipeline.
2. **Installation**: Instructions to install the necessary libraries (`scikit-learn` and `joblib`).
3. **Running the Pipeline**: How to run the pipeline and what happens when you do.
4. **Saved Models**: Describes how models are saved with version control and how to load them.
5. **Key Features**: Summarizes what the pipeline offers in terms of automated model selection and version control.
6. **Future Work**: Suggests possible future extensions for the project.

This file will serve as a helpful guide for anyone using your code, detailing how to run the pipeline, save models, and reload them for inspection.

Let me know if you'd like any adjustments or additional information!