# EazyML Explainable AI Template

## Define Imports

In [None]:
!pip install --upgrade eazyml-xai
!pip install --upgrade eazyml-automl
!pip install gdown python-dotenv

In [None]:
import os
from eazyml_xai import (
    ez_init,
    ez_explain
)

from eazyml import ez_display_df
import gdown
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

from dotenv import load_dotenv
load_dotenv()

## 1. Initialize EazyML

The `ez_init` function uses the `EAZYML_ACCESS_KEY` environment variable for authentication. If the variable is not set, it defaults to a trial license.

In [None]:
ez_init(access_key=os.getenv('EAZYML_ACCESS_KEY'))

## 2. Define Dataset Files and Outcome Variable

In [None]:
gdown.download_folder(id='1DJtU6gI929GdEEZ3F_7w5LMnT90VvYI7')

In [None]:
# Names of the files that will be used by EazyML APIs
train_file_path = os.path.join('data', 'IRIS_Train.csv')
test_file_path  = os.path.join('data', 'IRIS_Test.csv')

# The column name for outcome of interest
outcome = 'species'

## 3. Dataset Information

The dataset used in this notebook is the **Iris Dataset**, which is a well-known dataset in machine learning and statistics. It contains data about 150 iris flowers, with four features (sepal length, sepal width, petal length, and petal width) and the species of the flower (setosa, versicolor, or virginica).

You can find more details and download the dataset from Kaggle using the following link:

[Kaggle Iris Dataset](https://www.kaggle.com/datasets/uciml/iris)

### Columns in the Dataset:
- **sepal_length**: Sepal length of the flower (cm)
- **sepal_width**: Sepal width of the flower (cm)
- **petal_length**: Petal length of the flower (cm)
- **petal_width**: Petal width of the flower (cm)
- **species**: Species of the iris flower (setosa, versicolor, virginica)

### 3.1 Display the Dataset

Below is a preview of the dataset:

In [None]:
# Load the dataset from the provided file
train = pd.read_csv(train_file_path)

# Display the first few rows of the dataset
train.head()

## 4. Implement Preprocessing Steps in a Preprocessor Class and Apply to the Training Data

### 4.1 Implementing Preprocessing Steps within a Custom Preprocessor Class

In [None]:
class UnifiedPreprocessor:
    def __init__(self):
        self.numerical_imputer = SimpleImputer(strategy='mean')
        self.scaler = StandardScaler()
        self.categorical_encoder = OneHotEncoder(drop='first', sparse=False)
        self.label_encoder = LabelEncoder()
        self.target_scaler = StandardScaler()
        self.fitted = False  # To track whether preprocessing objects are fitted

    def fit(self, X, y=None):
        # Split columns into numerical and categorical
        self.numerical_columns = X.select_dtypes(include=[np.number]).columns
        self.categorical_columns = X.select_dtypes(include=[object]).columns

        # Fit transformers for features
        self.numerical_imputer.fit(X[self.numerical_columns])
        self.scaler.fit(X[self.numerical_columns])
        self.categorical_encoder.fit(X[self.categorical_columns])

        # Fit transformer for the target variable (if provided)
        if y is not None:
            y = np.array(y).reshape(-1, 1)  # Reshape for scaler
            self.label_encoder.fit(y)

        self.fitted = True

    def transform(self, X, y=None):
        if not self.fitted:
            raise ValueError("Preprocessor is not fitted yet. Call 'fit' first.")

        # Apply transformations to numerical features
        X_numerical = self.numerical_imputer.transform(X[self.numerical_columns])
        X_numerical = self.scaler.transform(X_numerical)

        # Apply transformations to categorical features
        X_categorical = self.categorical_encoder.transform(X[self.categorical_columns])

        # Get new column names for categorical features
        categorical_feature_names = self.categorical_encoder.get_feature_names_out(self.categorical_columns)

        # Combine transformed numerical and categorical data
        X_transformed = np.hstack((X_numerical, X_categorical))

        # Create a DataFrame with appropriate column names
        all_feature_names = list(self.numerical_columns) + list(categorical_feature_names)
        X_transformed_df = pd.DataFrame(X_transformed, columns=all_feature_names, index=X.index)

        # Transform the target variable (if provided)
        if y is not None:
            y = np.array(y).reshape(-1, 1)  # Reshape for scaler
            y_transformed = self.label_encoder.transform(y).flatten()
            return X_transformed_df, y_transformed

        return X_transformed_df

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X, y)

    def inverse_transform_outcome(self, y):
        """
        Revert the scaling of the target variable to its original scale.
        """
        if not self.fitted:
            raise ValueError("Preprocessor is not fitted yet. Call 'fit' first.")
        y = np.array(y).reshape(-1, 1)  # Reshape for scaler
        return self.target_scaler.inverse_transform(y).flatten()

### 4.2 Reading the Datasets and Dropping Unnecessary Columns

In [None]:
discard_columns = []

# Reading Training Data
train = pd.read_csv(train_file_path)
train = train.drop(columns=discard_columns)

### 4.3 Applying Preprocessing to the Training Data for Model Fitting

In [None]:
# Assuming train is your original training dataset
y = train[outcome]
X = train.drop(outcome, axis=1)

# Fit the preprocessor on training data
preprocessor = UnifiedPreprocessor()
preprocessor.fit(X, y)

# Transform the train dataset
X_train_transformed, y_train_transformed = preprocessor.transform(X, y)

## 5. Training Bagging Classifer Model

In [None]:
model_name = BaggingClassifier(estimator=DecisionTreeClassifier(
    class_weight=None, criterion='gini', max_depth=None,
    max_features=None, max_leaf_nodes=None,
    min_samples_leaf=1,
    min_samples_split=2, min_weight_fraction_leaf=0.0,
    random_state=None, splitter='best'),
    bootstrap=True, bootstrap_features=False, max_features=1.0,
    max_samples=1.0, n_estimators=5, n_jobs=None, oob_score=False,
    random_state=42, verbose=0, warm_start=False)

model = model_name.fit(X_train_transformed, y_train_transformed)

## 6. Get Explanations

### 6.1 Get Explanations for Top 2 Points

In [None]:
options = {'record_number': [1, 2], 'preprocessor': preprocessor}
response = ez_explain(train_file_path, outcome, test_file_path, model, options=options)

### 6.2 Display Explanation DataFrame

In [None]:
ex_df = pd.DataFrame([i.values() for i in response['explanations']], columns=response['explanations'][0].keys())
ez_display_df(ex_df)