### Install eazyml-counterfactual library

In [None]:
!pip install --upgrade eazyml-counterfactual
!pip install gdown python-dotenv

### Import Libraries

In [None]:
import os
import numpy as np
import pandas as pd
import eazyml as ez
from eazyml_counterfactual import (
        ez_cf_inference,
        ez_init        
)
import gdown

from dotenv import load_dotenv
load_dotenv()

# Scikit-learn libraries for model building
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

### Initialize EazyML
The `ez_init` function uses the `EAZYML_ACCESS_KEY` environment variable for authentication. If the variable is not set, it defaults to a trial license.

In [None]:
ez_init(os.getenv('EAZYML_ACCESS_KEY'))

## 1. Download the dataset and specify the outcome variable

### 1.1. Download dataset

In [None]:
gdown.download_folder(id='1p7Udh2MjKyJPxI47FS89VowAz9ZEq_hG')

###  1.2. Define dataset file and outcome variable

In [None]:
# Defining file paths for training and test datasets and specifying the outcome variable
train_file = os.path.join('data', "House Price Prediction - Train Data.xlsx")
test_file = os.path.join('data', "House Price Prediction - Test Data.xlsx")
outcome = "House_Price"

# Loading the training dataset and the test dataset
train_df = pd.read_excel(train_file)
test_df = pd.read_excel(test_file)

# Display the first few rows of the training DataFrame for inspection
ez.ez_display_df(train_df.head())

## 2. Custom Modeling with Scikit-learn

### 2.1. Unified Preprocessing Class for Regression

In [None]:
class UnifiedRegressorPreprocessor:
    """Preprocessor for handling numerical and categorical features, 
    including scaling, encoding, and missing value imputation."""

    def __init__(self):
        self.numerical_imputer = SimpleImputer(strategy="mean")
        self.scaler = StandardScaler()
        self.categorical_encoder = OneHotEncoder(drop="first", sparse=False)
        self.target_scaler = StandardScaler()
        self.fitted = False

    def fit(self, X, y=None):
        """Fits preprocessing transformations on numerical & categorical features and target variable (if provided)."""
        self.numerical_columns = X.select_dtypes(include=[np.number]).columns
        self.categorical_columns = X.select_dtypes(include=[object]).columns

        self.numerical_imputer.fit(X[self.numerical_columns])
        self.scaler.fit(X[self.numerical_columns])
        self.categorical_encoder.fit(X[self.categorical_columns])

        if y is not None:
            self.target_scaler.fit(np.array(y).reshape(-1, 1))

        self.fitted = True

    def transform(self, X, y=None):
        """Applies fitted transformations to the dataset."""
        if not self.fitted:
            raise ValueError("Preprocessor not fitted. Call 'fit' first.")

        X_num = self.scaler.transform(self.numerical_imputer.transform(X[self.numerical_columns]))
        X_cat = self.categorical_encoder.transform(X[self.categorical_columns])
        feature_names = list(self.numerical_columns) + list(self.categorical_encoder.get_feature_names_out())

        X_transformed_df = pd.DataFrame(np.hstack((X_num, X_cat)), columns=feature_names, index=X.index)

        if y is not None:
            y_transformed = self.target_scaler.transform(np.array(y).reshape(-1, 1)).flatten()
            return X_transformed_df, y_transformed

        return X_transformed_df

    def inverse_transform_outcome(self, y):
        """Reverts the target variable to its original scale."""
        return self.target_scaler.inverse_transform(np.array(y).reshape(-1, 1)).flatten()

    def fit_transform(self, X, y=None):
        """Combines fit and transform steps."""
        self.fit(X, y)
        return self.transform(X, y)


### 2.2. Train and Evaluate Linear Regression Model

In [None]:
# Prepare training and test datasets
X_train, y_train = train_df.drop(columns=[outcome]), train_df[outcome]
X_test, y_test = test_df.drop(columns=[outcome]), test_df[outcome]

# Initialize and apply preprocessing
preprocessor = UnifiedRegressorPreprocessor()
X_train_transformed, y_train_transformed = preprocessor.fit_transform(X_train, y_train)
X_test_transformed, y_test_transformed = preprocessor.transform(X_test, y_test)

# Train Linear Regression model
model = LinearRegression()
model.fit(X_train_transformed, y_train_transformed)

# Generate predictions and revert scaling
y_pred_transformed = model.predict(X_test_transformed)
y_pred = preprocessor.inverse_transform_outcome(y_pred_transformed)

# Add predictions to test DataFrame
predicted_df = test_df.copy()
predicted_df[f"Predicted {outcome}"] = y_pred

# Display sample predictions
print("\nTest DataFrame with Predictions:")
display(predicted_df.head(10))

# Evaluate model performance
metrics = {
    "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
    "MAE": mean_absolute_error(y_test, y_pred),
    "R2 Score": r2_score(y_test, y_pred),
}

print("\nModel Performance Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.2f}")
    

## 3. EazyML Counterfactual Inference

### 3.1. Define Counterfactual Inference Configuration

In [None]:
# Define the selected features for prediction
selected_features = ['Square_Footage', 'Num_Bedrooms', 'Num_Bathrooms', 'Year_Built', 
                     'Lot_Size', 'Garage_Size', 'Neighborhood_Quality']

# Define variant (modifiable) features
invariants = ['Year_Built']
variants = [feature for feature in selected_features if feature not in invariants]

# Define configurable parameters for counterfactual inference
cf_options = {   
    "variants": variants,  
    "outcome_ordinality": "maximize",  # Desired action 
    "train_data": train_file,
    "preprocessor": preprocessor,
}

### 3.2. Perform Counterfactual Inference

In [None]:
# Specify the index of the test record for counterfactual inference
test_index_no = 0  
test_data = predicted_df.loc[[test_index_no]]  

# Perform Inference 
result, optimal_transition_df = ez_cf_inference(
    test_data=test_data,  
    outcome=outcome,  
    selected_features=selected_features,  
    model_info=model,  
    options=cf_options  
)

### 3.3. Display Results

In [None]:
# Summarizes whether an optimal transition was found.
ez.ez_display_json(result)

In [None]:
# Details the feature changes needed to achieve the optimal outcome.
ez.ez_display_df(optimal_transition_df)