In [1]:
'''
Key Changes
I. Added get_feature_names_out to Custom Transformers:
    a. Both GroupAverageTransformer and CostPerSquareFootTransformer now implement get_feature_names_out to ensure feature names are propagated correctly.
    b.This is critical for interpretability and downstream usage.
II. Feature Name Extraction in Pipeline:
    a. The ColumnTransformer automatically generates feature names.
    b. Feature names from custom transformers are dynamically appended.
III. Clear Separation of Steps:
    a. Each custom transformation step operates on a pandas DataFrame, maintaining readability and industry conventions.
IV. Improved Logging:
    a. Model coefficients are logged in MLflow, mapped back to their feature names for interpretability.
V. Ensured Robustness:
    a. Each transformation avoids modifying the original DataFrame by creating copies.
    b. The pipeline is reusable and modular.

Why This Approach?
I. Industry Best Practices:
    a. Uses pandas for transformations and retains feature names throughout the pipeline.
    b. Dynamically handles feature name extraction, avoiding manual mapping.
II. Reusability and Interpretability:
    a. The pipeline can be reused across datasets.
    b. Feature names ensure interpretability of the model coefficients.
III. Robust and Scalable:
    a. Handles missing or unseen categories during inference.
    b. Easily extensible for additional features or transformations.'''

import mlflow
import mlflow.sklearn
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Sample data
data = pd.DataFrame({
    'square_footage': [1500, 2000, 2500, 1800, 3000],
    'price': [300000, 400000, 500000, 350000, 600000],
    'neighborhood': ['A', 'B', 'A', 'B', 'A']
})

# Train-test split
X = data[['square_footage', 'neighborhood']]
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Custom transformer for group averages
class GroupAverageTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, group_col, target_col, new_feature_name):
        self.group_col = group_col
        self.target_col = target_col
        self.new_feature_name = new_feature_name
        self.group_averages_ = {}
        self.global_mean_ = None

    def fit(self, X, y=None):
        X = X.copy()
        # Calculate group averages and global mean
        self.group_averages_ = X.groupby(self.group_col)[self.target_col].mean().to_dict()
        self.global_mean_ = X[self.target_col].mean()
        return self

    def transform(self, X):
        X = X.copy()  # Avoid modifying the original DataFrame
        # Map group averages, fallback to global mean for unknown groups
        X[self.new_feature_name] = X[self.group_col].map(self.group_averages_).fillna(self.global_mean_)
        return X

    def get_feature_names_out(self, input_features=None):
        return input_features + [self.new_feature_name]

# Custom transformer for cost per square foot
class CostPerSquareFootTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mean_price_ = None

    def fit(self, X, y=None):
        # Calculate the mean price from the training data
        self.mean_price_ = y.mean()
        return self

    def transform(self, X):
        X = X.copy()
        X['cost_per_square_foot'] = X['square_footage'] / (self.mean_price_ + 1e-9)
        return X

    def get_feature_names_out(self, input_features=None):
        return input_features + ['cost_per_square_foot']

# Preprocessor for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['square_footage', 'cost_per_square_foot']),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), ['neighborhood'])
    ],
    remainder='passthrough'
)

# Full pipeline
pipeline = Pipeline(steps=[
    ('group_avg', GroupAverageTransformer(
        group_col='neighborhood',
        target_col='square_footage',
        new_feature_name='avg_sqft_per_neighborhood'
    )),
    ('cost_per_sqft_transformer', CostPerSquareFootTransformer()),
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

# Train the model with MLflow tracking
with mlflow.start_run():
    # Fit the pipeline
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = pipeline.predict(X_test)

    # Calculate metrics
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)

    # Log metrics
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    # Log the pipeline
    mlflow.sklearn.log_model(pipeline, "pipeline_model")

    # Log model coefficients
    model = pipeline.named_steps['model']
    if hasattr(model, 'coef_'):
        coefficients = dict(zip(
            pipeline.named_steps['preprocessor'].get_feature_names_out(),
            model.coef_
        ))
        mlflow.log_param("coefficients", coefficients)

    print(f"Run complete: RMSE={rmse}, R2={r2}")


The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



Run complete: RMSE=9999.999999999302, R2=nan
