In [None]:
import mlflow
import mlflow.sklearn
from sklearn.metrics import mean_squared_error, r2_score

# Set the MLflow experiment
mlflow.set_experiment("House Pricing Prediction")

# Custom transformer for cost_per_square_foot
class CostPerSquareFootTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.copy()
        X['cost_per_square_foot'] = X['square_footage'] / y
        return X

# Preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['square_footage', 'cost_per_square_foot']),
        ('cat', OneHotEncoder(), ['neighborhood'])
    ],
    remainder='drop'
)

# Final pipeline
pipeline = Pipeline(steps=[
    ('feature_engineering', CostPerSquareFootTransformer()),  # Custom feature engineering
    ('preprocessor', preprocessor),  # Preprocessing
    ('model', LinearRegression())  # Model
])

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    data[['square_footage', 'neighborhood']],
    data['price'],
    test_size=0.2,
    random_state=42
)

# Train the model with MLflow tracking
with mlflow.start_run():
    # Log parameters (pipeline components)
    mlflow.log_param("model", "LinearRegression")
    mlflow.log_param("preprocessor", "StandardScaler and OneHotEncoder")
    mlflow.log_param("custom_transformer", "CostPerSquareFootTransformer")

    # Fit the pipeline
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = pipeline.predict(X_test)

    # Calculate metrics
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)

    # Log metrics
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    # Log the entire pipeline as an artifact
    mlflow.sklearn.log_model(pipeline, "pipeline_model")

    # End the run
    print(f"Run complete: RMSE={rmse}, R2={r2}")

# Load the logged model (for testing or deployment)
logged_model = "runs:/<run_id>/pipeline_model"
loaded_pipeline = mlflow.sklearn.load_model(logged_model)

# Make predictions with the loaded model
y_loaded_pred = loaded_pipeline.predict(X_test)
