In [11]:
import mlflow
import mlflow.sklearn
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Sample data
data = pd.DataFrame({
    'square_footage': [1500, 2000, 2500, 1800, 3000],
    'price': [300000, 400000, 500000, 350000, 600000],
    'neighborhood': ['A', 'B', 'A', 'B', 'A']
})

# Train-test split
X = data[['square_footage', 'neighborhood']]
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Custom transformer for cost_per_square_foot
class CostPerSquareFootTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        print("fitting!!")
        if y is None:
            raise ValueError("Target values (y) must not be None during fit.")
        self.mean_price_ = y.mean()  # Save mean price
        return self

    def transform(self, X, y=None):
        print("transforming!!")
        X = X.copy()
        # Use the mean price as a proxy for cost_per_square_foot
        X['cost_per_square_foot'] = X['square_footage'] / (self.mean_price_ + 1e-9)
        return X

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['square_footage', 'cost_per_square_foot']),
        ('cat', OneHotEncoder(), ['neighborhood'])
    ],
    remainder='drop'
)

# Full pipeline
pipeline = Pipeline(steps=[
    ('feature_engineering', CostPerSquareFootTransformer()),  # Custom feature engineering
    ('preprocessor', preprocessor),  # Preprocessing
    ('model', LinearRegression())  # Model
])

# Train the model with MLflow tracking
with mlflow.start_run():
    # Fit the pipeline
    print("hi!")
    pipeline.fit(X_train, y_train)
    print("bye!")
    # Make predictions
    y_pred = pipeline.predict(X_test)

    # Calculate metrics
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)

    # Log metrics
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    # Log the entire pipeline
    mlflow.sklearn.log_model(pipeline, "pipeline_model")

    # End the run
    print(f"Run complete: RMSE={rmse}, R2={r2}")




Run complete: RMSE=9999.999999999942, R2=nan
