In [1]:

import pandas as pd
import numpy as np
import warnings
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature
import warnings
warnings.filterwarnings('ignore')
import pickle
import mlflow.pyfunc

# Suppress all warnings for cleaner output
warnings.filterwarnings("ignore")


In [2]:
df = pd.read_csv('auto_mpg.csv')
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
origin          0
car name        0
dtype: int64

In [None]:


# --- 1. Data Preparation ---
def load_and_prepare_data():
    """
    Downloads the Auto MPG dataset from Kaggle, cleans it, and prepares it for modeling.
    """
    # Load the dataset
    df = pd.read_csv('auto_mpg.csv')
    
    # Handle missing values in 'horsepower'
    df = df.replace('?', np.nan)
    df['horsepower'] = pd.to_numeric(df['horsepower'])
    
    # Impute missing values with the median
    df['horsepower'] = df['horsepower'].fillna(df['horsepower'].median())
    
    # Drop the 'car name' column as it is not useful for prediction
    df = df.drop('car name', axis=1)
    
    # One-hot encode the 'origin' column
    df = pd.get_dummies(df, columns=['origin'], prefix='origin')
    
    # Define features (X) and target (y)
    X = df.drop('mpg', axis=1)
    y = df['mpg']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Normalize features using StandardScaler
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train, X_test, y_train, y_test, scaler, X_train.columns


def evaluate_model(model, X_test, y_test):
    """
    Predicts on the test set and calculates evaluation metrics.
    
    The custom MLflow pyfunc model requires two arguments: a context and the input data.
    """
    # Pass None for the context argument
    predictions = model.predict(None, X_test)
    rmse = np.sqrt(mean_squared_error(y_test, predictions))
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    return rmse, mae, r2

# --- 2. Train Models & MLflow Tracking ---
def train_and_track_models(X_train, X_test, y_train, y_test, scaler, models_to_train):
    """
    Trains multiple models, tracks metrics with MLflow, and returns the best model details.
    """
    experiment_name = "AutoMPGRegression"
    mlflow.set_experiment(experiment_name)
    print(f"MLflow experiment '{experiment_name}' is set.")

    best_rmse = float('inf')
    best_run_id = None
    best_model_name = None
    best_raw_model = None

    # Loop through each model to train and track
    for model_name, model in models_to_train.items():
        with mlflow.start_run(run_name=model_name):
            print(f"\n--- Training {model_name} ---")

            # Log model parameters
            if hasattr(model, 'get_params'):
                mlflow.log_params(model.get_params())
                
            # Train the model on the scaled data
            model.fit(scaler.transform(X_train), y_train)

            # Create the custom wrapper model for MLflow logging
            custom_model = ModelWrapper(scaler, model)

            # Evaluate the model using the wrapped model
            rmse, mae, r2 = evaluate_model(custom_model, X_test, y_test)

            # Log metrics
            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("mae", mae)
            mlflow.log_metric("r2_score", r2)

            mlflow.log_param("model_type", model_name)
            
            # Infer the model signature for MLflow
            signature = infer_signature(X_train, custom_model.predict(None, X_train))
            
            print(f"  RMSE: {rmse:.4f}")
            print(f"  MAE: {mae:.4f}")
            print(f"  R2 Score: {r2:.4f}")
            
            # Log the custom wrapped model to MLflow
            mlflow.pyfunc.log_model(
                python_model=custom_model,
                artifact_path="MPGestimator",
                signature=signature,
                input_example=X_train,
            )

            # Check if this is the best model so far
            if rmse < best_rmse:
                best_rmse = rmse
                best_run_id = mlflow.active_run().info.run_id
                best_model_name = model_name
                # --- FIX: Now correctly storing the best model object ---
                best_raw_model = model

    print("\nTraining complete. All runs are logged in MLflow.")
    return best_run_id, best_model_name, best_rmse, best_raw_model, scaler

# --- 3. Save the Best Model ---
def register_best_model(run_id, best_model_name, best_rmse):
    """
    Registers the top-performing model in the MLflow Model Registry.
    """
    if run_id:
        model_uri = f"runs:/{run_id}/MPGestimator"
        
        print(f"\nRegistering the best model '{best_model_name}' (RMSE: {best_rmse:.4f}) to the MLflow Model Registry...")
        
        registered_model = mlflow.register_model(
            model_uri=model_uri,
            name="AutoMPGRegressor"
        )

        print(f"Model '{registered_model.name}' version {registered_model.version} is registered.")
    else:
        print("No models were trained to be registered.")

# --- 4. Pickle the Best Model to a Local File ---
def save_model_to_bin(model, scaler, filename):
    """
    Saves both the best model and the scaler to a local binary file using pickle.
    This is necessary to replicate the training environment for prediction.
    """
    # The 'best_autompg_model.bin' file will now contain a dictionary with both the model and the scaler.
    model_and_scaler = {
        'model': model,
        'scaler': scaler
    }
    with open(filename, 'wb') as f_out:
        pickle.dump(model_and_scaler, f_out)
    print(f"\nBest model and scaler saved to {filename}")


if __name__ == "__main__":
    # Part 1: Data Prep
    X_train, X_test, y_train, y_test, scaler, _ = load_and_prepare_data()

    # Define the models to train
    models_to_train = {
        'Linear Regression': LinearRegression(),
        'Random Forest Regressor': RandomForestRegressor(n_estimators=100, random_state=42),
        'Gradient Boosting Regressor': GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
        'Support Vector Regressor': SVR(kernel='rbf', C=100, gamma=0.1),
        'K-Neighbors Regressor': KNeighborsRegressor(n_neighbors=5)
    }

    # Part 2: Train & Track Models
    best_run_id, best_model_name, best_rmse, best_raw_model, final_scaler = train_and_track_models(
        X_train, X_test, y_train, y_test, scaler, models_to_train
    )

    # Part 3: Register the Best Model in MLflow
    register_best_model(best_run_id, best_model_name, best_rmse)
    
    # Part 4: Pickle the Best Model and Scaler to a local file
    save_model_to_bin(best_raw_model, final_scaler, 'best_autompg_model.bin')
    
    print("\n--- Process Complete ---")
    print("To view your experiment results, run `mlflow ui` in your terminal and navigate to http://localhost:5000.")
