In [2]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

In [3]:
import pandas as pd

# Define column names from the dataset description
column_names = [
    'mpg', 'cylinders', 'displacement', 'horsepower',
    'weight', 'acceleration', 'model_year', 'origin', 'car_name'
]

# Load dataset from UCI and assign column names
df = pd.read_csv(
    'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data',
    sep='\s+',
    names=column_names
)

# Replace '?' in horsepower with NaN
df['horsepower'] = df['horsepower'].replace('?', np.nan).astype(float)

# Drop rows with missing values
df.dropna(inplace=True)

# Drop 'car name' column (not useful for modeling)
df.drop('car_name', axis=1, inplace=True)

# Map 'origin' to categorical names
df['origin'] = df['origin'].map({1: 'USA', 2: 'Europe', 3: 'Japan'})


In [4]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,USA
1,15.0,8,350.0,165.0,3693.0,11.5,70,USA
2,18.0,8,318.0,150.0,3436.0,11.0,70,USA
3,16.0,8,304.0,150.0,3433.0,12.0,70,USA
4,17.0,8,302.0,140.0,3449.0,10.5,70,USA


In [5]:
X = df.drop('mpg', axis=1)  # Features
y = df['mpg']               # Target variable


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [7]:
numeric_features = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year']
categorical_features = ['origin']


In [8]:
# Numeric pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Categorical pipeline
categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(drop='first'))  # Avoids dummy variable trap
])


In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


In [10]:
# Fit on training data, transform both train and test
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print("✅ Preprocessing complete.")
print("Processed training shape:", X_train_processed.shape)
print("Processed test shape:", X_test_processed.shape)


✅ Preprocessing complete.
Processed training shape: (313, 8)
Processed test shape: (79, 8)


In [11]:
#Start the mlflow context
mlflow.set_tracking_uri("http://localhost:5555")
mlflow.set_experiment("CapstoneAUTOMgp")

<Experiment: artifact_location='mlflow-artifacts:/3', creation_time=1755027237410, experiment_id='3', last_update_time=1755027237410, lifecycle_stage='active', name='CapstoneAUTOMgp', tags={}>

In [21]:
import mlflow
import mlflow.sklearn
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from mlflow.models import infer_signature

# Models to train
models = {
    'LinearRegression': LinearRegression(),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'DecisionTree': DecisionTreeRegressor(random_state=42),
    'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'SVR': SVR()
}

# 🔁 Loop through models and log to MLflow
best_rmse = float('inf')
best_run_id = None
mlflow.set_experiment("Capstone_Model_Comparison")

for name, model in models.items():
    with mlflow.start_run(run_name=name) as run:
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', model)
        ])

        pipeline.fit(X_train, y_train)
        preds = pipeline.predict(X_test)

        rmse = mean_squared_error(y_test, preds)
        mae = mean_absolute_error(y_test, preds)
        r2 = r2_score(y_test, preds)
        signature = infer_signature(X_test, preds)

        mlflow.log_param("model", name)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)

        mlflow.sklearn.log_model(
            sk_model=pipeline,
            artifact_path="model",
            input_example=X_test.iloc[:5],
            signature=signature
        )

        print(f"{name} - RMSE: {rmse:.2f}, MAE: {mae:.2f}, R2: {r2:.2f}")

        if rmse < best_rmse:
            best_model = model
            best_rmse = rmse
            best_run_id = run.info.run_id
            # Create the best pipeline again
            best_pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('model', best_model)
            ])
            
# Fit again to make sure it's trained (or save the already fitted one if needed)
best_pipeline.fit(X_train, y_train)

# Save as .bin
import pickle
with open("best_model_pipeline.bin", "wb") as f:
    pickle.dump(best_pipeline, f)
# ✅ Register the best model
model_uri = f"runs:/{best_run_id}/model"
registered_model_name = "CapstoneAUTOMgp_BestModel"

mlflow.register_model(model_uri, registered_model_name)
print(f"✅ Registered best model from run {best_run_id} with RMSE {best_rmse:.2f} as '{registered_model_name}'")



LinearRegression - RMSE: 10.60, MAE: 2.46, R2: 0.79
🏃 View run LinearRegression at: http://localhost:5555/#/experiments/4/runs/c935b7601ae6456093860c61618defcd
🧪 View experiment at: http://localhost:5555/#/experiments/4




RandomForest - RMSE: 5.81, MAE: 1.72, R2: 0.89
🏃 View run RandomForest at: http://localhost:5555/#/experiments/4/runs/f3147a93e20e4bb6943be14cfa59b39a
🧪 View experiment at: http://localhost:5555/#/experiments/4




DecisionTree - RMSE: 11.39, MAE: 2.31, R2: 0.78
🏃 View run DecisionTree at: http://localhost:5555/#/experiments/4/runs/71a6a7ddf92740828db605f6bab2a568
🧪 View experiment at: http://localhost:5555/#/experiments/4




GradientBoosting - RMSE: 6.19, MAE: 1.78, R2: 0.88
🏃 View run GradientBoosting at: http://localhost:5555/#/experiments/4/runs/8be3962a32f7463ca6154811935225d9
🧪 View experiment at: http://localhost:5555/#/experiments/4
SVR - RMSE: 9.08, MAE: 2.03, R2: 0.82
🏃 View run SVR at: http://localhost:5555/#/experiments/4/runs/6c0c1b75050a4a239c7906b06bcf4112
🧪 View experiment at: http://localhost:5555/#/experiments/4


Registered model 'CapstoneAUTOMgp_BestModel' already exists. Creating a new version of this model...
2025/08/13 06:01:38 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: CapstoneAUTOMgp_BestModel, version 26


✅ Registered best model from run f3147a93e20e4bb6943be14cfa59b39a with RMSE 5.81 as 'CapstoneAUTOMgp_BestModel'


Created version '26' of model 'CapstoneAUTOMgp_BestModel'.


In [16]:
from mlflow.tracking import MlflowClient

client = MlflowClient()

run_id = "2cd3a325d31d4f41ac7ba15c0584309d"  # replace this with the best_run_id
model_uri = f"runs:/{run_id}/model"
registered_model_name = "CapstoneAUTOMgp"

mlflow.register_model(model_uri, registered_model_name)


Successfully registered model 'CapstoneAUTOMgp'.
2025/08/13 05:39:24 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: CapstoneAUTOMgp, version 1
Created version '1' of model 'CapstoneAUTOMgp'.


<ModelVersion: aliases=[], creation_timestamp=1755063564464, current_stage='None', deployment_job_state=<ModelVersionDeploymentJobState: current_task_name='', job_id='', job_state='DEPLOYMENT_JOB_CONNECTION_STATE_UNSPECIFIED', run_id='', run_state='DEPLOYMENT_JOB_RUN_STATE_UNSPECIFIED'>, description='', last_updated_timestamp=1755063564464, metrics=None, model_id=None, name='CapstoneAUTOMgp', params=None, run_id='2cd3a325d31d4f41ac7ba15c0584309d', run_link='', source='models:/m-e9de5c2608984dda9ad77af644204605', status='READY', status_message=None, tags={}, user_id='', version='1'>

In [20]:
import pickle

with open('Rf_model.bin', 'wb') as f_out:

    pickle.dump((best_model), f_out)