# <center> Training Models with MLflow <center/>

## <center> Building housing price models and tracking everything <center/>

# The Goal

Now that we have our data prepped, it's time to train some models. We'll build housing price prediction models and use MLflow to track everything - experiments, parameters, metrics, and model artifacts.

**We're building models that will serve as the foundation for our MLOps monitoring pipeline, where we can detect when performance starts to degrade over time.**

# My Experience with MLOps
Working with model deployment in production environments has taught me the importance of proper experiment tracking and model versioning. MLflow provides the infrastructure needed to maintain model lineage and compare experiments systematically.

# Building Our Models

## Setting Up

In [None]:
#required libraries
import sys
import os
sys.path.append(os.path.join(os.getcwd(), '..', 'src'))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
import mlflow
import mlflow.sklearn
import mlflow.xgboost
from mlflow.tracking import MlflowClient
import optuna
from optuna.integration import MLflowCallback

#our custom modules
from data_loader import DataLoader
from model_trainer import ModelTrainer

#quality of life  
np.random.seed(42)
import warnings
warnings.filterwarnings('ignore')

print("✅ All libraries loaded successfully!")
print(f"MLflow version: {mlflow.__version__}")
print(f"Optuna version: {optuna.__version__}")

## Getting Our Data Ready

In [None]:
#initialize data loader
data_loader = DataLoader()

#load our preprocessed training data
print("Loading preprocessed training data...")
train_data = data_loader.load_processed_data("train_data.csv")
val_data = data_loader.load_processed_data("validation_data.csv")

print(f"Training data shape: {train_data.shape}")
print(f"Validation data shape: {val_data.shape}")

#this is how i like to display the shape of my df
train_data.info()

In [None]:
# quick look at our data
print("First few rows:")
print(train_data.head())

print("\nTarget variable stats:")
print(train_data['MedHouseVal'].describe())

In [None]:
# set up our model
model = HousingPriceModel(model_type="xgboost")

# separate features from target
print("Setting up features and target...")

X_train, y_train = model.prepare_features(train_data)
X_val, y_val = model.prepare_features(val_data)

print(f"Training features: {X_train.shape}")
print(f"Training target: {y_train.shape}")
print(f"Validation features: {X_val.shape}")
print(f"Validation target: {y_val.shape}")

print(f"\nFeatures we're using: {model.feature_columns}")

In [None]:
# let's see how our train/validation splits look
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
axes = axes.ravel()

for i, col in enumerate(X_train.columns):
    if i < len(axes):
        axes[i].hist(X_train[col], bins=30, alpha=0.7, color='skyblue', label='Train')
        axes[i].hist(X_val[col], bins=30, alpha=0.7, color='orange', label='Validation')
        axes[i].set_title(f'{col}')
        axes[i].legend()

plt.tight_layout()
plt.suptitle('Feature Distributions: Train vs Validation', y=1.02, fontsize=16)
plt.show()

# MLflow Setup

Time to set up experiment tracking. This will help us compare models and keep track of what works.

In [None]:
# configure MLflow
mlflow.set_tracking_uri("../mlruns")  # local tracking
experiment_name = "housing_price_prediction"

# create or get experiment
try:
    experiment_id = mlflow.create_experiment(experiment_name)
    print(f"Created new experiment: {experiment_name}")
except:
    experiment = mlflow.get_experiment_by_name(experiment_name)
    experiment_id = experiment.experiment_id
    print(f"Using existing experiment: {experiment_name}")

print(f"Experiment ID: {experiment_id}")
print(f"Tracking URI: {mlflow.get_tracking_uri()}")

# <center>XGBoost Optimization

In [None]:
#search params
search_space_xgb = {
    'learning_rate': (0.01, 0.3),
    'n_estimators': (100, 1000),
    'max_depth': (3, 10),
    'subsample': (0.6, 1.0),
    'colsample_bytree': (0.6, 1.0),
    'reg_alpha': (0, 10),
    'reg_lambda': (0, 10)
}

In [None]:
#define objective function for optuna
def objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
        'random_state': 42
    }
    
    model = XGBRegressor(**params)
    model.fit(X_train, y_train)
    
    val_pred = model.predict(X_val)
    mse = mean_squared_error(y_val, val_pred)
    
    return mse

#create optuna study
study = optuna.create_study(direction='minimize')

#optimize hyperparameters
study.optimize(objective, n_trials=50)

In [None]:
#best params for xgboost using optuna
study.best_params

### Train and Test

In [None]:
#create model with best params from optuna
xgb_tuned = XGBRegressor(
    colsample_bytree=study.best_params['colsample_bytree'],
    learning_rate=study.best_params['learning_rate'],
    max_depth=study.best_params['max_depth'],
    n_estimators=study.best_params['n_estimators'],
    reg_alpha=study.best_params['reg_alpha'],
    reg_lambda=study.best_params['reg_lambda'],
    subsample=study.best_params['subsample'],
    random_state=42
)

In [None]:
#train tuned model
xgb_tuned.fit(X_train, y_train)

In [None]:
#mse for our tuned model
mean_squared_error(y_val, xgb_tuned.predict(X_val))

In [None]:
#defining feature importance descending for XGBoost
feature_imp_xgb = pd.Series(xgb_tuned.feature_importances_, 
                           X_train.columns[xgb_tuned.feature_importances_!=0])
feature_imp_xgb.sort_values(ascending=False)

In [None]:
#desc sorting bar chart for feature importance
sorted_idx = xgb_tuned.feature_importances_.argsort()
plt.barh(
    X_train.columns[sorted_idx], 
    xgb_tuned.feature_importances_[sorted_idx])
plt.xlabel("XGBoostRegressor Feature Importance")

In [None]:
# getting training and validation predictions
y_train_pred = xgb_tuned.predict(X_train)
y_val_pred = xgb_tuned.predict(X_val)

In [None]:
# calculating training and validation mse
mse_train = mean_squared_error(y_train, y_train_pred)
mse_val = mean_squared_error(y_val, y_val_pred)

print("XGBoost Train MSE:", mse_train)
print("XGBoost Validation MSE:", mse_val)

In [None]:
# saving the trained model
import joblib
joblib.dump(xgb_tuned, '../models/xgboost_model.pkl')
print("Model saved successfully!")

In [None]:
# creating baseline data for monitoring
baseline_data = pd.concat([X_train, y_train], axis=1)
baseline_data.to_csv('../data/processed/baseline_data.csv', index=False)
print("Baseline data saved for monitoring purposes.")

### Summary

We successfully trained an XGBoost regression model with Optuna hyperparameter optimization. The model achieved decent performance on the housing price prediction task, and we've set up the foundation for continuous monitoring with MLflow experiment tracking. The baseline data has been prepared for drift detection in our MLOps pipeline.