In [2]:
# Import necessary libraries for modeling
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

# Load the processed dataset
df = pd.read_csv("../data/processed/medical_insurance_processed.csv")

# Display first few rows to confirm it's loaded correctly
print("First 5 rows of processed dataset:")
print(df.head())

# Check the shape of the dataset
print(f"\nDataset shape: {df.shape}")

First 5 rows of processed dataset:
   age     bmi  children      charges  sex_male  smoker_yes  region_northwest  \
0   19  27.900         0  16884.92400     False        True             False   
1   18  33.770         1   1725.55230      True       False             False   
2   28  33.000         3   4449.46200      True       False             False   
3   33  22.705         0  21984.47061      True       False              True   
4   32  28.880         0   3866.85520      True       False              True   

   region_southeast  region_southwest  age_smoker  bmi_smoker  age_bmi  
0             False              True          19        27.9  530.100  
1              True             False           0         0.0  607.860  
2              True             False           0         0.0  924.000  
3             False             False           0         0.0  749.265  
4             False             False           0         0.0  924.160  

Dataset shape: (2772, 12)


In [3]:
# --- Data Splitting ---

# 1. Define features (X) and target variable (y)
# We'll use all columns except 'charges' as features
X = df.drop('charges', axis=1)  # All columns except 'charges' (using 'df' which is our loaded data)
y = df['charges']               # Only the 'charges' column

# 2. Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Display the shapes of the resulting datasets
print("Training set shapes:")
print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")

print("\nTesting set shapes:")
print(f"X_test: {X_test.shape}")
print(f"y_test: {y_test.shape}")

# 4. Display first few rows of training features to confirm
print("\nFirst 5 rows of X_train:")
print(X_train.head())

Training set shapes:
X_train: (2217, 11)
y_train: (2217,)

Testing set shapes:
X_test: (555, 11)
y_test: (555,)

First 5 rows of X_train:
      age    bmi  children  sex_male  smoker_yes  region_northwest  \
1864   21  36.85         0      True       False             False   
1997   38  34.80         2     False       False             False   
1336   21  25.80         0     False       False             False   
655    52  25.30         2     False        True             False   
261    20  26.84         1     False        True             False   

      region_southeast  region_southwest  age_smoker  bmi_smoker  age_bmi  
1864              True             False           0        0.00   773.85  
1997             False              True           0        0.00  1322.40  
1336             False              True           0        0.00   541.80  
655               True             False          52       25.30  1315.60  
261               True             False          20       26

In [4]:
# --- Model Training: Linear Regression ---

# 1. Initialize the Linear Regression model
lr_model = LinearRegression()

# 2. Train the model on the training data
lr_model.fit(X_train, y_train)

# 3. Make predictions on the testing data
y_pred_lr = lr_model.predict(X_test)

# 4. Evaluate the model's performance
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

# 5. Display the evaluation metrics
print("Linear Regression Model Performance:")
print(f"Mean Squared Error (MSE): {mse_lr:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse_lr:.2f}")
print(f"Mean Absolute Error (MAE): {mae_lr:.2f}")
print(f"R-squared (R2) Score: {r2_lr:.4f}")

# 6. Show first 10 actual vs predicted values
print("\nFirst 10 Actual vs Predicted Charges:")
comparison_df = pd.DataFrame({'Actual': y_test[:10].values, 'Predicted': y_pred_lr[:10]})
print(comparison_df)

Linear Regression Model Performance:
Mean Squared Error (MSE): 26137763.57
Root Mean Squared Error (RMSE): 5112.51
Mean Absolute Error (MAE): 2895.21
R-squared (R2) Score: 0.8297

First 10 Actual vs Predicted Charges:
        Actual     Predicted
0   8988.15875  10898.135568
1  28101.33305  31760.878005
2  12032.32600  12280.563740
3   1682.59700   2295.525640
4   3393.35635   3951.951211
5  24106.91255  26423.413894
6   4746.34400   6383.683373
7  47269.85400  51471.708410
8   8556.90700  10277.152633
9   2639.04290   3742.176263


In [5]:
# --- Model Training: Random Forest Regressor ---

# 1. Initialize the Random Forest Regressor model
# We'll use 100 trees (n_estimators) and set a random state for reproducibility
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# 2. Train the model on the training data
rf_model.fit(X_train, y_train)

# 3. Make predictions on the testing data
y_pred_rf = rf_model.predict(X_test)

# 4. Evaluate the model's performance
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

# 5. Display the evaluation metrics
print("Random Forest Regressor Model Performance:")
print(f"Mean Squared Error (MSE): {mse_rf:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse_rf:.2f}")
print(f"Mean Absolute Error (MAE): {mae_rf:.2f}")
print(f"R-squared (R2) Score: {r2_rf:.4f}")

# 6. Show first 10 actual vs predicted values
print("\nFirst 10 Actual vs Predicted Charges (Random Forest):")
comparison_df_rf = pd.DataFrame({'Actual': y_test[:10].values, 'Predicted': y_pred_rf[:10]})
print(comparison_df_rf)

Random Forest Regressor Model Performance:
Mean Squared Error (MSE): 7624973.14
Root Mean Squared Error (RMSE): 2761.34
Mean Absolute Error (MAE): 1325.71
R-squared (R2) Score: 0.9503

First 10 Actual vs Predicted Charges (Random Forest):
        Actual     Predicted
0   8988.15875   9597.677216
1  28101.33305  27952.079509
2  12032.32600  12434.033191
3   1682.59700   2024.754422
4   3393.35635   4719.297501
5  24106.91255  23408.146039
6   4746.34400   5278.003712
7  47269.85400  47302.240584
8   8556.90700  11312.978299
9   2639.04290   3119.671920


In [18]:
# --- MLflow Integration ---

# 1. Import MLflow and os
import mlflow
import mlflow.sklearn
import os

# 2. Set the MLflow tracking URI
# Explicitly define the path to your main project directory
# Adjust this path if your project structure is different
# This assumes your notebook is in 'medical-insurance-cost-prediction/notebooks/'
# and you want to use 'medical-insurance-cost-prediction/mlruns/'

# Get the current working directory (where the notebook is likely run from)
# This should be the 'notebooks' directory if run normally from within VS Code
current_working_dir = os.getcwd()
print(f"Current working directory: {current_working_dir}")

# Navigate up one level to the main project directory
project_dir = os.path.dirname(current_working_dir)
print(f"Project directory: {project_dir}")

# Define the path to the mlruns directory in the main project folder
mlruns_path = os.path.join(project_dir, "mlruns")
tracking_uri = f"file:///{mlruns_path.replace(os.sep, '/')}" # Ensure correct URI format

# Set the tracking URI for MLflow
mlflow.set_tracking_uri(tracking_uri)

# 3. Display the tracking URI to confirm
print("MLflow Tracking URI set in notebook:", mlflow.get_tracking_uri())

# 4. Create an MLflow experiment (if it doesn't exist)
experiment_name = "Medical_Insurance_Cost_Prediction"
try:
    experiment_id = mlflow.create_experiment(experiment_name)
    print(f"Created new experiment: {experiment_name} with ID: {experiment_id}")
except:
    # If experiment already exists, get its ID
    experiment = mlflow.get_experiment_by_name(experiment_name)
    if experiment: # Check if experiment was found
        experiment_id = experiment.experiment_id
        print(f"Using existing experiment: {experiment_name} with ID: {experiment_id}")
    else:
        print(f"Error: Experiment '{experiment_name}' not found and could not be created.")
        raise # Re-raise the exception if experiment handling fails

# --- End MLflow Setup ---

Current working directory: c:\Users\absah\Desktop\medical-insurance-cost-prediction\notebooks
Project directory: c:\Users\absah\Desktop\medical-insurance-cost-prediction
MLflow Tracking URI set in notebook: file:///c:/Users/absah/Desktop/medical-insurance-cost-prediction/mlruns
Using existing experiment: Medical_Insurance_Cost_Prediction with ID: 115075237773415468


In [19]:
# --- Log Random Forest Model to MLflow (Improved) ---

# Start an MLflow run specifically for our Random Forest model
with mlflow.start_run(experiment_id=experiment_id, run_name="Random_Forest_Baseline") as run:
    # Log model parameters
    mlflow.log_param("model_type", "RandomForestRegressor")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("random_state", 42)
    
    # Log model metrics (using the variables from our previous Random Forest evaluation)
    mlflow.log_metric("mse", mse_rf)
    mlflow.log_metric("rmse", rmse_rf)
    mlflow.log_metric("mae", mae_rf)
    mlflow.log_metric("r2_score", r2_rf)
    
    # Log the trained model itself as an artifact
    # Provide an input example to capture the model signature (addresses the warning)
    # Using the first 5 rows of X_train as an example
    mlflow.sklearn.log_model(sk_model=rf_model, artifact_path="model", input_example=X_train.head())
    
    # Print confirmation and the Run ID for reference
    print("Random Forest model logged to MLflow successfully!")
    print(f"Run ID: {run.info.run_id}")

# Store the Run ID for potential later use (e.g., comparing models)
rf_run_id = run.info.run_id



Random Forest model logged to MLflow successfully!
Run ID: 4b16c04cc334458fb51a0e2f44ee7b04


In [8]:
# --- Train and Log Linear Regression Model to MLflow ---

# 1. Initialize the Linear Regression model
lr_model = LinearRegression()

# 2. Train the model on the training data
lr_model.fit(X_train, y_train)

# 3. Make predictions on the testing data
y_pred_lr = lr_model.predict(X_test)

# 4. Evaluate the model's performance
mse_lr = mean_squared_error(y_test, y_pred_lr)
rmse_lr = np.sqrt(mse_lr)
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

# 5. Start an MLflow run specifically for our Linear Regression model
with mlflow.start_run(experiment_id=experiment_id, run_name="Linear_Regression_Baseline") as lr_run:
    # Log model parameters (Linear Regression has few hyperparameters, but we'll log the type)
    mlflow.log_param("model_type", "LinearRegression")
    
    # Log model metrics
    mlflow.log_metric("mse", mse_lr)
    mlflow.log_metric("rmse", rmse_lr)
    mlflow.log_metric("mae", mae_lr)
    mlflow.log_metric("r2_score", r2_lr)
    
    # Log the trained model itself as an artifact
    # Provide an input example to capture the model signature
    mlflow.sklearn.log_model(sk_model=lr_model, artifact_path="model", input_example=X_train.head())
    
    # Print confirmation and the Run ID for reference
    print("Linear Regression model logged to MLflow successfully!")
    print(f"Run ID: {lr_run.info.run_id}")

# Store the Run ID for potential later use
lr_run_id = lr_run.info.run_id

# 6. Display a summary comparison of both models so far
print("\n--- Model Performance Comparison ---")
print(f"Random Forest (Run ID: {rf_run_id}):")
print(f"  R2 Score: {r2_rf:.4f}")
print(f"  RMSE: ${rmse_rf:.2f}")
print(f"  MAE: ${mae_rf:.2f}")

print(f"\nLinear Regression (Run ID: {lr_run_id}):")
print(f"  R2 Score: {r2_lr:.4f}")
print(f"  RMSE: ${rmse_lr:.2f}")
print(f"  MAE: ${mae_lr:.2f}")



Linear Regression model logged to MLflow successfully!
Run ID: cb261fcdb1784aa0a30fbbee5c7d63ee

--- Model Performance Comparison ---
Random Forest (Run ID: 8a781e4b2e4c4d509a765dda2b8c0ee3):
  R2 Score: 0.9503
  RMSE: $2761.34
  MAE: $1325.71

Linear Regression (Run ID: cb261fcdb1784aa0a30fbbee5c7d63ee):
  R2 Score: 0.8297
  RMSE: $5112.51
  MAE: $2895.21


In [9]:
# --- Register the Best Model (Random Forest) in MLflow Model Registry ---

# 1. Define a name for our registered model
model_name = "Medical_Insurance_Cost_Predictor"

# 2. Register the Random Forest model from its run
# We use the run ID of the Random Forest run to register it
registered_model_uri = f"runs:/{rf_run_id}/model"

try:
    # Create the registered model (this will fail if it already exists)
    model_details = mlflow.register_model(model_uri=registered_model_uri, name=model_name)
    print(f"Model registered successfully as '{model_name}' with version {model_details.version}")
except mlflow.exceptions.MlflowException as e:
    if "RESOURCE_ALREADY_EXISTS" in str(e):
        # If the model name already exists, create a new version
        print(f"Model '{model_name}' already exists. Creating a new version...")
        model_details = mlflow.register_model(model_uri=registered_model_uri, name=model_name)
        print(f"New version of '{model_name}' registered successfully with version {model_details.version}")
    else:
        # If it's a different kind of error, re-raise it
        raise e

# 3. Confirm the registration by printing details
print(f"\nRegistered Model Details:")
print(f"  Name: {model_details.name}")
print(f"  Version: {model_details.version}")
print(f"  Creation Timestamp: {model_details.creation_timestamp}")

Successfully registered model 'Medical_Insurance_Cost_Predictor'.


Model registered successfully as 'Medical_Insurance_Cost_Predictor' with version 1

Registered Model Details:
  Name: Medical_Insurance_Cost_Predictor
  Version: 1
  Creation Timestamp: 1754541350186


Created version '1' of model 'Medical_Insurance_Cost_Predictor'.


In [10]:
import mlflow

# Ensure MLflow is pointing to the correct tracking URI where your experiment/model was saved
# This should match what you used before (likely the default local one)
mlflow.set_tracking_uri("file:./mlruns")

# List all registered models to see if yours is there
print("Checking registered models...")
registered_models = mlflow.search_registered_models()
found = False
for model in registered_models:
    print(f"  Model Name: {model.name}, Latest Version: {model.latest_versions[0].version}")
    if model.name == "Medical_Insurance_Cost_Predictor":
        found = True
        print(f"  -> Found the target model: {model.name}")

if found:
    print("\nModel 'Medical_Insurance_Cost_Predictor' is registered.")
else:
    print("\nModel 'Medical_Insurance_Cost_Predictor' NOT found. Need to register it.")

# If not found, you can re-register it using rf_run_id from earlier in the notebook
# Uncomment and run the block below if needed:
"""
if not found:
    try:
        model_details = mlflow.register_model(
            model_uri=f"runs:/{rf_run_id}/model",
            name="Medical_Insurance_Cost_Predictor"
        )
        print(f"\nModel successfully re-registered as 'Medical_Insurance_Cost_Predictor' with version {model_details.version}")
    except Exception as e:
        print(f"\nError re-registering model: {e}")
"""

Checking registered models...
  Model Name: Medical_Insurance_Cost_Predictor, Latest Version: 1
  -> Found the target model: Medical_Insurance_Cost_Predictor

Model 'Medical_Insurance_Cost_Predictor' is registered.


'\nif not found:\n    try:\n        model_details = mlflow.register_model(\n            model_uri=f"runs:/{rf_run_id}/model",\n            name="Medical_Insurance_Cost_Predictor"\n        )\n        print(f"\nModel successfully re-registered as \'Medical_Insurance_Cost_Predictor\' with version {model_details.version}")\n    except Exception as e:\n        print(f"\nError re-registering model: {e}")\n'

In [13]:

import mlflow

# Ensure MLflow points to the correct location (where you saved your experiments)
mlflow.set_tracking_uri("file:./mlruns")

# List registered models and get details
print("Checking registered models...")
registered_models = mlflow.search_registered_models()
target_model_name = "Medical_Insurance_Cost_Predictor"
found = False
for model in registered_models:
    if model.name == target_model_name:
        found = True
        version_info = model.latest_versions[0] # Get info for version 1
        print(f"  Model Name: {model.name}")
        print(f"  Latest Version: {version_info.version}")
        print(f"  Run ID associated with this version: {version_info.run_id}")
        print(f"  Model URI for loading: models:/{model.name}/{version_info.version}")
        break

if not found:
    print(f"Model '{target_model_name}' NOT found in the registry.")


Checking registered models...
  Model Name: Medical_Insurance_Cost_Predictor
  Latest Version: 1
  Run ID associated with this version: 8a781e4b2e4c4d509a765dda2b8c0ee3
  Model URI for loading: models:/Medical_Insurance_Cost_Predictor/1


In [14]:
print(f"Run ID: {run.info.run_id}")

Run ID: 8a781e4b2e4c4d509a765dda2b8c0ee3


In [15]:
print(f"Run ID: {run.info.run_id}")

Run ID: 8a781e4b2e4c4d509a765dda2b8c0ee3
