# Model Training with MLflow Tracking

This notebook trains baseline models for absenteeism prediction and tracks experiments using MLflow.

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import mlflow
import mlflow.sklearn
import warnings
warnings.filterwarnings('ignore')

# Load the cleaned data
df = pd.read_csv('../data/processed/absenteeism_cleaned.csv')

# Prepare features and target
target = 'Absenteeism time in hours'
features_to_drop = [target, 'ID', 'Has_Disciplinary_Failure', 'Is_Social_Drinker', 'Is_Social_Smoker']  # Drop duplicates
X = df.drop(features_to_drop, axis=1)
y = df[target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nFeatures used: {list(X.columns)}")

Training set: (592, 19)
Test set: (148, 19)

Features used: ['Reason for absence', 'Month of absence', 'Day of the week', 'Seasons', 'Transportation expense', 'Distance from Residence to Work', 'Service time', 'Age', 'Work load Average/day', 'Hit target', 'Disciplinary failure', 'Education', 'Son', 'Social drinker', 'Social smoker', 'Pet', 'Weight', 'Height', 'Body mass index']


In [7]:
# Set MLflow experiment - MUST run this cell first before any training!
import mlflow

# Set tracking URI to local directory
mlflow.set_tracking_uri("file:///C:/Users/Alexis/mlops-absenteeism-project/mlruns")

# Create/set experiment
experiment = mlflow.set_experiment("absenteeism-prediction")
print(f"Experiment ID: {experiment.experiment_id}")
print(f"Artifact Location: {experiment.artifact_location}")

# Function to evaluate models
def evaluate_model(y_true, y_pred):
    from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2

print("\nMLflow tracking setup complete!")
print(f"Experiment: absenteeism-prediction")

2025/10/01 14:16:09 INFO mlflow.tracking.fluent: Experiment with name 'absenteeism-prediction' does not exist. Creating a new experiment.


Experiment ID: 385053282433394744
Artifact Location: file:///C:/Users/Alexis/mlops-absenteeism-project/mlruns/385053282433394744

MLflow tracking setup complete!
Experiment: absenteeism-prediction


In [8]:
from sklearn.linear_model import LinearRegression

with mlflow.start_run(run_name="Linear_Regression"):
    # Train model
    lr_model = LinearRegression()
    lr_model.fit(X_train_scaled, y_train)
    
    # Predictions
    y_pred_train = lr_model.predict(X_train_scaled)
    y_pred_test = lr_model.predict(X_test_scaled)
    
    # Evaluate
    train_mae, train_rmse, train_r2 = evaluate_model(y_train, y_pred_train)
    test_mae, test_rmse, test_r2 = evaluate_model(y_test, y_pred_test)
    
    # Log parameters and metrics
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_metric("train_mae", train_mae)
    mlflow.log_metric("train_rmse", train_rmse)
    mlflow.log_metric("train_r2", train_r2)
    mlflow.log_metric("test_mae", test_mae)
    mlflow.log_metric("test_rmse", test_rmse)
    mlflow.log_metric("test_r2", test_r2)
    
    # Log model
    mlflow.sklearn.log_model(lr_model, "model")
    
    print(f"Linear Regression Results:")
    print(f"  Train - MAE: {train_mae:.2f}, RMSE: {train_rmse:.2f}, R2: {train_r2:.3f}")
    print(f"  Test  - MAE: {test_mae:.2f}, RMSE: {test_rmse:.2f}, R2: {test_r2:.3f}")



Linear Regression Results:
  Train - MAE: 6.14, RMSE: 12.81, R2: 0.156
  Test  - MAE: 5.44, RMSE: 9.96, R2: 0.090


In [9]:
from sklearn.ensemble import RandomForestRegressor

with mlflow.start_run(run_name="Random_Forest"):
    # Train model
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10)
    rf_model.fit(X_train_scaled, y_train)
    
    # Predictions
    y_pred_train = rf_model.predict(X_train_scaled)
    y_pred_test = rf_model.predict(X_test_scaled)
    
    # Evaluate
    train_mae, train_rmse, train_r2 = evaluate_model(y_train, y_pred_train)
    test_mae, test_rmse, test_r2 = evaluate_model(y_test, y_pred_test)
    
    # Log parameters and metrics
    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", 10)
    mlflow.log_metric("train_mae", train_mae)
    mlflow.log_metric("train_rmse", train_rmse)
    mlflow.log_metric("train_r2", train_r2)
    mlflow.log_metric("test_mae", test_mae)
    mlflow.log_metric("test_rmse", test_rmse)
    mlflow.log_metric("test_r2", test_r2)
    
    # Log model
    mlflow.sklearn.log_model(rf_model, "model")
    
    print(f"Random Forest Results:")
    print(f"  Train - MAE: {train_mae:.2f}, RMSE: {train_rmse:.2f}, R2: {train_r2:.3f}")
    print(f"  Test  - MAE: {test_mae:.2f}, RMSE: {test_rmse:.2f}, R2: {test_r2:.3f}")



Random Forest Results:
  Train - MAE: 2.66, RMSE: 5.70, R2: 0.833
  Test  - MAE: 5.33, RMSE: 12.28, R2: -0.384


In [10]:
# Get the best run
runs = mlflow.search_runs(experiment_names=["absenteeism-prediction"])
best_run = runs.loc[runs['metrics.test_r2'].idxmax()]

print("="*50)
print("BEST MODEL SUMMARY")
print("="*50)
print(f"Model Type: {best_run['params.model_type']}")
print(f"Test R²: {best_run['metrics.test_r2']:.3f}")
print(f"Test MAE: {best_run['metrics.test_mae']:.2f} hours")
print(f"Test RMSE: {best_run['metrics.test_rmse']:.2f} hours")
print("\nTo view all experiments in MLflow UI, run: mlflow ui")

BEST MODEL SUMMARY
Model Type: LinearRegression
Test R²: 0.090
Test MAE: 5.44 hours
Test RMSE: 9.96 hours

To view all experiments in MLflow UI, run: mlflow ui
