# Hard Drive Failure Prediction - MLOps Experiment Tracking

This notebook demonstrates:
- **Iteration 1**: Baseline Logistic Regression model
- **Iteration 2**: Improved Random Forest model
- MLflow experiment tracking for both iterations

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## 1. Load and Explore Data

In [None]:
# Load the cleaned dataset from CW1
df = pd.read_csv('../data/processed/cleaned_hdd_from_faulty.csv')
print(f"Dataset shape: {df.shape}")
print(f"\nColumn names: {df.columns.tolist()}")
df.head()

In [None]:
# Check target distribution
print("Target distribution:")
print(df['failure'].value_counts())
print(f"\nFailure rate: {df['failure'].mean()*100:.2f}%")

In [None]:
# Data info
df.info()

## 2. Prepare Features and Target

In [None]:
# Select features for modeling
# Using the preprocessed features from CW1
feature_columns = ['capacity_bytes', 'lifetime', 'model_encoded']

X = df[feature_columns]
y = df['failure']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature statistics:")
X.describe()

In [None]:
# Split data into training and testing sets
# Using random_state=42 for reproducibility
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"\nTraining target distribution:")
print(y_train.value_counts(normalize=True))

## 3. Setup MLflow Tracking

In [None]:
# Configure MLflow
# Connect to local MLflow server running in Docker
mlflow.set_tracking_uri("http://127.0.0.1:8080")

# Create or set experiment
experiment_name = "hdd_failure_prediction"
mlflow.set_experiment(experiment_name)

print(f"MLflow tracking URI: {mlflow.get_tracking_uri()}")
print(f"Experiment: {experiment_name}")

In [None]:
# Helper function to evaluate and log metrics
def evaluate_model(model, X_test, y_test):
    """Evaluate model and return metrics dictionary"""
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, zero_division=0),
        'recall': recall_score(y_test, y_pred, zero_division=0),
        'f1_score': f1_score(y_test, y_pred, zero_division=0),
        'roc_auc': roc_auc_score(y_test, y_pred_proba)
    }
    
    return metrics, y_pred, y_pred_proba

def plot_confusion_matrix(y_test, y_pred, title):
    """Plot confusion matrix"""
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(title)
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.savefig(f'../reports/figures/{title.replace(" ", "_").lower()}.png', dpi=150, bbox_inches='tight')
    plt.show()
    return f'../reports/figures/{title.replace(" ", "_").lower()}.png'

## 4. ITERATION 1: Baseline Logistic Regression Model

For our baseline, we use Logistic Regression - a simple, interpretable model that works well for binary classification.

In [None]:
# ITERATION 1: Logistic Regression Baseline
with mlflow.start_run(run_name="iteration_1_logistic_regression"):
    
    # Log parameters
    mlflow.log_param("algorithm", "LogisticRegression")
    mlflow.log_param("features", ",".join(feature_columns))
    mlflow.log_param("n_features", len(feature_columns))
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("random_state", 42)
    mlflow.log_param("iteration", 1)
    
    # Train model
    model_lr = LogisticRegression(random_state=42, max_iter=1000)
    model_lr.fit(X_train, y_train)
    
    # Evaluate
    metrics_lr, y_pred_lr, y_pred_proba_lr = evaluate_model(model_lr, X_test, y_test)
    
    # Log metrics
    for metric_name, metric_value in metrics_lr.items():
        mlflow.log_metric(metric_name, metric_value)
    
    # Log model
    mlflow.sklearn.log_model(model_lr, "model")
    
    # Print results
    print("=" * 50)
    print("ITERATION 1: Logistic Regression Results")
    print("=" * 50)
    for metric_name, metric_value in metrics_lr.items():
        print(f"{metric_name}: {metric_value:.4f}")
    
    run_id_lr = mlflow.active_run().info.run_id
    print(f"\nMLflow Run ID: {run_id_lr}")

In [None]:
# Plot confusion matrix for Iteration 1
plot_confusion_matrix(y_test, y_pred_lr, "Iteration 1 Logistic Regression Confusion Matrix")

## 5. ITERATION 2: Improved Random Forest Model

For iteration 2, we use Random Forest - an ensemble method that typically provides better performance for complex patterns.

In [None]:
# ITERATION 2: Random Forest (Improved Model)
with mlflow.start_run(run_name="iteration_2_random_forest"):
    
    # Model hyperparameters
    n_estimators = 100
    max_depth = 10
    min_samples_split = 5
    
    # Log parameters
    mlflow.log_param("algorithm", "RandomForest")
    mlflow.log_param("features", ",".join(feature_columns))
    mlflow.log_param("n_features", len(feature_columns))
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("min_samples_split", min_samples_split)
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("random_state", 42)
    mlflow.log_param("iteration", 2)
    
    # Train model
    model_rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=42,
        n_jobs=-1
    )
    model_rf.fit(X_train, y_train)
    
    # Evaluate
    metrics_rf, y_pred_rf, y_pred_proba_rf = evaluate_model(model_rf, X_test, y_test)
    
    # Log metrics
    for metric_name, metric_value in metrics_rf.items():
        mlflow.log_metric(metric_name, metric_value)
    
    # Log model
    mlflow.sklearn.log_model(model_rf, "model")
    
    # Log feature importance
    feature_importance = pd.DataFrame({
        'feature': feature_columns,
        'importance': model_rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Print results
    print("=" * 50)
    print("ITERATION 2: Random Forest Results")
    print("=" * 50)
    for metric_name, metric_value in metrics_rf.items():
        print(f"{metric_name}: {metric_value:.4f}")
    
    print("\nFeature Importance:")
    print(feature_importance)
    
    run_id_rf = mlflow.active_run().info.run_id
    print(f"\nMLflow Run ID: {run_id_rf}")

In [None]:
# Plot confusion matrix for Iteration 2
plot_confusion_matrix(y_test, y_pred_rf, "Iteration 2 Random Forest Confusion Matrix")

In [None]:
# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Random Forest Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.savefig('../reports/figures/feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Compare Iterations

In [None]:
# Compare both iterations
comparison = pd.DataFrame({
    'Metric': list(metrics_lr.keys()),
    'Iteration 1 (Logistic Regression)': list(metrics_lr.values()),
    'Iteration 2 (Random Forest)': list(metrics_rf.values())
})
comparison['Improvement'] = comparison['Iteration 2 (Random Forest)'] - comparison['Iteration 1 (Logistic Regression)']
comparison['Improvement %'] = (comparison['Improvement'] / comparison['Iteration 1 (Logistic Regression)'] * 100).round(2)

print("=" * 70)
print("MODEL COMPARISON: Iteration 1 vs Iteration 2")
print("=" * 70)
print(comparison.to_string(index=False))

In [None]:
# Visualize comparison
fig, ax = plt.subplots(figsize=(12, 6))
x = np.arange(len(comparison['Metric']))
width = 0.35

bars1 = ax.bar(x - width/2, comparison['Iteration 1 (Logistic Regression)'], width, label='Iteration 1 (LR)', color='steelblue')
bars2 = ax.bar(x + width/2, comparison['Iteration 2 (Random Forest)'], width, label='Iteration 2 (RF)', color='darkorange')

ax.set_xlabel('Metrics')
ax.set_ylabel('Score')
ax.set_title('Model Performance Comparison: Iteration 1 vs Iteration 2')
ax.set_xticks(x)
ax.set_xticklabels(comparison['Metric'])
ax.legend()
ax.set_ylim(0, 1.1)

# Add value labels on bars
for bar in bars1:
    height = bar.get_height()
    ax.annotate(f'{height:.3f}', xy=(bar.get_x() + bar.get_width()/2, height),
                xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=8)
for bar in bars2:
    height = bar.get_height()
    ax.annotate(f'{height:.3f}', xy=(bar.get_x() + bar.get_width()/2, height),
                xytext=(0, 3), textcoords="offset points", ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.savefig('../reports/figures/model_comparison.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Register Best Model in MLflow Model Registry

In [None]:
# Register the best model (Random Forest) in MLflow Model Registry
model_name = "hdd_failure_predictor"

# Register model from the Random Forest run
model_uri = f"runs:/{run_id_rf}/model"
registered_model = mlflow.register_model(model_uri, model_name)

print(f"Model registered: {model_name}")
print(f"Version: {registered_model.version}")

In [None]:
# Transition model to Staging
from mlflow.tracking import MlflowClient

client = MlflowClient()

# Transition to Staging
client.transition_model_version_stage(
    name=model_name,
    version=registered_model.version,
    stage="Staging"
)

print(f"Model {model_name} version {registered_model.version} transitioned to Staging")

## 8. Summary

### Key Findings:
- **Iteration 1 (Logistic Regression)**: Baseline model providing initial performance benchmarks
- **Iteration 2 (Random Forest)**: Improved model with better handling of non-linear relationships

### MLflow Tracking:
- Both iterations logged to MLflow experiment: `hdd_failure_prediction`
- Parameters, metrics, and models tracked for comparison
- Best model registered in MLflow Model Registry

### Next Steps:
1. Export to production Python script
2. Create testing scripts for reproducibility and performance regression
3. Deploy model using MLflow serving

In [None]:
print("\n" + "="*60)
print("EXPERIMENT COMPLETE")
print("="*60)
print(f"\nView experiments at: http://127.0.0.1:8080")
print(f"Experiment name: {experiment_name}")
print(f"\nIteration 1 Run ID: {run_id_lr}")
print(f"Iteration 2 Run ID: {run_id_rf}")
print(f"\nRegistered Model: {model_name}")