# End-to-End Training Pipeline for FLIR+SCD41 Fire Detection System

This notebook demonstrates the complete workflow for training the FLIR+SCD41 fire detection system:
1. Dataset generation
2. Data storage
3. Data splitting
4. Model training
5. Ensemble weight calculation
6. Model evaluation

## System Overview
The system uses:
- FLIR Lepton 3.5 thermal camera (15 features)
- Sensirion SCD41 CO₂ sensor (3 features)
- Total: 18 features for fire detection

In [None]:
# Import required libraries
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import json
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
project_root = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.append(project_root)

# Import our custom modules
from src.data_generation.flir_scd41.flir_data_generator import FlirDataGenerator
from src.data_generation.flir_scd41.scd41_data_generator import Scd41DataGenerator
from src.feature_engineering.extractors.flir_thermal_extractor import FlirThermalExtractor
from src.feature_engineering.extractors.scd41_gas_extractor import Scd41GasExtractor
from src.training.flir_scd41.train_flir_scd41_model import FlirScd41XGBoost, FlirScd41LSTM

print("✅ All libraries imported successfully")

## 1. Dataset Generation

Generate synthetic training data for FLIR+SCD41 sensors

In [None]:
# Generate synthetic dataset
print("🔄 Generating synthetic FLIR+SCD41 dataset...")

# Create data generators
flir_generator = FlirDataGenerator()
scd41_generator = Scd41DataGenerator()

# Generate data
num_samples = 50000  # 50K samples for training
flir_data = flir_generator.generate_data(num_samples)
scd41_data = scd41_generator.generate_data(num_samples)

print(f"✅ Generated {len(flir_data)} FLIR samples and {len(scd41_data)} SCD41 samples")
print(f"FLIR data shape: {flir_data.shape}")
print(f"SCD41 data shape: {scd41_data.shape}")

## 2. Feature Extraction

Extract features from raw sensor data

In [None]:
# Extract features
print("🔄 Extracting features from sensor data...")

# Create feature extractors
flir_extractor = FlirThermalExtractor()
scd41_extractor = Scd41GasExtractor()

# Extract features for all samples
flir_features = []
scd41_features = []

for i in range(len(flir_data)):
    # Extract FLIR features (15 features)
    flir_sample = flir_extractor.extract_features(flir_data[i])
    flir_features.append(flir_sample)
    
    # Extract SCD41 features (3 features)
    scd41_sample = scd41_extractor.extract_features(scd41_data[i])
    scd41_features.append(scd41_sample)

flir_features = np.array(flir_features)
scd41_features = np.array(scd41_features)

print(f"✅ Extracted FLIR features: {flir_features.shape}")
print(f"✅ Extracted SCD41 features: {scd41_features.shape}")

## 3. Dataset Storage

Save the generated dataset to disk

In [None]:
# Combine features and create labels
print("💾 Combining features and creating dataset...")

# Combine all features (15 FLIR + 3 SCD41 = 18 features)
all_features = np.concatenate([flir_features, scd41_features], axis=1)

# Create labels (fire detected or not)
# In a real scenario, this would come from the data generation process
# For this example, we'll create synthetic labels
np.random.seed(42)
fire_probability = 0.15  # 15% of samples are fire events
labels = np.random.choice([0, 1], size=len(all_features), p=[1-fire_probability, fire_probability])

# Create DataFrame
feature_names = [
    't_mean', 't_std', 't_max', 't_p95', 't_hot_area_pct',
    't_hot_largest_blob_pct', 't_grad_mean', 't_grad_std',
    't_diff_mean', 't_diff_std', 'flow_mag_mean', 'flow_mag_std',
    'tproxy_val', 'tproxy_delta', 'tproxy_vel',
    'gas_val', 'gas_delta', 'gas_vel'
]

df = pd.DataFrame(all_features, columns=feature_names)
df['fire_detected'] = labels

# Save to CSV
data_dir = os.path.join(project_root, 'data', 'flir_scd41')
os.makedirs(data_dir, exist_ok=True)

dataset_path = os.path.join(data_dir, 'flir_scd41_dataset.csv')
df.to_csv(dataset_path, index=False)

print(f"✅ Dataset saved to {dataset_path}")
print(f"Dataset shape: {df.shape}")
print(f"Fire samples: {sum(labels)} ({sum(labels)/len(labels)*100:.2f}%)")

## 4. Data Splitting

Split the dataset into training, validation, and test sets

In [None]:
# Split dataset
print("📊 Splitting dataset into train/validation/test sets...")

# Separate features and labels
X = df.drop('fire_detected', axis=1).values
y = df['fire_detected'].values

# Split into train (70%), validation (15%), test (15%)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.176, random_state=42, stratify=y_temp)  # 0.176 ≈ 0.15/0.85

print(f"Train set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")

# Save splits
train_df = pd.DataFrame(X_train, columns=feature_names)
train_df['fire_detected'] = y_train
train_df.to_csv(os.path.join(data_dir, 'train.csv'), index=False)

val_df = pd.DataFrame(X_val, columns=feature_names)
val_df['fire_detected'] = y_val
val_df.to_csv(os.path.join(data_dir, 'val.csv'), index=False)

test_df = pd.DataFrame(X_test, columns=feature_names)
test_df['fire_detected'] = y_test
test_df.to_csv(os.path.join(data_dir, 'test.csv'), index=False)

print("✅ Dataset splits saved to disk")

## 5. Model Training

Train multiple models: XGBoost and LSTM

In [None]:
# Train XGBoost model
print("🚀 Training XGBoost model...")

# Create and train XGBoost model
xgb_model = FlirScd41XGBoost(
    max_depth=6,
    eta=0.3,
    subsample=0.8,
    colsample_bytree=0.8
)
xgb_model.fit(X_train, y_train)

# Evaluate XGBoost model
xgb_train_pred = xgb_model.predict(X_train)
xgb_val_pred = xgb_model.predict(X_val)
xgb_val_pred_proba = xgb_model.predict_proba(X_val)

xgb_train_metrics = {
    'accuracy': accuracy_score(y_train, xgb_train_pred),
    'f1_score': f1_score(y_train, xgb_train_pred),
    'precision': precision_score(y_train, xgb_train_pred),
    'recall': recall_score(y_train, xgb_train_pred)
}

xgb_val_metrics = {
    'accuracy': accuracy_score(y_val, xgb_val_pred),
    'f1_score': f1_score(y_val, xgb_val_pred),
    'precision': precision_score(y_val, xgb_val_pred),
    'recall': recall_score(y_val, xgb_val_pred),
    'auc': roc_auc_score(y_val, xgb_val_pred_proba) if len(np.unique(y_val)) > 1 else 0.0
}

print("XGBoost Training Metrics:")
for metric, value in xgb_train_metrics.items():
    print(f"  {metric}: {value:.4f}")

print("\nXGBoost Validation Metrics:")
for metric, value in xgb_val_metrics.items():
    print(f"  {metric}: {value:.4f}")

In [None]:
# Train LSTM model
print("\n🚀 Training LSTM model...")

# Create and train LSTM model
lstm_model = FlirScd41LSTM(input_size=18, hidden_size=64, num_layers=2, num_classes=2)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
lstm_model.to(device)
print(f"Using device: {device}")

# Define loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=0.001)

# Create datasets and data loaders
class FlirScd41Dataset(Dataset):
    def __init__(self, data, labels):
        self.data = torch.FloatTensor(data)
        self.labels = torch.LongTensor(labels)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

train_dataset = FlirScd41Dataset(X_train, y_train)
val_dataset = FlirScd41Dataset(X_val, y_val)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Training loop
num_epochs = 30
best_val_acc = 0.0

for epoch in range(num_epochs):
    # Training
    lstm_model.train()
    train_loss = 0.0
    train_preds = []
    train_targets = []
    
    for batch_data, batch_labels in train_loader:
        batch_data, batch_labels = batch_data.to(device), batch_labels.to(device)
        
        optimizer.zero_grad()
        outputs = lstm_model(batch_data)
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        train_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
        train_targets.extend(batch_labels.cpu().numpy())
    
    # Validation
    lstm_model.eval()
    val_loss = 0.0
    val_preds = []
    val_targets = []
    
    with torch.no_grad():
        for batch_data, batch_labels in val_loader:
            batch_data, batch_labels = batch_data.to(device), batch_labels.to(device)
            
            outputs = lstm_model(batch_data)
            loss = criterion(outputs, batch_labels)
            
            val_loss += loss.item()
            val_preds.extend(torch.argmax(outputs, dim=1).cpu().numpy())
            val_targets.extend(batch_labels.cpu().numpy())
    
    # Calculate metrics
    train_acc = accuracy_score(train_targets, train_preds)
    val_acc = accuracy_score(val_targets, val_preds)
    
    # Save best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(lstm_model.state_dict(), os.path.join(data_dir, 'best_lstm_model.pth'))
    
    # Print progress every 5 epochs
    if (epoch + 1) % 5 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}]')
        print(f'  Train Loss: {train_loss/len(train_loader):.4f}, Train Accuracy: {train_acc:.4f}')
        print(f'  Validation Loss: {val_loss/len(val_loader):.4f}, Validation Accuracy: {val_acc:.4f}')

# Final LSTM metrics
lstm_val_metrics = {
    'accuracy': best_val_acc,
    'f1_score': f1_score(val_targets, val_preds),
    'precision': precision_score(val_targets, val_preds),
    'recall': recall_score(val_targets, val_preds)
}

print("\nLSTM Validation Metrics:")
for metric, value in lstm_val_metrics.items():
    print(f"  {metric}: {value:.4f}")

## 6. Ensemble Weight Calculation

Calculate optimal weights for model ensemble based on validation performance

In [None]:
# Calculate ensemble weights based on validation performance
print("⚖️ Calculating ensemble weights...")

# Performance scores for each model (using accuracy as the metric)
xgb_score = xgb_val_metrics['accuracy']
lstm_score = lstm_val_metrics['accuracy']

print(f"XGBoost validation accuracy: {xgb_score:.4f}")
print(f"LSTM validation accuracy: {lstm_score:.4f}")

# Method 1: Performance-based weighting (exponential scaling)
def calculate_performance_weights(scores, scaling_factor=2.0):
    """Calculate weights based on performance scores using exponential scaling"""
    # Normalize scores to [0, 1] range
    min_score = min(scores)
    max_score = max(scores)
    
    if max_score == min_score:
        # All models have same performance, equal weights
        return [1.0/len(scores)] * len(scores)
    
    normalized_scores = [(score - min_score) / (max_score - min_score) for score in scores]
    
    # Apply exponential scaling
    weighted_scores = [np.exp(scaling_factor * score) for score in normalized_scores]
    
    # Normalize to sum to 1
    total_weight = sum(weighted_scores)
    weights = [w / total_weight for w in weighted_scores]
    
    return weights

# Calculate weights
model_scores = [xgb_score, lstm_score]
ensemble_weights = calculate_performance_weights(model_scores)

print(f"\nEnsemble weights:")
print(f"  XGBoost weight: {ensemble_weights[0]:.4f}")
print(f"  LSTM weight: {ensemble_weights[1]:.4f}")

# Save weights
weights_data = {
    'models': ['xgboost', 'lstm'],
    'weights': ensemble_weights,
    'validation_scores': {
        'xgboost': xgb_score,
        'lstm': lstm_score
    },
    'calculation_method': 'performance_based_exponential_scaling',
    'scaling_factor': 2.0
}

weights_path = os.path.join(data_dir, 'ensemble_weights.json')
with open(weights_path, 'w') as f:
    json.dump(weights_data, f, indent=2)

print(f"\n✅ Ensemble weights saved to {weights_path}")

## 7. Model Evaluation on Test Set

Evaluate the ensemble model on the test set

In [None]:
# Evaluate models on test set
print("🧪 Evaluating models on test set...")

# XGBoost predictions
xgb_test_pred = xgb_model.predict(X_test)
xgb_test_pred_proba = xgb_model.predict_proba(X_test)

# LSTM predictions
lstm_model.eval()
with torch.no_grad():
    test_data = torch.FloatTensor(X_test).to(device)
    lstm_outputs = lstm_model(test_data)
    lstm_test_pred_proba = torch.softmax(lstm_outputs, dim=1)[:, 1].cpu().numpy()
    lstm_test_pred = (lstm_test_pred_proba > 0.5).astype(int)

# Ensemble predictions (weighted average)
ensemble_pred_proba = (
    ensemble_weights[0] * xgb_test_pred_proba + 
    ensemble_weights[1] * lstm_test_pred_proba
)
ensemble_test_pred = (ensemble_pred_proba > 0.5).astype(int)

# Calculate metrics
xgb_test_metrics = {
    'accuracy': accuracy_score(y_test, xgb_test_pred),
    'f1_score': f1_score(y_test, xgb_test_pred),
    'precision': precision_score(y_test, xgb_test_pred),
    'recall': recall_score(y_test, xgb_test_pred),
    'auc': roc_auc_score(y_test, xgb_test_pred_proba)
}

lstm_test_metrics = {
    'accuracy': accuracy_score(y_test, lstm_test_pred),
    'f1_score': f1_score(y_test, lstm_test_pred),
    'precision': precision_score(y_test, lstm_test_pred),
    'recall': recall_score(y_test, lstm_test_pred),
    'auc': roc_auc_score(y_test, lstm_test_pred_proba)
}

ensemble_test_metrics = {
    'accuracy': accuracy_score(y_test, ensemble_test_pred),
    'f1_score': f1_score(y_test, ensemble_test_pred),
    'precision': precision_score(y_test, ensemble_test_pred),
    'recall': recall_score(y_test, ensemble_test_pred),
    'auc': roc_auc_score(y_test, ensemble_pred_proba)
}

print("Test Set Performance:")
print("\nXGBoost:")
for metric, value in xgb_test_metrics.items():
    print(f"  {metric}: {value:.4f}")

print("\nLSTM:")
for metric, value in lstm_test_metrics.items():
    print(f"  {metric}: {value:.4f}")

print("\nEnsemble:")
for metric, value in ensemble_test_metrics.items():
    print(f"  {metric}: {value:.4f}")

## 8. Model Saving

Save all trained models and components

In [None]:
# Save models
print("💾 Saving trained models...")

# Save XGBoost model
xgb_model_path = os.path.join(data_dir, 'flir_scd41_xgboost_model.json')
xgb_model.model.save_model(xgb_model_path)

# Save XGBoost scaler
xgb_scaler_path = os.path.join(data_dir, 'flir_scd41_xgboost_scaler.joblib')
joblib.dump(xgb_model.scaler, xgb_scaler_path)

# LSTM model already saved during training
lstm_model_path = os.path.join(data_dir, 'best_lstm_model.pth')

# Save model information
model_info = {
    'xgboost': {
        'model_path': xgb_model_path,
        'scaler_path': xgb_scaler_path,
        'metrics': xgb_test_metrics
    },
    'lstm': {
        'model_path': lstm_model_path,
        'metrics': lstm_test_metrics
    },
    'ensemble': {
        'weights_path': weights_path,
        'metrics': ensemble_test_metrics
    },
    'feature_names': feature_names,
    'training_date': datetime.now().isoformat()
}

model_info_path = os.path.join(data_dir, 'model_info.json')
with open(model_info_path, 'w') as f:
    json.dump(model_info, f, indent=2)

print(f"✅ XGBoost model saved to {xgb_model_path}")
print(f"✅ XGBoost scaler saved to {xgb_scaler_path}")
print(f"✅ LSTM model saved to {lstm_model_path}")
print(f"✅ Model information saved to {model_info_path}")

## 9. Results Visualization

Visualize model performance and results

In [None]:
# Create performance comparison visualization
print("📊 Creating performance visualization...")

# Prepare data for plotting
metrics = ['accuracy', 'f1_score', 'precision', 'recall', 'auc']
xgb_scores = [xgb_test_metrics[m] for m in metrics]
lstm_scores = [lstm_test_metrics[m] for m in metrics]
ensemble_scores = [ensemble_test_metrics[m] for m in metrics]

# Create bar plot
x = np.arange(len(metrics))
width = 0.25

fig, ax = plt.subplots(figsize=(12, 6))
bars1 = ax.bar(x - width, xgb_scores, width, label='XGBoost', color='skyblue')
bars2 = ax.bar(x, lstm_scores, width, label='LSTM', color='lightcoral')
bars3 = ax.bar(x + width, ensemble_scores, width, label='Ensemble', color='lightgreen')

# Add labels and title
ax.set_xlabel('Metrics')
ax.set_ylabel('Score')
ax.set_title('FLIR+SCD41 Fire Detection Model Performance Comparison')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()
ax.set_ylim(0, 1)

# Add value labels on bars
def add_value_labels(bars):
    for bar in bars:
        height = bar.get_height()
        ax.annotate(f'{height:.3f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom')

add_value_labels(bars1)
add_value_labels(bars2)
add_value_labels(bars3)

plt.tight_layout()
plt.show()

# Print summary
print("\n🏆 Model Performance Summary:")
print(f"XGBoost Accuracy: {xgb_test_metrics['accuracy']:.4f}")
print(f"LSTM Accuracy: {lstm_test_metrics['accuracy']:.4f}")
print(f"Ensemble Accuracy: {ensemble_test_metrics['accuracy']:.4f}")
print(f"\nBest Model: {'Ensemble' if ensemble_test_metrics['accuracy'] > max(xgb_test_metrics['accuracy'], lstm_test_metrics['accuracy']) else 'XGBoost' if xgb_test_metrics['accuracy'] > lstm_test_metrics['accuracy'] else 'LSTM'}")

## 10. Training Summary

Summary of the entire training process

In [None]:
print("🏁 Training Process Summary")
print("="*50)
print(f"Dataset Size: {len(df):,} samples")
print(f"Features: {len(feature_names)} (15 FLIR + 3 SCD41)")
print(f"Fire Samples: {sum(labels):,} ({sum(labels)/len(labels)*100:.2f}%)")
print(f"Training Samples: {len(X_train):,}")
print(f"Validation Samples: {len(X_val):,}")
print(f"Test Samples: {len(X_test):,}")
print()
print("Model Performance (Test Set):")
print(f"  XGBoost Accuracy: {xgb_test_metrics['accuracy']:.4f}")
print(f"  LSTM Accuracy: {lstm_test_metrics['accuracy']:.4f}")
print(f"  Ensemble Accuracy: {ensemble_test_metrics['accuracy']:.4f}")
print()
print("Ensemble Weights:")
print(f"  XGBoost: {ensemble_weights[0]:.4f}")
print(f"  LSTM: {ensemble_weights[1]:.4f}")
print()
print("Files Saved:")
print(f"  Dataset: {dataset_path}")
print(f"  Train Split: {os.path.join(data_dir, 'train.csv')}")
print(f"  Validation Split: {os.path.join(data_dir, 'val.csv')}")
print(f"  Test Split: {os.path.join(data_dir, 'test.csv')}")
print(f"  XGBoost Model: {xgb_model_path}")
print(f"  LSTM Model: {lstm_model_path}")
print(f"  Ensemble Weights: {weights_path}")
print(f"  Model Info: {model_info_path}")
print()
print("✅ End-to-end training pipeline completed successfully!")