## 1. Load Saved Model

Load the best-performing model (HistGradientBoosting) from disk.

In [None]:
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
import json

# Load the trained model
models_dir = Path('models')
model_path = models_dir / 'histgradientboosting_model.joblib'

print(f"Loading model from {model_path}...")
best_model = joblib.load(model_path)

print(f"Model loaded successfully!")
print(f"Model type: {type(best_model)}")
print(f"\nPipeline components:")
for name, step in best_model.named_steps.items():
    print(f"  - {name}: {type(step).__name__}")

## 2. Load Reference Data & Scent Mapping

Load the original dataset to understand scent labels and create a mapping for interpretable predictions.

In [None]:
# Load original data for reference
data_path = Path('data/initial-smell-dataset.csv')
df = pd.read_csv(data_path)

# Create scent ID to name mapping
scent_mapping = df[['scent_id', 'scent_name']].drop_duplicates().sort_values('scent_id')
id_to_name = dict(zip(scent_mapping['scent_id'], scent_mapping['scent_name']))
name_to_id = {v: k for k, v in id_to_name.items()}

print("Scent ID to Name Mapping:")
for scent_id, scent_name in id_to_name.items():
    print(f"  {int(scent_id)}: {scent_name}")

# Reference columns for new predictions
required_columns = [
    'trial_number', 'phase', 'time_s', 'temp_C', 'humidity_pct', 'pressure_kPa',
    'gas_bme', 'srawVoc', 'srawNox', 'NO2', 'ethanol', 'VOC_multichannel', 'COandH2'
]
print(f"\nRequired columns for prediction: {required_columns}")

## 3. Batch Prediction Example

Make predictions on a batch of new sensor readings.

In [None]:
# Create sample new data (simulated sensor readings)
# In practice, this would come from your IoT sensors

new_data = pd.DataFrame({
    'trial_number': [5, 5, 5, 6, 6, 6],
    'phase': ['exposure', 'exposure', 'exposure', 'baseline', 'baseline', 'baseline'],
    'time_s': [50.0, 75.0, 100.0, 10.0, 20.0, 30.0],
    'temp_C': [23.5, 23.5, 23.6, 22.8, 23.0, 23.2],
    'humidity_pct': [35.5, 36.0, 36.5, 32.5, 33.0, 33.5],
    'pressure_kPa': [100.95, 100.94, 100.93, 100.92, 100.93, 100.94],
    'gas_bme': [35.2, 32.5, 31.8, 110.5, 112.0, 113.5],
    'srawVoc': [25000, 24900, 24800, 30500, 30600, 30700],
    'srawNox': [14700, 14750, 14800, 14600, 14580, 14600],
    'NO2': [320, 350, 370, 220, 225, 228],
    'ethanol': [480, 520, 550, 370, 375, 380],
    'VOC_multichannel': [500, 540, 580, 390, 395, 400],
    'COandH2': [1000, 1005, 1010, 850, 855, 860]
})

print("New sensor data (batch of 6 samples):")
print(new_data)

### 3.1 Make Predictions

In [None]:
# Make predictions
print("Making predictions...\n")

predicted_ids = best_model.predict(new_data)

# Get prediction probabilities (if available)
if hasattr(best_model, 'predict_proba'):
    try:
        # Try to get probabilities via the pipeline
        predicted_proba = best_model.predict_proba(new_data)
        has_proba = True
    except:
        predicted_proba = None
        has_proba = False
else:
    predicted_proba = None
    has_proba = False

# Create results dataframe
results_df = pd.DataFrame({
    'Sample': range(1, len(new_data) + 1),
    'Phase': new_data['phase'],
    'Gas_BME': new_data['gas_bme'],
    'Predicted_ID': predicted_ids,
    'Predicted_Scent': [id_to_name[pid] for pid in predicted_ids]
})

# Add confidence if available
if has_proba:
    max_proba = np.max(predicted_proba, axis=1)
    results_df['Confidence'] = max_proba

print("Prediction Results:")
print(results_df.to_string(index=False))

if has_proba:
    print(f"\nAverage confidence: {results_df['Confidence'].mean():.4f}")
    print(f"Min confidence: {results_df['Confidence'].min():.4f}")
    print(f"Max confidence: {results_df['Confidence'].max():.4f}")

## 4. Single Sample Prediction (Real-time)

Example of predicting a single new sensor reading in real-time.

In [None]:
# Simulate a single sensor reading from IoT device
single_reading = pd.DataFrame({
    'trial_number': [7],
    'phase': ['exposure'],
    'time_s': [80.0],
    'temp_C': [23.4],
    'humidity_pct': [36.2],
    'pressure_kPa': [100.94],
    'gas_bme': [33.5],
    'srawVoc': [24950],
    'srawNox': [14780],
    'NO2': [340],
    'ethanol': [510],
    'VOC_multichannel': [550],
    'COandH2': [1008]
})

# Predict
predicted_scent_id = best_model.predict(single_reading)[0]
predicted_scent_name = id_to_name[predicted_scent_id]

# Get confidence if available
if has_proba:
    proba = best_model.predict_proba(single_reading)[0]
    confidence = proba[int(predicted_scent_id - 1)]  # Assuming scent_id starts at 1
    print(f"\nPredicted Scent: {predicted_scent_name} (ID: {int(predicted_scent_id)})")
    print(f"Confidence: {confidence:.4f}")
    print(f"\nProbability distribution across all scents:")
    for scent_id in sorted(id_to_name.keys()):
        idx = int(scent_id - 1)
        print(f"  {id_to_name[scent_id]}: {proba[idx]:.4f}")
else:
    print(f"\nPredicted Scent: {predicted_scent_name} (ID: {int(predicted_scent_id)})")

## 5. Prediction as JSON (API Ready)

Format predictions as JSON for API integration.

In [None]:
# Create a prediction response in JSON format (API ready)
def format_prediction_json(sensor_data: pd.DataFrame, predictions: np.ndarray, model, id_to_name: dict):
    """
    Format prediction results as JSON for API responses.
    
    Args:
        sensor_data: DataFrame with sensor readings
        predictions: Array of predicted scent IDs
        model: Trained model pipeline
        id_to_name: Dict mapping scent_id to scent_name
    
    Returns:
        JSON-serializable dict
    """
    results = []
    
    # Get probabilities if available
    try:
        probas = model.predict_proba(sensor_data)
        has_proba = True
    except:
        probas = None
        has_proba = False
    
    for idx, pred_id in enumerate(predictions):
        result = {
            'sample_index': idx,
            'predicted_scent_id': int(pred_id),
            'predicted_scent_name': id_to_name[pred_id],
            'phase': sensor_data.iloc[idx]['phase'],
            'timestamp': sensor_data.iloc[idx]['time_s']
        }
        
        if has_proba:
            proba_dict = {}
            for scent_id, scent_name in id_to_name.items():
                scent_idx = int(scent_id - 1)  # Adjust for 0-indexing
                if scent_idx < len(probas[idx]):
                    proba_dict[scent_name] = float(probas[idx][scent_idx])
            result['probabilities'] = proba_dict
            result['confidence'] = float(probas[idx][int(pred_id - 1)])
        
        results.append(result)
    
    return {
        'status': 'success',
        'model': 'HistGradientBoosting',
        'num_predictions': len(predictions),
        'predictions': results
    }

# Format results
json_response = format_prediction_json(new_data, predicted_ids, best_model, id_to_name)

print("JSON Response (API format):")
print(json.dumps(json_response, indent=2))

## 6. FastAPI Server Helper

Code to integrate with FastAPI server for real-time inference.

In [None]:
# Example FastAPI integration code (for reference, not to run)
fastapi_example = '''
from fastapi import FastAPI
from pydantic import BaseModel
import joblib
import pandas as pd

app = FastAPI()

# Load model at startup
model = joblib.load('models/histgradientboosting_model.joblib')
id_to_name = {1: 'apple', 2: 'banana', 3: 'cherry', ...}  # From dataset

class SensorReading(BaseModel):
    trial_number: int
    phase: str
    time_s: float
    temp_C: float
    humidity_pct: float
    pressure_kPa: float
    gas_bme: float
    srawVoc: int
    srawNox: int
    NO2: int
    ethanol: int
    VOC_multichannel: int
    COandH2: int

@app.post('/predict')
def predict(reading: SensorReading):
    # Convert to DataFrame
    df = pd.DataFrame([reading.dict()])
    
    # Make prediction
    pred_id = model.predict(df)[0]
    proba = model.predict_proba(df)[0]
    
    return {
        'predicted_scent_id': int(pred_id),
        'predicted_scent_name': id_to_name[pred_id],
        'confidence': float(proba[int(pred_id-1)])
    }
'''

print("FastAPI Integration Example:")
print(fastapi_example)

## 7. Performance Monitoring

Monitor model predictions for drift detection and logging.

In [None]:
import datetime

# Simulate real-time predictions and log them
prediction_log = []

for i in range(3):
    # Simulate new reading
    sample = new_data.iloc[i:i+1]
    
    # Predict
    pred_id = best_model.predict(sample)[0]
    
    # Get confidence if available
    if has_proba:
        proba = best_model.predict_proba(sample)[0]
        confidence = float(proba[int(pred_id - 1)])
    else:
        confidence = None
    
    # Log entry
    log_entry = {
        'timestamp': datetime.datetime.now().isoformat(),
        'sample_index': i,
        'phase': sample['phase'].values[0],
        'gas_bme': float(sample['gas_bme'].values[0]),
        'predicted_scent_id': int(pred_id),
        'predicted_scent': id_to_name[pred_id],
        'confidence': confidence
    }
    
    prediction_log.append(log_entry)
    
    print(f"[{log_entry['timestamp']}] Predicted {log_entry['predicted_scent']} "
          f"(confidence: {confidence:.4f if confidence else 'N/A'})")

# Convert log to DataFrame for analysis
log_df = pd.DataFrame(prediction_log)

print("\nPrediction Log (first 3 samples):")
print(log_df.to_string(index=False))

## 8. Best Practices for Deployment

Summary of deployment best practices.

In [None]:
deployment_guide = """
=================================================================
BEST PRACTICES FOR SMELL DETECTION MODEL DEPLOYMENT
=================================================================

1. MODEL VERSIONING
   - Save model with version tag: histgradientboosting_v1.0.joblib
   - Keep training configuration for reproducibility
   - Log model metrics at save time

2. INPUT VALIDATION
   - Check all required columns are present
   - Validate sensor value ranges (e.g., temp_C between -10 and 60Â°C)
   - Handle missing values gracefully

3. OUTPUT INTERPRETATION
   - Always return prediction confidence/probability
   - Flag low-confidence predictions (<0.90) for review
   - Include timestamp and sensor readings in response

4. ERROR HANDLING
   - Catch preprocessing errors (e.g., unknown categorical values)
   - Return informative error messages
   - Log all errors for debugging

5. PERFORMANCE MONITORING
   - Log all predictions for drift detection
   - Monitor confidence score distribution
   - Compare with historical performance
   - Retrain quarterly or when accuracy drops >2%

6. LATENCY OPTIMIZATION
   - Batch predictions when possible (<10ms per sample)
   - Cache model in memory (don't reload each request)
   - Use async endpoints for high-frequency predictions

7. DATA FRESHNESS
   - Ensure sensor data is recent (<1 second old)
   - Handle out-of-order predictions
   - Retrain if sensor calibration changes

8. SECURITY
   - Protect model endpoint with authentication
   - Validate input size (max batch: 1000 samples)
   - Rate limit requests per user/device

=================================================================
"""

print(deployment_guide)