# Wildfire Model

In [49]:
import os
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import mlflow
import logging
import pickle
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

In [50]:
# In your imports section, add:
import mlflow
from mlflow.models.signature import infer_signature
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

os.makedirs("./mlflow", exist_ok=True)
os.makedirs("./mlflow/.trash", exist_ok=True)

# Set tracking URI to a local directory
mlflow.set_tracking_uri("file:./mlflow")

# Set experiment name
mlflow.set_experiment("wildfire-prediction-test")

# In your training function, replace the MLflow section with:
def train_model(X_train, y_train, X_test, y_test):
    # Create and train the model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Get feature importance
    feature_importance = pd.DataFrame(
        model.feature_importances_,
        index=X_train.columns,
        columns=['importance']
    ).sort_values('importance', ascending=False)
    
    # Save feature importance to a proper location
    os.makedirs("models/feature_importance", exist_ok=True)
    feature_importance.to_csv("models/feature_importance/feature_importance.csv")
    
    # Log with MLflow
    with mlflow.start_run():
        # Log parameters
        mlflow.log_param("n_estimators", model.n_estimators)
        mlflow.log_param("random_state", 42)
        
        # Log metrics
        mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred))
        mlflow.log_metric("precision", precision_score(y_test, y_pred, average='weighted'))
        mlflow.log_metric("recall", recall_score(y_test, y_pred, average='weighted'))
        mlflow.log_metric("f1", f1_score(y_test, y_pred, average='weighted'))
        
        # Log feature importance
        mlflow.log_artifact("models/feature_importance/feature_importance.csv")
        
        # Log model with signature
        signature = infer_signature(X_train, y_pred)
        mlflow.sklearn.log_model(model, "model", signature=signature)
        
        # Save model locally
        os.makedirs("models", exist_ok=True)
        mlflow.sklearn.save_model(model, "models/wildfire_model")
    
    return model, feature_importance

# Your prediction function can remain the same

2025/05/20 07:45:12 INFO mlflow.tracking.fluent: Experiment with name 'wildfire-prediction-test' does not exist. Creating a new experiment.


In [51]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [52]:
def load_data(data_path):
    """Load the merged dataset from JSON file."""
    logger.info(f"Loading data from {data_path}")
    with open(data_path, 'r') as f:
        data = json.load(f)
    return data

In [53]:
def preprocess_data(data):
    """Preprocess the data for training."""
    logger.info("Preprocessing data")
    
    # Create an empty list to hold the flattened records
    flattened_records = []
    
    # Process each record
    for record in data:
        # Skip records with missing weather or vegetation data
        if record['weather'] is None or record['vegetation'] is None:
            continue
            
        # Create a flattened dictionary
        flat_record = {
            # Event features
            'lat': float(record['event']['lat']),
            'lon': float(record['event']['lon']),
            'brightness': float(record['event']['brightness']),
            'scan': float(record['event']['scan']),
            'track': float(record['event']['track']),
            'confidence': float(record['event']['confidence']),
            'bright_t31': float(record['event']['bright_t31']),
            'frp': float(record['event']['frp']),
            'daynight': 1 if record['event']['daynight'] == "D" else 0,  # Day=1, Night=0
            
            # Weather features
            'tavg': record['weather'].get('tavg'),
            'tmin': record['weather'].get('tmin'),
            'tmax': record['weather'].get('tmax'),
            'prcp': record['weather'].get('prcp'),
            'wspd': record['weather'].get('wspd'),
            'pres': record['weather'].get('pres'),
            
            # Vegetation features
            'ndvi': record['vegetation'].get('ndvi'),
            'evi': record['vegetation'].get('evi'),
            
            # Target: high severity fire (confidence > 80 and frp > 20)
            'high_severity': 1 if (float(record['event']['confidence']) > 80 and 
                                float(record['event']['frp']) > 20) else 0
        }
        
        flattened_records.append(flat_record)
    
    # Convert to DataFrame
    df = pd.DataFrame(flattened_records)
    
    # Replace NaN values with median for each column
    for col in df.columns:
        if col != 'high_severity' and df[col].dtype in [np.float64, np.int64]:
            median_val = df[col].median()
            df[col] = df[col].fillna(median_val)
    
    logger.info(f"Preprocessed data shape: {df.shape}")
    return df


In [54]:
def train_model(df, test_size=0.2, random_state=42):
    """Train a random forest model on the preprocessed data with MLflow tracking."""
    logger.info("Training model")
    
    # Define features and target
    features = [col for col in df.columns if col != 'high_severity']
    X = df[features]
    y = df['high_severity']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )
    
    # Initialize MLflow run
    with mlflow.start_run(run_name="random-forest-wildfire"):
        # Log parameters
        mlflow.log_param("test_size", test_size)
        mlflow.log_param("random_state", random_state)
        mlflow.log_param("n_estimators", 100)
        mlflow.log_param("max_depth", 6)
        mlflow.log_param("max_features", 3)
        mlflow.log_param("features", features)
        
        # Train model with progress bar
        print("Training Random Forest model...")
        model = RandomForestClassifier(
            n_estimators=100,
            max_depth=6,
            max_features=3,
            random_state=random_state,
            verbose=0  # Keep sklearn's verbose off as we're using tqdm
        )
        
        # Use a progress bar for fitting
        with tqdm(total=100, desc="Training Progress") as pbar:
            # Train model with fallback for older sklearn versions
            try:
                model.fit(X_train, y_train, callback=lambda _, i, __: pbar.update(1))
            except TypeError:
                model.fit(X_train, y_train)
                for i in range(0, 100, 10):
                    pbar.update(10)
                    
        # Evaluate model with progress bar
        print("Evaluating model...")
        with tqdm(total=len(X_test), desc="Evaluation Progress") as pbar:
            y_pred = []
            batch_size = max(1, len(X_test) // 10)
            
            for i in range(0, len(X_test), batch_size):
                end = min(i + batch_size, len(X_test))
                batch_pred = model.predict(X_test[i:end])
                y_pred.extend(batch_pred)
                pbar.update(end - i)
            
            y_pred = np.array(y_pred)
            
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)

        # Log metrics
        mlflow.log_metric("accuracy", accuracy)
        
        # Extract and log individual metrics from classification report
        report_dict = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in report_dict.items():
            if isinstance(metrics, dict):
                for metric_name, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric_name}", value)
        
        # Log feature importance
        feature_importances = model.feature_importances_
        importance_df = pd.DataFrame({
            'Feature': features,
            'Importance': feature_importances
        }).sort_values('Importance', ascending=False)
        
        # Create proper directories for artifacts
        artifacts_dir = "../artifacts/feature_importance"
        os.makedirs(artifacts_dir, exist_ok=True)
        
        # Save feature importance CSV to a proper location with timestamp
        from datetime import datetime
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        importance_path = f"{artifacts_dir}/feature_importance_{timestamp}.csv"
        importance_df.to_csv(importance_path, index=False)
        
        # Log the feature importance file as an MLflow artifact
        mlflow.log_artifact(importance_path)
        
        # Create a visualization of feature importance and save it
        plt_path = f"{artifacts_dir}/feature_importance_{timestamp}.png"
        try:
            import matplotlib.pyplot as plt
            plt.figure(figsize=(10, 6))
            plt.barh(importance_df['Feature'][:10], importance_df['Importance'][:10])
            plt.xlabel('Importance')
            plt.ylabel('Feature')
            plt.title('Top 10 Feature Importance')
            plt.tight_layout()
            plt.savefig(plt_path)
            plt.close()
            
            # Log the plot as an artifact
            mlflow.log_artifact(plt_path)
        except ImportError:
            logger.warning("Matplotlib not installed. Skipping feature importance visualization.")
        
        # Create model signature for proper schema tracking
        signature = infer_signature(X_train, y_pred)
        
        # Log the model with its signature
        mlflow.sklearn.log_model(
            model, 
            "random_forest_model",
            signature=signature,
            input_example=X_train.iloc[0].to_dict()
        )
        
        # Also save model to disk outside of MLflow
        output_dir = "../models"
        os.makedirs(output_dir, exist_ok=True)
        with open(f"{output_dir}/wildfire_model_{timestamp}.pkl", "wb") as f:
            pickle.dump(model, f)
        
        # Also save the latest version with a fixed name for easy access
        with open(f"{output_dir}/wildfire_model_latest.pkl", "wb") as f:
            pickle.dump(model, f)
        
        logger.info(f"Model saved to {output_dir}/wildfire_model_{timestamp}.pkl")
        logger.info(f"Model also saved as {output_dir}/wildfire_model_latest.pkl")
        logger.info(f"Model logged to MLflow with run_id: {mlflow.active_run().info.run_id}")
    
    return model, accuracy

In [55]:
def main():
    """Main function to run the model training pipeline with MLflow tracking."""
    logger.info("Starting wildfire prediction model training")
    
    # Load data
    data_path = "../data/merged_complete.json"
    data = load_data(data_path)
    
    # Preprocess data
    df = preprocess_data(data)
    
    # Train model
    model, accuracy = train_model(df)
    
    logger.info(f"Wildfire prediction model training completed with accuracy: {accuracy:.4f}")
    logger.info(f"View experiments at: http://localhost:5000 (after starting mlflow ui)")
    
    return model, accuracy

if __name__ == "__main__":
    main()

Training Random Forest model...


Training Progress:   0%|          | 0/100 [00:00<?, ?it/s]

Evaluating model...


Evaluation Progress:   0%|          | 0/48 [00:00<?, ?it/s]



# Predicitions

In [56]:
import os
import pickle
import json
import pandas as pd
import numpy as np
import logging


In [57]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [58]:

def load_model(model_path):
    """Load the trained model from a pickle file."""
    logger.info(f"Loading model from {model_path}")
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    return model

In [59]:
def prepare_input(data):
    """Prepare input data for prediction."""
    # If data is a single record, convert to list
    if isinstance(data, dict):
        data = [data]
    
    # Create an empty list to hold the flattened records
    flattened_records = []
    
    # Process each record
    for record in data:
        # Handle cases where weather or vegetation is None
        weather = record.get('weather') or {}
        vegetation = record.get('vegetation') or {}
        
        # Create a flattened dictionary
        flat_record = {
            # Event features
            'lat': float(record['event']['lat']),
            'lon': float(record['event']['lon']),
            'brightness': float(record['event']['brightness']),
            'scan': float(record['event']['scan']),
            'track': float(record['event']['track']),
            'confidence': float(record['event']['confidence']),
            'bright_t31': float(record['event']['bright_t31']),
            'frp': float(record['event']['frp']),
            'daynight': 1 if record['event']['daynight'] == "D" else 0,  # Day=1, Night=0
            
            # Weather features (using default values if missing)
            'tavg': weather.get('tavg', 25.0),
            'tmin': weather.get('tmin', 20.0),
            'tmax': weather.get('tmax', 30.0),
            'prcp': weather.get('prcp', 0.0),
            'wspd': weather.get('wspd', 10.0),
            'pres': weather.get('pres', 1010.0),
            
            # Vegetation features (using default values if missing)
            'ndvi': vegetation.get('ndvi', 0.5),
            'evi': vegetation.get('evi', 0.4),
        }
        
        flattened_records.append(flat_record)
    
    # Convert to DataFrame
    df = pd.DataFrame(flattened_records)
    
    # Replace NaN values with median for each column
    for col in df.columns:
        if df[col].dtype in [np.float64, np.int64]:
            df[col] = df[col].fillna(df[col].median() if not df[col].isnull().all() else 0)
    
    return df

In [60]:
def predict(model, input_data):
    """Make predictions using the trained model."""
    logger.info("Making predictions")
    
    # Prepare input data
    df = prepare_input(input_data)
    
    # Make predictions
    predictions = model.predict(df)
    probabilities = model.predict_proba(df)[:, 1]  # Probability of high severity
    
    # Create results
    results = []
    for i, record in enumerate(input_data):
        result = {
            'lat': record['event']['lat'],
            'lon': record['event']['lon'],
            'date': record['event']['date'],
            'high_severity': bool(predictions[i]),
            'probability': float(probabilities[i]),
            'frp': float(record['event']['frp']),
            'confidence': float(record['event']['confidence'])
        }
        results.append(result)
    
    return results


In [61]:
# Replace the prediction execution cell with this version
# This version works in a notebook environment without requiring command line arguments

def run_predictions(input_path="../data/merged_complete.json", 
                   output_path="../predictions/wildfire_predictions.json",
                   model_path="../models/wildfire_model_latest.pkl"):
    """Run predictions from within the notebook environment"""
    logger.info(f"Starting prediction process with input: {input_path}")
    
    # Make sure output directory exists
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    
    try:
        # Load model
        model = load_model(model_path)
        
        # Load input data
        with open(input_path, 'r') as f:
            input_data = json.load(f)
        
        # Take a subset for testing if the input is large
        test_data = input_data[:10]  # Use first 10 records
        
        # Make predictions
        print("Making predictions...")
        with tqdm(total=len(test_data), desc="Prediction Progress") as pbar:
            results = predict(model, test_data)
            pbar.update(len(test_data))
        
        # Save results
        with open(output_path, 'w') as f:
            json.dump(results, f, indent=2)
        
        logger.info(f"Predictions saved to {output_path}")
        
        # Display a sample of the results
        print("\nSample prediction results:")
        for i, result in enumerate(results[:5]):  # Show first 5 results
            print(f"Record {i+1}: High Severity = {result['high_severity']} (Probability: {result['probability']:.2f})")
        
        return results
    
    except Exception as e:
        logger.error(f"Error during prediction: {str(e)}")
        raise

# Run predictions using the latest model
predictions = run_predictions()

Making predictions...


Prediction Progress:   0%|          | 0/10 [00:00<?, ?it/s]


Sample prediction results:
Record 1: High Severity = False (Probability: 0.17)
Record 2: High Severity = False (Probability: 0.00)
Record 3: High Severity = False (Probability: 0.00)
Record 4: High Severity = False (Probability: 0.00)
Record 5: High Severity = False (Probability: 0.04)


# Run

In [62]:
# Instead of importing functions, just call them directly

def run_pipeline():
    """Run the complete model pipeline: training and prediction."""
    logger.info("Starting wildfire prediction pipeline")
    
    # Load data
    data_path = "../data/merged_complete.json"
    data = load_data(data_path)
    
    # Preprocess data
    df = preprocess_data(data)
    
    # Train model
    model, accuracy = train_model(df)
    logger.info(f"Model trained with accuracy: {accuracy:.4f}")
    
    # Make predictions on a subset of the data (for demonstration)
    sample_data = data[:10]  # Use first 10 records
    results = predict(model, sample_data)
    
    # Print a few predictions
    print("\nSample predictions:")
    for i, result in enumerate(results[:5]):
        print(f"Record {i+1}: High Severity = {result['high_severity']} (Probability: {result['probability']:.2f})")
    
    logger.info("Wildfire prediction pipeline completed")
    
    return model, results

# Run the pipeline
model, results = run_pipeline()

Training Random Forest model...


Training Progress:   0%|          | 0/100 [00:00<?, ?it/s]

Evaluating model...


Evaluation Progress:   0%|          | 0/48 [00:00<?, ?it/s]




Sample predictions:
Record 1: High Severity = False (Probability: 0.17)
Record 2: High Severity = False (Probability: 0.00)
Record 3: High Severity = False (Probability: 0.00)
Record 4: High Severity = False (Probability: 0.00)
Record 5: High Severity = False (Probability: 0.04)
