# NASA Event Classification

A simple machine learning model to classify NASA EONET (Earth Observatory Natural Event Tracker) events.



In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import plotly.express as px
from dotenv import load_dotenv
import snowflake.connector
from pathlib import Path

load_dotenv()

SNOWFLAKE_CONFIG = {
    "account": os.getenv("SNOWFLAKE_ACCOUNT"),
    "user": os.getenv("SNOWFLAKE_USER"),
    "password": os.getenv("SNOWFLAKE_PASSWORD"),
    "warehouse": os.getenv("SNOWFLAKE_WAREHOUSE"),
    "database": os.getenv("SNOWFLAKE_DATABASE"),
    "schema": os.getenv("SNOWFLAKE_SCHEMA")
}

# Create directory for saving models
MODELS_DIR = Path("models")
MODELS_DIR.mkdir(exist_ok=True)


In [2]:
def load_events(days=30):
    """Load EONET events from Snowflake"""
    try:
        conn = snowflake.connector.connect(**SNOWFLAKE_CONFIG)

        query = f"""
            WITH event_data AS (
                SELECT 
                    EVENT_TIME,
                    EVENT_ID,
                    TITLE,
                    CATEGORIES[0]:title::STRING as CATEGORY,
                    GEOMETRIES[0]:coordinates[0]::FLOAT as LONGITUDE,
                    GEOMETRIES[0]:coordinates[1]::FLOAT as LATITUDE
                FROM PUBLIC_STAGING.STG_EONET_EVENTS
                WHERE EVENT_TIME >= DATEADD(days, -{days}, CURRENT_TIMESTAMP())
                  AND ARRAY_SIZE(CATEGORIES) > 0
                  AND ARRAY_SIZE(GEOMETRIES) > 0
            )
            SELECT *
            FROM event_data
            WHERE LONGITUDE IS NOT NULL
              AND LATITUDE IS NOT NULL
        """
        df = pd.read_sql(query, conn)
        print(f"Loaded {len(df)} events")
        
        df['EVENT_TIME'] = pd.to_datetime(df['EVENT_TIME'])

        df = df[
            df['LONGITUDE'].between(-180, 180) & 
            df['LATITUDE'].between(-90, 90)
        ]
        
        df = df.drop_duplicates(subset=['EVENT_ID'])
        
        print(f"After cleaning: {len(df)} valid events")
        print("\nEvent categories found:")
        print(df['CATEGORY'].value_counts())
        
        return df
        
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        raise
    finally:
        if 'conn' in locals():
            conn.close()

df_events = load_events()


  df = pd.read_sql(query, conn)


Loaded 2316 events
After cleaning: 1535 valid events

Event categories found:
CATEGORY
Wildfires           1506
Volcanoes             17
Sea and Lake Ice      10
Severe Storms          2
Name: count, dtype: int64


In [3]:
def prepare_data(df):
    """Create features and prepare data for modeling"""
    df['month'] = df['EVENT_TIME'].dt.month
    df['day_of_year'] = df['EVENT_TIME'].dt.dayofyear

    df['region'] = pd.cut(
        df['LATITUDE'], 
        bins=[-90, -30, 0, 30, 90],
        labels=['South', 'Tropical South', 'Tropical North', 'North']
    )

    le = LabelEncoder()
    df['category_encoded'] = le.fit_transform(df['CATEGORY'])
    df['region_encoded'] = LabelEncoder().fit_transform(df['region'])
    
    # create feature matrix
    X = df[[
        'month', 'day_of_year', 'LONGITUDE', 'LATITUDE', 'region_encoded'
    ]].copy()
    
    y = df['category_encoded']
    

    print("\nData Summary:")
    print(f"Total samples: {len(df)}")
    print("\nCategories:")
    for cat, count in df['CATEGORY'].value_counts().items():
        print(f"  {cat}: {count}")
    
    return X, y, le

X, y, label_encoder = prepare_data(df_events)

# split into train -simple test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


Data Summary:
Total samples: 1535

Categories:
  Wildfires: 1506
  Volcanoes: 17
  Sea and Lake Ice: 10
  Severe Storms: 2


In [None]:
# train and evaluate model
def train_and_evaluate():
    """Train and evaluate a Random Forest classifier"""
    print("Training Random Forest classifier...")
    model = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        class_weight='balanced',
        random_state=42
    )
    model.fit(X_train, y_train)
    
    # mmake predictions
    print("Making predictions...")
    y_pred = model.predict(X_test)
    
    print("\nModel Results:")
    print("=============")
    print("\nClassification Report:")
    print(classification_report(
        y_test, y_pred,
        target_names=label_encoder.classes_,
        zero_division=0
    ))
    

    importance = pd.DataFrame({
        'feature': X.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nFeature Importance:")
    for _, row in importance.iterrows():
        print(f"  {row['feature']}: {row['importance']:.3f}")
    
    # save model using joblib
    try:
        import joblib
        model_path = MODELS_DIR / 'event_classifier.joblib'
        joblib.dump(model, model_path)
        print(f"\nModel saved to {model_path}")
    except Exception as e:
        print(f"Warning: Could not save model: {str(e)}")
    
    return model, y_pred


model, predictions = train_and_evaluate()


Training Random Forest classifier...
Making predictions...

Model Results:

Classification Report:
                  precision    recall  f1-score   support

Sea and Lake Ice       1.00      1.00      1.00         2
   Severe Storms       0.00      0.00      0.00         0
       Volcanoes       1.00      0.75      0.86         4
       Wildfires       1.00      1.00      1.00       301

        accuracy                           1.00       307
       macro avg       0.75      0.69      0.71       307
    weighted avg       1.00      1.00      1.00       307


Feature Importance:
  LATITUDE: 0.482
  LONGITUDE: 0.371
  region_encoded: 0.146
  month: 0.000
  day_of_year: 0.000

Model saved to models\event_classifier.joblib


In [None]:
def plot_results():
    # 1- Event distribution
    fig1 = px.bar(
        df_events['CATEGORY'].value_counts(),
        title='Event Categories',
        labels={'index': 'Category', 'value': 'Count'}
    )
    fig1.show()
    
    # 2- geographic distribution
    fig2 = px.scatter(
        df_events,
        x='LONGITUDE',
        y='LATITUDE',
        color='CATEGORY',
        title='Event Locations',
        hover_data=['TITLE'],
        width=800,
        height=500
    )
    fig2.update_layout(
        showlegend=True,
        legend_title="Event Type"
    )
    fig2.show()
    
    # 3- confusion matrix
    # Convert numeric predictions back to categories
    actual_categories = label_encoder.inverse_transform(y_test)
    predicted_categories = label_encoder.inverse_transform(predictions)
    
    conf_matrix = pd.crosstab(
        actual_categories,
        predicted_categories,
        rownames=['Actual'],
        colnames=['Predicted']
    )
    
    # make heatmap
    fig3 = px.imshow(
        conf_matrix,
        title="Confusion Matrix",
        labels=dict(x="Predicted Category", y="Actual Category", color="Count"),
        aspect='auto',
        color_continuous_scale='RdBu_r'
    )
    
    fig3.update_layout(
        width=800,
        height=600,
        xaxis_tickangle=-45
    )
    fig3.show()

    # accuracy score
    accuracy = (actual_categories == predicted_categories).mean()
    print(f"\nOverall Accuracy: {accuracy:.2%}")

plot_results()



Overall Accuracy: 99.67%


In [None]:
# save predictions and metadata to Snowflake
def save_predictions_and_metadata():
    """Save model predictions and metadata to Snowflake"""
    conn = None
    try:
        # make predictions on all data
        all_predictions = model.predict(X)
        all_probabilities = model.predict_proba(X)

        results_df = pd.DataFrame({
            'event_id': df_events['EVENT_ID'],
            'event_time': df_events['EVENT_TIME'].dt.strftime('%Y-%m-%d %H:%M:%S'), 
            'actual_category': df_events['CATEGORY'],
            'predicted_category': label_encoder.inverse_transform(all_predictions),
            'confidence': all_probabilities.max(axis=1),
            'model_version': '1.0.0',
            'features_used': ','.join(X.columns)
        })

        metrics = {
            'accuracy': (predictions == y_test).mean(),
            'n_estimators': model.n_estimators,
            'max_depth': model.max_depth,
            'feature_count': len(X.columns),
            'training_samples': len(X_train),
            'test_samples': len(X_test)
        }
        
        print("\nModel Metrics:")
        for key, value in metrics.items():
            print(f"{key}: {value}")
        
         #save to snowflake
        conn = snowflake.connector.connect(**SNOWFLAKE_CONFIG)
        cur = conn.cursor()

        cur.execute("""
            CREATE TABLE IF NOT EXISTS PUBLIC.EVENT_PREDICTIONS (
                EVENT_ID VARCHAR,
                EVENT_TIME TIMESTAMP,
                ACTUAL_CATEGORY VARCHAR,
                PREDICTED_CATEGORY VARCHAR,
                CONFIDENCE FLOAT,
                MODEL_VERSION VARCHAR,
                FEATURES_USED VARCHAR,
                PREDICTION_TIMESTAMP TIMESTAMP DEFAULT CURRENT_TIMESTAMP()
            )
        """)
        
        cur.execute("""
            CREATE TABLE IF NOT EXISTS PUBLIC.MODEL_METADATA (
                MODEL_VERSION VARCHAR,
                ACCURACY FLOAT,
                N_ESTIMATORS INTEGER,
                MAX_DEPTH INTEGER,
                FEATURE_COUNT INTEGER,
                TRAINING_SAMPLES INTEGER,
                TEST_SAMPLES INTEGER,
                TRAINING_TIMESTAMP TIMESTAMP DEFAULT CURRENT_TIMESTAMP()
            )
        """)

        cur.execute("""
            INSERT INTO MODEL_METADATA (
                MODEL_VERSION, ACCURACY, N_ESTIMATORS, MAX_DEPTH,
                FEATURE_COUNT, TRAINING_SAMPLES, TEST_SAMPLES
            ) VALUES (%s, %s, %s, %s, %s, %s, %s)
        """, (
            '1.0.0',
            float(metrics['accuracy']),
            int(metrics['n_estimators']),
            int(metrics['max_depth']),
            int(metrics['feature_count']),
            int(metrics['training_samples']),
            int(metrics['test_samples'])
        ))
        
        batch_size = 1000
        for i in range(0, len(results_df), batch_size):
            batch = results_df.iloc[i:i + batch_size]
            values = batch[['event_id', 'event_time', 'actual_category',
                          'predicted_category', 'confidence', 'model_version',
                          'features_used']].values.tolist()
            
            cur.executemany("""
                INSERT INTO EVENT_PREDICTIONS (
                    EVENT_ID, EVENT_TIME, ACTUAL_CATEGORY,
                    PREDICTED_CATEGORY, CONFIDENCE, MODEL_VERSION,
                    FEATURES_USED
                ) VALUES (%s, %s, %s, %s, %s, %s, %s)
            """, values)
            
            print(f"Saved {len(batch)} predictions")
        
        conn.commit()
        print(f"\nSaved {len(results_df)} predictions to Snowflake")
        
    except Exception as e:
        if conn:
            conn.rollback()
        print(f"Error saving to Snowflake: {str(e)}")
        raise
    finally:
        if conn:
            conn.close()

try:
    save_predictions_and_metadata()
except Exception as e:
    print(f"Failed to save results: {str(e)}")
    raise


Model Metrics:
accuracy: 0.996742671009772
n_estimators: 100
max_depth: 10
feature_count: 5
training_samples: 1228
test_samples: 307
Saved 1000 predictions
Saved 535 predictions

Saved 1535 predictions to Snowflake
