In [3]:
import pandas as pd
import numpy as np

# Load your downloaded CSV
df = pd.read_csv('Data.csv')

def engineer_eonet_features(data):
    # 1. Expand Geometry_Coordinates_1 into Lon/Lat
    # The format is 'Longitude Latitude'
    coords = data['Geometry_Coordinates_1'].str.split(expand=True)
    data['Longitude'] = pd.to_numeric(coords[0])
    data['Latitude'] = pd.to_numeric(coords[1])

    # 2. Extract Time-Based Features
    data['Date'] = pd.to_datetime(data['Date'])
    data['Month'] = data['Date'].dt.month

    # 3. Cyclic Encoding for Months
    data['Month_Sin'] = np.sin(2 * np.pi * data['Month'] / 12)
    data['Month_Cos'] = np.cos(2 * np.pi * data['Month'] / 12)

    # 4. Hemisphere Logic
    data['Hemisphere'] = np.where(data['Latitude'] >= 0, 1, 0)

    # 5. Distance from Equator (Climate driver)
    data['Equator_Dist'] = data['Latitude'].abs()

    return data.dropna(subset=['Longitude', 'Latitude'])

df_clean = engineer_eonet_features(df)

Cross-Validation: Repeated Stratified K-Fold

In [4]:
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Define features and target
X = df_clean[['Longitude', 'Latitude', 'Month_Sin', 'Month_Cos', 'Equator_Dist', 'Hemisphere']]
y = df_clean['Category_title']

# Create Pipeline: Scale -> Model
# Scaling coordinates is vital for tree-based models to split effectively
cv_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Define Repeated CV Strategy (5 Folds, 3 Repeats)
rkf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

# Execute CV
scores = cross_val_score(cv_pipeline, X, y, cv=rkf, scoring='accuracy')

print(f"Mean CV Accuracy: {scores.mean():.2%}")
print(f"Confidence Interval: +/- {scores.std() * 2:.2%}")



Mean CV Accuracy: 99.46%
Confidence Interval: +/- 0.18%
