# Weather Data Processing - Classification Example

This notebook demonstrates how to use the WeatherDataProcessor for classification tasks (predicting rain occurrence 7 days ahead).


In [None]:
# Import the local development version
import sys
import os
sys.path.insert(0, os.path.abspath('../src'))

# Import directly from the module to avoid conflicts with installed package
from brayam_pineda_ml.weather_data_processor import WeatherDataProcessor
import pandas as pd
import numpy as np


## 1. Initialize the Weather Data Processor


In [None]:
# Initialize the processor for Sydney weather data
processor = WeatherDataProcessor(
    lat=-33.8678,  # Sydney latitude
    lon=151.2073,  # Sydney longitude
    timezone="Australia/Sydney"
)


In [None]:
# Run the complete pipeline for classification task
data = processor.process_full_pipeline(
    start_date="2016-01-01",
    end_date="2024-12-31",
    task_type="classification",
    target_name="target_rain"
)

print(f"Training set shape: {data['X_train'].shape}")
print(f"Validation set shape: {data['X_val'].shape}")
print(f"Test set shape: {data['X_test'].shape}")
print(f"Number of features: {len(data['feature_names'])}")
print(f"Target variable: {data['target_name']}")


## 3. Examine the Processed Data


In [None]:
# Look at the target variable distribution
print("Target variable distribution:")
print(data['y_train'].value_counts())
print(f"\\nClass balance: {data['y_train'].mean():.3f} (proportion of positive class)")


## 4. Train a Model (Example with XGBoost)


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, accuracy_score

# Train XGBoost model
model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42
)

model.fit(data['X_train'], data['y_train'])

# Make predictions
y_train_pred = model.predict(data['X_train'])
y_train_proba = model.predict_proba(data['X_train'])[:, 1]

y_val_pred = model.predict(data['X_val'])
y_val_proba = model.predict_proba(data['X_val'])[:, 1]

y_test_pred = model.predict(data['X_test'])
y_test_proba = model.predict_proba(data['X_test'])[:, 1]

# Evaluate performance
def evaluate_classification(y_true, y_pred, y_proba, set_name):
    auc = roc_auc_score(y_true, y_proba)
    pr_auc = average_precision_score(y_true, y_proba)
    f1 = f1_score(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)
    print(f"{set_name} - AUC: {auc:.3f}, PR-AUC: {pr_auc:.3f}, F1: {f1:.3f}, Accuracy: {acc:.3f}")

evaluate_classification(data['y_train'], y_train_pred, y_train_proba, "Training")
evaluate_classification(data['y_val'], y_val_pred, y_val_proba, "Validation")
evaluate_classification(data['y_test'], y_test_pred, y_test_proba, "Test")
