# Weather Data Processing Example

This notebook demonstrates how to use the `WeatherDataProcessor` module to replicate the data processing workflow from the original experiment notebook.


## 1. Import Required Libraries


In [16]:
# Import the WeatherDataProcessor from the local module
import sys
import os
sys.path.append(os.path.join(os.getcwd(), 'src'))

from brayam_pineda_ml import WeatherDataProcessor
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")


## 2. Initialize Weather Data Processor

The `WeatherDataProcessor` is initialized with Sydney coordinates and timezone, matching the original experiment setup.


In [17]:
# Initialize the weather data processor with Sydney coordinates
processor = WeatherDataProcessor(
    lat=-33.8678,  # Sydney latitude
    lon=151.2073,  # Sydney longitude
    timezone="Australia/Sydney"
)

print("WeatherDataProcessor initialized for Sydney, Australia")
print(f"Latitude: {processor.lat}")
print(f"Longitude: {processor.lon}")
print(f"Timezone: {processor.timezone}")


WeatherDataProcessor initialized for Sydney, Australia
Latitude: -33.8678
Longitude: 151.2073
Timezone: Australia/Sydney


## 3. Complete Pipeline Processing

The `process_full_pipeline` method replicates the entire data processing workflow from the original notebook in a single call.


In [18]:
# # Run the complete pipeline for classification (rain prediction)
# print("Running complete data processing pipeline...")
# print("This includes: fetching data, creating targets, feature engineering, splitting, scaling, and imputation")

# data = processor.process_full_pipeline(
#     start_date="2016-01-01",
#     end_date="2024-12-31",
#     task_type="classification",
#     target_name="target_rain"
# )

# print("\n✅ Data processing complete!")
# print(f"Training set: {data['X_train'].shape}")
# print(f"Validation set: {data['X_val'].shape}")
# print(f"Test set: {data['X_test'].shape}")
# print(f"Number of features: {len(data['feature_names'])}")
# print(f"Target variable: {data['target_name']}")


## 4. Explore the Processed Data

Let's examine the processed data to understand what features were created.


In [19]:
# # Display basic information about the processed data
# print("=== TRAINING DATA INFO ===")
# print(f"Shape: {data['X_train'].shape}")
# print(f"Features: {list(data['X_train'].columns)[:10]}...")  # Show first 10 features
# print(f"Target distribution: {data['y_train'].value_counts().to_dict()}")

# print("\n=== FEATURE TYPES ===")
# feature_types = data['X_train'].dtypes.value_counts()
# print(feature_types)

# print("\n=== MISSING VALUES ===")
# missing_values = data['X_train'].isnull().sum()
# print(f"Features with missing values: {(missing_values > 0).sum()}")
# if (missing_values > 0).any():
#     print(missing_values[missing_values > 0].head())


## 5. Step-by-Step Processing Example

Now let's demonstrate how to use individual methods for more granular control over the processing pipeline.


In [20]:
# Step 1: Fetch raw weather data
print("=== STEP 1: FETCHING WEATHER DATA ===")
raw_data = processor.fetch_weather_data("2016-01-01", "2024-12-31")
print(f"Raw data shape: {raw_data.shape}")
print(f"Columns: {list(raw_data.columns)}")
print(f"Date range: {raw_data['time'].min()} to {raw_data['time'].max()}")


[32m2025-09-30 17:45:12.852[0m | [1mINFO    [0m | [36mbrayam_pineda_ml.weather_data_processor[0m:[36mfetch_weather_data[0m:[36m64[0m - [1mFetching weather data from 2016-01-01 to 2024-12-31[0m


=== STEP 1: FETCHING WEATHER DATA ===


HTTPError: 429 Client Error: Too Many Requests for url: https://archive-api.open-meteo.com/v1/archive?latitude=-33.8678&longitude=151.2073&start_date=2016-01-01&end_date=2024-12-31&daily=weather_code,temperature_2m_max,temperature_2m_min,apparent_temperature_max,apparent_temperature_min,precipitation_sum,rain_sum,snowfall_sum,precipitation_hours,wind_speed_10m_max,wind_gusts_10m_max,wind_direction_10m_dominant,shortwave_radiation_sum,et0_fao_evapotranspiration,sunshine_duration,daylight_duration&timezone=Australia/Sydney

In [None]:
# Step 2: Create classification target
print("=== STEP 2: CREATING CLASSIFICATION TARGET ===")
data_with_target = processor.create_classification_target(
    raw_data, 
    target_name="target_rain", 
    threshold=0.1, 
    horizon_days=7
)
print(f"Data with target shape: {data_with_target.shape}")
print(f"Target distribution: {data_with_target['target_rain'].value_counts().to_dict()}")
print(f"Target percentage: {data_with_target['target_rain'].mean():.2%}")


[32m2025-09-30 17:36:00.085[0m | [1mINFO    [0m | [36mbrayam_pineda_ml.weather_data_processor[0m:[36mcreate_classification_target[0m:[36m128[0m - [1mCreated classification target 'target_rain' with 3281 samples[0m


=== STEP 2: CREATING CLASSIFICATION TARGET ===
Data with target shape: (3281, 19)
Target distribution: {1: 1682, 0: 1599}
Target percentage: 51.26%


In [None]:
# Step 3: Add temporal features

features_keep = [
    "temperature_2m_max", "temperature_2m_min",
    "apparent_temperature_max", "apparent_temperature_min",
    "wind_speed_10m_max", "wind_gusts_10m_max",
    "shortwave_radiation_sum", "sunshine_duration", "et0_fao_evapotranspiration",
    "weather_code", "wind_direction_10m_dominant", 
    "time", "precipitation_sum", "rain_sum", "target_rain"
]

print("=== STEP 3: ADDING TEMPORAL FEATURES ===")
data_with_temporal = processor.add_temporal_features(data_with_target, keep_columns=features_keep)
print(f"Added temporal features: year, month, season")
print(f"Season distribution: {data_with_temporal['season'].value_counts().to_dict()}")


[32m2025-09-30 17:36:00.109[0m | [1mINFO    [0m | [36mbrayam_pineda_ml.weather_data_processor[0m:[36madd_temporal_features[0m:[36m174[0m - [1mAdded temporal features and filtered to keep: ['et0_fao_evapotranspiration', 'precipitation_sum', 'wind_speed_10m_max', 'year', 'shortwave_radiation_sum', 'season', 'wind_gusts_10m_max', 'sunshine_duration', 'weather_code', 'wind_direction_10m_dominant', 'temperature_2m_max', 'time', 'apparent_temperature_max', 'apparent_temperature_min', 'rain_sum', 'month', 'temperature_2m_min', 'target_rain'][0m


=== STEP 3: ADDING TEMPORAL FEATURES ===
Added temporal features: year, month, season
Season distribution: {'Autumn': 828, 'Winter': 828, 'Spring': 819, 'Summer': 806}


In [None]:
data_with_temporal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3281 entries, 0 to 3280
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   et0_fao_evapotranspiration   3281 non-null   float64       
 1   precipitation_sum            3281 non-null   float64       
 2   wind_speed_10m_max           3281 non-null   float64       
 3   year                         3281 non-null   int32         
 4   shortwave_radiation_sum      3281 non-null   float64       
 5   season                       3281 non-null   category      
 6   wind_gusts_10m_max           3281 non-null   float64       
 7   sunshine_duration            3281 non-null   float64       
 8   weather_code                 3281 non-null   int64         
 9   wind_direction_10m_dominant  3281 non-null   int64         
 10  temperature_2m_max           3281 non-null   float64       
 11  time                         3281 non-null 

In [None]:
# # Step 4: Create lag features
# print("=== STEP 4: CREATING LAG FEATURES ===")
# data_with_lags = processor.create_lag_features(
#     data_with_temporal, 
#     target_col="rain_sum", 
#     lags=[1, 2, 3, 7]
# )
# print(f"Added lag features for rain_sum: [1, 2, 3, 7] days")
# print(f"New lag columns: {[col for col in data_with_lags.columns if 'lag' in col]}")


[32m2025-09-30 17:35:28.897[0m | [1mINFO    [0m | [36mbrayam_pineda_ml.weather_data_processor[0m:[36mcreate_lag_features[0m:[36m198[0m - [1mCreated lag features for rain_sum: [1, 2, 3, 7][0m


=== STEP 4: CREATING LAG FEATURES ===
Added lag features for rain_sum: [1, 2, 3, 7] days
New lag columns: ['lag1', 'lag2', 'lag3', 'lag7']


In [None]:
# Step 5: Create rolling features
print("=== STEP 5: CREATING ROLLING FEATURES ===")
data_with_rolling = processor.create_rolling_features(
    data_with_temporal, 
    windows=[3, 7, 14, 30]
)
print(f"Added rolling features for windows: [3, 7, 14, 30] days")
print(f"New rolling columns: {[col for col in data_with_rolling.columns if any(x in col for x in ['3d', '7d', '14d', '30d'])]}")


[32m2025-09-30 17:36:11.622[0m | [1mINFO    [0m | [36mbrayam_pineda_ml.weather_data_processor[0m:[36mcreate_rolling_features[0m:[36m228[0m - [1mCreated rolling features for windows: [3, 7, 14, 30][0m


=== STEP 5: CREATING ROLLING FEATURES ===
Added rolling features for windows: [3, 7, 14, 30] days
New rolling columns: ['precip_3d_sum', 'precip_3d_avg', 'precip_3d_std', 'precip_3d_max', 'rain_days_3d', 'precip_7d_sum', 'precip_7d_avg', 'precip_7d_std', 'precip_7d_max', 'rain_days_7d', 'precip_14d_sum', 'precip_14d_avg', 'precip_14d_std', 'precip_14d_max', 'rain_days_14d', 'precip_30d_sum', 'precip_30d_avg', 'precip_30d_std', 'precip_30d_max', 'rain_days_30d']


In [None]:
# Step 6: Create advanced features
print("=== STEP 6: CREATING ADVANCED FEATURES ===")
data_with_advanced = processor.create_advanced_features(data_with_rolling)
print(f"Added advanced meteorological features")
print(f"Total features now: {len(data_with_advanced.columns)}")
print(f"Advanced feature examples: {[col for col in data_with_advanced.columns if any(x in col for x in ['temp_range', 'storm_potential', 'instability'])]}")


In [None]:
# Step 7: Encode categorical features
print("=== STEP 7: ENCODING CATEGORICAL FEATURES ===")
data_encoded = processor.encode_categorical_features(
    data_with_advanced, 
    categorical_cols=["season"]
)
print(f"Encoded categorical features: season")
print(f"Season dummy columns: {[col for col in data_encoded.columns if 'season' in col]}")


In [None]:
# Step 8: Split data chronologically
print("=== STEP 8: SPLITTING DATA CHRONOLOGICALLY ===")
X_train, X_val, X_test, y_train, y_val, y_test = processor.split_time_series_data(
    data_encoded, 
    target_col="target_rain",
    train_ratio=0.7, 
    val_ratio=0.15
)
print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
print(f"Target distributions - Train: {y_train.value_counts().to_dict()}")
print(f"Target distributions - Val: {y_val.value_counts().to_dict()}")
print(f"Target distributions - Test: {y_test.value_counts().to_dict()}")


In [None]:
# Step 9: Impute missing values
print("=== STEP 9: IMPUTING MISSING VALUES ===")
X_train_imp, X_val_imp, X_test_imp = processor.impute_missing_values(
    X_train, X_val, X_test, 
    strategy="mean"
)
print(f"Imputed missing values using mean strategy")
print(f"Missing values after imputation: {X_train_imp.isnull().sum().sum()}")


In [None]:
# Step 10: Scale features
print("=== STEP 10: SCALING FEATURES ===")
X_train_scaled, X_val_scaled, X_test_scaled = processor.scale_features(
    X_train_imp, X_val_imp, X_test_imp, 
    method="standard"
)
print(f"Scaled features using standard scaling")
print(f"Scaled data shape: {X_train_scaled.shape}")
print(f"Feature scaling statistics (first 5 features):")
print(X_train_scaled.iloc[:, :5].describe())


## 6. Model Training and Evaluation

Now let's train a simple model to demonstrate the complete workflow.


In [None]:
# Train a simple Random Forest model
print("=== TRAINING RANDOM FOREST MODEL ===")
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

# Train on processed data
rf_model.fit(X_train_scaled, y_train)
print("Model trained successfully!")

# Make predictions
y_train_pred = rf_model.predict(X_train_scaled)
y_val_pred = rf_model.predict(X_val_scaled)
y_test_pred = rf_model.predict(X_test_scaled)

y_train_proba = rf_model.predict_proba(X_train_scaled)[:, 1]
y_val_proba = rf_model.predict_proba(X_val_scaled)[:, 1]
y_test_proba = rf_model.predict_proba(X_test_scaled)[:, 1]


In [None]:
# Evaluate model performance
print("=== MODEL PERFORMANCE EVALUATION ===")

# Training performance
train_auc = roc_auc_score(y_train, y_train_proba)
train_f1 = f1_score(y_train, y_train_pred)
train_acc = accuracy_score(y_train, y_train_pred)

# Validation performance
val_auc = roc_auc_score(y_val, y_val_proba)
val_f1 = f1_score(y_val, y_val_pred)
val_acc = accuracy_score(y_val, y_val_pred)

# Test performance
test_auc = roc_auc_score(y_test, y_test_proba)
test_f1 = f1_score(y_test, y_test_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"Training - AUROC: {train_auc:.4f}, F1: {train_f1:.4f}, Accuracy: {train_acc:.4f}")
print(f"Validation - AUROC: {val_auc:.4f}, F1: {val_f1:.4f}, Accuracy: {val_acc:.4f}")
print(f"Test - AUROC: {test_auc:.4f}, F1: {test_f1:.4f}, Accuracy: {test_acc:.4f}")

# Check for overfitting
overfitting_gap = train_auc - val_auc
print(f"\nOverfitting gap (Train AUC - Val AUC): {overfitting_gap:.4f}")
if overfitting_gap > 0.05:
    print("⚠️  Warning: Potential overfitting detected!")
else:
    print("✅ No significant overfitting detected")


## 7. Feature Importance Analysis

Let's examine which features are most important for the model.


In [None]:
# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X_train_scaled.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("=== TOP 20 MOST IMPORTANT FEATURES ===")
print(feature_importance.head(20))

# Plot feature importance
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(15)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 15 Most Important Features')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()


## 8. Regression Example

Let's also demonstrate how to use the processor for regression tasks.


In [None]:
# Example: Regression pipeline for precipitation prediction
print("=== REGRESSION PIPELINE EXAMPLE ===")

# Create a new processor instance for regression
reg_processor = WeatherDataProcessor(
    lat=-33.8678,
    lon=151.2073,
    timezone="Australia/Sydney"
)

# Run regression pipeline
reg_data = reg_processor.process_full_pipeline(
    start_date="2016-01-01",
    end_date="2024-12-31",
    task_type="regression",
    target_name="precip_3day_next"
)

print(f"Regression data - Train: {reg_data['X_train'].shape}, Val: {reg_data['X_val'].shape}, Test: {reg_data['X_test'].shape}")
print(f"Target variable: {reg_data['target_name']}")
print(f"Target statistics: {reg_data['y_train'].describe()}")


## 9. Summary and Benefits

The `WeatherDataProcessor` module provides several key benefits:

### **Complete Pipeline Processing**
- Single method call handles the entire workflow
- Consistent with the original notebook processing
- Reduces code duplication and errors

### **Modular Design**
- Individual methods for each processing step
- Full control over the processing pipeline
- Easy to customize and extend

### **Built-in Best Practices**
- Chronological data splitting for time series
- Proper scaling and imputation
- Advanced feature engineering
- Categorical encoding

### **Flexibility**
- Supports both classification and regression tasks
- Configurable parameters for all methods
- Easy to adapt for different locations and time periods


## 10. Feature Selection Example

Now let's demonstrate the new feature selection capability using the specific variables from your original notebook.


In [None]:
# Define the specific features to keep (from your original notebook)
features_keep = [
    # Temperature
    "temperature_2m_max", "temperature_2m_min",
    "apparent_temperature_max", "apparent_temperature_min",

    # Wind
    "wind_speed_10m_max", "wind_gusts_10m_max",

    # Radiation / sunshine / evapotranspiration
    "shortwave_radiation_sum", "sunshine_duration", "et0_fao_evapotranspiration",

    # Categorical / context
    "weather_code", "wind_direction_10m_dominant", 
    "time", "precipitation_sum", "rain_sum", "target_rain"
]

print("=== FEATURE SELECTION EXAMPLE ===")
print(f"Features to keep: {features_keep}")
print(f"Total features to keep: {len(features_keep)}")


In [None]:
# Create a new processor for feature selection example
feature_processor = WeatherDataProcessor(
    lat=-33.8678,
    lon=151.2073,
    timezone="Australia/Sydney"
)

# Run pipeline with feature selection
print("Running pipeline with feature selection...")
selected_data = feature_processor.process_full_pipeline(
    start_date="2016-01-01",
    end_date="2024-12-31",
    task_type="classification",
    target_name="target_rain",
    keep_columns=features_keep  # This is the new parameter!
)

print(f"\n✅ Feature selection complete!")
print(f"Selected training set: {selected_data['X_train'].shape}")
print(f"Selected validation set: {selected_data['X_val'].shape}")
print(f"Selected test set: {selected_data['X_test'].shape}")
print(f"Number of selected features: {len(selected_data['feature_names'])}")
print(f"Selected features: {selected_data['feature_names']}")


In [None]:
# Compare with the full pipeline (without feature selection)
print("=== COMPARISON: WITH vs WITHOUT FEATURE SELECTION ===")
print(f"Full pipeline features: {len(data['feature_names'])}")
print(f"Selected features: {len(selected_data['feature_names'])}")
print(f"Reduction: {len(data['feature_names']) - len(selected_data['feature_names'])} features removed")

# Show the difference in feature sets
full_features = set(data['feature_names'])
selected_features = set(selected_data['feature_names'])
removed_features = full_features - selected_features

print(f"\nRemoved features ({len(removed_features)}):")
for feature in sorted(removed_features):
    print(f"  - {feature}")


In [None]:
# Example: Using add_temporal_features with feature selection
print("=== INDIVIDUAL METHOD WITH FEATURE SELECTION ===")

# Fetch raw data
raw_data = feature_processor.fetch_weather_data("2016-01-01", "2024-12-31")

# Create target
data_with_target = feature_processor.create_classification_target(
    raw_data, 
    target_name="target_rain", 
    threshold=0.1, 
    horizon_days=7
)

# Add temporal features AND filter to keep only specified columns
data_filtered = feature_processor.add_temporal_features(
    data_with_target, 
    keep_columns=features_keep
)

print(f"Original data shape: {data_with_target.shape}")
print(f"Filtered data shape: {data_filtered.shape}")
print(f"Columns kept: {list(data_filtered.columns)}")
print(f"Features removed: {data_with_target.shape[1] - data_filtered.shape[1]}")


In [None]:
print("=== UPDATED PROCESSING SUMMARY ===")
print(f"✅ Successfully processed weather data from 2016-2024")
print(f"✅ Created {len(data['feature_names'])} features through advanced engineering")
print(f"✅ NEW: Feature selection capability added to add_temporal_features()")
print(f"✅ NEW: Can filter to keep only specific columns during processing")
print(f"✅ Split data chronologically: {data['X_train'].shape[0]} train, {data['X_val'].shape[0]} val, {data['X_test'].shape[0]} test")
print(f"✅ Applied scaling, imputation, and encoding")
print(f"✅ Trained model with AUROC: {test_auc:.4f} on test set")
print(f"\nThe WeatherDataProcessor module now includes feature selection capabilities!")
print(f"This allows you to replicate your original notebook's feature selection approach.")


In [None]:
print("=== PROCESSING SUMMARY ===")
print(f"✅ Successfully processed weather data from 2016-2024")
print(f"✅ Created {len(data['feature_names'])} features through advanced engineering")
print(f"✅ Split data chronologically: {data['X_train'].shape[0]} train, {data['X_val'].shape[0]} val, {data['X_test'].shape[0]} test")
print(f"✅ Applied scaling, imputation, and encoding")
print(f"✅ Trained model with AUROC: {test_auc:.4f} on test set")
print(f"\nThe WeatherDataProcessor module successfully replicates the original notebook workflow!")
print(f"This demonstrates how the custom package streamlines the data processing pipeline.")
