In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pathlib import Path

# Load your reduced dataset
PROCESSED = Path("../../data/processed")
df = pd.read_csv(PROCESSED / "reduced_dataset.csv")

# Convert date column to datetime and extract month
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df['month'] = df['date'].dt.month

print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Total samples: {len(df):,}")
print(f"Months distribution:\n{df['month'].value_counts().sort_index()}")

def month_stratified_split(df, test_size=0.2, val_size=0.2, random_state=42):
    """
    Create train/validation/test splits stratified by month
    Ensures each split has proportional representation of all months
    """
    # Create a combined stratification column: month + class
    df = df.copy()
    df['month_class'] = df['month'].astype(str) + '_' + df['outage_occurred'].astype(str)
    
    print(f"Unique month-class combinations: {df['month_class'].nunique()}")
    
    # First split: separate test set stratified by month_class
    temp_df, test_df = train_test_split(
        df,
        test_size=test_size,
        stratify=df['month_class'],
        random_state=random_state
    )
    
    # Second split: separate validation set from remaining data
    val_relative_size = val_size / (1 - test_size)
    
    train_df, val_df = train_test_split(
        temp_df,
        test_size=val_relative_size,
        stratify=temp_df['month_class'],
        random_state=random_state
    )
    
    # Drop the temporary stratification column
    for split_df in [train_df, val_df, test_df]:
        split_df.drop(columns=['month_class'], inplace=True)
    
    return train_df, val_df, test_df

# Create splits
train_df, val_df, test_df = month_stratified_split(df)

print(f"\n=== Month-Stratified Split ===")
print(f"Training set: {len(train_df):,} samples ({len(train_df)/len(df):.1%})")
print(f"Validation set: {len(val_df):,} samples ({len(val_df)/len(df):.1%})")
print(f"Test set: {len(test_df):,} samples ({len(test_df)/len(df):.1%})")

# Analyze month distribution in each split
def analyze_month_distribution(train_df, val_df, test_df):
    """Analyze how months are distributed across splits"""
    results = []
    
    for name, split_df in [('Training', train_df), ('Validation', val_df), ('Test', test_df)]:
        month_dist = split_df['month'].value_counts(normalize=True).sort_index()
        class_dist = split_df['outage_occurred'].value_counts(normalize=True)
        
        results.append({
            'split': name,
            'samples': len(split_df),
            'month_distribution': month_dist.to_dict(),
            'class_distribution': class_dist.to_dict()
        })
        
        print(f"\n{name} Split:")
        print(f"Month distribution: {month_dist.to_dict()}")
        print(f"Class distribution: {class_dist.to_dict()}")
    
    return results

analysis = analyze_month_distribution(train_df, val_df, test_df)

Date range: 2014-01-01 00:00:00 to 2014-12-31 00:00:00
Total samples: 366,460
Months distribution:
month
1     31124
2     28112
3     31124
4     30120
5     31124
6     30120
7     31124
8     31124
9     30120
10    31124
11    30120
12    31124
Name: count, dtype: int64
Unique month-class combinations: 14

=== Month-Stratified Split ===
Training set: 219,876 samples (60.0%)
Validation set: 73,292 samples (20.0%)
Test set: 73,292 samples (20.0%)

Training Split:
Month distribution: {1: 0.08492968764212556, 2: 0.0767159671815023, 3: 0.08492968764212556, 4: 0.0821917808219178, 5: 0.08492968764212556, 6: 0.0821917808219178, 7: 0.08492968764212556, 8: 0.08492968764212556, 9: 0.0821917808219178, 10: 0.08493423566009933, 11: 0.08219632883989157, 12: 0.08492968764212556}
Class distribution: {0: 0.9061471010933435, 1: 0.09385289890665648}

Validation Split:
Month distribution: {1: 0.08493423566009933, 2: 0.07670687114555477, 3: 0.08493423566009933, 4: 0.0821917808219178, 5: 0.08493423566009

## Data Split Summary

### Dataset Characteristics
- **Monthly Distribution**: Relatively balanced across months (8.2-8.5% per month)
- **Class Imbalance**: ~90.6% non-outage (Class 0) vs ~9.4% outage (Class 1) events

### Split Configuration
- **Training Set**: (60%)
- **Validation Set**: (20%)
- **Test Set**: (20%)

### Methodological Strengths
- **Seasonal Representation**: Each split contains proportional representation of all 12 months
- **Reduced Seasonal Bias**: Models learn from entire year's patterns across all splits which prevents overfitting to specific months
- **Class Distribution Preservation**: Maintains consistent outage/non-outage ratios (90.6%/9.4%)