In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pathlib import Path
import json

# Load the reduced dataset
PROCESSED = Path("../../data/processed")
df = pd.read_csv(PROCESSED / "reduced_dataset.csv", parse_dates=['date'])
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Total samples: {len(df):,}")

# Extract month and year for stratification
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

print(f"Months distribution:\n{df['month'].value_counts().sort_index()}")
print(f"Years distribution:\n{df['year'].value_counts().sort_index()}")

def stratified_train_val_test_split(df, test_size=0.2, val_size=0.2, random_state=42, save_splits=True):
    """
    Split dataframe into train, validation, and test sets.
    Stratified by month + outage_occurred to preserve seasonal and class distribution.
    """
    df = df.copy()
    df['stratify_col'] = df['year'].astype(str) + "_" + df['month'].astype(str) + "_" + df['outage_occurred'].astype(str)
    
    # Split off test set
    temp_df, test_df = train_test_split(
        df,
        test_size=test_size,
        stratify=df['stratify_col'],
        random_state=random_state
    )
    
    # Split remaining into train and validation
    val_relative_size = val_size / (1 - test_size)
    train_df, val_df = train_test_split(
        temp_df,
        test_size=val_relative_size,
        stratify=temp_df['stratify_col'],
        random_state=random_state
    )
    
    # Drop helper column
    for split_df in [train_df, val_df, test_df]:
        split_df.drop(columns=['stratify_col', 'month', 'year'], inplace=True)
    
    if save_splits:
        output_dir = PROCESSED / "stratified_splits_2014_2020"
        output_dir.mkdir(parents=True, exist_ok=True)
        train_df.to_csv(output_dir / "train.csv", index=False)
        val_df.to_csv(output_dir / "val.csv", index=False)
        test_df.to_csv(output_dir / "test.csv", index=False)
        
        split_info = {
            "total_samples": len(df),
            "train_samples": len(train_df),
            "val_samples": len(val_df),
            "test_samples": len(test_df),
            "split_ratios": {
                "train": len(train_df)/len(df),
                "val": len(val_df)/len(df),
                "test": len(test_df)/len(df)
            }
        }
        with open(output_dir / "split_info.json", "w") as f:
            json.dump(split_info, f, indent=2)
        print(f"Splits saved to: {output_dir}")
    
    return train_df, val_df, test_df

# Create splits
train_df, val_df, test_df = stratified_train_val_test_split(df)

# Check class distribution in each split
for name, split in [('Train', train_df), ('Validation', val_df), ('Test', test_df)]:
    print(f"\n{name} split: {len(split):,} samples")
    print("Class distribution:", split['outage_occurred'].value_counts(normalize=True).to_dict())


Date range: 2014-01-01 00:00:00 to 2020-12-31 00:00:00
Total samples: 3,976,135
Months distribution:
month
1     337435
2     307890
3     337435
4     326550
5     337435
6     326550
7     337435
8     337435
9     326550
10    337435
11    326550
12    337435
Name: count, dtype: int64
Years distribution:
year
2014    567575
2015    567575
2016    569130
2017    567575
2018    567575
2019    567575
2020    569130
Name: count, dtype: int64
Splits saved to: ..\..\data\processed\stratified_splits_2014_2020

Train split: 2,385,681 samples
Class distribution: {0: 0.5009475281900639, 1: 0.499052471809936}

Validation split: 795,227 samples
Class distribution: {0: 0.5009462706874892, 1: 0.49905372931251074}

Test split: 795,227 samples
Class distribution: {0: 0.5009462706874892, 1: 0.49905372931251074}
