In [2]:
from pathlib import Path
try:
    import google.colab
    from google.colab import drive
    drive.mount('/content/drive', force_remount=True)
    PROJECT_ROOT = Path("/content/drive/MyDrive/ABT_Global/AI-Studio-Project")
except ImportError:
    PROJECT_ROOT = Path("../..").resolve()
PROCESSED = PROJECT_ROOT / "data" / "processed"
PROCESSED.mkdir(parents=True, exist_ok=True)
print(f"Processed: {PROCESSED}")

Mounted at /content/drive
Processed: /content/drive/MyDrive/ABT_Global/AI-Studio-Project/data/processed


## Environment Setup

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import json

# Load your reduced dataset using dynamic paths
df = pd.read_csv(PROCESSED / "reduced_dataset.csv")

# Convert date column to datetime and extract month
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df['month'] = df['date'].dt.month

print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Total samples: {len(df):,}")
print(f"Months distribution:\n{df['month'].value_counts().sort_index()}")

def month_stratified_split(df, test_size=0.2, val_size=0.2, random_state=42, save_splits=True):
    """
    Performs a stratified split of the DataFrame into training, validation, and test sets.
    Stratification is done based on both 'month' and 'outage' columns to ensure
    representative distributions in each split.

    Args:
        df (pd.DataFrame): The input DataFrame containing 'date', 'month', and 'outage' columns.
        test_size (float): The proportion of the dataset to include in the test split.
        val_size (float): The proportion of the dataset to include in the validation split.
        random_state (int): Controls the shuffling applied to the data before applying the split.
        save_splits (bool): If True, saves the split DataFrames to CSV files in the PROCESSED directory.

    Returns:
        tuple: A tuple containing the training, validation, and test DataFrames.
    """
    # Ensure 'outage' column exists for stratification
    if 'outage_occurred' not in df.columns:
        print("Warning: 'outage' column not found. Stratification will only be done by 'month'.")
        df['stratify_col'] = df['month'].astype(str)
    else:
        # Create a combined stratification column for month and outage
        df['stratify_col'] = df['month'].astype(str) + '_' + df['outage_occurred'].astype(str)

    # First split: train_val and test set
    train_val_df, test_df = train_test_split(
        df,
        test_size=test_size,
        random_state=random_state,
        stratify=df['stratify_col']
    )

    # Calculate adjusted validation size for the second split
    # val_size is relative to the original DataFrame, we need it relative to train_val_df
    val_size_adjusted = val_size / (1 - test_size)

    # Second split: train and validation set from train_val_df
    train_df, val_df = train_test_split(
        train_val_df,
        test_size=val_size_adjusted,
        random_state=random_state,
        stratify=train_val_df['stratify_col']
    )

    # Drop the temporary stratification column from all splits
    train_df = train_df.drop(columns=['stratify_col'])
    val_df = val_df.drop(columns=['stratify_col'])
    test_df = test_df.drop(columns=['stratify_col'])

    print("\n--- Data Split Summary ---")
    print(f"Train samples: {len(train_df):,}")
    print(f"Validation samples: {len(val_df):,}")
    print(f"Test samples: {len(test_df):,}")

    print("\nMonth Distribution in Splits:")
    print("Train (%):\n", train_df['month'].value_counts(normalize=True).sort_index() * 100)
    print("Validation (%):\n", val_df['month'].value_counts(normalize=True).sort_index() * 100)
    print("Test (%):\n", test_df['month'].value_counts(normalize=True).sort_index() * 100)

    if 'outage' in df.columns:
        print("\nOutage Distribution in Splits:")
        print("Train (%):\n", train_df['outage_occurred'].value_counts(normalize=True).sort_index() * 100)
        print("Validation (%):\n", val_df['outage_occurred'].value_counts(normalize=True).sort_index() * 100)
        print("Test (%):\n", test_df['outage_occurred'].value_counts(normalize=True).sort_index() * 100)

    if save_splits:
        PROCESSED.mkdir(parents=True, exist_ok=True) # Ensure directory exists
        train_df.to_csv(PROCESSED / "train_df.csv", index=False)
        val_df.to_csv(PROCESSED / "val_df.csv", index=False)
        test_df.to_csv(PROCESSED / "test_df.csv", index=False)
        print(f"\nSplits saved to {PROCESSED}/")

    return train_df, val_df, test_df

Date range: 2014-01-01 00:00:00 to 2020-12-31 00:00:00
Total samples: 3,976,135
Months distribution:
month
1     337435
2     307890
3     337435
4     326550
5     337435
6     326550
7     337435
8     337435
9     326550
10    337435
11    326550
12    337435
Name: count, dtype: int64


In [6]:
train_df, val_df, test_df = month_stratified_split(df, test_size=0.2, val_size=0.2, random_state=42, save_splits=True)



--- Data Split Summary ---
Train samples: 2,385,681
Validation samples: 795,227
Test samples: 795,227

Month Distribution in Splits:
Train (%):
 month
1     8.486508
2     7.743449
3     8.486508
4     8.212749
5     8.486508
6     8.212749
7     8.486508
8     8.486508
9     8.212749
10    8.486508
11    8.212749
12    8.486508
Name: proportion, dtype: float64
Validation (%):
 month
1     8.486508
2     7.743449
3     8.486508
4     8.212749
5     8.486508
6     8.212749
7     8.486508
8     8.486508
9     8.212749
10    8.486508
11    8.212749
12    8.486508
Name: proportion, dtype: float64
Test (%):
 month
1     8.486508
2     7.743449
3     8.486508
4     8.212749
5     8.486508
6     8.212749
7     8.486508
8     8.486508
9     8.212749
10    8.486508
11    8.212749
12    8.486508
Name: proportion, dtype: float64

Splits saved to /content/drive/MyDrive/ABT_Global/AI-Studio-Project/data/processed/


## Data Split Summary

### Dataset Characteristics
- **Monthly Distribution**: Relatively balanced across months (8.2-8.5% per month)
- **Class Imbalance**: ~90.6% non-outage (Class 0) vs ~9.4% outage (Class 1) events

### Split Configuration
- **Training Set**: (60%)
- **Validation Set**: (20%)
- **Test Set**: (20%)

### Methodological Strengths
- **Seasonal Representation**: Each split contains proportional representation of all 12 months
- **Reduced Seasonal Bias**: Models learn from entire year's patterns across all splits which prevents overfitting to specific months
- **Class Distribution Preservation**: Maintains consistent outage/non-outage ratios (90.6%/9.4%)