
# 01 — Dimensionality Reduction

**Milestone 2** · Power Outage Prediction

This notebook reduces the dataset's dimensionality by selecting a compact, **leak-free** set of features suitable for modeling. It supports multiple selection strategies (univariate F-test, Mutual Information), correlation pruning, and optional PCA on continuous features. It saves a reduced dataset and an ordered list of final features for downstream modeling.


In [2]:

# === Setup ===
import json
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings("ignore")

# Project paths (this notebook expected to live in notebooks/milestone_2/)
PROCESSED = Path("../../data/processed")
OUTPUTS = Path("../../notebooks/outputs/milestone_2")
OUTPUTS.mkdir(parents=True, exist_ok=True)

print("Processed dir:", PROCESSED.resolve())
print("Outputs dir  :", OUTPUTS.resolve())


Processed dir: /Users/ayahalmusaddy/projects/AI-Studio-Project/data/processed
Outputs dir  : /Users/ayahalmusaddy/projects/AI-Studio-Project/notebooks/outputs/milestone_2


## Load data

In [3]:

# Try to load the modeling matrix produced in earlier notebooks.
# Fallback to integrated dataset if the modeling matrix isn't available.
model_df_path = PROCESSED / "modeling_matrix.parquet"
integrated_csv = PROCESSED / "integrated_dataset.csv"

if model_df_path.exists():
    df = pd.read_parquet(model_df_path)
    source = "modeling_matrix.parquet"
elif integrated_csv.exists():
    df = pd.read_csv(integrated_csv)
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'])
    source = "integrated_dataset.csv"
else:
    raise FileNotFoundError("No data found in ../../data/processed (need modeling_matrix.parquet or integrated_dataset.csv).")

print(f"Loaded: {source} with shape {df.shape}")
print("Columns:", list(df.columns)[:12], "...")


Loaded: integrated_dataset.csv with shape (3973578, 17)
Columns: ['fips_code', 'date', 'prcp', 'tmax', 'tmin', 'outage_occurred', 'customers_out', 'county', 'state', 'run_start_time', 'year', 'month'] ...


## Define target and exclude leakage / IDs

In [4]:

# Target
if 'outage_occurred' not in df.columns:
    raise KeyError("Column 'outage_occurred' is required as the classification target.")

y = df['outage_occurred'].astype(int)

# Columns we never use as features (IDs, text labels, or leakage)
always_exclude = {
    # IDs / metadata
    'fips_code', 'county', 'state', 'region', 'date', 'county_name',
    # Leakage (only known after event or derived from target)
    'run_start_time', 'customers_out', 'outage_hour',
    'state_risk_score', 'region_risk_score', 'state_risk_category', 'high_risk_state'
}

# Often excluded for time-based evaluation; keep cyclical encodings instead
optional_exclude = {'year'}  # edit if you decide to include raw year

exclude_cols = [c for c in df.columns if c in always_exclude or c in optional_exclude or c == 'outage_occurred']

X = df.drop(columns=[c for c in exclude_cols if c in df.columns]).copy()
print("Initial feature columns:", X.shape[1])


Initial feature columns: 7


## Preprocess: encode categoricals and impute missing

In [5]:

# Encode remaining object/category columns
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# Impute missing: numeric -> median; others -> mode
for col in X.columns:
    if X[col].isna().any():
        if pd.api.types.is_numeric_dtype(X[col]):
            X[col] = X[col].fillna(X[col].median())
        else:
            X[col] = X[col].fillna(X[col].mode().iloc[0])

print("Post-preprocess shape:", X.shape)
print("Any NA left:", int(X.isna().sum().sum()))


Post-preprocess shape: (3973578, 7)
Any NA left: 0


## Univariate selection: F-test and Mutual Information

In [6]:

K = 30  # desired top-k from each univariate method (adjust later if needed)

# F-test (assumes roughly linear/normal relationship)
f_selector = SelectKBest(score_func=f_classif, k=min(K, X.shape[1]))
f_selector.fit(X, y)
f_scores = f_selector.scores_
f_rank = pd.Series(f_scores, index=X.columns).sort_values(ascending=False).rename("f_score")

# Mutual Information (captures nonlinear dependencies)
mi_selector = SelectKBest(score_func=mutual_info_classif, k=min(K, X.shape[1]))
mi_selector.fit(X, y)
mi_scores = mi_selector.scores_
mi_rank = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False).rename("mi_score")

# Combine into a table
rank_df = pd.concat([f_rank, mi_rank], axis=1).fillna(0)
rank_df['rank_sum'] = rank_df['f_score'].rank(ascending=False) + rank_df['mi_score'].rank(ascending=False)
rank_df = rank_df.sort_values(['rank_sum'], ascending=True)

rank_df.to_csv(PROCESSED / "feature_scores_univariate.csv", index=True)
print("Saved univariate scores ->", PROCESSED / "feature_scores_univariate.csv")

top_f  = f_rank.head(K).index.tolist()
top_mi = mi_rank.head(K).index.tolist()
print("Top-F (k):", len(top_f))
print("Top-MI(k):", len(top_mi))

# Candidate pools
cand_union = list(dict.fromkeys(top_f + top_mi))  # preserve order, dedup
cand_inter = [c for c in top_f if c in top_mi]
print("Union size:", len(cand_union), "| Intersection size:", len(cand_inter))


Saved univariate scores -> ../../data/processed/feature_scores_univariate.csv
Top-F (k): 7
Top-MI(k): 7
Union size: 7 | Intersection size: 7


## Correlation pruning (remove highly collinear features)

In [7]:

def correlation_prune(cols, corr_threshold=0.90):
    if len(cols) <= 1:
        return cols
    C = X[cols].corr().abs()
    # Upper triangle mask
    upper = C.where(np.triu(np.ones(C.shape), k=1).astype(bool))
    to_drop = set()
    for c in upper.columns:
        if c in to_drop: 
            continue
        high = upper[c][upper[c] > corr_threshold].index.tolist()
        to_drop.update(high)
    kept = [c for c in cols if c not in to_drop]
    return kept

union_pruned = correlation_prune(cand_union, corr_threshold=0.90)
inter_pruned = correlation_prune(cand_inter, corr_threshold=0.90)
print("After pruning -> union:", len(union_pruned), "intersection:", len(inter_pruned))


After pruning -> union: 6 intersection: 6


## Optional PCA on continuous features (for reference)

In [8]:

# We'll only run PCA on numeric columns as an auxiliary representation.
# Save explained-variance info; do not force components into final feature set by default.
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
if len(num_cols) >= 2:
    scaler = StandardScaler()
    Xs = scaler.fit_transform(X[num_cols])
    pca = PCA(n_components=None, random_state=42)
    pca.fit(Xs)
    evr = pca.explained_variance_ratio_
    cum_evr = np.cumsum(evr)

    # Choose #components to reach >=95% variance
    n95 = int(np.searchsorted(cum_evr, 0.95) + 1)
    pca_info = pd.DataFrame({
        "component": np.arange(1, len(evr)+1),
        "explained_variance_ratio": evr,
        "cumulative_evr": cum_evr
    })
    pca_info.to_csv(PROCESSED / "pca_explained_variance.csv", index=False)
    print(f"PCA numeric cols: {len(num_cols)} | components to reach 95% var: {n95}")
    print("Saved PCA EVR ->", PROCESSED / "pca_explained_variance.csv")
else:
    print("Not enough numeric columns for PCA (skipped).")


PCA numeric cols: 7 | components to reach 95% var: 6
Saved PCA EVR -> ../../data/processed/pca_explained_variance.csv


## Final feature selection (rule-based consolidation)

In [9]:

# Consolidation rule:
# 1) Start with intersection (robust across methods).
# 2) If intersection < 12, add best from union until target size.
TARGET_K = 18

final_feats = inter_pruned.copy()
for c in union_pruned:
    if len(final_feats) >= TARGET_K:
        break
    if c not in final_feats:
        final_feats.append(c)

print("Final feature count:", len(final_feats))
print("Final features:", final_feats)
with open(PROCESSED / "final_features.json", "w") as f:
    json.dump(final_feats, f, indent=2)
print("Saved ->", PROCESSED / "final_features.json")


Final feature count: 6
Final features: ['day_of_week', 'month', 'tmin', 'day_name', 'season', 'prcp']
Saved -> ../../data/processed/final_features.json


## Save reduced dataset

In [11]:

reduced = X[final_feats].copy()
reduced['outage_occurred'] = y.values

# Keep date and fips_code if available for later evaluation/analysis
for extra in ['date','fips_code']:
    if extra in df.columns and extra not in reduced.columns:
        reduced[extra] = df[extra]

reduced_path = PROCESSED / "reduced_dataset.csv"
reduced.to_csv(reduced_path, index=False)
print("Saved reduced dataset ->", reduced_path, reduced.shape)


Saved reduced dataset -> ../../data/processed/reduced_dataset.csv (3973578, 9)


## Notes / Decisions


- **Leakage removed:** `run_start_time`, `customers_out`, `outage_hour`, target-derived risk scores; IDs and labels removed.
- **Univariate filters:** ANOVA F-test (linear) and Mutual Information (nonlinear). We kept `K=30` from each, then consolidated.
- **Correlation pruning:** removed one of any feature pair with |corr| > 0.90.
- **PCA (optional):** variance report saved; components not enforced in final set to preserve interpretability.
- **Final feature count:** `TARGET_K = 18` by default; adjust if needed.
- **Artifacts saved:**  
  - `../../data/processed/feature_scores_univariate.csv`  
  - `../../data/processed/pca_explained_variance.csv` (if applicable)  
  - `../../data/processed/final_features.json`  
  - `../../data/processed/reduced_dataset.parquet`
