In [None]:
# 1. Imports and Configuration

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, TimeSeriesSplit
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    roc_auc_score,
    average_precision_score,
    make_scorer
)
from sklearn.preprocessing import StandardScaler

# For nice plots
sns.set(style="whitegrid", context="notebook")

# Configuration
DATE_COL = "date"
SPLIT_DATE = "2022-01-01"
RANDOM_STATE = 42

# Load dataset
df = pd.read_csv("master_dataset_ml_ready_labelled.csv")
df[DATE_COL] = pd.to_datetime(df[DATE_COL], errors='coerce')
df = df.sort_values(by=DATE_COL).reset_index(drop=True)

# Dataset description
description = """
### Enhanced Decision Tree Notebook
Changes from original:
- Binary target: PriceUp (1 if next day's price increases)
- Engineered features: curve_spread, storage_change, momentum indicators
- Chronological train/test split at 2022-01-01
- TimeSeriesSplit cross-validation
- Multiple metrics: Accuracy, F1, ROC-AUC, PR-AUC
"""
print(description)
df.head()

In [None]:
# 2. Feature Engineering & Dataset Description

# --- Feature Engineering ---
# 1. Target: Binary Next-Day Direction (PriceUp)
df['return'] = df['spot_price'].pct_change()
df['PriceUp'] = (df['return'].shift(-1) > 0).astype(int)

# 2. Synthetic Features
# A. Curve Spread (Futures curve shape)
df['curve_spread'] = df['contract_2_price'] - df['contract_1_price']

# B. Storage Change (7-day delta)
df['storage_bcf_change_7d'] = df['storage_bcf'].diff(7)

# C. Aggregated Weather
hdd_cols = [c for c in df.columns if c.startswith('HDD_')]
cdd_cols = [c for c in df.columns if c.startswith('CDD_')]

df['HDD_total'] = df[hdd_cols].mean(axis=1)
df['CDD_total'] = df[cdd_cols].mean(axis=1)
df['net_weather'] = df['HDD_total'] - df['CDD_total']

# D. Momentum / Returns Features
df['ret_1'] = df['return'].shift(1)       # Yesterday's return
df['ret_3'] = df['return'].rolling(3).mean()
df['ret_5'] = df['return'].rolling(5).mean()
df['ret_10'] = df['return'].rolling(10).mean()

# 3. Cleanup - Drop rows with NaNs created by lags/rolling/target shift
initial_shape = df.shape
df.dropna(inplace=True)
print(f"Dropped {initial_shape[0] - df.shape[0]} rows due to lag/rolling NaN creation.")

print("\nShape (rows, columns):", df.shape)
print("\nNew columns added:", ['curve_spread', 'storage_bcf_change_7d', 'HDD_total', 
                               'CDD_total', 'net_weather', 'ret_1', 'ret_3', 'ret_5', 'ret_10'])

print("\nBinary Target Distribution (PriceUp):")
print(df["PriceUp"].value_counts())
print(df["PriceUp"].value_counts(normalize=True))

In [None]:
# 3.1 Verify no missing values remain

print("Missing values per column after feature engineering cleanup:")
missing = df.isna().sum()
print(missing[missing > 0] if missing.sum() > 0 else "No missing values!")

print("\nFinal shape:", df.shape)

In [None]:
# 3.2 Define features and target (Binary: PriceUp)

# Columns to drop (leakage and non-predictive)
cols_to_drop = [
    DATE_COL, 
    'PriceUp',
    'spot_price',  # Drop to avoid leakage (target derived from this)
    'price_movement_scaled', 
    'price_movement_raw', 
    'return'  # This is basically the target, just unshifted
]

X = df.drop(columns=cols_to_drop, errors='ignore')
y = df['PriceUp']

# Ensure X is strictly numeric
X = X.select_dtypes(include=[np.number])

print(f"Feature columns ({X.shape[1]} features):")
print(X.columns.tolist())
print("\nX shape:", X.shape)
print("y shape:", y.shape)

In [None]:
# 3.3 Chronological Trainâ€“Test Split (proper for time-series)

# Chronological Split at 2022-01-01
mask_train = df[DATE_COL] < SPLIT_DATE
mask_test = df[DATE_COL] >= SPLIT_DATE

X_train, X_test = X[mask_train], X[mask_test]
y_train, y_test = y[mask_train], y[mask_test]

print(f"Train Dates: < {SPLIT_DATE} | Shape: {X_train.shape}")
print(f"Test Dates: >= {SPLIT_DATE} | Shape: {X_test.shape}")

print(f"\nClass distribution in training set:")
print(y_train.value_counts(normalize=True))
print(f"\nClass distribution in test set:")
print(y_test.value_counts(normalize=True))

# Note: Decision Trees don't require scaling, but we keep it available
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# 4. Baseline Decision Tree (with multi-metric evaluation)

dt_baseline = DecisionTreeClassifier(
    random_state=RANDOM_STATE,
    class_weight="balanced"  # handle class imbalance
)

dt_baseline.fit(X_train, y_train)

y_pred_baseline = dt_baseline.predict(X_test)
y_probs_baseline = dt_baseline.predict_proba(X_test)[:, 1]

# Multi-metric evaluation
baseline_acc = accuracy_score(y_test, y_pred_baseline)
baseline_f1 = f1_score(y_test, y_pred_baseline, pos_label=1)
baseline_roc = roc_auc_score(y_test, y_probs_baseline)
baseline_pr = average_precision_score(y_test, y_probs_baseline)

print("=== Baseline Decision Tree Results ===")
print(f"Accuracy:          {baseline_acc:.4f}")
print(f"F1 Score (Up):     {baseline_f1:.4f}")
print(f"ROC-AUC Score:     {baseline_roc:.4f}")
print(f"PR-AUC Score:      {baseline_pr:.4f}")

print("\nClassification report (baseline):")
print(classification_report(y_test, y_pred_baseline, target_names=['Down (0)', 'Up (1)']))

In [None]:
# Baseline Confusion Matrix (Binary)
cm_baseline = confusion_matrix(y_test, y_pred_baseline, labels=[0, 1])

plt.figure(figsize=(6, 4))
sns.heatmap(cm_baseline, annot=True, fmt="d", cmap="Blues",
            xticklabels=['Down (0)', 'Up (1)'],
            yticklabels=['Down (0)', 'Up (1)'])
plt.title("Baseline Decision Tree - Confusion Matrix")
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.show()

In [None]:
# 5.1 GridSearchCV with TimeSeriesSplit (optimizing F1 score)

# Use TimeSeriesSplit for proper time-series cross-validation
tscv = TimeSeriesSplit(n_splits=3)  # 3 folds for efficiency

param_grid_dt = {
    "max_depth": [3, 5, 7, 10, None],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 4, 8],
    "class_weight": [None, "balanced", {0: 1.0, 1: 2.0}, {0: 1.0, 1: 2.5}]
}

# Use F1 score for minority class as optimization metric
f1_scorer = make_scorer(f1_score, pos_label=1)

dt = DecisionTreeClassifier(random_state=RANDOM_STATE)

print("--- Tuning Decision Tree via GridSearchCV (TimeSeriesSplit) ---")
n_combos = 5 * 4 * 4 * 4
print(f"Parameter combinations: {n_combos}")
print(f"With {tscv.n_splits} CV folds = {n_combos * tscv.n_splits} model fits\n")

grid_dt = GridSearchCV(
    estimator=dt,
    param_grid=param_grid_dt,
    scoring=f1_scorer,  # Optimize for F1 (minority class)
    cv=tscv,
    n_jobs=-1,
    verbose=1,
    refit=True
)

grid_dt.fit(X_train, y_train)

print(f"\nBest parameters: {grid_dt.best_params_}")
print(f"Best CV F1 Score: {grid_dt.best_score_:.4f}")

In [None]:
# 5.2 Evaluate tuned tree on test set (multi-metric)

dt_best = grid_dt.best_estimator_

y_pred_tuned = dt_best.predict(X_test)
y_probs_tuned = dt_best.predict_proba(X_test)[:, 1]

# Multi-metric evaluation
tuned_acc = accuracy_score(y_test, y_pred_tuned)
tuned_f1 = f1_score(y_test, y_pred_tuned, pos_label=1)
tuned_roc = roc_auc_score(y_test, y_probs_tuned)
tuned_pr = average_precision_score(y_test, y_probs_tuned)

print("=== Tuned Decision Tree Results (Test Set) ===")
print(f"Accuracy:          {tuned_acc:.4f}")
print(f"F1 Score (Up):     {tuned_f1:.4f}")
print(f"ROC-AUC Score:     {tuned_roc:.4f}")
print(f"PR-AUC Score:      {tuned_pr:.4f}")

print("\nClassification report (tuned):")
print(classification_report(y_test, y_pred_tuned, target_names=['Down (0)', 'Up (1)']))

In [None]:
# Tuned Confusion Matrix (Binary)
cm_tuned = confusion_matrix(y_test, y_pred_tuned, labels=[0, 1])

plt.figure(figsize=(6, 4))
sns.heatmap(cm_tuned, annot=True, fmt="d", cmap="Greens",
            xticklabels=['Down (0)', 'Up (1)'],
            yticklabels=['Down (0)', 'Up (1)'])
plt.title("Tuned Decision Tree - Confusion Matrix")
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.show()

In [None]:
# Multi-metric comparison: Baseline vs Tuned
comparison_df = pd.DataFrame({
    'Model': ['DT Baseline', 'DT Tuned'],
    'Accuracy': [baseline_acc, tuned_acc],
    'F1 (Up)': [baseline_f1, tuned_f1],
    'ROC-AUC': [baseline_roc, tuned_roc],
    'PR-AUC': [baseline_pr, tuned_pr]
})

print("="*70)
print("MODEL COMPARISON: Baseline vs Tuned")
print("="*70)
display(comparison_df)

# Visualize comparison - all 4 metrics
fig, axes = plt.subplots(1, 4, figsize=(16, 4))
metrics = ['Accuracy', 'F1 (Up)', 'ROC-AUC', 'PR-AUC']
colors = ['#3498db', '#2ecc71']

for i, metric in enumerate(metrics):
    ax = axes[i]
    values = comparison_df[metric].values
    bars = ax.bar(['Baseline', 'Tuned'], values, color=colors)
    ax.set_title(metric, fontsize=12, fontweight='bold')
    ax.set_ylim(0, max(values) * 1.2)
    
    for bar, val in zip(bars, values):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
               f'{val:.3f}', ha='center', va='bottom', fontsize=10)

plt.suptitle('Decision Tree: Baseline vs Tuned Performance', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# 6.1 Feature importances (including engineered features)

importances = dt_best.feature_importances_
feat_imp = pd.Series(importances, index=X.columns).sort_values(ascending=False)

print("Top 15 features by importance:")
display(feat_imp.head(15))

# Highlight engineered features
engineered = ['curve_spread', 'storage_bcf_change_7d', 'HDD_total', 'CDD_total', 
              'net_weather', 'ret_1', 'ret_3', 'ret_5', 'ret_10']
colors = ['#e74c3c' if f in engineered else '#3498db' for f in feat_imp.head(15).index]

plt.figure(figsize=(10, 6))
feat_imp.head(15).plot(kind="bar", color=colors)
plt.title("Top 15 Feature Importances (Tuned Decision Tree)\nRed = Engineered Features", fontsize=12)
plt.ylabel("Importance")
plt.tight_layout()
plt.show()

# Show engineered features importance
print("\nEngineered Features Importance:")
for f in engineered:
    if f in feat_imp.index:
        print(f"  {f}: {feat_imp[f]:.4f}")

In [None]:
# 6.2 Ablation Study: Remove redundant features and re-evaluate

# Features to remove based on redundancy hypotheses
cols_to_remove = [
    'CDD_PA', 'CDD_IL', 'HDD_PA', 'HDD_IL',  # Regional weather redundancy
    'contract_1_price', 'contract_2_price'    # Price levels (keep curve_spread)
]

print(f"Removing features: {cols_to_remove}")

X_reduced = X.drop(columns=[c for c in cols_to_remove if c in X.columns])

print(f"Original features: {X.shape[1]}")
print(f"Reduced features: {X_reduced.shape[1]}")
print(f"\nRemaining features: {list(X_reduced.columns)}")

# Apply same chronological split
X_train_r, X_test_r = X_reduced[mask_train], X_reduced[mask_test]

# Use best params from tuned model
best_params = grid_dt.best_params_
dt_reduced = DecisionTreeClassifier(
    random_state=RANDOM_STATE,
    **best_params
)
dt_reduced.fit(X_train_r, y_train)

y_pred_reduced = dt_reduced.predict(X_test_r)
y_probs_reduced = dt_reduced.predict_proba(X_test_r)[:, 1]

# Multi-metric evaluation
reduced_acc = accuracy_score(y_test, y_pred_reduced)
reduced_f1 = f1_score(y_test, y_pred_reduced, pos_label=1)
reduced_roc = roc_auc_score(y_test, y_probs_reduced)
reduced_pr = average_precision_score(y_test, y_probs_reduced)

print("\n=== Decision Tree (Reduced Features) Results ===")
print(f"Accuracy:          {reduced_acc:.4f}")
print(f"F1 Score (Up):     {reduced_f1:.4f}")
print(f"ROC-AUC Score:     {reduced_roc:.4f}")
print(f"PR-AUC Score:      {reduced_pr:.4f}")

print("\nClassification report (reduced features):")
print(classification_report(y_test, y_pred_reduced, target_names=['Down (0)', 'Up (1)']))

In [None]:
# 6.3 Final Model Comparison: All 3 variants

final_comparison = pd.DataFrame({
    'Model': ['DT Baseline', 'DT Tuned', 'DT Reduced Features'],
    'Features': [X.shape[1], X.shape[1], X_reduced.shape[1]],
    'Accuracy': [baseline_acc, tuned_acc, reduced_acc],
    'F1 (Up)': [baseline_f1, tuned_f1, reduced_f1],
    'ROC-AUC': [baseline_roc, tuned_roc, reduced_roc],
    'PR-AUC': [baseline_pr, tuned_pr, reduced_pr]
})

print("="*80)
print("FINAL MODEL COMPARISON (ALL VARIANTS)")
print("="*80)
display(final_comparison)

# Visualize - All 3 models, all 4 metrics
fig, axes = plt.subplots(1, 4, figsize=(18, 5))
metrics = ['Accuracy', 'F1 (Up)', 'ROC-AUC', 'PR-AUC']
colors = ['#3498db', '#e74c3c', '#2ecc71']

for i, metric in enumerate(metrics):
    ax = axes[i]
    values = final_comparison[metric].values
    bars = ax.bar(['Baseline', 'Tuned', 'Reduced'], values, color=colors)
    ax.set_title(metric, fontsize=12, fontweight='bold')
    ax.set_ylim(0, max(values) * 1.2)
    
    for bar, val in zip(bars, values):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
               f'{val:.3f}', ha='center', va='bottom', fontsize=9)

plt.suptitle('Decision Tree Model Performance Comparison (Binary Classification)', 
             fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

# Print best parameters
print("\n" + "="*80)
print("BEST GRIDSEARCHCV PARAMETERS")
print("="*80)
for param, value in grid_dt.best_params_.items():
    print(f"  {param}: {value}")

In [None]:
# 7. Visualize a shallow decision tree for presentation

dt_for_plot = DecisionTreeClassifier(
    random_state=RANDOM_STATE,
    class_weight="balanced",
    max_depth=3,          # small depth so it fits on slide
    min_samples_leaf=10
)
dt_for_plot.fit(X_train, y_train)

plt.figure(figsize=(20, 10))
plot_tree(
    dt_for_plot,
    feature_names=X.columns,
    class_names=["Down (0)", "Up (1)"],  # Binary classes
    filled=True,
    rounded=True,
    fontsize=9
)
plt.title("Simplified Decision Tree (max_depth=3) - Binary Classification", fontsize=14)
plt.tight_layout()
plt.show()

# Summary
print("\n" + "="*80)
print("SUMMARY")
print("="*80)
print(f"""
This enhanced Decision Tree notebook incorporates:
1. Binary target (PriceUp) for next-day direction prediction
2. Engineered features: curve_spread, storage_change, momentum indicators (ret_1/3/5/10)
3. Chronological train/test split at {SPLIT_DATE}
4. TimeSeriesSplit cross-validation (proper for time-series)
5. GridSearchCV optimizing F1 score (for minority class)
6. Multiple evaluation metrics: Accuracy, F1, ROC-AUC, PR-AUC
7. Ablation study with reduced features

Best performing model: {'Tuned' if tuned_f1 >= max(baseline_f1, reduced_f1) else 'Reduced' if reduced_f1 > tuned_f1 else 'Baseline'} Decision Tree
Best F1 Score (Up class): {max(baseline_f1, tuned_f1, reduced_f1):.4f}
""")