# ML Trading System - Interaktiv Debug Notebook

Detta notebook ger dig kraftfulla verktyg för att debugga och analysera ML trading systemet med visualiseringar istället för konsol-utskrifter.

## 1. Setup och Konfiguration

In [None]:
# Sätt FRED API nyckel och importera bibliotek
import os
os.environ['FRED_API_KEY'] = '8d9ad11bf6016ba0a68f2f6f56f056ba'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Konfigurera plotting
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
warnings.filterwarnings('ignore')

print("✅ Setup klart!")

## 2. Ladda och Analysera Rådata

In [None]:
# Ladda data och bygg features
from src.fetch_data import fetch_sp500_from_fred
from src.features import build_feature_set
from src.labels import make_std_labels
from src.config import LABEL_HORIZON

print("🔄 Laddar S&P 500 data...")
df_raw = fetch_sp500_from_fred(start="2000-01-01")

print("🔄 Bygger features...")
df_features = build_feature_set(df_raw)

print("🔄 Skapar labels...")
df_labeled = make_std_labels(df_features, horizon=LABEL_HORIZON)

# Feature kolumner
exclude = {"Date","Close","label","fwd_return","vol_h"}
feature_cols = [c for c in df_labeled.columns if c not in exclude]

print(f"✅ Data laddad:")
print(f"  - Rådata shape: {df_raw.shape}")
print(f"  - Efter features: {df_features.shape}")
print(f"  - Efter labels: {df_labeled.shape}")
print(f"  - Antal features: {len(feature_cols)}")
print(f"  - Datum range: {df_raw['Date'].min()} till {df_raw['Date'].max()}")

## 3. Data Overview Visualisering

In [None]:
# Skapa overview plots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. S&P 500 pris över tid
axes[0,0].plot(pd.to_datetime(df_raw['Date']), df_raw['Close'], linewidth=1)
axes[0,0].set_title('S&P 500 Pris Över Tid')
axes[0,0].set_ylabel('Pris')
axes[0,0].tick_params(axis='x', rotation=45)
axes[0,0].grid(True, alpha=0.3)

# 2. Label distribution
label_counts = df_labeled['label'].value_counts()
axes[0,1].bar(label_counts.index, label_counts.values, color=['red', 'green'])
axes[0,1].set_title('Label Distribution')
axes[0,1].set_xlabel('Label (0=Ned, 1=Upp)')
axes[0,1].set_ylabel('Antal')
for i, v in enumerate(label_counts.values):
    axes[0,1].text(i, v + 50, str(v), ha='center')

# 3. NaN mönster (sample)
nan_data = df_labeled[feature_cols + ['label']].isnull()
sample_indices = np.linspace(0, len(nan_data)-1, 100, dtype=int)
sns.heatmap(nan_data.iloc[sample_indices].T, ax=axes[1,0], cbar=True, 
           yticklabels=True, xticklabels=False, cmap='Reds')
axes[1,0].set_title('NaN Mönster (100 samplade rader)')

# 4. Feature korrelationer (sample av features)
sample_features = feature_cols[:10] if len(feature_cols) > 10 else feature_cols
corr_sample = df_labeled[sample_features].dropna().corr()
sns.heatmap(corr_sample, ax=axes[1,1], cmap='coolwarm', center=0,
           square=True, cbar=True, annot=True, fmt='.2f')
axes[1,1].set_title(f'Feature Korrelationer (Top {len(sample_features)})')

plt.tight_layout()
plt.show()

print(f"📊 Data kvalitet:")
print(f"  - Label balans: {dict(label_counts)}")
print(f"  - NaN per kolumn: {df_labeled[feature_cols].isnull().sum().sum()} totalt")

## 4. Detaljerad Feature Analys

In [None]:
# Feature statistik
feature_data = df_labeled[feature_cols]

# Skapa statistik DataFrame
stats = pd.DataFrame({
    'count': feature_data.count(),
    'mean': feature_data.mean(),
    'std': feature_data.std(),
    'min': feature_data.min(),
    'max': feature_data.max(),
    'nan_count': feature_data.isnull().sum(),
    'nan_pct': (feature_data.isnull().sum() / len(feature_data) * 100).round(2)
})

print("📈 FEATURE STATISTIK:")
display(stats.round(4))

In [None]:
# Plot feature distributions
n_features = len(feature_cols)
n_cols = 4
n_rows = (n_features + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 5*n_rows))
axes = axes.flatten() if n_rows > 1 else [axes] if n_cols == 1 else axes

for i, col in enumerate(feature_cols):
    if i < len(axes):
        data = feature_data[col].dropna()
        axes[i].hist(data, bins=30, alpha=0.7, color='skyblue', edgecolor='black')
        axes[i].set_title(f'{col}\n(μ={data.mean():.3f}, σ={data.std():.3f})')
        axes[i].grid(True, alpha=0.3)
        axes[i].axvline(data.mean(), color='red', linestyle='--', alpha=0.8, label='Mean')

# Dölj onödiga subplots
for i in range(len(feature_cols), len(axes)):
    axes[i].set_visible(False)
    
plt.tight_layout()
plt.show()

## 5. ML Model Debug - Single Training

In [None]:
# Preparera data för ML debug
from src.model import make_mlp_bagging, fit_predict

print("🤖 SINGLE MODEL TRAINING DEBUG")
print("=" * 40)

# Ta clean data sample
df_clean = df_labeled.dropna(subset=feature_cols+["label"])
sample_size = min(2000, len(df_clean))
df_sample = df_clean.head(sample_size)

X = df_sample[feature_cols].values
y = df_sample["label"].values.astype(int)

print(f"📊 Sample info:")
print(f"  - Sample size: {X.shape}")
print(f"  - Features: {X.shape[1]}")
print(f"  - Label distribution: {dict(zip(*np.unique(y, return_counts=True)))}")
print(f"  - Feature matrix stats: min={X.min():.3f}, max={X.max():.3f}, mean={X.mean():.3f}")

# Split train/test
split_idx = int(0.7 * len(X))
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

print(f"\n🔄 Training model...")
clf = make_mlp_bagging()
y_pred, y_proba, trained_clf = fit_predict(clf, X_train, y_train, X_test)

# Beräkna metrics
accuracy = (y_pred == y_test).mean()
print(f"✅ Model tränad! Accuracy: {accuracy:.3f}")

In [None]:
# Visualisera model results
from sklearn.metrics import confusion_matrix, classification_report

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Prediction distribution
pred_counts = np.bincount(y_pred.astype(int))
axes[0,0].bar(range(len(pred_counts)), pred_counts, color=['red', 'green'])
axes[0,0].set_title(f'Predictions (Accuracy: {accuracy:.3f})')
axes[0,0].set_xlabel('Predicted Class')
axes[0,0].set_ylabel('Count')
for i, v in enumerate(pred_counts):
    axes[0,0].text(i, v + 5, str(v), ha='center')

# 2. Probability histogram
axes[0,1].hist(y_proba, bins=30, alpha=0.7, color='purple', edgecolor='black')
axes[0,1].set_title('Prediction Probabilities')
axes[0,1].set_xlabel('P(Class=1)')
axes[0,1].set_ylabel('Frequency')
axes[0,1].axvline(0.5, color='red', linestyle='--', label='Decision Threshold')
axes[0,1].legend()

# 3. Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', ax=axes[1,0], cmap='Blues',
           xticklabels=['Pred 0', 'Pred 1'], yticklabels=['True 0', 'True 1'])
axes[1,0].set_title('Confusion Matrix')

# 4. Probability by actual class
for actual_class in [0, 1]:
    mask = (y_test == actual_class)
    class_proba = y_proba[mask]
    axes[1,1].hist(class_proba, alpha=0.6, bins=20,
                  label=f'Actual {actual_class}', 
                  color='red' if actual_class == 0 else 'green')
axes[1,1].legend()
axes[1,1].set_title('Probability Distribution by Actual Class')
axes[1,1].set_xlabel('P(Class=1)')
axes[1,1].axvline(0.5, color='black', linestyle='--', alpha=0.5)

plt.tight_layout()
plt.show()

# Classification report
print("📊 CLASSIFICATION REPORT:")
print(classification_report(y_test, y_pred, target_names=['Class 0 (Ned)', 'Class 1 (Upp)']))

## 6. Rolling Predictions Debug

In [None]:
# Debug rolling predictions
from src.train_predict import rolling_train_predict

print("🔄 ROLLING PREDICTIONS DEBUG")
print("=" * 40)

# Använd begränsad data för snabbare debug
max_rows = min(1000, len(df_clean))
df_rolling_sample = df_clean.head(max_rows).reset_index(drop=True)

print(f"📊 Rolling sample: {df_rolling_sample.shape}")

try:
    df_pred = rolling_train_predict(df_rolling_sample, feature_cols=feature_cols)
    
    print(f"✅ Rolling predictions klart!")
    print(f"  - Output shape: {df_pred.shape}")
    print(f"  - Nya kolumner: {set(df_pred.columns) - set(df_rolling_sample.columns)}")
    
    # Analysera predictions
    if 'pred' in df_pred.columns:
        pred_clean = df_pred.dropna(subset=['pred'])
        pred_counts = pred_clean['pred'].value_counts()
        print(f"  - Prediction distribution: {dict(pred_counts)}")
        print(f"  - Giltiga predictions: {len(pred_clean)} av {len(df_pred)}")
        
    if 'proba' in df_pred.columns:
        proba_clean = df_pred.dropna(subset=['proba'])
        if len(proba_clean) > 0:
            print(f"  - Probability stats: min={proba_clean['proba'].min():.3f}, "
                  f"max={proba_clean['proba'].max():.3f}, mean={proba_clean['proba'].mean():.3f}")
            
except Exception as e:
    print(f"❌ ERROR i rolling prediction: {e}")
    import traceback
    traceback.print_exc()

## 7. Interaktiv Feature Explorer

Kör cellerna nedan för att interaktivt utforska features:

In [None]:
# Funktion för att plotta enskilda features
def explore_feature(feature_name):
    """Utforska en specifik feature interaktivt."""
    if feature_name not in feature_cols:
        print(f"❌ Feature '{feature_name}' finns inte!")
        print(f"Tillgängliga: {feature_cols}")
        return
        
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    
    # Data prep
    data_clean = df_labeled.dropna(subset=[feature_name, 'label'])
    
    # 1. Time series
    axes[0].plot(range(len(data_clean)), data_clean[feature_name], linewidth=1)
    axes[0].set_title(f'{feature_name} Time Series')
    axes[0].set_xlabel('Time Index')
    axes[0].grid(True, alpha=0.3)
    
    # 2. Distribution
    axes[1].hist(data_clean[feature_name], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
    axes[1].set_title(f'{feature_name} Distribution')
    axes[1].set_xlabel('Value')
    axes[1].grid(True, alpha=0.3)
    mean_val = data_clean[feature_name].mean()
    axes[1].axvline(mean_val, color='red', linestyle='--', label=f'Mean: {mean_val:.3f}')
    axes[1].legend()
    
    # 3. By Label
    for label in [0, 1]:
        label_data = data_clean[data_clean['label'] == label][feature_name]
        axes[2].hist(label_data, alpha=0.6, bins=20,
                    label=f'Label {label} (n={len(label_data)})', 
                    color='red' if label == 0 else 'green')
    axes[2].legend()
    axes[2].set_title(f'{feature_name} by Label')
    axes[2].set_xlabel('Value')
    axes[2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Statistik
    print(f"📊 {feature_name} Statistik:")
    for label in [0, 1]:
        label_data = data_clean[data_clean['label'] == label][feature_name]
        print(f"  Label {label}: mean={label_data.mean():.4f}, std={label_data.std():.4f}, n={len(label_data)}")

# Lista alla tillgängliga features
print("🔍 TILLGÄNGLIGA FEATURES FÖR EXPLORATION:")
for i, feat in enumerate(feature_cols, 1):
    print(f"{i:2d}. {feat}")

In [None]:
# Exempel: Utforska en specifik feature
# Ändra feature_name till den du vill undersöka
feature_to_explore = feature_cols[0] if feature_cols else "Close"
explore_feature(feature_to_explore)

## 8. Sammanfattning och Nästa Steg

Detta notebook ger dig kraftfulla verktyg för ML-debugging:

### ✅ Vad vi kontrollerat:
- **Data kvalitet**: shapes, NaN-mönster, distributions
- **Feature analys**: statistik, korrelationer, outliers  
- **ML training**: model performance, predictions, probabilities
- **Rolling predictions**: pipeline funktionalitet

### 🔧 Debug-verktyg du nu har:
1. **Visualiseringar** istället för konsol-utskrifter
2. **Interaktiv feature exploration** 
3. **Model performance analysis**
4. **Data quality checks**

### 📈 Nästa steg:
1. Kör `explore_feature()` för olika features
2. Experimentera med andra model parameters
3. Analysera feature importance
4. Testa olika time windows för rolling predictions