# 05 - Feature Engineering

**Purpose**: Create derived features for modeling and deeper analysis.

**Features to Create**:
1. **Matchup features**: Trophy diff, elixir diff, card level diff
2. **Deck complexity**: Weighted score based on elixir, spell count, legendary count
3. **Archetype indicators**: Beatdown, cycle, spell-heavy flags
4. **Card synergy scores**: Based on historical win rates of card pairs
5. **Trophy brackets**: Categorical variables for skill levels

**Output**: Clean feature matrix saved as Parquet for modeling

In [None]:
import sys, os, duckdb, pandas as pd, numpy as np

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.insert(0, os.path.join(PROJECT_ROOT, 'src'))

# Use Parquet if available (faster), fallback to CSV
DATA_PATH = os.path.join(PROJECT_ROOT, 'battles.parquet')
if not os.path.exists(DATA_PATH):
    DATA_PATH = os.path.join(PROJECT_ROOT, 'battles.csv')

from duckdb_utils import get_connection, create_battles_view, query_to_df, save_to_parquet, create_sample
from feature_engineering import (
    create_card_level_features,
    create_deck_archetype_features,
    create_trophy_bracket_features,
    create_matchup_features,
    create_tower_damage_features
)

con = get_connection()
create_battles_view(con, DATA_PATH)

## 1. Load Base Data

Work with a sample for feature engineering development

In [None]:
# Create 10% sample if not exists
sample_path = 'artifacts/sample_battles_10pct.parquet'
sample_file = os.path.join(PROJECT_ROOT, sample_path)

if not os.path.exists(sample_file):
    print("Creating 10% sample...")
    sample = create_sample(con, sample_pct=10, output_path=sample_path)
    print(f"✓ Sample created and saved to {sample_path}")
else:
    print("Loading existing sample...")
    sample = pd.read_parquet(sample_file)
    print(f"✓ Sample loaded from {sample_path}")
    
print(f"Sample size: {len(sample):,} battles")
print(f"Columns: {len(sample.columns)}")
print(f"Memory usage: {sample.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

## 2. Create Matchup Features

In [None]:
# Add matchup comparison features
print("Creating matchup features...")
sample_features = create_matchup_features(sample)

print("✓ Matchup features created:")
print("  - trophy_diff: Winner trophy advantage")
print("  - elixir_diff: Winner elixir cost advantage")  
print("  - card_level_diff: Winner card level advantage")
print("  - spell_diff: Winner spell count advantage")

# Show summary stats
print(f"\nSample stats:")
print(f"  Average trophy diff: {sample_features['trophy_diff'].mean():.1f}")
print(f"  Average elixir diff: {sample_features['elixir_diff'].mean():.2f}")

## 3. Create Deck Archetype Features

In [None]:
# Add archetype indicators for winner and loser
print("Creating archetype features...")
sample_features = create_deck_archetype_features(sample_features, player='winner')
sample_features = create_deck_archetype_features(sample_features, player='loser')

print("✓ Archetype features created for both players:")

# List all archetype columns
archetype_cols = [col for col in sample_features.columns if 'archetype' in col.lower() 
                  or 'beatdown' in col.lower() or 'cycle' in col.lower() 
                  or 'heavy' in col.lower() or 'siege' in col.lower()]
for col in sorted(archetype_cols):
    print(f"  - {col}")

# Show archetype distribution for winners
winner_archetypes = [col for col in archetype_cols if 'winner' in col]
if len(winner_archetypes) > 0:
    print(f"\nWinner archetype distribution:")
    for col in winner_archetypes[:5]:
        if col in sample_features.columns:
            pct = sample_features[col].sum() / len(sample_features) * 100
            print(f"  {col}: {pct:.1f}% of decks")

## 4. Create Trophy Bracket Features

In [None]:
# Categorize battles by trophy level
print("Creating trophy bracket features...")
sample_features = create_trophy_bracket_features(sample_features)

print("✓ Trophy bracket feature created")

# Show distribution
if 'trophy_bracket' in sample_features.columns:
    print("\nTrophy bracket distribution:")
    bracket_counts = sample_features['trophy_bracket'].value_counts().sort_index()
    for bracket, count in bracket_counts.items():
        pct = count / len(sample_features) * 100
        print(f"  {bracket}: {count:,} battles ({pct:.1f}%)")

## 5. Create Tower Damage Features

In [None]:
# Add crown-related features
print("Creating tower damage features...")
sample_features = create_tower_damage_features(sample_features)

print("✓ Tower damage features created:")
print("  - crown_diff: Crown advantage")
print("  - close_game: Boolean for 1-crown wins")
print("  - three_crown_win: Boolean for 3-crown wins")

# Show distribution
if 'close_game' in sample_features.columns:
    close_pct = sample_features['close_game'].sum() / len(sample_features) * 100
    print(f"\nClose games (1-crown): {close_pct:.1f}%")

if 'three_crown_win' in sample_features.columns:
    three_crown_pct = sample_features['three_crown_win'].sum() / len(sample_features) * 100
    print(f"3-crown wins: {three_crown_pct:.1f}%")

## 6. Save Feature Matrix

In [None]:
# Save engineered features for modeling
output_path = os.path.join(PROJECT_ROOT, 'artifacts/model_features.parquet')
print(f"Saving feature matrix to {output_path}...")

save_to_parquet(sample_features, 'artifacts/model_features.parquet')

print(f"\n✓ Feature matrix saved!")
print(f"  Shape: {sample_features.shape}")
print(f"  File size: {os.path.getsize(output_path) / 1024**2:.1f} MB")
print(f"  Total features: {len(sample_features.columns)}")

## 7. Feature Summary

In [None]:
# List all engineered features
engineered_cols = [col for col in sample_features.columns 
                   if any(x in col for x in ['_diff', '_heavy', '_beatdown', '_cycle', 
                                              'bracket', 'close_game', 'archetype', 'siege',
                                              'crown_diff', 'three_crown'])]

print(f"✓ Engineered features ({len(engineered_cols)}):")
for col in sorted(engineered_cols):
    # Get data type and sample value
    dtype = sample_features[col].dtype
    sample_val = sample_features[col].iloc[0]
    print(f"  - {col:<30} ({dtype}, e.g., {sample_val})")

print(f"\n✓ Feature engineering complete!")
print(f"  Ready for modeling in Notebook 06")