# FPL Feature Engineering & Preprocessing

This notebook demonstrates:
1. Feature engineering for player performance prediction
2. Creating rolling averages and lag features
3. Form metrics and temporal features
4. Data preprocessing pipeline
5. Train/test splitting strategies
6. Preparing ML-ready datasets

## 1. Setup

In [None]:
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Add project root
project_root = Path.cwd().parent
sys.path.append(str(project_root))

# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Project modules
from src.preprocessing import (
    FPLDataLoader,
    FPLFeatureEngineer,
    FPLPreprocessor,
    prepare_training_data,
    get_feature_columns_by_type,
    POSITIONS
)

# Settings
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
pd.set_option('display.max_columns', 50)

print("✓ Setup complete!")

## 2. Load Data

In [None]:
# Initialize loader
loader = FPLDataLoader()

# Load data for a specific season
SEASON = '2023-24'  # Change as needed

print(f"Loading data for season: {SEASON}")
gameweeks_df = loader.load_gameweeks(SEASON)
players_df = loader.load_players(SEASON)
teams_df = loader.load_teams(SEASON)

print(f"\n✓ Loaded:")
print(f"  Gameweeks: {len(gameweeks_df):,} records")
print(f"  Players: {len(players_df):,}")
print(f"  Teams: {len(teams_df):,}")

In [None]:
# Check initial data
print("Gameweek data columns:")
print(gameweeks_df.columns.tolist())
print(f"\nShape: {gameweeks_df.shape}")
print(f"\nSample:")
gameweeks_df.head()

## 3. Feature Engineering

In [None]:
# Initialize feature engineer
engineer = FPLFeatureEngineer()

# Ensure data is sorted
gameweeks_df = gameweeks_df.sort_values(['element', 'round'])

print("Creating all features...")
print("="*60)

# Create all features
features_df = engineer.create_all_features(
    gameweeks_df,
    teams_df=teams_df,
    players_df=players_df
)

print(f"\nFeatures created!")
print(f"  Original columns: {len(gameweeks_df.columns)}")
print(f"  With features: {len(features_df.columns)}")
print(f"  New features: {len(features_df.columns) - len(gameweeks_df.columns)}")

### 3.1 Examine Created Features

In [None]:
# Categorize features
feature_types = get_feature_columns_by_type(features_df)

print("FEATURE GROUPS")
print("="*60)
for group, features in feature_types.items():
    if features:
        print(f"\n{group.upper()} ({len(features)} features):")
        print(f"  {', '.join(features[:10])}")
        if len(features) > 10:
            print(f"  ... and {len(features) - 10} more")

In [None]:
# Sample of feature-engineered data
print("Sample of engineered features:")
sample_cols = [
    'element', 'round', 'total_points',
    'total_points_rolling_3', 'total_points_rolling_5',
    'form_weighted', 'consistency_5', 'form_trend',
    'goal_involvement', 'points_per_90'
]
existing_cols = [col for col in sample_cols if col in features_df.columns]
features_df[existing_cols].head(15)

### 3.2 Visualize Rolling Features

In [None]:
# Pick a sample player to visualize
if 'element' in features_df.columns:
    sample_player_id = features_df['element'].value_counts().index[0]
    player_data = features_df[features_df['element'] == sample_player_id].copy()
    
    if len(player_data) > 10:
        fig, axes = plt.subplots(2, 2, figsize=(16, 10))
        
        # Rolling points
        if all(col in player_data.columns for col in ['round', 'total_points', 'total_points_rolling_3', 'total_points_rolling_5']):
            axes[0, 0].plot(player_data['round'], player_data['total_points'], 
                           'o-', alpha=0.5, label='Actual', markersize=4)
            axes[0, 0].plot(player_data['round'], player_data['total_points_rolling_3'],
                           '-', linewidth=2, label='3-game avg')
            axes[0, 0].plot(player_data['round'], player_data['total_points_rolling_5'],
                           '-', linewidth=2, label='5-game avg')
            axes[0, 0].set_title(f'Player {sample_player_id}: Points & Rolling Averages', fontweight='bold')
            axes[0, 0].set_xlabel('Gameweek')
            axes[0, 0].set_ylabel('Points')
            axes[0, 0].legend()
            axes[0, 0].grid(alpha=0.3)
        
        # Form metrics
        if all(col in player_data.columns for col in ['round', 'form_weighted', 'form_trend']):
            ax2 = axes[0, 1]
            ax2.plot(player_data['round'], player_data['form_weighted'],
                    '-o', linewidth=2, markersize=4, label='Weighted Form')
            ax2.set_ylabel('Weighted Form', color='blue')
            ax2.tick_params(axis='y', labelcolor='blue')
            
            ax2_twin = ax2.twinx()
            ax2_twin.plot(player_data['round'], player_data['form_trend'],
                         '-s', color='red', linewidth=2, markersize=4, label='Form Trend')
            ax2_twin.set_ylabel('Form Trend', color='red')
            ax2_twin.tick_params(axis='y', labelcolor='red')
            ax2_twin.axhline(0, color='gray', linestyle='--', alpha=0.3)
            
            ax2.set_title('Form Metrics', fontweight='bold')
            ax2.set_xlabel('Gameweek')
            ax2.grid(alpha=0.3)
        
        # Goal involvement
        if all(col in player_data.columns for col in ['round', 'goals_scored', 'assists', 'goal_involvement']):
            axes[1, 0].bar(player_data['round'], player_data['goals_scored'],
                          alpha=0.7, label='Goals', color='red')
            axes[1, 0].bar(player_data['round'], player_data['assists'],
                          bottom=player_data['goals_scored'],
                          alpha=0.7, label='Assists', color='blue')
            axes[1, 0].plot(player_data['round'], 
                           player_data.get('goal_involvement_rolling_3', player_data['goal_involvement']),
                           'k-', linewidth=2, label='3-game avg')
            axes[1, 0].set_title('Goal Involvement', fontweight='bold')
            axes[1, 0].set_xlabel('Gameweek')
            axes[1, 0].set_ylabel('Count')
            axes[1, 0].legend()
            axes[1, 0].grid(alpha=0.3)
        
        # Minutes played
        if 'minutes' in player_data.columns:
            axes[1, 1].bar(player_data['round'], player_data['minutes'],
                          alpha=0.7, color='green')
            axes[1, 1].axhline(90, color='red', linestyle='--', label='Full 90')
            axes[1, 1].axhline(60, color='orange', linestyle='--', label='60 min')
            axes[1, 1].set_title('Minutes Played', fontweight='bold')
            axes[1, 1].set_xlabel('Gameweek')
            axes[1, 1].set_ylabel('Minutes')
            axes[1, 1].legend()
            axes[1, 1].grid(alpha=0.3)
        
        plt.tight_layout()
        plt.show()
    else:
        print("Not enough data to visualize")
else:
    print("Element column not found")

## 4. Prepare Training Data

In [None]:
# Prepare data for training
print("Preparing training data...")
print("="*60)

# Remove first few games per player (incomplete rolling features)
training_ready_df = prepare_training_data(
    features_df,
    target_col='total_points',
    drop_first_n_games=5
)

print(f"\n✓ Training data prepared")
print(f"  Original records: {len(features_df):,}")
print(f"  Training-ready records: {len(training_ready_df):,}")
print(f"  Removed: {len(features_df) - len(training_ready_df):,} ({(1 - len(training_ready_df)/len(features_df))*100:.1f}%)")

In [None]:
# Check for missing values
missing_pct = (training_ready_df.isnull().sum() / len(training_ready_df) * 100)
missing_cols = missing_pct[missing_pct > 0].sort_values(ascending=False)

if len(missing_cols) > 0:
    print("\nColumns with missing values:")
    print(missing_cols.head(10))
else:
    print("\n✓ No missing values!")

## 5. Preprocessing Pipeline

In [None]:
# Initialize preprocessor
preprocessor = FPLPreprocessor(scaler_type='standard')

# Run full pipeline
print("\nRunning preprocessing pipeline...")
print("="*60)

processed_data = preprocessor.prepare_for_training(
    training_ready_df,
    target_col='total_points',
    temporal_split=True,  # Use temporal split for time series
    test_rounds=5,  # Last 5 gameweeks for testing
    scale=True,
    handle_missing=True
)

# Extract data
X_train = processed_data['X_train']
X_test = processed_data['X_test']
y_train = processed_data['y_train']
y_test = processed_data['y_test']

print(f"\n{'='*60}")
print("FINAL DATASETS")
print(f"{'='*60}")
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nTarget distribution (train):")
print(y_train.describe())
print(f"\nTarget distribution (test):")
print(y_test.describe())

In [None]:
# Visualize train/test split
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Target distribution
axes[0].hist(y_train, bins=30, alpha=0.7, label='Train', edgecolor='black')
axes[0].hist(y_test, bins=30, alpha=0.7, label='Test', edgecolor='black')
axes[0].set_title('Target Distribution (Total Points)', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Points')
axes[0].set_ylabel('Frequency')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Dataset sizes
sizes = [len(X_train), len(X_test)]
labels = ['Train', 'Test']
axes[1].pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90,
           colors=['steelblue', 'orange'])
axes[1].set_title('Train/Test Split', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

## 6. Feature Analysis

In [None]:
# Feature correlation with target
print("Top features correlated with target:")
print("="*60)

# Combine X and y for correlation analysis
train_with_target = X_train.copy()
train_with_target['target'] = y_train.values

# Calculate correlations
correlations = train_with_target.corr()['target'].drop('target').abs().sort_values(ascending=False)

print("\nTop 20 features by correlation:")
print(correlations.head(20))

# Visualize
plt.figure(figsize=(10, 8))
correlations.head(20).plot(kind='barh', color='steelblue')
plt.xlabel('Absolute Correlation with Target')
plt.title('Top 20 Features by Correlation', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Feature importance by group
feature_groups = get_feature_columns_by_type(X_train)

print("\nAverage correlation by feature group:")
print("="*60)

group_correlations = {}
for group, features in feature_groups.items():
    if features:
        avg_corr = correlations[correlations.index.isin(features)].mean()
        group_correlations[group] = avg_corr
        print(f"{group:15s}: {avg_corr:.4f}")

# Plot
plt.figure(figsize=(10, 6))
pd.Series(group_correlations).sort_values().plot(kind='barh', color='green', alpha=0.7)
plt.xlabel('Average Absolute Correlation')
plt.title('Feature Group Importance', fontsize=14, fontweight='bold')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Save Processed Data

In [None]:
# Save processed datasets
from configs.config import PROCESSED_DATA_DIR

output_dir = PROCESSED_DATA_DIR / SEASON
output_dir.mkdir(parents=True, exist_ok=True)

# Save train/test sets
X_train.to_csv(output_dir / 'X_train.csv', index=False)
X_test.to_csv(output_dir / 'X_test.csv', index=False)
y_train.to_csv(output_dir / 'y_train.csv', index=False, header=['total_points'])
y_test.to_csv(output_dir / 'y_test.csv', index=False, header=['total_points'])

# Save feature-engineered full dataset
training_ready_df.to_csv(output_dir / 'features_complete.csv', index=False)

print(f"✓ Data saved to: {output_dir}")
print(f"\nFiles created:")
print(f"  - X_train.csv ({X_train.shape})")
print(f"  - X_test.csv ({X_test.shape})")
print(f"  - y_train.csv ({len(y_train)} records)")
print(f"  - y_test.csv ({len(y_test)} records)")
print(f"  - features_complete.csv ({training_ready_df.shape})")

In [None]:
# Save preprocessing pipeline
from configs.config import MODELS_DIR

pipeline_path = MODELS_DIR / f'preprocessor_{SEASON}.pkl'
preprocessor.save_pipeline(str(pipeline_path))

print(f"\n✓ Preprocessing pipeline saved to: {pipeline_path}")

## 8. Summary

### Features Created:
1. **Rolling Features**: Moving averages (3, 5, 10 games) for key metrics
2. **Lag Features**: Previous gameweek values (1, 2, 3 lags)
3. **Form Features**: Weighted form, consistency, trend
4. **Attacking Features**: Goal involvement, xG, xA, creativity
5. **Defensive Features**: Clean sheets, goals conceded, saves
6. **Opponent Features**: Difficulty ratings, team strength
7. **Home/Away Features**: Location-specific performance
8. **Value Features**: Points per cost, transfer trends

### Data Ready For:
- Training ML models (Random Forest, XGBoost, LightGBM)
- Position-specific predictions
- Temporal validation
- Comparison with research paper benchmarks

### Next Steps:
1. Train baseline models
2. Hyperparameter tuning
3. Model evaluation and comparison
4. Feature selection and importance analysis
5. Deploy for predictions