In [1]:
# %% [markdown]
"""
# Football Player Data Splitting
*Advanced stratified splitting for player scouting or modeling*
"""

# %%
# Imports and setup
from pathlib import Path
import pandas as pd
import sys
import logging
import yaml

# Add src directory to path
src_path = Path(r"C:\Users\Nanaba\Desktop\football_player_scouting_ml\src")
sys.path.append(str(src_path))

# %%
# Import the general-purpose data splitter
try:
    from data_splitter import ProspectDataSplitter  # Rename later if renamed
    print(" Successfully imported DataSplitter")
except ImportError as e:
    print(f" Import Error: {e}")
    raise

# %%
# Define file paths
data_dir = Path(r"C:\Users\Nanaba\Desktop\football_player_scouting_ml\data")
processed_path = data_dir / "processed" / "fifa_players_cleaned.csv"
output_dir = data_dir / "splits" / "v1"  # Versioned output

# %%
# Initialize the data splitter
print("\n Initializing DataSplitter...")
splitter = ProspectDataSplitter(  # Replace name if you change the class name
    test_size=0.2, 
    val_size=0.2, 
    random_state=42
)

# %%
# Load and validate the processed data
print(f"\n Loading data from {processed_path}...")
try:
    df = splitter.load_and_validate(processed_path)
    print(f" Successfully loaded data with shape: {df.shape}")
except Exception as e:
    print(f" Data validation failed: {e}")
    raise

# %%
# Split the data
print("\n Splitting data with stratification...")
try:
    train_df, val_df, test_df = splitter.split_data(df)

    # Print split summary
    print("\n Split Summary:")
    print(f"{'Set':<12} {'Samples':<10}")
    print("-" * 30)
    for name, df in zip(['Train', 'Validation', 'Test'], [train_df, val_df, test_df]):
        print(f"{name:<12} {len(df):<10}")

    # Position distribution check
    print("\n Position Distribution in Splits (top 3 positions):")
    pos_counts = pd.concat([
        train_df['player_positions'].value_counts(normalize=True).head(3),
        val_df['player_positions'].value_counts(normalize=True).head(3),
        test_df['player_positions'].value_counts(normalize=True).head(3)
    ], axis=1, keys=['Train', 'Validation', 'Test'])
    display(pos_counts.style.format("{:.1%}"))

except Exception as e:
    print(f" Splitting failed: {e}")
    raise

# %%
# Save the splits with metadata
print(f"\n Saving splits and metadata to {output_dir}...")
try:
    splitter.save_splits_with_scouting_metadata((train_df, val_df, test_df), output_dir)

    # Display metadata summary
    with open(output_dir / 'scouting_split_metadata.yaml') as f:
        metadata = yaml.safe_load(f)

    print("\n Metadata Summary:")
    print(f"- Split Date: {metadata['split_date']}")
    print(f"- Train Samples: {metadata['split_stats']['train']['total_samples']}")
    print(f"- Mean Potential: {metadata['split_stats']['train']['potential_stats']['mean']:.1f}")

except Exception as e:
    print(f" Failed to save splits: {e}")
    raise

# %%
# Final checks (remove prospect-related metrics)
print("\n Running final quality checks...")
try:
    print("\n Key Attribute Averages (sample scouting features):")
    features = ['technical_composite', 'physical_composite', 'potential']
    scouting_stats = pd.concat([
        train_df[features].mean().rename('Train'),
        val_df[features].mean().rename('Validation'),
        test_df[features].mean().rename('Test')
    ], axis=1)
    display(scouting_stats.style.format("{:.1f}").background_gradient(axis=1))

except Exception as e:
    print(f" Validation warning: {e}")

# %%
print("\n Data splitting completed successfully!")
print(f"   - Train set: {len(train_df)} players")
print(f"   - Validation set: {len(val_df)} players")
print(f"   - Test set: {len(test_df)} players")


 Successfully imported DataSplitter

 Initializing DataSplitter...

 Loading data from C:\Users\Nanaba\Desktop\football_player_scouting_ml\data\processed\fifa_players_cleaned.csv...


2025-07-25 23:31:05,277 - INFO - Loaded prospect data with shape (45629, 66)


 Successfully loaded data with shape: (45629, 66)

 Splitting data with stratification...

 Split Summary:
Set          Samples   
------------------------------
Train        27377     
Validation   9126      
Test         9126      

 Position Distribution in Splits (top 3 positions):


Unnamed: 0_level_0,Train,Validation,Test
player_positions,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CB,12.2%,12.6%,12.9%
GK,11.2%,11.2%,11.2%
ST,10.8%,11.0%,11.1%



 Saving splits and metadata to C:\Users\Nanaba\Desktop\football_player_scouting_ml\data\splits\v1...


2025-07-25 23:31:08,257 - INFO - Saved scouting splits to C:\Users\Nanaba\Desktop\football_player_scouting_ml\data\splits\v1



 Metadata Summary:
- Split Date: 2025-07-25T23:31:06.864062
- Train Samples: 27377
- Mean Potential: 69.0

 Running final quality checks...

 Key Attribute Averages (sample scouting features):

 Data splitting completed successfully!
   - Train set: 27377 players
   - Validation set: 9126 players
   - Test set: 9126 players
