In [2]:
# Step 1: Set up paths and imports
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import os
from scipy import sparse
import logging
from datetime import datetime

# Configure basic logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Get the absolute path to the src directory
src_path = os.path.abspath(r"C:\Users\Nanaba\Desktop\football_player_scouting_ml\src")
if src_path not in sys.path:
    sys.path.insert(0, src_path)

# Import with enhanced error handling
try:
    from feature_scaling import FootballFeaturePreprocessor
    logger.info(" Successfully imported FootballFeaturePreprocessor")
except ImportError as e:
    logger.error(f" Import Error: {e}")
    logger.error("\nContents of src directory:")
    logger.error(os.listdir(src_path))
    raise

# Step 2: Define file paths with versioning
current_date = datetime.now().strftime("%Y%m%d")
data_dir = Path(r"C:\Users\Nanaba\Desktop\football_player_scouting_ml\data\processed")
split_path = data_dir / "processed" 
processed_path = data_dir  / f"v{current_date}"
processed_path.mkdir(parents=True, exist_ok=True)

# File paths for input data
train_file = data_dir / "fifa_players_train.csv"
test_file = data_dir / "fifa_players_test.csv"
train_df = pd.read_csv(train_file, low_memory=False)
test_df = pd.read_csv(test_file, low_memory=False)

# File paths for processed output
train_processed_file = processed_path / "fifa_players_train_processed.feather"
test_processed_file = processed_path / "fifa_players_test_processed.feather"
feature_report_file = processed_path / "feature_quality_report.csv"


# Step 3: Load datasets with memory optimization
logger.info("\n Loading datasets...")
try:
    train_df = pd.read_csv(train_file, low_memory=False)
    test_df = pd.read_csv(test_file, low_memory=False)
    logger.info(f" Training data loaded: {len(train_df)} records")
    logger.info(f" Test data loaded: {len(test_df)} records")
except Exception as e:
    logger.error("Data loading failed", exc_info=True)
    raise

# Step 4: Initialize preprocessor with enhanced configuration
logger.info("\n Initializing FootballFeaturePreprocessor...")
preprocessor = FootballFeaturePreprocessor(
    target='is_prospect',  # Updated target focus
    model_type='auto',     # Auto-detect best approach
    max_categories=75,     # Increased from original 50
    n_components=None      # Optional PCA reduction
)

# Step 5: Create prospect target if needed
if 'is_prospect' not in train_df.columns:
    logger.info("Creating prospect target variable...")
    train_df['is_prospect'] = preprocessor.create_prospect_target(train_df)
    val_df['is_prospect'] = preprocessor.create_prospect_target(val_df)
    test_df['is_prospect'] = preprocessor.create_prospect_target(test_df)
    logger.info(f"Prospect counts - Train: {train_df['is_prospect'].sum()}, Val: {val_df['is_prospect'].sum()}, Test: {test_df['is_prospect'].sum()}")

# Identify target columns
target_cols = ['overall', 'potential', 'is_prospect']
feature_cols = [col for col in train_df.columns if col not in target_cols]

# Separate features and targets
X_train = train_df[feature_cols].copy()
y_train = train_df[target_cols].copy()


X_test = test_df[feature_cols].copy()
y_test = test_df[target_cols].copy()

# Step 6: Fit and transform data
logger.info("\n🔧 Fitting preprocessor on training data...")
preprocessor.fit(X_train)

# Save feature quality report
feature_report = preprocessor.get_feature_quality_report()
feature_report.to_csv(feature_report_file)
logger.info(f" Feature quality report saved to {feature_report_file}")

# Process data in chunks (memory-efficient)
chunk_size = 2000
logger.info(f"\n Processing data in chunks of {chunk_size}...")

def process_in_chunks(X, name):
    chunks = []
    for i in range(0, len(X), chunk_size):
        chunk = X.iloc[i:i+chunk_size]
        processed_chunk = preprocessor.transform(chunk)
        chunks.append(processed_chunk)
        logger.info(f"Processed {min(i+chunk_size, len(X))}/{len(X)} {name} records")
    return pd.concat(chunks)

X_train_processed = process_in_chunks(X_train, 'training')
X_test_processed = preprocessor.transform(X_test)

# Combine processed features with targets
train_processed = pd.concat([X_train_processed, y_train.reset_index(drop=True)], axis=1)
test_processed = pd.concat([X_test_processed, y_test.reset_index(drop=True)], axis=1)

# Convert sparse columns to dense (fix for Feather saving)
for col in train_processed.columns:
    if pd.api.types.is_sparse(train_processed[col]):
        train_processed[col] = train_processed[col].sparse.to_dense()

for col in test_processed.columns:
    if pd.api.types.is_sparse(test_processed[col]):
        test_processed[col] = test_processed[col].sparse.to_dense()

# Step 7: Save processed datasets
logger.info("\n Saving processed datasets...")
train_processed.to_feather(train_processed_file)
test_processed.to_feather(test_processed_file)
logger.info(f" Training data saved to {train_processed_file}")
logger.info(f" Test data saved to {test_processed_file}")

logger.info("\n Preprocessing complete!")

2025-08-19 03:14:43,544 - INFO -  Successfully imported FootballFeaturePreprocessor
2025-08-19 03:14:43,917 - INFO - 
 Loading datasets...
2025-08-19 03:14:44,271 - INFO -  Training data loaded: 27377 records
2025-08-19 03:14:44,272 - INFO -  Test data loaded: 9126 records
2025-08-19 03:14:44,273 - INFO - 
 Initializing FootballFeaturePreprocessor...
2025-08-19 03:14:44,307 - INFO - 
🔧 Fitting preprocessor on training data...
2025-08-19 03:14:44,617 - INFO - Feature quality analysis completed
2025-08-19 03:14:49,247 - INFO - Preprocessor fitted with 180 features
2025-08-19 03:14:49,253 - INFO -  Feature quality report saved to C:\Users\Nanaba\Desktop\football_player_scouting_ml\data\processed\v20250819\feature_quality_report.csv
2025-08-19 03:14:49,254 - INFO - 
 Processing data in chunks of 2000...
2025-08-19 03:14:49,499 - INFO - Processed 2000/27377 training records
2025-08-19 03:14:49,747 - INFO - Processed 4000/27377 training records
2025-08-19 03:14:49,974 - INFO - Processed 6000