# Football Player Value Prediction - LightGBM Model

This notebook uses modular preprocessing and LightGBM training

## 1. Import Libraries & Modules

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

from preprocessing import FootballDataPreprocessor, remove_outliers
from lightgbm_model import (
    LightGBMTrainer, 
    plot_predictions, 
    plot_residuals, 
    plot_feature_importance,
    print_metrics
)

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 2. Load Data

In [None]:
df = pd.read_csv('football_players_dataset.csv')

print("=" * 80)
print("DATASET OVERVIEW")
print("=" * 80)
print(f"Shape: {df.shape}")
print(f"Samples: {df.shape[0]:,}")
print(f"Features: {df.shape[1]}")
print(f"\nMissing values: {df.isnull().sum().sum()}")
print(f"\nFirst 5 rows:")
display(df.head())

print(f"\nTarget variable statistics:")
display(df['market_value'].describe())

## 3. Data Cleaning

In [None]:
print("=" * 80)
print("DATA CLEANING")
print("=" * 80)

original_size = len(df)
df_clean = remove_outliers(df, n_std=3)
removed = original_size - len(df_clean)

print(f"\nOriginal samples: {original_size:,}")
print(f"After outlier removal: {len(df_clean):,}")
print(f"Removed: {removed:,} samples ({removed/original_size*100:.2f}%)")

## 4. Train/Val/Test Split

In [None]:
print("=" * 80)
print("DATA SPLITTING")
print("=" * 80)

train_val, test = train_test_split(df_clean, test_size=0.20, random_state=42)
train, val = train_test_split(train_val, test_size=0.20, random_state=42)

print(f"\nTrain set: {len(train):,} samples ({len(train)/len(df_clean)*100:.1f}%)")
print(f"Val set:   {len(val):,} samples ({len(val)/len(df_clean)*100:.1f}%)")
print(f"Test set:  {len(test):,} samples ({len(test)/len(df_clean)*100:.1f}%)")

## 5. Preprocessing (Feature Engineering + Scaling)

In [None]:
print("=" * 80)
print("PREPROCESSING & FEATURE ENGINEERING")
print("=" * 80)

preprocessor = FootballDataPreprocessor(corr_threshold=0.05)

X_train = preprocessor.fit_transform(train, scale=True)
y_train = train['market_value'].values

X_val = preprocessor.transform(val, scale=True)
y_val = val['market_value'].values

X_test = preprocessor.transform(test, scale=True)
y_test = test['market_value'].values

print(f"\nSelected features: {len(preprocessor.selected_features)}")
print(f"\nTop 10 features by correlation:")
for i, feat in enumerate(preprocessor.selected_features[:10], 1):
    print(f"  {i:2d}. {feat}")

print(f"\nX_train shape: {X_train.shape}")
print(f"X_val shape:   {X_val.shape}")
print(f"X_test shape:  {X_test.shape}")

## 6. LightGBM Training with GridSearchCV

In [None]:
print("=" * 80)
print("LIGHTGBM TRAINING - GRIDSEARCHCV")
print("=" * 80)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [31, 50, 70],
    'min_child_samples': [20, 30, 40],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

trainer = LightGBMTrainer(param_grid=param_grid, cv_folds=5, random_state=42)

trainer.train_with_gridsearch(
    X_train, y_train, 
    X_val, y_val,
    verbose=True
)

## 7. Evaluation on Test Set

In [None]:
metrics, y_pred = trainer.evaluate(X_test, y_test)
print_metrics(metrics, trainer.cv_scores)

## 8. Visualizations

In [None]:
plot_predictions(
    y_test, y_pred,
    title="LightGBM - Actual vs Predicted",
    save_path="lgbm_predictions.png"
)

In [None]:
plot_residuals(
    y_test, y_pred,
    save_path="lgbm_residuals.png"
)

In [None]:
importance_df = trainer.get_feature_importance(preprocessor.selected_features, top_n=20)
plot_feature_importance(
    importance_df,
    top_n=20,
    save_path="lgbm_feature_importance.png"
)

print("\nTop 20 Feature Importances:")
display(importance_df)

## 9. Save Model & Preprocessor

In [None]:
print("=" * 80)
print("SAVING MODEL & PREPROCESSOR")
print("=" * 80)

trainer.save('lightgbm_model.pkl')
print("Saved: lightgbm_model.pkl")

preprocessor.save('preprocessor.pkl')
print("Saved: preprocessor.pkl")

import joblib
metadata = {
    'model_name': 'LightGBM',
    'n_features': len(preprocessor.selected_features),
    'feature_names': preprocessor.selected_features,
    'n_train': len(X_train),
    'n_val': len(X_val),
    'n_test': len(X_test),
    'test_metrics': metrics,
    'best_params': trainer.best_params,
    'cv_scores': trainer.cv_scores
}

joblib.dump(metadata, 'lightgbm_metadata.pkl')
print("Saved: lightgbm_metadata.pkl")

print("\nAll files saved successfully!")

## 10. Final Report

In [None]:
print("=" * 80)
print("LIGHTGBM - FINAL REPORT")
print("=" * 80)

report = f"""
Dataset:
  Total samples:      {len(df):,}
  After cleaning:     {len(df_clean):,}
  Train:              {len(train):,} ({len(train)/len(df_clean)*100:.1f}%)
  Validation:         {len(val):,} ({len(val)/len(df_clean)*100:.1f}%)
  Test:               {len(test):,} ({len(test)/len(df_clean)*100:.1f}%)

Features:
  Selected features:  {len(preprocessor.selected_features)}
  Correlation threshold: {preprocessor.corr_threshold}

Model:
  Algorithm:          LightGBM
  Validation:         5-Fold Cross-Validation
  Hyperparameter tuning: GridSearchCV

Best Parameters:
"""

for param, value in trainer.best_params.items():
    report += f"  {param}: {value}\n"

report += f"""
Performance:
  Test R²:            {metrics['r2']:.4f}
  Test RMSE:          €{metrics['rmse']:.2f}M
  Test MAE:           €{metrics['mae']:.2f}M
  CV R² (mean±std):   {trainer.cv_scores.mean():.4f} ± {trainer.cv_scores.std():.4f}

Files Generated:
  - lightgbm_model.pkl
  - preprocessor.pkl
  - lightgbm_metadata.pkl
  - lgbm_predictions.png
  - lgbm_residuals.png
  - lgbm_feature_importance.png
"""

print(report)

with open('lightgbm_report.txt', 'w') as f:
    f.write(report)

print("\nSaved: lightgbm_report.txt")
print("=" * 80)