In [1]:
!pip install xgboost



In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, f1_score, accuracy_score
from sklearn.metrics import make_scorer, f1_score, accuracy_score, mean_absolute_error  # Import mean_absolute_error
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

In [4]:
# Load datasets
def load_data():
    df_20_21 = pd.read_csv('/content/drive/MyDrive/CollabData/Player_Prediction/df_20_21.csv')
    df_21_22 = pd.read_csv('/content/drive/MyDrive/CollabData/Player_Prediction/df_21_22.csv')
    df_22_23 = pd.read_csv('/content/drive/MyDrive/CollabData/Player_Prediction/df_22_23.csv')
    df_23_24 = pd.read_csv('/content/drive/MyDrive/CollabData/Player_Prediction/df_23_24.csv')
    df_24_25 = pd.read_csv('/content/drive/MyDrive/CollabData/Player_Prediction/df_24_25.csv')
    return df_20_21,df_21_22,df_22_23, df_23_24, df_24_25

In [5]:
# Preprocess data for LSTM
def preprocess_data_xgb(df):
    # Add time_idx for temporal ordering
    df = df.reset_index()  # Reset index to ensure uniqueness
    df['time_idx'] = pd.factorize(df['MP'])[0]

    # Fill NaN values in relevant columns
    df.fillna(0, inplace=True)

    # Feature Engineering
    df['G+A'] = df['Gls'] + df['Ast']
    df['G-PK'] = df['Gls'] - df['PK']
    df['G+A-PK'] = df['G+A'] - df['PK']

    df['xG+xAG'] = df['xG'] + df['xAG']
    df['npxG+xAG'] = df['npxG'] + df['xAG']

    # Define a weighted Performance Index
    df['Performance_Index'] = (
        df['G+A-PK'] * 0.35 +    # Emphasis on actual goal contributions
        df['xG+xAG'] * 0.25 +    # Expected goal contributions
        df['PrgC'] * 0.15 +      # Progressive carries
        df['PrgP'] * 0.15 +      # Progressive passes
        df['PrgR'] * 0.1         # Progressive receptions
    )

    # Future Performance Potential based on trends (without 'Min')
    df['Future_Potential'] = (
        (df['MP'] / (df['MP'].mean() + 1)) * 0.4 +  # Playing time influence using 'MP'
        df.groupby('Player')['Performance_Index'].transform(lambda x: x.diff().fillna(0)) * 0.6  # Performance trends
    )

    features = [
        "MP", "Gls", "Ast", "G+A-PK", "xG", "xAG", "xG+xAG",
        "npxG", "npxG+xAG", "PrgC", "PrgP", "PrgR", "Tkl", "Int", "Blocks",
        "Performance_Index", "Future_Potential"
    ]

    # Verify uniqueness of the index
    if not df.index.is_unique:
        raise ValueError("Data index must be unique.")

    return df[features], df["Performance_Index"]

In [6]:
# Train XGBoost model
def train_xgb(X_train, y_train):
    model = xgb.XGBRegressor(
        objective='reg:squarederror',  # Regression objective
        n_estimators=100,  # Number of boosting rounds (trees)
        learning_rate=0.1,  # Step size shrinkage used in update to prevents overfitting
        max_depth=3,  # Maximum depth of a tree
        subsample=0.8,  # Subsample ratio of the training instance
        colsample_bytree=0.8  # Subsample ratio of columns when constructing each tree
    )

    model.fit(X_train, y_train)
    return model

In [7]:
def evaluate_xgboost(model, X_test, y_test, df_test): # Pass df_test to the function
    model.eval()
    with torch.no_grad():
        predictions = model.predict(X_test)

    raw_predictions = predictions
    predictions = raw_predictions.cpu().numpy() # Remove the index and prediction key to get the raw predictions

    num_test_samples = len(df_test['G+A'])  # Assuming df_test is defined globally

    if len(predictions) < num_test_samples:
        predictions = np.pad(predictions, (0, num_test_samples - len(predictions)), mode='edge')
    elif len(predictions) > num_test_samples:
        predictions = predictions[:num_test_samples]  # Truncate excess values
    return predictions

In [8]:
# Main workflow
def main():
    df_20_21, df_21_22, df_22_23, df_23_24, df_24_25 = load_data()

    df_train = pd.concat([df_20_21, df_21_22, df_22_23, df_23_24])
    df_test = df_24_25

    X_train, y_train = preprocess_data_xgb(df_train)
    X_test, y_test = preprocess_data_xgb(df_test)

    model = train_xgb(X_train, y_train)
    predictions = evaluate_xgb(model, X_test, y_test)

    actual = df_test['G+A'].values

    display_metrics(predictions, actual)

    print(predictions)

In [None]:
# Execute the workflow
if __name__ == "__main__":
    main()