In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, make_scorer, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import joblib

In [2]:
# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
# Load and clean data
df = pd.read_csv("Unseen_data.csv", encoding='latin1')
df.columns = df.columns.str.strip()

In [4]:
# Handle missing values
print(f'Initial Missing Values:\n{df.isnull().sum()}')

Initial Missing Values:
Rk                   0
Edited Name          0
Original Name        0
Matching Name        0
Market Value Euro    0
Nation               0
Pos                  0
Squad                0
Comp                 0
Born                 0
MP                   0
90s                  0
Goals                0
Shots                0
SoT                  0
G/Sh                 0
G/SoT                0
ShoDist              0
ShoFK                0
ShoPK                0
PKatt                0
PasTotCmp            0
PasTotAtt            0
PasTotDist           0
PasTotPrgDist        0
PasShoCmp            0
PasShoAtt            0
PasMedCmp            0
PasMedAtt            0
PasLonCmp            0
PasLonAtt            0
Assists              0
PasAss               0
Pas3rd               0
PPA                  0
CrsPA                0
PasProg              0
PasLive              0
PasDead              0
PasFK                0
TB                   0
Sw                   0
PasCrs    

In [5]:
# Encode with LabelEncoder
le = LabelEncoder()
df['Pos_Cat'] = le.fit_transform(df['Pos'])
df['Nation_Cat'] = le.fit_transform(df['Nation'])
df['Squad_Cat'] = le.fit_transform(df['Squad'])
df['Comp_Cat'] = le.fit_transform(df['Comp'])

In [6]:
# Select numeric columns
numeric_df = df.select_dtypes(include=[np.number])

In [7]:
# Scale features using RobustScaler
scaler = RobustScaler()
features_to_scale = numeric_df.drop(columns=['Pos_Cat','Nation_Cat','Squad_Cat','Comp_Cat'])
scaled_features = scaler.fit_transform(features_to_scale)
scaled_numeric_df = pd.DataFrame(scaled_features, columns=features_to_scale.columns)

In [8]:
# Shift the scaled data to ensure no negative values
min_value = scaled_features.min()
shifted_scaled_features = scaled_features - min_value

In [9]:
# Convert back to DataFrame
scaled_numeric_df = pd.DataFrame(shifted_scaled_features, columns=features_to_scale.columns)

In [10]:
# Save the scaler
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [11]:
# Save the min_value for future use
np.save('min_value.npy', min_value)

In [12]:
# Add non-scaled columns
# scaled_numeric_df['Market Value Euro'] = df['Market Value Euro']
scaled_numeric_df['Pos_Cat'] = df['Pos_Cat']
scaled_numeric_df['Nation_Cat'] = df['Nation_Cat']
scaled_numeric_df['Squad_Cat'] = df['Squad_Cat']
scaled_numeric_df['Comp_Cat'] = df['Comp_Cat']

In [13]:
# Drop 'Rk' if exists in the dataset
if 'Rk' in scaled_numeric_df.columns:
    scaled_numeric_df = scaled_numeric_df.drop('Rk', axis=1)

In [14]:
scaled_numeric_df.head()

Unnamed: 0,Market Value Euro,Born,MP,90s,Goals,Shots,SoT,G/Sh,G/SoT,ShoDist,ShoFK,ShoPK,PKatt,PasTotCmp,PasTotAtt,PasTotDist,PasTotPrgDist,PasShoCmp,PasShoAtt,PasMedCmp,PasMedAtt,PasLonCmp,PasLonAtt,Assists,PasAss,Pas3rd,PPA,CrsPA,PasProg,PasLive,PasDead,PasFK,TB,Sw,PasCrs,CK,CkIn,CkOut,CkStr,TI,PasCmp,PasOff,PasBlocks,SCA,ScaPassLive,ScaPassDead,ScaDrib,ScaSh,ScaFld,ScaDef,GCA,GcaPassLive,GcaPassDead,GcaDrib,GcaSh,GcaFld,GcaDef,Tkl,TklWon,TklDef3rd,TklMid3rd,TklAtt3rd,TklDri,TklDriAtt,TklDriPast,Blocks,BlkSh,BlkPass,Int,Clr,Err,Touches,TouDefPen,TouDef3rd,TouMid3rd,TouAtt3rd,TouAttPen,TouLive,Carries,CarTotDist,CarPrgDist,CarProg,Car3rd,CPA,CarMis,CarDis,Rec,RecProg,CrdY,CrdR,2CrdY,Fls,Fld,Off,Crs,TklW,PKwon,PKcon,OG,Recov,AerWon,AerLost,Goals Per Match,ToAtt,ToSuc,ToTkl,Pos_Cat,Nation_Cat,Squad_Cat,Comp_Cat
0,2.557545,3.128973,3.257545,3.63322,2.370045,2.223119,2.290878,2.375727,2.315121,3.085635,2.557545,2.557545,2.557545,2.626352,2.68946,2.512168,2.772272,2.987777,2.921181,2.566136,2.633816,2.421384,2.589779,3.057545,2.439898,2.309269,3.127493,3.178234,2.680893,2.388391,4.692208,2.706244,3.224211,2.309545,2.961054,2.557545,2.557545,2.557545,2.557545,9.314688,2.626352,3.207545,3.227448,2.290878,2.245758,3.057545,3.249852,3.200402,2.986116,2.557545,2.695476,3.007545,2.557545,2.557545,2.557545,2.557545,2.557545,2.906299,2.759792,3.429885,2.54192,2.028133,3.489876,2.859132,2.375727,3.753717,3.690878,3.609568,2.827618,3.064862,2.557545,2.72949,3.408916,3.202506,2.426158,2.512718,2.314733,2.445013,2.581213,2.868404,3.114195,3.188613,3.069356,3.003973,2.436666,2.495045,2.483925,2.32472,2.86789,2.557545,2.557545,2.221096,3.12665,2.514066,2.961054,2.759792,2.557545,2.617545,2.587545,2.055735,2.054655,2.640252,2.557545,2.557545,2.557545,2.557545,0,32,67,3
1,2.557545,1.27183,3.257545,3.69268,2.745045,2.308364,2.490878,3.375727,3.315121,2.883387,2.557545,2.557545,2.557545,2.841948,2.774566,3.069688,3.183177,2.13894,2.01209,3.365105,3.311782,2.999509,2.937032,2.557545,2.09676,2.718464,2.143037,2.316165,2.583976,2.866002,2.801934,3.90327,2.557545,3.165545,2.17158,2.557545,2.557545,2.557545,2.557545,2.67183,2.841948,2.857545,2.382787,1.992211,1.926366,2.557545,3.249852,2.486116,2.557545,3.843259,2.2472,2.207545,2.557545,2.587545,2.557545,2.557545,2.557545,2.699894,3.018219,3.08946,2.557545,2.116368,2.331981,2.073418,2.084817,2.969028,3.9131,2.557545,3.820318,3.557545,2.557545,2.697816,4.338209,3.791653,2.564844,1.608721,2.295564,2.787724,2.687722,2.922959,3.012389,2.318063,2.337072,2.27183,2.22238,2.1131,2.723189,2.149148,2.523062,2.587545,2.557545,2.529507,2.224211,2.383632,2.17158,3.018219,2.557545,2.587545,2.557545,2.354891,2.950608,2.467319,2.557545,2.557545,2.557545,2.557545,0,60,74,2
2,2.557545,3.128973,3.107545,3.384572,2.620045,2.387053,2.490878,2.921181,2.921181,3.063163,2.557545,2.557545,2.557545,3.630939,3.370311,3.490438,2.60139,3.569173,3.455272,3.657201,3.498223,3.238348,2.740695,2.557545,2.400682,2.883982,2.443555,2.454096,2.689703,3.676948,2.230861,2.512935,3.001989,2.477545,2.347018,2.557545,2.557545,2.557545,2.557545,2.478973,3.630939,2.457545,2.343953,2.434878,2.435872,2.557545,2.557545,3.343259,3.628973,3.557545,2.281683,2.407545,2.557545,2.557545,2.557545,2.557545,2.557545,2.799538,2.84968,2.738396,3.04192,2.557545,2.783109,3.128973,3.345424,2.126923,2.246434,2.395695,2.90791,2.264862,2.557545,3.290576,2.528685,3.165297,3.740027,2.206405,2.222081,3.575448,3.740977,3.353222,2.861321,2.350425,2.337072,2.539688,2.436666,2.856156,3.821348,2.40869,3.523062,2.667545,2.627545,2.847264,2.671366,2.383632,2.347018,2.84968,2.557545,2.557545,2.557545,2.834987,2.291649,2.106417,2.557545,2.557545,2.557545,2.557545,2,41,22,2
3,2.557545,2.128973,3.257545,3.676464,2.370045,2.550987,2.540878,2.375727,2.315121,3.32159,2.557545,2.557545,2.557545,2.933692,2.893715,2.952517,2.722806,2.848242,2.807545,2.926961,2.854155,3.133438,2.972197,3.057545,2.753623,3.371338,2.557545,2.936855,3.227148,3.114759,2.260787,2.557545,4.557545,3.029545,2.59848,2.715439,2.557545,2.557545,2.557545,2.478973,2.933692,3.157545,3.159487,2.728211,2.740054,3.057545,2.788314,2.486116,4.057545,5.128973,2.660993,2.807545,2.557545,2.557545,2.557545,2.587545,2.557545,3.909858,3.962039,3.355417,4.29192,3.116368,4.452282,5.367069,5.418151,2.78721,2.357545,3.135579,3.418859,2.162423,2.557545,2.892386,2.14196,2.607157,3.630537,2.258703,2.170963,3.11509,3.279438,2.585337,2.534556,2.356898,2.652033,2.432545,2.44216,2.856156,3.134232,2.435407,2.936855,2.557545,2.557545,2.622965,3.313642,2.514066,2.59848,3.962039,2.557545,2.557545,2.557545,2.861525,2.031533,1.768071,2.557545,2.557545,2.557545,2.557545,2,36,51,2
4,2.557545,3.128973,1.607545,1.930518,2.370045,1.954266,2.190878,2.375727,2.315121,0.782264,2.557545,2.557545,2.557545,1.250205,1.285204,1.391375,1.564852,1.185452,1.171181,1.612528,1.701613,1.738348,1.596373,2.557545,1.861466,1.592028,1.956508,2.316165,1.385739,1.273963,2.048817,2.081708,2.557545,1.837545,3.306083,2.557545,2.557545,2.557545,2.557545,2.37183,1.250205,2.107545,1.528419,1.656211,1.606974,2.557545,2.557545,2.27183,2.557545,2.557545,2.143752,2.207545,2.557545,2.557545,2.557545,2.557545,2.557545,1.368933,1.456421,1.770311,1.620045,2.028133,1.745515,3.009926,3.999969,3.09343,2.157545,3.609568,1.550245,1.996569,2.557545,1.191029,1.948598,1.757545,1.418859,2.73685,3.940931,1.227621,1.640385,2.030525,1.411404,1.428095,3.337072,2.27183,6.370732,1.842267,1.47779,2.843804,2.005821,2.557545,2.557545,3.183713,1.630715,2.383632,3.306083,1.456421,2.557545,2.557545,2.557545,1.686616,2.939048,1.444763,2.557545,2.557545,2.557545,2.557545,1,36,77,2


In [15]:
# Create a mapping between 'pos' and 'pos_cat'
pos_mapping = df[['Pos', 'Pos_Cat']].drop_duplicates().sort_values(by='Pos_Cat').reset_index(drop=True)

print("Mapping between 'pos' and 'pos_cat':")
print(pos_mapping)

Mapping between 'pos' and 'pos_cat':
  Pos  Pos_Cat
0  DF        0
1  FW        1
2  MF        2


In [16]:
import numpy as np
import pandas as pd

# Sample data: scaled_numeric_df is your dataframe with scaled numeric features

# Set the threshold for collinearity
threshold = 0.95

# Compute the correlation matrix
corr_matrix = scaled_numeric_df.corr().abs()

# Initialize the set of columns to keep
columns_to_keep = set(scaled_numeric_df.columns)

# Loop through the correlation matrix and identify columns to drop
for i in range(len(corr_matrix.columns)):
    for j in range(i + 1, len(corr_matrix.columns)):
        if corr_matrix.iloc[i, j] > threshold:
            colname_i = corr_matrix.columns[i]
            colname_j = corr_matrix.columns[j]
            # Check which column to drop based on the current set of columns to keep
            if colname_i in columns_to_keep and colname_j in columns_to_keep:
                # Drop one of the columns (e.g., column j)
                columns_to_keep.remove(colname_j)

# Create a new dataframe with only the selected columns
scaled_numeric_df = scaled_numeric_df[list(columns_to_keep)]

# Output the result
print(f"Dropped columns due to high correlation: {set(scaled_numeric_df.columns) - columns_to_keep}")
print(f"Remaining columns: {columns_to_keep}")

Dropped columns due to high correlation: set()
Remaining columns: {'Recov', 'TklAtt3rd', 'BlkPass', 'TklWon', 'PasFK', 'Comp_Cat', 'PasLonAtt', 'TouMid3rd', 'ToAtt', 'GcaSh', 'GcaDrib', 'BlkSh', 'TouAttPen', 'PasShoCmp', 'CK', 'ShoPK', 'PKcon', 'PasAss', 'GcaDef', 'PasTotPrgDist', 'ScaPassLive', 'TB', 'PasCrs', 'Nation_Cat', 'G/Sh', 'ScaDef', 'Pos_Cat', 'Fld', 'TklDriPast', 'SoT', 'PasDead', 'G/SoT', 'ScaSh', 'TklDef3rd', 'PasLonCmp', 'PasMedCmp', 'ScaFld', 'GCA', 'Goals Per Match', 'CrdY', 'TklDriAtt', 'GcaFld', 'ScaDrib', 'CkStr', 'TI', 'Off', 'GcaPassDead', 'AerLost', 'ShoFK', 'Int', 'TouDefPen', 'OG', 'ShoDist', 'PasOff', 'CarDis', 'Carries', 'Pas3rd', 'TklMid3rd', 'CkIn', 'Assists', 'MP', 'TouDef3rd', 'GcaPassLive', 'PPA', 'PasTotCmp', 'Tkl', 'PasBlocks', 'Car3rd', 'Touches', 'PKatt', 'CPA', 'ScaPassDead', 'AerWon', 'CarProg', 'CarTotDist', 'Born', '2CrdY', 'CkOut', 'TklDri', 'SCA', 'Blocks', 'Goals', 'Clr', 'ToSuc', 'Err', 'PKwon', 'Shots', 'Sw', 'ToTkl', 'TouAtt3rd', 'RecProg', 

In [17]:
scaled_numeric_df.head()

Unnamed: 0,Recov,TklAtt3rd,BlkPass,TklWon,PasFK,Comp_Cat,PasLonAtt,TouMid3rd,ToAtt,GcaSh,GcaDrib,BlkSh,TouAttPen,PasShoCmp,CK,ShoPK,PKcon,PasAss,GcaDef,PasTotPrgDist,ScaPassLive,TB,PasCrs,Nation_Cat,G/Sh,ScaDef,Pos_Cat,Fld,TklDriPast,SoT,PasDead,G/SoT,ScaSh,TklDef3rd,PasLonCmp,PasMedCmp,ScaFld,GCA,Goals Per Match,CrdY,TklDriAtt,GcaFld,ScaDrib,CkStr,TI,Off,GcaPassDead,AerLost,ShoFK,Int,TouDefPen,OG,ShoDist,PasOff,CarDis,Carries,Pas3rd,TklMid3rd,CkIn,Assists,MP,TouDef3rd,GcaPassLive,PPA,PasTotCmp,Tkl,PasBlocks,Car3rd,Touches,PKatt,CPA,ScaPassDead,AerWon,CarProg,CarTotDist,Born,2CrdY,CkOut,TklDri,SCA,Blocks,Goals,Clr,ToSuc,Err,PKwon,Shots,Sw,ToTkl,TouAtt3rd,RecProg,Fls,90s,CrdR,CrsPA,CarMis,CarPrgDist,Market Value Euro,Squad_Cat,PasProg
0,2.055735,2.028133,3.609568,2.759792,2.706244,3,2.589779,2.426158,2.557545,2.557545,2.557545,3.690878,2.314733,2.987777,2.557545,2.557545,2.617545,2.439898,2.557545,2.772272,2.245758,3.224211,2.961054,32,2.375727,2.557545,0,3.12665,2.375727,2.290878,4.692208,2.315121,3.200402,3.429885,2.421384,2.566136,2.986116,2.695476,2.557545,2.86789,2.859132,2.557545,3.249852,2.557545,9.314688,2.514066,2.557545,2.640252,2.557545,2.827618,3.408916,2.587545,3.085635,3.207545,2.495045,2.581213,2.309269,2.54192,2.557545,3.057545,3.257545,3.202506,3.007545,3.127493,2.626352,2.906299,3.227448,3.069356,2.72949,2.557545,3.003973,3.057545,2.054655,3.188613,2.868404,3.128973,2.557545,2.557545,3.489876,2.290878,3.753717,2.370045,3.064862,2.557545,2.557545,2.557545,2.223119,2.309545,2.557545,2.512718,2.32472,2.221096,3.63322,2.557545,3.178234,2.436666,3.114195,2.557545,67,2.680893
1,2.354891,2.116368,2.557545,3.018219,3.90327,2,2.937032,2.564844,2.557545,2.557545,2.587545,3.9131,2.295564,2.13894,2.557545,2.557545,2.587545,2.09676,2.557545,3.183177,1.926366,2.557545,2.17158,60,3.375727,3.843259,0,2.224211,2.084817,2.490878,2.801934,3.315121,2.486116,3.08946,2.999509,3.365105,2.557545,2.2472,2.557545,2.523062,2.073418,2.557545,3.249852,2.557545,2.67183,2.383632,2.557545,2.467319,2.557545,3.820318,4.338209,2.557545,2.883387,2.857545,2.1131,2.687722,2.718464,2.557545,2.557545,2.557545,3.257545,3.791653,2.207545,2.143037,2.841948,2.699894,2.382787,2.337072,2.697816,2.557545,2.27183,2.557545,2.950608,2.318063,2.922959,1.27183,2.557545,2.557545,2.331981,1.992211,2.969028,2.745045,3.557545,2.557545,2.557545,2.557545,2.308364,3.165545,2.557545,1.608721,2.149148,2.529507,3.69268,2.587545,2.316165,2.22238,3.012389,2.557545,74,2.583976
2,2.834987,2.557545,2.395695,2.84968,2.512935,2,2.740695,3.740027,2.557545,2.557545,2.557545,2.246434,2.222081,3.569173,2.557545,2.557545,2.557545,2.400682,2.557545,2.60139,2.435872,3.001989,2.347018,41,2.921181,3.557545,2,2.671366,3.345424,2.490878,2.230861,2.921181,3.343259,2.738396,3.238348,3.657201,3.628973,2.281683,2.557545,3.523062,3.128973,2.557545,2.557545,2.557545,2.478973,2.383632,2.557545,2.106417,2.557545,2.90791,2.528685,2.557545,3.063163,2.457545,2.856156,3.740977,2.883982,3.04192,2.557545,2.557545,3.107545,3.165297,2.407545,2.443555,3.630939,2.799538,2.343953,2.337072,3.290576,2.557545,2.539688,2.557545,2.291649,2.350425,3.353222,3.128973,2.627545,2.557545,2.783109,2.434878,2.126923,2.620045,2.264862,2.557545,2.557545,2.557545,2.387053,2.477545,2.557545,2.206405,2.40869,2.847264,3.384572,2.667545,2.454096,2.436666,2.861321,2.557545,22,2.689703
3,2.861525,3.116368,3.135579,3.962039,2.557545,2,2.972197,3.630537,2.557545,2.557545,2.557545,2.357545,2.170963,2.848242,2.715439,2.557545,2.557545,2.753623,2.557545,2.722806,2.740054,4.557545,2.59848,36,2.375727,5.128973,2,3.313642,5.418151,2.540878,2.260787,2.315121,2.486116,3.355417,3.133438,2.926961,4.057545,2.660993,2.557545,2.936855,5.367069,2.587545,2.788314,2.557545,2.478973,2.514066,2.557545,1.768071,2.557545,3.418859,2.14196,2.557545,3.32159,3.157545,2.856156,3.279438,3.371338,4.29192,2.557545,3.057545,3.257545,2.607157,2.807545,2.557545,2.933692,3.909858,3.159487,2.652033,2.892386,2.557545,2.432545,3.057545,2.031533,2.356898,2.585337,2.128973,2.557545,2.557545,4.452282,2.728211,2.78721,2.370045,2.162423,2.557545,2.557545,2.557545,2.550987,3.029545,2.557545,2.258703,2.435407,2.622965,3.676464,2.557545,2.936855,2.44216,2.534556,2.557545,51,3.227148
4,1.686616,2.028133,3.609568,1.456421,2.081708,2,1.596373,1.418859,2.557545,2.557545,2.557545,2.157545,3.940931,1.185452,2.557545,2.557545,2.557545,1.861466,2.557545,1.564852,1.606974,2.557545,3.306083,36,2.375727,2.557545,1,1.630715,3.999969,2.190878,2.048817,2.315121,2.27183,1.770311,1.738348,1.612528,2.557545,2.143752,2.557545,2.005821,3.009926,2.557545,2.557545,2.557545,2.37183,2.383632,2.557545,1.444763,2.557545,1.550245,1.948598,2.557545,0.782264,2.107545,1.842267,1.640385,1.592028,1.620045,2.557545,2.557545,1.607545,1.757545,2.207545,1.956508,1.250205,1.368933,1.528419,3.337072,1.191029,2.557545,2.27183,2.557545,2.939048,1.428095,2.030525,3.128973,2.557545,2.557545,1.745515,1.656211,3.09343,2.370045,1.996569,2.557545,2.557545,2.557545,1.954266,1.837545,2.557545,2.73685,2.843804,3.183713,1.930518,2.557545,2.316165,6.370732,1.411404,2.557545,77,1.385739


In [18]:
# Save the DataFrame to a CSV file
scaled_numeric_df.to_csv('scaled_numeric_unseen.csv', index=False)

In [19]:
import pandas as pd
import joblib
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, matthews_corrcoef, log_loss


# Load the processed unseen data
unseen_data = pd.read_csv('scaled_numeric_unseen.csv')

# Extract the labels if available
y_test_true = unseen_data['Pos_Cat']

# Drop non-feature columns if they exist
X_unseen = unseen_data.drop(columns=['Pos_Cat', 'Nation_Cat', 'Squad_Cat', 'Comp_Cat'], errors='ignore')

# Ensure top 13 features
top_13_features = [
    'TouDef3rd', 'TI', 'Clr', 'PasTotPrgDist', 'PasDead',
    'TouDefPen', 'PasMedCmp', 'TouAtt3rd', 'RecProg',
    'TouAttPen', 'PasTotCmp', 'TouMid3rd', 'Shots'
]

# Ensure only top 13 features are used
X_unseen = X_unseen[top_13_features]


# Load the trained XGBoost model
best_xgb_model = joblib.load('best_xgboost_model.pkl')


# Make predictions on the unseen data
y_unseen_pred = best_xgb_model.predict(X_unseen)

# Convert predictions to DataFrame
predictions_df = pd.DataFrame({
    'Predicted_Pos_Cat': y_unseen_pred
})

# Add actual results to predictions DataFrame if available
if 'Pos_Cat' in unseen_data.columns:
    results_comparison = predictions_df.copy()
    results_comparison['Actual_Pos_Cat'] = y_test_true.reset_index(drop=True)

    # Display the comparison
    print(results_comparison.head())

    # Extract actual labels and predicted labels
    y_true = results_comparison['Actual_Pos_Cat']
    y_pred = results_comparison['Predicted_Pos_Cat']

    # Calculate accuracy
    accuracy = accuracy_score(y_true, y_pred)
    print(f'Accuracy: {accuracy:.4f}')

    # Calculate confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    print('Confusion Matrix:')
    print(cm)

    # Print classification report
    report = classification_report(y_true, y_pred)
    print('Classification Report:')
    print(report)

    # Optionally, save the predictions
    predictions_df.to_csv('xgb_predictions.csv', index=False)

    # Save metrics
    with open('xgb_confusion_matrix.txt', 'w') as f:
        f.write(str(cm))

    with open('xgb_classification_report.txt', 'w') as f:
        f.write(report)
else:
    print(predictions_df.head())
    predictions_df.to_csv('xgb_predictions.csv', index=False)

   Predicted_Pos_Cat  Actual_Pos_Cat
0                  0               0
1                  0               0
2                  2               2
3                  2               2
4                  1               1
Accuracy: 0.8531
Confusion Matrix:
[[991  20  32]
 [ 11 606 138]
 [ 67 129 709]]
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94      1043
           1       0.80      0.80      0.80       755
           2       0.81      0.78      0.79       905

    accuracy                           0.85      2703
   macro avg       0.85      0.85      0.85      2703
weighted avg       0.85      0.85      0.85      2703



In [20]:
import pandas as pd
import joblib
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, matthews_corrcoef, log_loss

# Load the processed unseen data
unseen_data = pd.read_csv('scaled_numeric_unseen.csv')

# Extract the labels if available
y_test_true = unseen_data['Pos_Cat']

# Drop non-feature columns if they exist
X_unseen = unseen_data.drop(columns=['Pos_Cat', 'Nation_Cat', 'Squad_Cat', 'Comp_Cat'], errors='ignore')

# Ensure top 13 features
top_13_features = [
    'TouDef3rd', 'TI', 'Clr', 'PasTotPrgDist', 'PasDead',
    'TouDefPen', 'PasMedCmp', 'TouAtt3rd', 'RecProg',
    'TouAttPen', 'PasTotCmp', 'TouMid3rd', 'Shots'
]

# Ensure only top 13 features are used
X_unseen = X_unseen[top_13_features]

# Load the trained XGBoost model
best_xgb_model = joblib.load('best_xgboost_model.pkl')

# Make predictions on the unseen data
y_unseen_pred = best_xgb_model.predict(X_unseen)
y_unseen_pred_proba = best_xgb_model.predict_proba(X_unseen)

# Convert predictions to DataFrame
predictions_df = pd.DataFrame({
    'Predicted_Pos_Cat': y_unseen_pred
})

# Add actual results to predictions DataFrame if available
if 'Pos_Cat' in unseen_data.columns:
    results_comparison = predictions_df.copy()
    results_comparison['Actual_Pos_Cat'] = y_test_true.reset_index(drop=True)

    # Display the comparison
    print(results_comparison.head())

    # Extract actual labels and predicted labels
    y_true = results_comparison['Actual_Pos_Cat']
    y_pred = results_comparison['Predicted_Pos_Cat']

    # Calculate accuracy
    accuracy = accuracy_score(y_true, y_pred)
    print(f'Accuracy: {accuracy:.4f}')

    # Calculate confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    print('Confusion Matrix:')
    print(cm)

    # Print classification report
    report = classification_report(y_true, y_pred)
    print('Classification Report:')
    print(report)

    # Calculate ROC AUC score
    roc_auc = roc_auc_score(pd.get_dummies(y_true), y_unseen_pred_proba, multi_class='ovr')
    print(f'ROC AUC: {roc_auc:.4f}')

    # Calculate Matthews Correlation Coefficient (MCC)
    mcc = matthews_corrcoef(y_true, y_pred)
    print(f'Matthews Correlation Coefficient (MCC): {mcc:.4f}')

    # Calculate Log Loss
    logloss = log_loss(y_true, y_unseen_pred_proba)
    print(f'Log Loss: {logloss:.4f}')

    # Optionally, save the predictions
    predictions_df.to_csv('xgb_predictions.csv', index=False)

    # Save metrics
    with open('xgb_confusion_matrix.txt', 'w') as f:
        f.write(str(cm))

    with open('xgb_classification_report.txt', 'w') as f:
        f.write(report)

    with open('xgb_performance_metrics.txt', 'w') as f:
        f.write(f'Accuracy: {accuracy:.4f}\n')
        f.write(f'ROC AUC: {roc_auc:.4f}\n')
        f.write(f'Matthews Correlation Coefficient (MCC): {mcc:.4f}\n')
        f.write(f'Log Loss: {logloss:.4f}\n')
else:
    print(predictions_df.head())
    predictions_df.to_csv('xgb_predictions.csv', index=False)


   Predicted_Pos_Cat  Actual_Pos_Cat
0                  0               0
1                  0               0
2                  2               2
3                  2               2
4                  1               1
Accuracy: 0.8531
Confusion Matrix:
[[991  20  32]
 [ 11 606 138]
 [ 67 129 709]]
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94      1043
           1       0.80      0.80      0.80       755
           2       0.81      0.78      0.79       905

    accuracy                           0.85      2703
   macro avg       0.85      0.85      0.85      2703
weighted avg       0.85      0.85      0.85      2703

ROC AUC: 0.9609
Matthews Correlation Coefficient (MCC): 0.7777
Log Loss: 0.3651
