In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, make_scorer, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
import joblib

In [3]:
# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
# Load and clean data
df = pd.read_csv("Unseen_data.csv", encoding='latin1')
df.columns = df.columns.str.strip()

In [5]:
# Handle missing values
print(f'Initial Missing Values:\n{df.isnull().sum()}')

Initial Missing Values:
Rk                   0
Edited Name          0
Original Name        0
Matching Name        0
Market Value Euro    0
Nation               0
Pos                  0
Squad                0
Comp                 0
Born                 0
MP                   0
90s                  0
Goals                0
Shots                0
SoT                  0
G/Sh                 0
G/SoT                0
ShoDist              0
ShoFK                0
ShoPK                0
PKatt                0
PasTotCmp            0
PasTotAtt            0
PasTotDist           0
PasTotPrgDist        0
PasShoCmp            0
PasShoAtt            0
PasMedCmp            0
PasMedAtt            0
PasLonCmp            0
PasLonAtt            0
Assists              0
PasAss               0
Pas3rd               0
PPA                  0
CrsPA                0
PasProg              0
PasLive              0
PasDead              0
PasFK                0
TB                   0
Sw                   0
PasCrs    

In [6]:
# Encode with LabelEncoder
le = LabelEncoder()
df['Pos_Cat'] = le.fit_transform(df['Pos'])
df['Nation_Cat'] = le.fit_transform(df['Nation'])
df['Squad_Cat'] = le.fit_transform(df['Squad'])
df['Comp_Cat'] = le.fit_transform(df['Comp'])

In [7]:
# Select numeric columns
numeric_df = df.select_dtypes(include=[np.number])

In [8]:
# Extract numeric features (assuming numeric_df is already defined)
features_to_scale = numeric_df.drop(columns=['Pos_Cat', 'Nation_Cat', 'Squad_Cat', 'Comp_Cat'])
# Scale features using RobustScaler
scaler = RobustScaler()
scaled_features = scaler.fit_transform(features_to_scale)

In [9]:
# No need to shift scaled features as RobustScaler already scales them
scaled_numeric_df = pd.DataFrame(scaled_features, columns=features_to_scale.columns)

In [10]:
# Save the scaler
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [11]:
# Add non-scaled columns
# scaled_numeric_df['Market Value Euro'] = df['Market Value Euro']
scaled_numeric_df['Pos_Cat'] = df['Pos_Cat']
scaled_numeric_df['Nation_Cat'] = df['Nation_Cat']
scaled_numeric_df['Squad_Cat'] = df['Squad_Cat']
scaled_numeric_df['Comp_Cat'] = df['Comp_Cat']

In [12]:
# Drop 'Rk' if exists in the dataset
if 'Rk' in scaled_numeric_df.columns:
    scaled_numeric_df = scaled_numeric_df.drop('Rk', axis=1)

In [13]:
scaled_numeric_df.head()

Unnamed: 0,Market Value Euro,Born,MP,90s,Goals,Shots,SoT,G/Sh,G/SoT,ShoDist,ShoFK,ShoPK,PKatt,PasTotCmp,PasTotAtt,PasTotDist,PasTotPrgDist,PasShoCmp,PasShoAtt,PasMedCmp,PasMedAtt,PasLonCmp,PasLonAtt,Assists,PasAss,Pas3rd,PPA,CrsPA,PasProg,PasLive,PasDead,PasFK,TB,Sw,PasCrs,CK,CkIn,CkOut,CkStr,TI,PasCmp,PasOff,PasBlocks,SCA,ScaPassLive,ScaPassDead,ScaDrib,ScaSh,ScaFld,ScaDef,GCA,GcaPassLive,GcaPassDead,GcaDrib,GcaSh,GcaFld,GcaDef,Tkl,TklWon,TklDef3rd,TklMid3rd,TklAtt3rd,TklDri,TklDriAtt,TklDriPast,Blocks,BlkSh,BlkPass,Int,Clr,Err,Touches,TouDefPen,TouDef3rd,TouMid3rd,TouAtt3rd,TouAttPen,TouLive,Carries,CarTotDist,CarPrgDist,CarProg,Car3rd,CPA,CarMis,CarDis,Rec,RecProg,CrdY,CrdR,2CrdY,Fls,Fld,Off,Crs,TklW,PKwon,PKcon,OG,Recov,AerWon,AerLost,Goals Per Match,ToAtt,ToSuc,ToTkl,Pos_Cat,Nation_Cat,Squad_Cat,Comp_Cat
0,0.0,0.571429,0.7,1.075676,-0.1875,-0.334426,-0.266667,-0.181818,-0.242424,0.52809,0.0,0.0,0.0,0.068807,0.131915,-0.045377,0.214727,0.430233,0.363636,0.008591,0.076271,-0.136161,0.032234,0.5,-0.117647,-0.248276,0.569948,0.62069,0.123348,-0.169154,2.134663,0.148699,0.666667,-0.248,0.403509,0.0,0.0,0.0,0.0,6.757143,0.068807,0.65,0.669903,-0.266667,-0.311787,0.5,0.692308,0.642857,0.428571,0.0,0.137931,0.45,0.0,0.0,0.0,0.0,0.0,0.348754,0.202247,0.87234,-0.015625,-0.529412,0.932331,0.301587,-0.181818,1.196172,1.133333,1.052023,0.270073,0.507317,0.0,0.171946,0.851371,0.644961,-0.131387,-0.044826,-0.242812,-0.112532,0.023669,0.310859,0.55665,0.631068,0.511811,0.446429,-0.120879,-0.0625,-0.07362,-0.232824,0.310345,0.0,0.0,-0.336449,0.569106,-0.043478,0.403509,0.202247,0.0,0.06,0.03,-0.501809,-0.50289,0.082707,0.0,0.0,0.0,0.0,0,32,67,3
1,0.0,-1.285714,0.7,1.135135,0.1875,-0.24918,-0.066667,0.818182,0.757576,0.325843,0.0,0.0,0.0,0.284404,0.217021,0.512143,0.625632,-0.418605,-0.545455,0.80756,0.754237,0.441964,0.379487,0.0,-0.460784,0.16092,-0.414508,-0.241379,0.026432,0.308458,0.244389,1.345725,0.0,0.608,-0.385965,0.0,0.0,0.0,0.0,0.114286,0.284404,0.3,-0.174757,-0.565333,-0.631179,0.0,0.692308,-0.071429,0.0,1.285714,-0.310345,-0.35,0.0,0.03,0.0,0.0,0.0,0.142349,0.460674,0.531915,0.0,-0.441176,-0.225564,-0.484127,-0.472727,0.411483,1.355556,0.0,1.262774,1.0,0.0,0.140271,1.780664,1.234109,0.007299,-0.948823,-0.261981,0.230179,0.130178,0.365414,0.454844,-0.239482,-0.220472,-0.285714,-0.335165,-0.444444,0.165644,-0.408397,-0.034483,0.03,0.0,-0.028037,-0.333333,-0.173913,-0.385965,0.460674,0.0,0.03,0.0,-0.202654,0.393064,-0.090226,0.0,0.0,0.0,0.0,0,60,74,2
2,0.0,0.571429,0.55,0.827027,0.0625,-0.170492,-0.066667,0.363636,0.363636,0.505618,0.0,0.0,0.0,1.073394,0.812766,0.932893,0.043845,1.011628,0.897727,1.099656,0.940678,0.680804,0.18315,0.0,-0.156863,0.326437,-0.11399,-0.103448,0.132159,1.119403,-0.326683,-0.04461,0.444444,-0.08,-0.210526,0.0,0.0,0.0,0.0,-0.078571,1.073394,-0.1,-0.213592,-0.122667,-0.121673,0.0,0.0,0.785714,1.071429,1.0,-0.275862,-0.15,0.0,0.0,0.0,0.0,0.0,0.241993,0.292135,0.180851,0.484375,0.0,0.225564,0.571429,0.787879,-0.430622,-0.311111,-0.16185,0.350365,-0.292683,0.0,0.733032,-0.02886,0.607752,1.182482,-0.351139,-0.335463,1.017903,1.183432,0.795677,0.303777,-0.20712,-0.220472,-0.017857,-0.120879,0.298611,1.263804,-0.148855,0.965517,0.11,0.07,0.28972,0.113821,-0.173913,-0.210526,0.292135,0.0,0.0,0.0,0.277443,-0.265896,-0.451128,0.0,0.0,0.0,0.0,2,41,22,2
3,0.0,-0.428571,0.7,1.118919,-0.1875,-0.006557,-0.016667,-0.181818,-0.242424,0.764045,0.0,0.0,0.0,0.376147,0.33617,0.394972,0.165261,0.290698,0.25,0.369416,0.29661,0.575893,0.414652,0.5,0.196078,0.813793,0.0,0.37931,0.669604,0.557214,-0.296758,0.0,2.0,0.472,0.040936,0.157895,0.0,0.0,0.0,-0.078571,0.376147,0.6,0.601942,0.170667,0.18251,0.5,0.230769,-0.071429,1.5,2.571429,0.103448,0.25,0.0,0.0,0.0,0.03,0.0,1.352313,1.404494,0.797872,1.734375,0.558824,1.894737,2.809524,2.860606,0.229665,-0.2,0.578035,0.861314,-0.395122,0.0,0.334842,-0.415584,0.049612,1.072993,-0.298842,-0.386581,0.557545,0.721893,0.027792,-0.022989,-0.200647,0.094488,-0.125,-0.115385,0.298611,0.576687,-0.122137,0.37931,0.0,0.0,0.065421,0.756098,-0.043478,0.040936,1.404494,0.0,0.0,0.0,0.303981,-0.526012,-0.789474,0.0,0.0,0.0,0.0,2,36,51,2
4,0.0,0.571429,-0.95,-0.627027,-0.1875,-0.603279,-0.366667,-0.181818,-0.242424,-1.775281,0.0,0.0,0.0,-1.307339,-1.27234,-1.16617,-0.992693,-1.372093,-1.386364,-0.945017,-0.855932,-0.819196,-0.961172,0.0,-0.696078,-0.965517,-0.601036,-0.241379,-1.171806,-1.283582,-0.508728,-0.475836,0.0,-0.72,0.748538,0.0,0.0,0.0,0.0,-0.185714,-1.307339,-0.45,-1.029126,-0.901333,-0.95057,0.0,0.0,-0.285714,0.0,0.0,-0.413793,-0.35,0.0,0.0,0.0,0.0,0.0,-1.188612,-1.101124,-0.787234,-0.9375,-0.529412,-0.81203,0.452381,1.442424,0.535885,-0.4,1.052023,-1.007299,-0.560976,0.0,-1.366516,-0.608947,-0.8,-1.138686,0.179305,1.383387,-1.329923,-0.91716,-0.52702,-1.146141,-1.12945,0.779528,-0.285714,3.813187,-0.715278,-1.079755,0.28626,-0.551724,0.0,0.0,0.626168,-0.926829,-0.173913,0.748538,-1.101124,0.0,0.0,0.0,-0.870929,0.381503,-1.112782,0.0,0.0,0.0,0.0,1,36,77,2


In [14]:
# Create a mapping between 'pos' and 'pos_cat'
pos_mapping = df[['Pos', 'Pos_Cat']].drop_duplicates().sort_values(by='Pos_Cat').reset_index(drop=True)

print("Mapping between 'pos' and 'pos_cat':")
print(pos_mapping)

Mapping between 'pos' and 'pos_cat':
  Pos  Pos_Cat
0  DF        0
1  FW        1
2  MF        2


In [15]:
import numpy as np
import pandas as pd

# Sample data: scaled_numeric_df is your dataframe with scaled numeric features

# Set the threshold for collinearity
threshold = 0.95

# Compute the correlation matrix
corr_matrix = scaled_numeric_df.corr().abs()

# Initialize the set of columns to keep
columns_to_keep = set(scaled_numeric_df.columns)

# Loop through the correlation matrix and identify columns to drop
for i in range(len(corr_matrix.columns)):
    for j in range(i + 1, len(corr_matrix.columns)):
        if corr_matrix.iloc[i, j] > threshold:
            colname_i = corr_matrix.columns[i]
            colname_j = corr_matrix.columns[j]
            # Check which column to drop based on the current set of columns to keep
            if colname_i in columns_to_keep and colname_j in columns_to_keep:
                # Drop one of the columns (e.g., column j)
                columns_to_keep.remove(colname_j)

# Create a new dataframe with only the selected columns
scaled_numeric_df = scaled_numeric_df[list(columns_to_keep)]

# Output the result
print(f"Dropped columns due to high correlation: {set(scaled_numeric_df.columns) - columns_to_keep}")
print(f"Remaining columns: {columns_to_keep}")

Dropped columns due to high correlation: set()
Remaining columns: {'GcaDef', 'TouDef3rd', 'TouDefPen', 'PasTotPrgDist', 'ShoFK', 'Pas3rd', 'AerWon', 'Pos_Cat', 'BlkSh', 'Err', 'PasFK', 'GcaPassLive', 'PKatt', 'BlkPass', 'ScaFld', 'ScaDrib', 'ShoPK', 'Carries', '90s', 'ScaSh', 'TouMid3rd', 'CPA', 'CarPrgDist', 'PasLonAtt', 'Squad_Cat', 'TI', 'ScaPassDead', 'CkOut', 'OG', 'ScaPassLive', 'ScaDef', 'TB', 'PasTotCmp', 'TklDriAtt', 'TklDriPast', 'PasOff', 'PKcon', 'CrdY', 'Fld', 'PasProg', 'Sw', 'Nation_Cat', 'Clr', 'TklDri', 'G/Sh', 'TouAttPen', 'Shots', 'CrdR', 'Recov', 'ToAtt', 'Int', 'Off', 'RecProg', 'TklMid3rd', 'CkIn', 'PasBlocks', 'GcaDrib', 'Assists', 'Market Value Euro', 'PasShoCmp', 'PasCrs', 'PKwon', 'CkStr', 'ToTkl', 'GcaPassDead', 'TklAtt3rd', 'Comp_Cat', 'PasAss', 'ToSuc', 'Blocks', 'PasMedCmp', 'Goals Per Match', 'ShoDist', 'GCA', 'G/SoT', 'PasLonCmp', '2CrdY', 'CarTotDist', 'CarDis', 'TouAtt3rd', 'Goals', 'CrsPA', 'TklWon', 'CK', 'CarProg', 'GcaSh', 'Tkl', 'AerLost', 'GcaFld

In [16]:
scaled_numeric_df.head()

Unnamed: 0,GcaDef,TouDef3rd,TouDefPen,PasTotPrgDist,ShoFK,Pas3rd,AerWon,Pos_Cat,BlkSh,Err,PasFK,GcaPassLive,PKatt,BlkPass,ScaFld,ScaDrib,ShoPK,Carries,90s,ScaSh,TouMid3rd,CPA,CarPrgDist,PasLonAtt,Squad_Cat,TI,ScaPassDead,CkOut,OG,ScaPassLive,ScaDef,TB,PasTotCmp,TklDriAtt,TklDriPast,PasOff,PKcon,CrdY,Fld,PasProg,Sw,Nation_Cat,Clr,TklDri,G/Sh,TouAttPen,Shots,CrdR,Recov,ToAtt,Int,Off,RecProg,TklMid3rd,CkIn,PasBlocks,GcaDrib,Assists,Market Value Euro,PasShoCmp,PasCrs,PKwon,CkStr,ToTkl,GcaPassDead,TklAtt3rd,Comp_Cat,PasAss,ToSuc,Blocks,PasMedCmp,Goals Per Match,ShoDist,GCA,G/SoT,PasLonCmp,2CrdY,CarTotDist,CarDis,TouAtt3rd,Goals,CrsPA,TklWon,CK,CarProg,GcaSh,Tkl,AerLost,GcaFld,MP,PasDead,SoT,TklDef3rd,CarMis,PPA,Fls,SCA,Born,Touches,Car3rd
0,0.0,0.644961,0.851371,0.214727,0.0,-0.248276,-0.50289,0,1.133333,0.0,0.148699,0.45,0.0,1.052023,0.428571,0.692308,0.0,0.023669,1.075676,0.642857,-0.131387,0.446429,0.55665,0.032234,67,6.757143,0.5,0.0,0.03,-0.311787,0.0,0.666667,0.068807,0.301587,-0.181818,0.65,0.06,0.310345,0.569106,0.123348,-0.248,32,0.507317,0.932331,-0.181818,-0.242812,-0.334426,0.0,-0.501809,0.0,0.270073,-0.043478,-0.232824,-0.015625,0.0,0.669903,0.0,0.5,0.0,0.430233,0.403509,0.0,0.0,0.0,0.0,-0.529412,3,-0.117647,0.0,1.196172,0.008591,0.0,0.52809,0.137931,-0.242424,-0.136161,0.0,0.310859,-0.0625,-0.044826,-0.1875,0.62069,0.202247,0.0,0.631068,0.0,0.348754,0.082707,0.0,0.7,2.134663,-0.266667,0.87234,-0.120879,0.569948,-0.336449,-0.266667,0.571429,0.171946,0.511811
1,0.0,1.234109,1.780664,0.625632,0.0,0.16092,0.393064,0,1.355556,0.0,1.345725,-0.35,0.0,0.0,0.0,0.692308,0.0,0.130178,1.135135,-0.071429,0.007299,-0.285714,0.454844,0.379487,74,0.114286,0.0,0.0,0.0,-0.631179,1.285714,0.0,0.284404,-0.484127,-0.472727,0.3,0.03,-0.034483,-0.333333,0.026432,0.608,60,1.0,-0.225564,0.818182,-0.261981,-0.24918,0.03,-0.202654,0.0,1.262774,-0.173913,-0.408397,0.0,0.0,-0.174757,0.03,0.0,0.0,-0.418605,-0.385965,0.0,0.0,0.0,0.0,-0.441176,2,-0.460784,0.0,0.411483,0.80756,0.0,0.325843,-0.310345,0.757576,0.441964,0.0,0.365414,-0.444444,-0.948823,0.1875,-0.241379,0.460674,0.0,-0.239482,0.0,0.142349,-0.090226,0.0,0.7,0.244389,-0.066667,0.531915,-0.335165,-0.414508,-0.028037,-0.565333,-1.285714,0.140271,-0.220472
2,0.0,0.607752,-0.02886,0.043845,0.0,0.326437,-0.265896,2,-0.311111,0.0,-0.04461,-0.15,0.0,-0.16185,1.071429,0.0,0.0,1.183432,0.827027,0.785714,1.182482,-0.017857,0.303777,0.18315,22,-0.078571,0.0,0.0,0.0,-0.121673,1.0,0.444444,1.073394,0.571429,0.787879,-0.1,0.0,0.965517,0.113821,0.132159,-0.08,41,-0.292683,0.225564,0.363636,-0.335463,-0.170492,0.11,0.277443,0.0,0.350365,-0.173913,-0.148855,0.484375,0.0,-0.213592,0.0,0.0,0.0,1.011628,-0.210526,0.0,0.0,0.0,0.0,0.0,2,-0.156863,0.0,-0.430622,1.099656,0.0,0.505618,-0.275862,0.363636,0.680804,0.07,0.795677,0.298611,-0.351139,0.0625,-0.103448,0.292135,0.0,-0.20712,0.0,0.241993,-0.451128,0.0,0.55,-0.326683,-0.066667,0.180851,-0.120879,-0.11399,0.28972,-0.122667,0.571429,0.733032,-0.220472
3,0.0,0.049612,-0.415584,0.165261,0.0,0.813793,-0.526012,2,-0.2,0.0,0.0,0.25,0.0,0.578035,1.5,0.230769,0.0,0.721893,1.118919,-0.071429,1.072993,-0.125,-0.022989,0.414652,51,-0.078571,0.5,0.0,0.0,0.18251,2.571429,2.0,0.376147,2.809524,2.860606,0.6,0.0,0.37931,0.756098,0.669604,0.472,36,-0.395122,1.894737,-0.181818,-0.386581,-0.006557,0.0,0.303981,0.0,0.861314,-0.043478,-0.122137,1.734375,0.0,0.601942,0.0,0.5,0.0,0.290698,0.040936,0.0,0.0,0.0,0.0,0.558824,2,0.196078,0.0,0.229665,0.369416,0.0,0.764045,0.103448,-0.242424,0.575893,0.0,0.027792,0.298611,-0.298842,-0.1875,0.37931,1.404494,0.157895,-0.200647,0.0,1.352313,-0.789474,0.03,0.7,-0.296758,-0.016667,0.797872,-0.115385,0.0,0.065421,0.170667,-0.428571,0.334842,0.094488
4,0.0,-0.8,-0.608947,-0.992693,0.0,-0.965517,0.381503,1,-0.4,0.0,-0.475836,-0.35,0.0,1.052023,0.0,0.0,0.0,-0.91716,-0.627027,-0.285714,-1.138686,-0.285714,-1.146141,-0.961172,77,-0.185714,0.0,0.0,0.0,-0.95057,0.0,0.0,-1.307339,0.452381,1.442424,-0.45,0.0,-0.551724,-0.926829,-1.171806,-0.72,36,-0.560976,-0.81203,-0.181818,1.383387,-0.603279,0.0,-0.870929,0.0,-1.007299,-0.173913,0.28626,-0.9375,0.0,-1.029126,0.0,0.0,0.0,-1.372093,0.748538,0.0,0.0,0.0,0.0,-0.529412,2,-0.696078,0.0,0.535885,-0.945017,0.0,-1.775281,-0.413793,-0.242424,-0.819196,0.0,-0.52702,-0.715278,0.179305,-0.1875,-0.241379,-1.101124,0.0,-1.12945,0.0,-1.188612,-1.112782,0.0,-0.95,-0.508728,-0.366667,-0.787234,3.813187,-0.601036,0.626168,-0.901333,0.571429,-1.366516,0.779528


In [17]:
# Save the DataFrame to a CSV file
scaled_numeric_df.to_csv('scaled_numeric_unseen.csv', index=False)

In [18]:
import pandas as pd
import joblib
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, matthews_corrcoef, log_loss


# Load the processed unseen data
unseen_data = pd.read_csv('scaled_numeric_unseen.csv')

# Extract the labels if available
y_test_true = unseen_data['Pos_Cat']

# Drop non-feature columns if they exist
X_unseen = unseen_data.drop(columns=['Pos_Cat', 'Nation_Cat', 'Squad_Cat', 'Comp_Cat'], errors='ignore')

# Ensure top n features
top_n_features = [
    'TouDef3rd',
    'TI',
    'TouDefPen',
    'PasTotPrgDist',
    'Clr',
    'PasDead',
    'PasMedCmp',
    'TouAtt3rd',
    'TouMid3rd',
    'Shots',
    'TouAttPen',
    'RecProg',
    'PasTotCmp',
    'Pas3rd',
    'CarMis',
    'Touches',
    'SCA',
    'PasFK',
    'PasLonAtt'
]

# Ensure only top n features are used
X_unseen = X_unseen[top_n_features]


# Load the trained XGBoost model
best_xgb_model = joblib.load('XGBoost_best_model.pkl')


# Make predictions on the unseen data
y_unseen_pred = best_xgb_model.predict(X_unseen)

# Convert predictions to DataFrame
predictions_df = pd.DataFrame({
    'Predicted_Pos_Cat': y_unseen_pred
})

# Add actual results to predictions DataFrame if available
if 'Pos_Cat' in unseen_data.columns:
    results_comparison = predictions_df.copy()
    results_comparison['Actual_Pos_Cat'] = y_test_true.reset_index(drop=True)

    # Display the comparison
    print(results_comparison.head())

    # Extract actual labels and predicted labels
    y_true = results_comparison['Actual_Pos_Cat']
    y_pred = results_comparison['Predicted_Pos_Cat']

    # Calculate accuracy
    accuracy = accuracy_score(y_true, y_pred)
    print(f'Accuracy: {accuracy:.4f}')

    # Calculate confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    print('Confusion Matrix:')
    print(cm)

    # Print classification report
    report = classification_report(y_true, y_pred)
    print('Classification Report:')
    print(report)

   Predicted_Pos_Cat  Actual_Pos_Cat
0                  0               0
1                  0               0
2                  2               2
3                  2               2
4                  1               1
Accuracy: 0.8672
Confusion Matrix:
[[987  10  46]
 [  6 619 130]
 [ 32 135 738]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.95      0.95      1043
           1       0.81      0.82      0.82       755
           2       0.81      0.82      0.81       905

    accuracy                           0.87      2703
   macro avg       0.86      0.86      0.86      2703
weighted avg       0.87      0.87      0.87      2703



In [20]:
import pandas as pd
import joblib
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, matthews_corrcoef, log_loss

# Load the processed unseen data
unseen_data = pd.read_csv('scaled_numeric_unseen.csv')

# Extract the labels if available
y_test_true = unseen_data['Pos_Cat']

# Drop non-feature columns if they exist
X_unseen = unseen_data.drop(columns=['Pos_Cat', 'Nation_Cat', 'Squad_Cat', 'Comp_Cat'], errors='ignore')

# Ensure top n features
top_n_features =  [
    'TouDef3rd',
    'TI',
    'TouDefPen',
    'PasTotPrgDist',
    'Clr',
    'PasDead',
    'PasMedCmp',
    'TouAtt3rd',
    'TouMid3rd',
    'Shots',
    'TouAttPen',
    'RecProg',
    'PasTotCmp',
    'Pas3rd',
    'CarMis',
    'Touches',
    'SCA',
    'PasFK',
    'PasLonAtt'
]

# Ensure only top n features are used
X_unseen = X_unseen[top_n_features]

# Load the trained XGBoost model
best_xgb_model = joblib.load('XGBoost_best_model.pkl')

# Make predictions on the unseen data
y_unseen_pred = best_xgb_model.predict(X_unseen)
y_unseen_pred_proba = best_xgb_model.predict_proba(X_unseen)

# Convert predictions to DataFrame
predictions_df = pd.DataFrame({
    'Predicted_Pos_Cat': y_unseen_pred
})

# Add actual results to predictions DataFrame if available
if 'Pos_Cat' in unseen_data.columns:
    results_comparison = predictions_df.copy()
    results_comparison['Actual_Pos_Cat'] = y_test_true.reset_index(drop=True)

    # Display the comparison
    print(results_comparison.head())

    # Extract actual labels and predicted labels
    y_true = results_comparison['Actual_Pos_Cat']
    y_pred = results_comparison['Predicted_Pos_Cat']

    # Calculate accuracy
    accuracy = accuracy_score(y_true, y_pred)
    print(f'Accuracy: {accuracy:.4f}')

    # Calculate confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    print('Confusion Matrix:')
    print(cm)


    # Calculate ROC AUC score
    roc_auc = roc_auc_score(pd.get_dummies(y_true), y_unseen_pred_proba, multi_class='ovr')
    print(f'ROC AUC: {roc_auc:.4f}')

    # Calculate Matthews Correlation Coefficient (MCC)
    mcc = matthews_corrcoef(y_true, y_pred)
    print(f'Matthews Correlation Coefficient (MCC): {mcc:.4f}')

    # Calculate Log Loss
    logloss = log_loss(y_true, y_unseen_pred_proba)
    print(f'Log Loss: {logloss:.4f}')

   Predicted_Pos_Cat  Actual_Pos_Cat
0                  0               0
1                  0               0
2                  2               2
3                  2               2
4                  1               1
Accuracy: 0.8672
Confusion Matrix:
[[987  10  46]
 [  6 619 130]
 [ 32 135 738]]
ROC AUC: 0.9688
Matthews Correlation Coefficient (MCC): 0.7993
Log Loss: 0.3180
