In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
from sklearn.datasets import load_iris, load_digits, load_wine, load_breast_cancer

# Install missing packages
%pip install tensorflow xgboost lightgbm catboost

Note: you may need to restart the kernel to use updated packages.


In [5]:
#Lodaing the dataset
brasil_df = pd.read_csv('data/Fixed_Brasil_SerieA.csv')
brasil_df = brasil_df.drop(columns=['Age'])
brasil_df.head()

Unnamed: 0,Player,Gls/90,G/Sh,G/SoT,SoT%,SoT/90,Sh/90,G-PK/90,PK/90,PKatt/90,Dist,MP,Min,90s,Starts,Subs,unSub
0,Abel Hernández,0.3,0.13,0.33,37.5,0.9,2.3,0.3,0.0,0.0,12.1,23.0,925.0,10.3,10.0,13.0,5.0
1,Adailson Dadá,0.1,0.04,0.18,22.9,0.4,1.7,0.1,0.0,0.0,23.1,33.0,2508.0,27.9,30.0,3.0,0.0
2,Ademir Santos,0.275,0.0975,0.2675,37.15,0.875,2.4,0.225,0.05,0.05,17.95,32.5,1732.5,19.25,18.25,14.25,1.25
3,Adrián Martínez,0.4,0.15,0.4,37.0,0.9,2.4,0.4,0.0,0.0,13.1,22.0,1000.0,11.1,10.0,12.0,4.0
4,Adson,0.15,0.095,0.29,34.8,0.65,1.9,0.15,0.0,0.0,19.25,22.5,1231.5,13.7,13.5,9.0,2.0


In [3]:
big5_df = pd.read_csv('data/Big5CombinedForwards.csv')
pd.set_option('display.max_columns', None)
big5_df.head()

Unnamed: 0,Rk,Player,Gls/90,G/Sh,G/SoT,SoT%,SoT/90,Sh/90,PK/90,PKatt/90,Dist,Season,Age,Nation,Team,Comp,MP,Min,90s,Starts,Subs,unSub,Pos
0,1,Robert Lewandowski,1.5,0.26,0.59,43.8,2.1,4.7,0.3,0.3,13.8,2020-2021,31,pl POL,Bayern Munich,de Bundesliga,29,2458,27.3,28,1,0,FW
1,2,Luis Muriel,1.4,0.24,0.56,42.9,2.3,5.3,0.1,0.2,18.0,2020-2021,29,co COL,Atalanta,it Serie A,36,1436,16.0,16,20,1,FW
2,3,Paco Alcácer,1.3,0.33,0.65,51.0,1.9,3.8,0.1,0.1,16.0,2018-2019,24,es ESP,Dortmund,de Bundesliga,26,1211,13.5,11,15,2,FW
3,4,Mateo Retegui,1.3,0.29,0.72,39.7,1.6,4.1,0.1,0.2,13.1,2024-2025,25,it ITA,Atalanta,it Serie A,23,1371,15.2,19,4,0,FW
4,5,Luis Muriel,1.3,0.17,0.35,48.6,2.4,5.0,0.4,0.5,19.9,2019-2020,28,co COL,Atalanta,it Serie A,34,1260,14.0,10,24,1,FW


In [4]:
big5_df = big5_df.drop(columns=['Comp', 'Nation', 'Team', 'Season','Rk', 'Pos', 'Age'])
big5_df.head()

Unnamed: 0,Player,Gls/90,G/Sh,G/SoT,SoT%,SoT/90,Sh/90,PK/90,PKatt/90,Dist,MP,Min,90s,Starts,Subs,unSub
0,Robert Lewandowski,1.5,0.26,0.59,43.8,2.1,4.7,0.3,0.3,13.8,29,2458,27.3,28,1,0
1,Luis Muriel,1.4,0.24,0.56,42.9,2.3,5.3,0.1,0.2,18.0,36,1436,16.0,16,20,1
2,Paco Alcácer,1.3,0.33,0.65,51.0,1.9,3.8,0.1,0.1,16.0,26,1211,13.5,11,15,2
3,Mateo Retegui,1.3,0.29,0.72,39.7,1.6,4.1,0.1,0.2,13.1,23,1371,15.2,19,4,0
4,Luis Muriel,1.3,0.17,0.35,48.6,2.4,5.0,0.4,0.5,19.9,34,1260,14.0,10,24,1


In [7]:
# In Big5 dataset if a player appears more than once, take the average of all numeric values
big5_df = big5_df.groupby('Player').agg({col: 'mean' for col in big5_df.select_dtypes(include=np.number).columns}).reset_index()
big5_df.head()

Unnamed: 0,Player,Gls/90,G/Sh,G/SoT,SoT%,SoT/90,Sh/90,PK/90,PKatt/90,Dist,MP,90s,Starts,Subs,unSub
0,Aaron Connolly,0.2,0.08,0.23,34.2,0.9,2.7,0.0,0.0,16.2,24.0,14.0,14.0,10.0,4.0
1,Aaron Hunt,0.1,0.07,0.29,24.1,0.3,1.3,0.0,0.0,24.2,28.0,23.1,26.0,2.0,0.0
2,Aaron Lennon,0.1,0.17,0.5,33.3,0.2,0.7,0.0,0.0,20.1,28.0,17.2,17.0,11.0,7.0
3,Aaron Leya Iseka,0.3,0.13,0.345,37.3,0.6,1.6,0.1,0.15,14.95,24.5,13.6,13.5,11.0,5.0
4,Aaron Mooy,0.1,0.055,0.2,27.6,0.4,1.5,0.0,0.0,22.1,30.0,24.55,25.0,5.0,2.5
