Il presente Notebook illustra un esempio di Regressione, svolto con una Support Vector Machine Lineare.

In [1]:
import sklearn
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
data_1722 = pd.read_csv("../../dataset/Training Set 1722.csv")
data_2223 = pd.read_csv("../../dataset/Test Set 2223.csv")
data_total = pd.concat([data_1722, data_2223], axis=0)

# Caricamento e Preparazione del Dataset

## Feature Selection e Rimozione degli Outlier (Partite Giocate & Minuti Giocati)

In [5]:
data_total = data_total[["Player", "Pos", "Squad", "Age", "Season", "Goals", "xG", "Shots on Target", "Shots", "Att Pen", "Offsides",
                  "GCA", "Carries into Penalty Area", "PK Attempted", "PK Made", "Att 3rd", "GCA TO to Goal", "Take-Ons Attempted", "Take-Ons Successful",
                  "GCA Shot to Goal", "Goals Scored while on Pitch", "Carries into Final 1/3", "xGS while on Pitch", "Matches Played", "G/Shots on Target",
                   "G/Shot", "Minutes", "Shots on Target%", "Shots on Target/90", "Shots/90", "Mid 3rd", "Def 3rd", "Def Pen"]]

data_total = data_total.dropna(how="any")
data_total = data_total[(data_total["Matches Played"] >= 5) & (data_total["Minutes"] >= 343)]
data_total.reset_index(drop=True, inplace=True)

In [6]:
X_train = data_total[data_total["Season"] != 2223]
X_test = data_total[data_total["Season"] == 2223]

## Scaling

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [8]:
X_train_to_scale = X_train[["xG", "Shots on Target", "Shots", "Att Pen", "Offsides",
                  "GCA", "Carries into Penalty Area", "PK Attempted", "PK Made", "Att 3rd", "GCA TO to Goal", "Take-Ons Attempted", "Take-Ons Successful",
                  "GCA Shot to Goal", "Goals Scored while on Pitch", "Carries into Final 1/3", "xGS while on Pitch", "Matches Played", "G/Shots on Target",
                   "G/Shot", "Minutes", "Shots on Target%", "Shots on Target/90", "Shots/90", "Mid 3rd", "Def 3rd", "Def Pen"]]

X_test_to_scale = X_test[["xG", "Shots on Target", "Shots", "Att Pen", "Offsides",
                  "GCA", "Carries into Penalty Area", "PK Attempted", "PK Made", "Att 3rd", "GCA TO to Goal", "Take-Ons Attempted", "Take-Ons Successful",
                  "GCA Shot to Goal", "Goals Scored while on Pitch", "Carries into Final 1/3", "xGS while on Pitch", "Matches Played", "G/Shots on Target",
                   "G/Shot", "Minutes", "Shots on Target%", "Shots on Target/90", "Shots/90", "Mid 3rd", "Def 3rd", "Def Pen"]]

In [9]:
X_train_scaled = scaler.fit_transform(X_train_to_scale)
X_test_scaled = scaler.transform(X_test_to_scale)

## Riduzione di Dimensionalità

In [10]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 0.95)

In [11]:
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [12]:
print("Dimensioni del Training Set nello Spazio delle Feature: ", X_train_to_scale.shape)
print("Dimensioni del Training Set nello Spazio Ridotto: ", X_train_pca.shape)

print()

print("Dimensioni del Test Set nello Spazio delle Feature: ", X_test_to_scale.shape)
print("Dimensioni del Test Set nello Spazio Ridotto: ", X_test_pca.shape)

Dimensioni del Training Set nello Spazio delle Feature:  (10403, 27)
Dimensioni del Training Set nello Spazio Ridotto:  (10403, 12)

Dimensioni del Test Set nello Spazio delle Feature:  (2133, 27)
Dimensioni del Test Set nello Spazio Ridotto:  (2133, 12)


# AgglomerativeClustering

In [13]:
from sklearn.cluster import AgglomerativeClustering

In [14]:
agg_clustering = AgglomerativeClustering(n_clusters=5)

In [15]:
from datetime import datetime

In [16]:
start = datetime.now()
agg_clustering.fit(X_train_pca)
end = datetime.now()
print("Total Time: " + str(end-start))

Total Time: 0:00:03.042026


# Regressione con LinearSVR

La predizione per un nuovo giocatore viene calcolata con il seguente Algoritmo:

1. Dato il giocatore di input vengono recuperati tutti i suoi datapoint "vecchio", ovvero tutti i suoi dati relativi alle stagioni precedenti.
2. Per ognuno dei datapoint "vecchi" viene calcolata la distanza tra il datapoint di input ed il datapoint "vecchio".
3. Le distanze vengono normalizzate: a seconda di quanto dista il datapoint "nuovo" dal datapoint "vecchio", verranno recuperati un certo numero di neighbors del datapoint "vecchio" dal suo Cluster di appartenenza.
4. Viene costruito il per la Regressione: qui saranno presenti tutti i datapoint "vecchi" e tutti i loro neighbors, in proporzione rispetto alle distanze normalizzate.
5. Il Dataset ottenuto è utilizzato per addestrare una Support Vector Machine Lineare, utilizzata come Regressore.

In [17]:
from sklearn.neighbors import NearestNeighbors

In [18]:
def build_regression_dataset(vector, agg_clustering, verbose=False):
    
    ## STEP 1: Recupero i "Vecchi Datapoint"
    player_name = vector["Player"].values[0]
    old_neighbors = X_train[X_train["Player"] == player_name]
    if verbose:
        print("Ecco i dati precedenti sul Giocatore da te inserito:")
        display(old_neighbors)
    
    ## STEP 2: Calcolo la distanze del Nuovo Datapoint dai Vecchi Datapoint
    nn_tot = round(3*np.sqrt(X_train_scaled.shape[0]))
    
    vector = vector[["xG", "Shots on Target", "Shots", "Att Pen", "Offsides",
                  "GCA", "Carries into Penalty Area", "PK Attempted", "PK Made", "Att 3rd", "GCA TO to Goal", "Take-Ons Attempted", "Take-Ons Successful",
                  "GCA Shot to Goal", "Goals Scored while on Pitch", "Carries into Final 1/3", "xGS while on Pitch", "Matches Played", "G/Shots on Target",
                   "G/Shot", "Minutes", "Shots on Target%", "Shots on Target/90", "Shots/90", "Mid 3rd", "Def 3rd", "Def Pen"]]
    scaled_vector = scaler.transform(vector)

    distances = np.empty(shape=(old_neighbors.shape[0]))
    for idx, i in zip(old_neighbors.index.values, range(0, old_neighbors.shape[0])):
        neig = X_train_scaled[idx]
        distances[i] = np.linalg.norm((scaled_vector-neig), ord=2)
        
    ## STEP 3: Normalizzo le Distanze
    distances = 1/distances
    sum = np.sum(distances)
    for i in range(0, len(distances)):
        distances[i] = distances[i] / sum
    distances = np.round(distances*nn_tot, decimals=0).astype(int)
    nn_tot = np.sum(distances)
    
    ## STEP 4: Costruisco i Dataset per la Regressione
    X_reg_dataset = np.zeros(shape=(nn_tot,27)) # Contiene i dati (Feature scalate) relativi ai giocatori identificati come "simili" 
    y_reg_dataset = np.zeros(shape=(nn_tot,1))  # Contiene i gol segnati dai giocatori identificati come "simili"
    
    ds_counter = 0
    for idx,dist in zip(old_neighbors.index.values, range(0,len(distances))):
        # Ogni datapoint "vecchio" del giocatore di input è considerabile come un suo "simile".
        X_reg_dataset[ds_counter] = X_train_scaled[idx]
        y_reg_dataset[ds_counter] = old_neighbors.loc[idx]["Goals"]
        ds_counter += 1
        
        # Recuperiamo tutto il Cluster a cui appartiene il datapoint "vecchio" che stiamo considerando.
        label = agg_clustering.labels_[idx]
        indexes = np.where(agg_clustering.labels_ == label)[0].tolist()
        cluster = X_train_scaled[indexes]
        
        # A seconda della distanza "giocatore di input-datapoint vecchio" recuperiamo i Nearest Neighbors del datapoint vecchio.
        neighbors_finder = NearestNeighbors(n_neighbors=distances[dist], metric='minkowski', p=2)
        neighbors_finder.fit(cluster)
        
        neighbors_found = neighbors_finder.kneighbors(X_reg_dataset[ds_counter-1].reshape(1, -1), return_distance=False)[0]
        neighbors_found = neighbors_found[1:] # Si rimuove il primo neighbor, perché è il datapoint vecchio stesso (che è già nel dataset)
        
        for i in neighbors_found:
            dataset_index = indexes[i]
            X_reg_dataset[ds_counter] = X_train_scaled[dataset_index]
            y_reg_dataset[ds_counter] = X_train.loc[dataset_index]["Goals"]
            ds_counter += 1
            
    return scaled_vector, X_reg_dataset, y_reg_dataset

In [19]:
from sklearn.svm import LinearSVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Testing su 200 Giocatori
I seguenti giocatori sono stati selezionati in modo casuale.

In [20]:
giocatori_da_testare = ['Achraf Hakimi', 'Adam Lallana', 'Adam Marušić', 'Ademola Lookman', 'Adrien Rabiot', 'Aleksandar Kolarov',
'Aleksandr Golovin', 'Aleksei Miranchuk', 'Alessandro Bastoni', 'Alexandre Lacazette', 'Álvaro Morata', 'André Schürrle', 'André Silva',
'Andrea Belotti', 'Andreas Cornelius', 'Ángel Di María', 'Ante Rebić', 'Antoine Griezmann', 'Antonín Barák', 'Antonio Candreva',
'Antonio Sanabria', 'Arjen Robben', 'Arkadiusz Milik', 'Armando Izzo', 'Ashley Young', 'Axel Witsel', 'Aymeric Laporte',
'Brahim Díaz', 'Bram Nuytinck', 'Breel Embolo', 'Bruno Fernandes', 'Bryan Cristante', 'Bukayo Saka', 'Çağlar Söyüncü',
'Carlos Soler', 'Casemiro', 'César Azpilicueta', 'Charalambos Lykogiannis', 'Chris Smalling', 'Christian Eriksen', 'Christopher Nkunku',
'Cristian Ansaldi', 'Cristiano Biraghi', 'Cristiano Ronaldo', 'Dan-Axel Zagadou', 'Danilo', 'Danny da Costa', 'Darko Lazović', 'David Luiz',
'Davide Faraoni', 'Davide Zappacosta', 'Diego Laxalt', 'Domenico Berardi', 'Domenico Criscito', 'Duván Zapata', 'Eden Hazard',
'Edin Džeko', 'Edinson Cavani', 'Emil Forsberg', 'Emre Can', 'Fabio Depaoli', 'Federico Bernardeschi', 'Federico Chiesa', 'Federico Dimarco',
'Felipe Anderson', 'Filip Đuričić', 'Filip Kostić', 'Francesco Acerbi', 'Gabriel Strefezza', 'Georginio Wijnaldum', 'Gerard Deulofeu',
'Gian Marco Ferrari', 'Gianluca Mancini', 'Giorgos Kyriakopoulos', 'Giovanni Simeone', 'Gleison Bremer', 'Gonzalo Higuaín',
'Hakan Çalhanoğlu', 'Hans Hateboer', 'Hirving Lozano', 'Houssem Aouar', 'İlkay Gündoğan', 'Ivan Perišić', 'Jack Grealish',
'Jakub Jankto', 'James Milner', 'James Rodríguez', 'James Ward-Prowse', 'Jens Stryger Larsen', 'Jeremie Boga', 'Jeremie Frimpong',
'João Cancelo', 'João Pedro', 'Joël Matip', 'Jordan Veretout', 'Jordi Alba', 'Jorginho', 'José Luis Palomino', 'Josip Brekalo',
'Josip Iličić', 'Juan Bernat', 'Julian Draxler', 'Julian Weigl', 'Kalidou Koulibaly', 'Karim Bellarabi', 'Kerem Demirbay',
'Kevin De Bruyne', 'Kevin Volland', 'Kingsley Coman', 'Krzysztof Piątek', 'Lautaro Martínez', 'Layvin Kurzawa', 'Lazar Samardzic',
'Leandro Paredes', 'Leon Bailey', 'Lionel Messi', 'Lorenzo De Silvestri', 'Lorenzo Pellegrini', 'Lorenzo Venuti', 'Lucas Hernández',
'Lucas Leiva', 'Lucas Martínez Quarta', 'Luis Alberto', 'Luis Muriel', 'Luiz Felipe', 'Luka Jović', 'Luka Modrić', 'Manuel Akanji',
'Manuel Lazzari', 'Manuel Locatelli', 'Marcel Sabitzer', 'Marcelo Brozović', 'Marco Asensio', 'Marco Reus', 'Mario Pašalić',
'Martin Ødegaard', 'Matteo Darmian', 'Matteo Pessina', 'Matteo Politano', 'Mattia Zaccagni', 'Mesut Özil', 'Michy Batshuayi',
'Miguel Veloso', 'Mikel Oyarzabal', 'Nadiem Amiri', 'Nemanja Matić', 'Neymar', 'Nico Schlotterbeck', 'Nicolò Zaniolo',
'Nikola Milenković', 'Nikola Vlašić', 'Oleksandr Zinchenko', 'Oscar Hiljemark', 'Ousmane Dembélé', 'Pablo Marí', 'Paco Alcácer',
'Papu Gómez', 'Paulo Dybala', 'Phil Foden', 'Pierre Højbjerg', 'Pierre Kalulu', 'Pierre-Emerick Aubameyang', 'Piotr Zieliński',
'Rade Krunić', 'Raphaël Guerreiro', 'Raphaël Varane', 'Renato Sanches','Riccardo Orsolini', 'Rick Karsdorp', 'Roberto Pereyra',
'Roberto Soriano', 'Robin Gosens', 'Rodrigo De Paul', 'Roger Ibanez', 'Romelu Lukaku', 'Ruslan Malinovskyi', 'Samuel Umtiti',
'Sandro Tonali', 'Saúl Ñíguez', 'Seko Fofana', 'Sergej Milinković-Savić', 'Sergio Ramos', 'Silvan Widmer', 'Šime Vrsaljko',
'Simon Kjær', 'Simone Verdi', 'Stefan de Vrij', 'Stefan Savić', 'Stevan Jovetić', 'Steven Bergwijn', 'Steven Nzonzi',
'Theo Hernández', 'Thiago Alcántara', 'Thomas Müller', 'Thorgan Hazard', 'Timo Werner', 'Tomáš Souček', 'Trent Alexander-Arnold',
'Victor Osimhen', 'Virgil van Dijk', 'Weston McKennie', 'Yusuf Yazıcı']

In [21]:
giocatori_test_dataset = X_test['Player'].tolist()
y_predicted = []
y_actual = []

report_200 = pd.DataFrame(columns=["Player", "Squadra", "Ruolo", "Goal Segnati 22/23", "Goal Predetti 22/23", "Diff"])

for player in giocatori_da_testare:
    if player not in giocatori_test_dataset:
        continue
    else:
        vector = X_test[(X_test["Player"] == player)]
        squad = vector["Squad"].values[0]
        pos = vector["Pos"].values[0]
        
        goals_scored = vector["Goals"].values.astype(int)[0]
        y_actual.append(goals_scored)
        
        scaled_vector, X_reg_dataset, y_reg_dataset = build_regression_dataset(vector, agg_clustering)
        
        regressor = LinearSVR()
        hp = {
            'epsilon': [0.0, 0.1, 0.2, 0.3, 0.5],
            'C': [0.1, 1, 10, 50, 100]
        }
        grid = GridSearchCV(regressor, hp, scoring='neg_mean_squared_error', cv=5)
        grid.fit(X=X_reg_dataset, y=y_reg_dataset)
        
        regressor = LinearSVR(C=grid.best_params_['C'], epsilon=grid.best_params_['epsilon'])
        regressor.fit(X=X_reg_dataset, y=y_reg_dataset)
        
        goals_predicted = regressor.predict(scaled_vector)[0]
        if goals_predicted < 0:
            goals_predicted = 0 # Attenzione! Il modello può prevedere un numero negativo di Goal segnati...ma ciò non ha senso!
        y_predicted.append(goals_predicted)
        
        diff = np.abs(goals_scored-goals_predicted)
        
        report_200.loc[len(report_200)] = [player, squad, pos, goals_scored, goals_predicted, diff]

In [22]:
display(report_200)

Unnamed: 0,Player,Squadra,Ruolo,Goal Segnati 22/23,Goal Predetti 22/23,Diff
0,Achraf Hakimi,Paris S-G,"DF,MF",5,4.553893,0.446107
1,Adam Lallana,Brighton,"MF,FW",2,1.124656,0.875344
2,Adam Marušić,Lazio,DF,0,0.000000,0.000000
3,Ademola Lookman,Atalanta,"FW,MF",13,10.653590,2.346410
4,Adrien Rabiot,Juventus,MF,8,6.812509,1.187491
...,...,...,...,...,...,...
168,Tomáš Souček,West Ham,MF,2,2.286229,0.286229
169,Trent Alexander-Arnold,Liverpool,DF,2,1.511479,0.488521
170,Victor Osimhen,Napoli,FW,26,23.323564,2.676436
171,Virgil van Dijk,Liverpool,DF,3,2.954423,0.045577


In [23]:
# Esegui per materializzare il Report_200
report_200.to_excel("reports/Report_200 LinearSVR x AgglomerativeClustering.xlsx", index=False)

In [24]:
print("Errore Quadratico Medio:", mean_squared_error(y_actual, y_predicted))
print("Errore Assoluto Medio:", mean_absolute_error(y_actual, y_predicted))
print("R2 Score: ", r2_score(y_actual, y_predicted))

Errore Quadratico Medio: 0.8879115926312486
Errore Assoluto Medio: 0.5723761357680078
R2 Score:  0.9593497222769941


## Testing sull'intero Test Set

In [25]:
giocatori_tot = X_test["Player"].tolist()
giocatori_da_testare = []

for player in giocatori_tot:
    old_neighbors = X_train[X_train["Player"] == player]
    if (old_neighbors.shape[0] > 0):
        giocatori_da_testare.append(player)

In [26]:
y_predicted = []
y_actual = []
full_report = pd.DataFrame(columns=["Player", "Squadra", "Ruolo", "Goal Segnati 22/23", "Goal Predetti 22/23", "Diff"])
feature_importance = pd.DataFrame(columns=["xG", "Shots on Target", "Shots", "Att Pen", "Offsides",
                  "GCA", "Carries into Penalty Area", "PK Attempted", "PK Made", "Att 3rd", "GCA TO to Goal", "Take-Ons Attempted", "Take-Ons Successful",
                  "GCA Shot to Goal", "Goals Scored while on Pitch", "Carries into Final 1/3", "xGS while on Pitch", "Matches Played", "G/Shots on Target",
                   "G/Shot", "Minutes", "Shots on Target%", "Shots on Target/90", "Shots/90", "Mid 3rd", "Def 3rd", "Def Pen"])

for player in giocatori_da_testare:
    if player not in giocatori_test_dataset:
        continue
    else:
        vector = X_test[(X_test["Player"] == player)]
        squad = vector["Squad"].values[0]
        pos = vector["Pos"].values[0]
        
        goals_scored = vector["Goals"].values.astype(int)[0]
        y_actual.append(goals_scored)
        
        scaled_vector, X_reg_dataset, y_reg_dataset = build_regression_dataset(vector, agg_clustering)
        
        regressor = LinearSVR()
        hp = {
            'epsilon': [0.0, 0.1, 0.2, 0.3, 0.5],
            'C': [0.1, 1, 10, 50, 100]
        }
        grid = GridSearchCV(regressor, hp, scoring='neg_mean_squared_error', cv=5)
        grid.fit(X=X_reg_dataset, y=y_reg_dataset)
        
        regressor = LinearSVR(C=grid.best_params_['C'], epsilon=grid.best_params_['epsilon'])
        regressor.fit(X=X_reg_dataset, y=y_reg_dataset)
        
        goals_predicted = regressor.predict(scaled_vector)[0]
        if goals_predicted < 0:
            goals_predicted = 0 # Attenzione! Il modello può prevedere un numero negativo di Goal segnati...ma ciò non ha senso!
        y_predicted.append(goals_predicted)
        
        diff = np.abs(goals_scored-goals_predicted)
        
        full_report.loc[len(full_report)] = [player, squad, pos, goals_scored, goals_predicted, diff]
        feature_importance.loc[len(feature_importance)] = regressor.coef_

In [27]:
# Esegui per materializzare il Full_Report
writer = pd.ExcelWriter("reports/Full Report LinearSVR x AgglomerativeClustering.xlsx")
full_report.to_excel(writer, sheet_name="Risultati", index=False)
feature_importance.to_excel(writer, sheet_name="Feature Importance", index=False)
writer.close()

In [28]:
print("Errore Quadratico Medio:", mean_squared_error(y_actual, y_predicted))
print("Errore Assoluto Medio:", mean_absolute_error(y_actual, y_predicted))
print("R2 Score: ", r2_score(y_actual, y_predicted))

Errore Quadratico Medio: 1.0807201623385823
Errore Assoluto Medio: 0.503078479092301
R2 Score:  0.9188476661892078
