In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.preprocessing import StandardScaler #per la standardizzazione dei dati
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report



## Data import

Qui vengono importati i dati provenienti dal dataset. In particolare vengono costruiti due DataFrame della libreria pandas: uno per ogni sheet del file excel.

In [2]:
dataset_full_path = "./"
dataset_filename = "dataset.xlsx"

players = pd.read_excel(dataset_full_path + dataset_filename)
injuries = pd.read_excel(dataset_full_path + dataset_filename)

### Data filtering

Vengono selezionati solamente i dati relativi all'anno 2015.

Il dataset è ordinato utilizzando come chiave la stagione (Season_ID), quindi si è sicuri del fatto che le prime 741 righe del primo sheet, e le prime 734 del secondo sheet sono relative alla stagione del 2015.

In [3]:
season = 2015
players_in_specific_season = players[players["Season_ID"] == season]

# metà degli injuries
injuries_head_size = int(len(injuries[injuries["Season_ID"] == season]) / 2)
injuries_in_specific_season = injuries[injuries["Season_ID"] == season].head(injuries_head_size)

players_in_specific_season.drop([
        "Surname",
        "First_name",
        "Player",
        "Club",         # feature non numerica
        "POSITION",     # feature non numerica
        "Date_of_birth",# c'è già l'anno di nascita
    ],
    axis = 'columns',
    inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [4]:
players_in_specific_season

Unnamed: 0,Player_ID,Season_ID,Starting_11,Goals,Assists,Owngoals,Substitutions_on,Substitutions_off,Yellow_cards,Red_yellow_cards,Red_cards,Penalty_goals,Year_of_birth,Age
0,10,2015,14,7,8,0,10,9,3,0,0,1,1978,37
1,2865,2015,22,0,1,0,4,11,5,0,0,0,1984,31
2,3417,2015,28,0,0,0,0,1,7,0,0,0,1981,34
3,3507,2015,0,0,0,0,1,0,0,0,0,0,1977,38
4,3713,2015,35,13,4,0,2,20,4,0,0,1,1979,36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
740,256361,2015,1,0,0,0,5,0,0,0,0,0,1996,19
2226,57644,2015,0,0,0,0,0,0,0,0,0,0,1989,26
2237,197473,2015,13,0,0,0,0,9,4,1,0,0,1995,20
2238,37941,2015,3,0,1,0,6,1,0,0,0,0,1985,30


In [5]:
injuries_in_specific_season

Unnamed: 0,Player_ID,Surname,First_name,Player,Club,Season_ID,Starting_11,Goals,Assists,Owngoals,Substitutions_on,Substitutions_off,Yellow_cards,Red_yellow_cards,Red_cards,Penalty_goals,POSITION,Date_of_birth,Year_of_birth,Age
0,10,Klose,Miroslav,Miroslav Klose,SS Lazio,2015,14,7,8,0,10,9,3,0,0,1,Centre-Forward,1978-06-09,1978,37
1,2865,Lichtsteiner,Stephan,Stephan Lichtsteiner,Juventus FC,2015,22,0,1,0,4,11,5,0,0,0,Right-Back,1984-01-16,1984,31
2,3417,Burdisso,Nicolás,Nicolás Burdisso,Genoa CFC,2015,28,0,0,0,0,1,7,0,0,0,Centre-Back,1981-04-12,1981,34
3,3507,Abbiati,Christian,Christian Abbiati,AC Milan,2015,0,0,0,0,1,0,0,0,0,0,Goalkeeper,1977-07-08,1977,38
4,3713,Maccarone,Massimo,Massimo Maccarone,FC Empoli,2015,35,13,4,0,2,20,4,0,0,1,Centre-Forward,1979-09-06,1979,36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367,129505,Trotta,Marcello,Marcello Trotta,US Sassuolo,2015,1,1,0,0,7,1,0,0,0,0,Centre-Forward,1992-09-29,1992,23
368,129893,Wagué,Molla,Molla Wagué,Udinese Calcio,2015,20,0,0,0,1,2,5,1,0,0,Centre-Back,1991-02-21,1991,24
369,130348,Crivello,Roberto,Roberto Crivello,Frosinone Calcio,2015,16,0,1,0,1,1,5,0,0,0,Left-Back,1991-09-14,1991,24
370,130360,Verdi,Simone,Simone Verdi,Carpi FC 1909,2015,3,3,0,0,5,3,1,0,0,2,Right Winger,1992-07-12,1992,23


## Training set preparing

In [6]:
training_set = []

injuries_in_specific_season_player_ids = injuries_in_specific_season["Player_ID"]

# labeling per un addestramento supervisionato
for index, player in players_in_specific_season.iterrows():
    training_set_datapoint = player
    # inizializza la classe a 0: giocatore non infortunato
    training_set_datapoint["label"] = 0
    
    for injury_player_id in injuries_in_specific_season_player_ids:
        # se il giocatore si è infortunato, assegna la classe 1
        if player["Player_ID"] == injury_player_id:
            training_set_datapoint["label"] = 1
            # migliora le performance
            break

    training_set.append(training_set_datapoint)

training_set = pd.DataFrame(training_set)

In [7]:
training_set

Unnamed: 0,Player_ID,Season_ID,Starting_11,Goals,Assists,Owngoals,Substitutions_on,Substitutions_off,Yellow_cards,Red_yellow_cards,Red_cards,Penalty_goals,Year_of_birth,Age,label
0,10,2015,14,7,8,0,10,9,3,0,0,1,1978,37,1
1,2865,2015,22,0,1,0,4,11,5,0,0,0,1984,31,1
2,3417,2015,28,0,0,0,0,1,7,0,0,0,1981,34,1
3,3507,2015,0,0,0,0,1,0,0,0,0,0,1977,38,1
4,3713,2015,35,13,4,0,2,20,4,0,0,1,1979,36,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
740,256361,2015,1,0,0,0,5,0,0,0,0,0,1996,19,0
2226,57644,2015,0,0,0,0,0,0,0,0,0,0,1989,26,0
2237,197473,2015,13,0,0,0,0,9,4,1,0,0,1995,20,0
2238,37941,2015,3,0,1,0,6,1,0,0,0,0,1985,30,1


In [8]:
training_set["label"].value_counts()

1    408
0    337
Name: label, dtype: int64

## Training

In [9]:
# preparazione del training set per sklearn
x_train, x_test, y_train, y_test = train_test_split(training_set, training_set["label"], test_size = 0.3, random_state = 12345)

# istanziamento dell'oggetto per il modello
lr = LogisticRegression()

# training
lr.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Testing

In [10]:
# test
pred = lr.predict(x_test)

In [11]:
pred

array([1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 0], dtype=int64)

## Evaluation

In [12]:
#matrice di confusione
confusion_matrix = np.matrix(confusion_matrix(pred, y_test))
confusion_matrix = pd.DataFrame(confusion_matrix, columns = ["predicted as 1", "predicted as 0"])
confusion_matrix.index = [1, 0]

In [13]:
confusion_matrix

Unnamed: 0,predicted as 1,predicted as 0
1,85,12
0,11,116


In [14]:
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           0       0.89      0.88      0.88        97
           1       0.91      0.91      0.91       127

    accuracy                           0.90       224
   macro avg       0.90      0.89      0.90       224
weighted avg       0.90      0.90      0.90       224

