In [1]:
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split

## Current hyperparamater
**GF**: Gradient Boosting
* Best params: {'n_estimators': 50, 'max_depth': 5, 'min_samples_leaf': 0.01, 'min_samples_split': 50} 
* Score: 0.4106145251396648

**GS**: Gradient Boosting
* Best params: {'n_estimators': 50, 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 50}
* Score: 0.45251396648044695

**1X2**: LDA
* Score: 0.5921787709497207

**GG/NG**: Gradient Boosting
* Best params: {'n_estimators': 200, 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 200} 
* Score: 0.6005586592178771

**O-U**: Gradient Boosting
* Best params: {'n_estimators': 50, 'max_depth': 7, 'min_samples_leaf': 0.01, 'min_samples_split': 100}
* Score: 0.6368715083798883

In [2]:
year = ['2019', '2020', '2021', '2022', '2023']
sheet = []

for ls in year:
    year_df = pd.read_csv(f'../data/dataframes/{ls}.csv', index_col=0)
    sheet.append(year_df)

datafinal = pd.concat(sheet, ignore_index=True)
datafinal['yGf'] = datafinal['yGf'].astype(int)
datafinal['yGs'] = datafinal['yGs'].astype(int)

for indice in datafinal.index:
    if datafinal.loc[indice, 'yGf'] > 4:
        datafinal.loc[indice, 'yGf'] = 4
    if datafinal.loc[indice, 'yGs'] > 4:
        datafinal.loc[indice, 'yGs'] = 4

datafinal

Unnamed: 0,Partita,Rt-5h,Rt-4h,Rt-3h,Rt-2h,Rt-1h,Classifica h,Rt-5a,Rt-4a,Rt-3a,...,Gf-3a,Gf-2a,Gf-1a,Gs-5a,Gs-4a,Gs-3a,Gs-2a,Gs-1a,yGf,yGs
0,Juventus-Spal,2.0,2.0,1.0,2.0,2.0,2.0,0.0,0.0,2.0,...,2.0,0.0,1.0,3.0,1.0,1.0,3.0,3.0,2,0
1,Sampdoria-Inter,0.0,0.0,0.0,2.0,0.0,20.0,2.0,2.0,2.0,...,1.0,2.0,1.0,0.0,1.0,0.0,0.0,0.0,1,3
2,Sassuolo-Atalanta,0.0,2.0,0.0,2.0,0.0,10.0,2.0,0.0,2.0,...,2.0,2.0,2.0,2.0,3.0,1.0,2.0,0.0,1,4
3,Napoli-Brescia,2.0,0.0,2.0,2.0,0.0,4.0,2.0,0.0,0.0,...,3.0,1.0,1.0,0.0,1.0,4.0,0.0,2.0,2,1
4,Lazio-Genoa,2.0,1.0,0.0,2.0,0.0,9.0,1.0,2.0,0.0,...,1.0,1.0,0.0,3.0,1.0,2.0,3.0,0.0,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1425,Fiorentina-Verona,0.0,2.0,0.0,2.0,1.0,7.0,0.0,0.0,1.0,...,2.0,3.0,1.0,3.0,1.0,2.0,3.0,1.0,1,0
1426,Udinese-Sassuolo,2.0,1.0,0.0,1.0,0.0,17.0,0.0,1.0,2.0,...,4.0,1.0,1.0,2.0,2.0,3.0,2.0,2.0,2,2
1427,Bologna-Roma,2.0,0.0,2.0,1.0,2.0,5.0,2.0,1.0,2.0,...,3.0,2.0,1.0,1.0,0.0,1.0,1.0,1.0,2,0
1428,Lazio-Inter,0.0,1.0,0.0,2.0,1.0,10.0,2.0,2.0,1.0,...,1.0,3.0,4.0,1.0,0.0,1.0,0.0,0.0,0,2


In [3]:
def Xy_split(dataframe, target):

    df = dataframe.copy()

    if target == 'Gf':
        X = df.drop(['yGf', 'yGs', 'Partita'], axis=1).copy()
        y = df['yGf'].copy()
    
    elif target == 'Gs':
        X = df.drop(['yGf', 'yGs', 'Partita'], axis=1).copy()
        y = df['yGs'].copy()

    else:
        df['Classe'] = 0
        for i in df.index:
            if target == '1X2':
                if df.loc[i, 'yGf'] > df.loc[i, 'yGs']:
                    df.loc[i, 'Classe'] = 'V'
                elif df.loc[i, 'yGf'] == df.loc[i, 'yGs']:
                    df.loc[i, 'Classe'] = 'N'
                else:
                    df.loc[i, 'Classe'] = 'P'

            elif target == "GG-NG":
                if df.loc[i, 'yGf'] != 0 and df.loc[i, 'yGs'] != 0:
                    df.loc[i, 'Classe'] = 'GG'
                else:
                    df.loc[i, 'Classe'] = 'NG'

            elif target == 'O-U':
                if df.loc[i, 'yGf'] + df.loc[i, 'yGs'] > 2:
                    df.loc[i, 'Classe'] = 'O'
                else:
                    df.loc[i, 'Classe'] = 'U'

            else:
                return "Invalid target variable"

        X = df.drop(['Classe', 'Partita', 'yGf', 'yGs'], axis=1).copy()
        y = df['Classe'].copy()

    return X, y

In [4]:
def KNNed(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False)
    K = np.arange(1, 1000, 10)
    kscore = []
    for k in K:
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        kscore.append(knn.score(X_test, y_test))
    return f'Best k: {K[np.argmax(kscore)]}, Score: {np.max(kscore)}'

In [5]:
def RFed(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False)
    grid = {
        'n_estimators': [50, 100, 200],
        'min_samples_leaf': [0.01, 0.05, 0.1, 1],
        'min_samples_split': [50, 100, 200]
    }

    rfparam = []
    rfscore = []
    for est in grid['n_estimators']:
        for minsl in grid['min_samples_leaf']:
            for minss in grid['min_samples_split']:
                resgrid = {
                    'n_estimators': 0,
                    'min_samples_leaf': 0,
                    'min_samples_split': 0}
                rf = RandomForestClassifier(n_estimators = est,
                           random_state = 66,
                           min_samples_leaf = minsl,
                           min_samples_split = minss,
                           n_jobs = -1)
                rf.fit(X_train, y_train)
                resgrid['n_estimators'] = est
                resgrid['min_samples_leaf'] = minsl
                resgrid['min_samples_split'] = minss
                rfparam.append(resgrid)
                rfscore.append(rf.score(X_test, y_test))
    return f'Best params: {rfparam[np.argmax(rfscore)]}, Score: {np.max(rfscore)}'

In [6]:
def MBed(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False)
    mnb = MultinomialNB()
    mnb.fit(X_train, y_train)
    return mnb.score(X_test, y_test)

In [7]:
def LDAed(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False)
    lda = LinearDiscriminantAnalysis()
    lda.fit(X_train, y_train)
    return lda.score(X_test, y_test)

In [12]:
def GBed(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False)
    grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7],
        'min_samples_leaf': [0.01, 0.1, 1],
        'min_samples_split': [50, 100, 200]}

    gbparam = []
    gbscore = []
    for est in grid['n_estimators']:
        for minsl in grid['min_samples_leaf']:
            for minss in grid['min_samples_split']:
                for maxd in grid['max_depth']: 
                    resgrid = {
                        'n_estimators': 0,
                        'max_depth': 0,
                        'min_samples_leaf': 0,
                        'min_samples_split': 0}
                    gb = GradientBoostingClassifier(n_estimators = est,
                               random_state = 66,
                               max_depth = maxd,
                               subsample = 0.8,
                               min_samples_leaf = minsl,
                               min_samples_split = minss)
                    gb.fit(X_train, y_train)
                    resgrid['n_estimators'] = est
                    resgrid['max_depth'] = maxd
                    resgrid['min_samples_leaf'] = minsl
                    resgrid['min_samples_split'] = minss
                    gbparam.append(resgrid)
                    gbscore.append(gb.score(X_test, y_test))
    return f'Best params: {gbparam[np.argmax(gbscore)]}, Score: {np.max(gbscore)}'

In [13]:
def print_result():
    print('Multinomial NB :', MBed(X, y))
    print('KNN :', KNNed(X, y))
    print('LDA :', LDAed(X, y))
    print('Random Forest: ', RFed(X, y))
    print('Gradient Boosting :', GBed(X, y))

# Gol fatti

In [14]:
df = datafinal.copy()
X, y = Xy_split(df, 'Gf')
print_result()

Multinomial NB : 0.2905027932960894
KNN : Best k: 71, Score: 0.36312849162011174
LDA : 0.3659217877094972
Random Forest:  Best params: {'n_estimators': 50, 'min_samples_leaf': 1, 'min_samples_split': 50}, Score: 0.36312849162011174
Gradient Boosting : Best params: {'n_estimators': 50, 'max_depth': 5, 'min_samples_leaf': 0.01, 'min_samples_split': 50}, Score: 0.4106145251396648


# Gol subiti

In [15]:
df = datafinal.copy()
X, y = Xy_split(df, 'Gs')
print_result()

Multinomial NB : 0.28212290502793297
KNN : Best k: 221, Score: 0.41899441340782123
LDA : 0.48044692737430167
Random Forest:  Best params: {'n_estimators': 50, 'min_samples_leaf': 0.01, 'min_samples_split': 50}, Score: 0.4134078212290503
Gradient Boosting : Best params: {'n_estimators': 50, 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 50}, Score: 0.45251396648044695


# 1X2

In [16]:
df = datafinal.copy()
X, y = Xy_split(df, '1X2')
print_result()

Multinomial NB : 0.48044692737430167
KNN : Best k: 821, Score: 0.505586592178771
LDA : 0.5921787709497207
Random Forest:  Best params: {'n_estimators': 50, 'min_samples_leaf': 1, 'min_samples_split': 50}, Score: 0.5195530726256983
Gradient Boosting : Best params: {'n_estimators': 50, 'max_depth': 3, 'min_samples_leaf': 0.1, 'min_samples_split': 50}, Score: 0.5418994413407822


# Gol - NoGol

In [17]:
df = datafinal.copy()
X, y = Xy_split(df, 'GG-NG')
print_result()

Multinomial NB : 0.49441340782122906
KNN : Best k: 21, Score: 0.5446927374301676
LDA : 0.5698324022346368
Random Forest:  Best params: {'n_estimators': 50, 'min_samples_leaf': 0.01, 'min_samples_split': 50}, Score: 0.5391061452513967
Gradient Boosting : Best params: {'n_estimators': 200, 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 200}, Score: 0.6005586592178771


# Under - Over 2.5

In [18]:
df = datafinal.copy()
X, y = Xy_split(df, 'O-U')
print_result()

Multinomial NB : 0.4776536312849162
KNN : Best k: 11, Score: 0.5139664804469274
LDA : 0.6089385474860335
Random Forest:  Best params: {'n_estimators': 50, 'min_samples_leaf': 1, 'min_samples_split': 50}, Score: 0.5782122905027933
Gradient Boosting : Best params: {'n_estimators': 50, 'max_depth': 7, 'min_samples_leaf': 0.01, 'min_samples_split': 100}, Score: 0.6368715083798883
