In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.gridspec as grid_spec
import matplotlib.patches as mpatches
import random
import warnings
import platform
import time
import re

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, learning_curve
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn import naive_bayes
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB

%matplotlib inline
warnings.filterwarnings("ignore")

In [2]:
tables=pd.read_excel("C:/Users/Don Augwin/OneDrive/Documents/serie a club stats 2001-2022.xlsx")
clubstats=pd.read_excel("C:/Users/Don Augwin/OneDrive/Documents/serie a.xlsx")

In [3]:
tables.head()

Unnamed: 0,Rk(2001-2022),Squad,MP,W,D,L,GF,GA,GD,Pts,Pts/MP,Attendance,Top Team Scorer,Goalkeeper,Notes,season,SEASON
0,18,Ancona,34,2,7,25,21,70,-49,13,0.38,13234,Cristian Bucchi - 5,Sergio Marcon,Relegated 2,2003-04,2003
1,10,Ascoli,38,9,16,13,43,53,-10,43,1.13,12435,Marco Ferrante - 8,Ferdinando Coppola,,2005-06,2005
2,19,Ascoli,38,5,12,21,36,67,-31,27,0.71,6829,"Saša Bjelanović, Andrea Soncin - 7",Gianluca Pagliuca,Relegated,2006-07,2006
3,9,Atalanta,34,12,9,13,41,50,-9,45,1.32,16706,Cristiano Doni - 16,Massimo Taibi,,2001-02,2001
4,15,Atalanta,34,8,14,12,35,47,-12,38,1.12,16350,Cristiano Doni - 10,Massimo Taibi,"Relegated, → Relegation play-off",2002-03,2002


In [4]:
clubstats.head()

Unnamed: 0,Squad,Players count,Age,Gls,Ast,Goals,PK,PKatt,CrdY,CrdR,...,G&PK,G+A&PK,Fouls,Subs,SoT/90,SoT,G/SoT,Saves,Cleansheets,SEASON
0,Ancona,41,28.8,21,14,18,3,4,71,6,...,0.53,0.94,586,80,4.0,136,0.13,160,4,2003
1,Ascoli,27,26.5,40,24,37,3,3,75,8,...,0.97,1.61,764,103,4.55,173,0.21,129,6,2005
2,Ascoli,32,28.0,35,17,32,3,6,116,8,...,0.84,1.29,844,111,3.76,143,0.22,152,5,2006
3,Atalanta,27,27.0,40,25,36,4,4,61,5,...,1.06,1.79,612,88,3.79,129,0.28,105,7,2001
4,Atalanta,28,27.2,34,24,31,3,3,67,4,...,0.91,1.62,679,85,3.82,130,0.24,118,8,2002


In [5]:
# Join two data frames
data = pd.merge(tables, clubstats, on=['Squad'],how='outer')

# To get total number of games
data['total_games'] = data['MP']

# Add top4 indicator 
data['is_top4'] = data['Rk(2001-2022)'].apply(lambda x: 1 if (x <= 4) else 0)

# Since I want to predict Top 4 for season 2019/20, I will exclude this season.
current_season = data[data['season'] == '2021-22']
past_seasons = data[data['season'] != '2021-22'].reset_index(drop=True)

In [6]:
data.head()

Unnamed: 0,Rk(2001-2022),Squad,MP,W,D,L,GF,GA,GD,Pts,...,Fouls,Subs,SoT/90,SoT,G/SoT,Saves,Cleansheets,SEASON_y,total_games,is_top4
0,18.0,Ancona,34.0,2.0,7.0,25.0,21.0,70.0,-49.0,13.0,...,,,,,,,,,34.0,0
1,10.0,Ascoli,38.0,9.0,16.0,13.0,43.0,53.0,-10.0,43.0,...,,,,,,,,,38.0,0
2,19.0,Ascoli,38.0,5.0,12.0,21.0,36.0,67.0,-31.0,27.0,...,,,,,,,,,38.0,0
3,9.0,Atalanta,34.0,12.0,9.0,13.0,41.0,50.0,-9.0,45.0,...,,,,,,,,,34.0,0
4,15.0,Atalanta,34.0,8.0,14.0,12.0,35.0,47.0,-12.0,38.0,...,,,,,,,,,34.0,0


In [7]:
def train(X, y, X_valid, y_valid):
    nb = naive_bayes.GaussianNB()
    lsvm = LinearSVC(random_state=2021)
    logit = LogisticRegression(random_state=2021)
    
    nb.fit(X, y)
    lsvm.fit(X, y)
    logit.fit(X, y)
    
    nb_preds = nb.predict(X_valid)
    lsvm_preds = lsvm.predict(X_valid)
    logit_preds = logit.predict(X_valid)
    
    print((f'F1 Scores \n Naive Bayes: {f1_score(y_valid, nb_preds)} \n' 
           f' Linear SVM: {f1_score(y_valid, lsvm_preds)} \n' 
           f' Logitstic Regression: {f1_score(y_valid, logit_preds)}'))
    
    return [nb, lsvm, logit]


def plot_learning(X, y):
    nb = naive_bayes.GaussianNB()
    lsvm = LinearSVC(random_state=2021)
    logit = LogisticRegression(random_state=2021)
    
    models = [nb, lsvm, logit]
    
    fig, ax = plt.subplots(1,3, figsize=(20,5))
    
    for i in range(3):
        train_sizes, train_score, valid_scores = learning_curve(models[i], X, y, cv=5)
        ax.flatten()[i].plot(train_sizes, np.mean(train_score, axis=1), 'o-', color=b4_color, label='Training')
        ax.flatten()[i].plot(train_sizes, np.mean(valid_scores, axis=1), 'o-', color=t4_color, label='CV')
        ax.flatten()[i].legend()
        ax.flatten()[i].set_title(f'{models[i]}')

In [8]:
t_cols = list(tables.columns)
t_cols.append('is_top4')

c_cols = list(clubstats.columns)
c_cols.append('is_top4')

t_cols
c_cols

['Squad',
 'Players count',
 'Age',
 'Gls',
 'Ast',
 'Goals',
 'PK',
 'PKatt',
 'CrdY',
 'CrdR',
 'Goals ratio',
 'Ast ratio',
 'G+A',
 'G&PK',
 'G+A&PK',
 'Fouls',
 'Subs',
 'SoT/90',
 'SoT',
 'G/SoT',
 'Saves',
 'Cleansheets',
 'SEASON',
 'is_top4']

In [12]:

# Separate the features and target variable
X = tables.drop(['Squad','Rk(2001-2022)','Pts/MP','Attendance','Top Team Scorer','Goalkeeper','Notes','season','SEASON'], axis=1)
y = tables['Rk(2001-2022)']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [17]:
# Train and evaluate a Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)
print('Logistic Regression model performance:')
print('Accuracy:', accuracy_score(y_test, lr_preds))
print('Precision:', precision_score(y_test, lr_preds, average='macro'))
print('Recall:', recall_score(y_test, lr_preds, average='macro'))
print('F1 score:', f1_score(y_test, lr_preds, average='macro'))



Logistic Regression model performance:
Accuracy: 0.26506024096385544
Precision: 0.2569642857142857
Recall: 0.2867857142857143
F1 score: 0.2508116883116883


In [18]:
# Train and evaluate a Naive Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_preds = nb_model.predict(X_test)
print('Naive Bayes model performance:')
print('Accuracy:', accuracy_score(y_test, nb_preds))
print('Precision:', precision_score(y_test, nb_preds, average='macro'))
print('Recall:', recall_score(y_test, nb_preds, average='macro'))
print('F1 score:', f1_score(y_test, nb_preds, average='macro'))

# Train and evaluate a Linear SVM model
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)
svm_preds = svm_model.predict(X_test)
print('Linear SVM model performance:')
print('Accuracy:', accuracy_score(y_test, svm_preds))
print('Precision:', precision_score(y_test, svm_preds, average='macro'))
print('Recall:', recall_score(y_test, svm_preds, average='macro'))
print('F1 score:', f1_score(y_test, svm_preds, average='macro'))


Naive Bayes model performance:
Accuracy: 0.20481927710843373
Precision: 0.24843434343434345
Recall: 0.1992857142857143
F1 score: 0.18519952269952272
Linear SVM model performance:
Accuracy: 0.08433734939759036
Precision: 0.06300607287449393
Recall: 0.12456140350877191
F1 score: 0.05987696514012304
