In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split


In [2]:
import numpy as np
 
def get_fitting_summary(regressor, data, target):
    predictions = regressor.predict(data)
    errors_squared = (predictions - target) ** 2
    
    print('Mean Squared Error:', round(np.mean(errors_squared), 2), 'degrees.')

    score = regressor.score(data, target)
    print('R2:', round(score, 3))

In [3]:
df = pd.read_csv("data/ml/df.csv")

In [4]:
df["diff_h_a"] = df.goals_home-df.goals_away
df["home_code"] = df.home.astype("category").cat.codes
df["away_code"] = df.away.astype("category").cat.codes
teams_dict = dict( zip( df.home.astype("category").cat.codes , df.home ) )


In [5]:
target_name = "diff_h_a"
not_usefull_columns = [
    target_name,
    'home',
    'away',
    'goals_home',
    'goals_away'    
    ]

data = df.drop(columns=not_usefull_columns)
target = df[target_name]

data_train, data_test, target_train, target_test = train_test_split(
    data, target, random_state=42)

In [6]:
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline


float_columns_selector = selector(dtype_include="float")
int_columns_selector = selector(dtype_include="int")

numerical_columns = float_columns_selector(data) 
categorical_columns = int_columns_selector(data) 

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()


preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)])

In [7]:
# Fitting SVR to the dataset
from sklearn.svm import SVR

print("Support Vector Regression")
regressor = SVR(kernel = 'rbf')
model = make_pipeline(preprocessor, regressor)
model.fit(data_train, target_train)
get_fitting_summary(model, data, target)

Support Vector Regression


ValueError: Input X contains NaN.
SVR does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [8]:
# Fitting Multi-layer Perceptron regressor to the dataset
from sklearn.neural_network import MLPRegressor
print("Multi-layer Perceptron Regression")

regressor = MLPRegressor(hidden_layer_sizes=(20,20,20), activation='relu', solver='adam', max_iter=500)
model = make_pipeline(preprocessor, regressor)
model.fit(data_train, target_train)
get_fitting_summary(model, data, target)

Multi-layer Perceptron Regression


ValueError: Input X contains NaN.
MLPRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# Fitting DecisionTreeRegressor to the dataset
from sklearn.tree import DecisionTreeRegressor

print("Decision Tree Regression")
regressor = DecisionTreeRegressor(random_state = 0)
model = make_pipeline(preprocessor, regressor)
model.fit(data_train, target_train)
get_fitting_summary(model, data, target)

Decision Tree Regression
Mean Squared Error: 2.14 degrees.
R2: 0.399


In [None]:
# Fitting Random Forest Regression to the dataset
from sklearn.ensemble import RandomForestRegressor

print("Random Forest Regression")
regressor = RandomForestRegressor(n_estimators = 10000, random_state = 42)
model = make_pipeline(preprocessor, regressor)
model.fit(data_train, target_train)
get_fitting_summary(model, data, target)

Random Forest Regression
Mean Squared Error: 1.14 degrees.
R2: 0.679


In [None]:
df["forecast"] = model.predict(data) 

In [None]:

import joblib
joblib.dump(model, 'data/ml/my_model.pkl', compress = 3 )


['data/ml/my_model.pkl']

In [None]:
prediction_summary = df[[
    "home",
    "away",
    "diff_h_a",
    "forecast",
    ]]

In [None]:
points_end_season = { team: 0 for team in teams_dict.values()}

In [None]:
for index, row in prediction_summary.iterrows():
    if row.forecast > 1:
        points_end_season[row.home] += 3
        points_end_season[row.away] += 0
    elif  row.forecast < -1:
        points_end_season[row.home] += 0
        points_end_season[row.away] += 3
    else:
        points_end_season[row.home] += 1
        points_end_season[row.away] += 1
    

In [None]:
results = sorted( ((v,k) for k,v in points_end_season.items()), reverse=True)
for index, (points, team) in enumerate(results):
    print(f"{index + 1 }. {team} ({points}) ")


1. Manchester City (80) 
2. Liverpool (72) 
3. Manchester United (67) 
4. Arsenal (63) 
5. Chelsea (61) 
6. Tottenham Hotspur (59) 
7. West Ham United (53) 
8. Leeds United (51) 
9. Leicester City (48) 
10. Aston Villa (46) 
11. Everton (41) 
12. Burnley (39) 
13. Brighton & Hove Albion (38) 
14. Wolverhampton Wanderers (33) 
15. Southampton (32) 
16. Newcastle United (32) 
17. Crystal Palace (31) 
18. West Bromwich Albion (25) 
19. Fulham (25) 
20. Sheffield United (22) 
