In [30]:
import pandas as pd 
from sklearn.model_selection import train_test_split


In [31]:
import numpy as np
 
def get_fitting_summary(regressor, data, target):
    predictions = regressor.predict(data)
    errors_squared = (predictions - target) ** 2
    
    print('Mean Squared Error:', round(np.mean(errors_squared), 2), 'degrees.')

    score = regressor.score(data, target)
    print('R2:', round(score, 3))

In [32]:
df = pd.read_csv("data/ml/df.csv")

In [33]:
df["diff_h_a"] = df.goals_home-df.goals_away
df["home_code"] = df.home.astype("category").cat.codes
df["away_code"] = df.away.astype("category").cat.codes
teams_dict = dict( zip( df.home.astype("category").cat.codes , df.home ) )


In [35]:
target_name = "diff_h_a"
not_usefull_columns = [
    target_name,
    'Season',
    'home',
    'away',
    'goals_home',
    'goals_away'    
    ]

data = df.drop(columns=not_usefull_columns)
target = df[target_name]

data_train, data_test, target_train, target_test = train_test_split(
    data, target, random_state=42)

In [36]:
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline


float_columns_selector = selector(dtype_include="float")
int_columns_selector = selector(dtype_include="int")

numerical_columns = float_columns_selector(data) 
categorical_columns = int_columns_selector(data) 

categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()


preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)])

In [37]:
# Fitting SVR to the dataset
from sklearn.svm import SVR

print("Support Vector Regression")
regressor = SVR(kernel = 'rbf')
model = make_pipeline(preprocessor, regressor)
model.fit(data_train, target_train)
get_fitting_summary(model, data, target)

Support Vector Regression
Mean Squared Error: 2.71 degrees.
R2: 0.224


In [38]:
# Fitting Multi-layer Perceptron regressor to the dataset
from sklearn.neural_network import MLPRegressor
print("Multi-layer Perceptron Regression")

regressor = MLPRegressor(hidden_layer_sizes=(20,20,20), activation='relu', solver='adam', max_iter=500)
model = make_pipeline(preprocessor, regressor)
model.fit(data_train, target_train)
get_fitting_summary(model, data, target)

Multi-layer Perceptron Regression
Mean Squared Error: 2.38 degrees.
R2: 0.318


In [39]:
# Fitting DecisionTreeRegressor to the dataset
from sklearn.tree import DecisionTreeRegressor

print("Decision Tree Regression")
regressor = DecisionTreeRegressor(random_state = 0)
model = make_pipeline(preprocessor, regressor)
model.fit(data_train, target_train)
get_fitting_summary(model, data, target)

Decision Tree Regression
Mean Squared Error: 1.49 degrees.
R2: 0.573


In [40]:
# Fitting Random Forest Regression to the dataset
from sklearn.ensemble import RandomForestRegressor

print("Random Forest Regression")
regressor = RandomForestRegressor(n_estimators = 10000, random_state = 42)
model = make_pipeline(preprocessor, regressor)
model.fit(data_train, target_train)
get_fitting_summary(model, data, target)

Random Forest Regression
Mean Squared Error: 1.12 degrees.
R2: 0.68


In [41]:
df["forecast"] = model.predict(data) 

In [42]:

import joblib
joblib.dump(model, 'data/ml/my_model.pkl', compress = 3 )


['data/ml/my_model.pkl']

In [43]:

importances = regressor.feature_importances_
for i,j in zip(importances, list(data)):
    print("{}: {}".format(
        j,
        round(i, 2)
    ))



home_reserve_overall: 0.1
home_subs_overall: 0.1
home_titular_overall: 0.15
home_attack_overall: 0.09
home_defend_overall: 0.08
away_reserve_overall: 0.1
away_subs_overall: 0.12
away_titular_overall: 0.12
away_attack_overall: 0.07
away_defend_overall: 0.09


In [44]:
prediction_summary = df[[
    "home",
    "away",
    "diff_h_a",
    "forecast",
    ]]

In [45]:
points_end_season = { team: 0 for team in teams_dict.values()}

In [46]:
for index, row in prediction_summary.iterrows():
    if row.forecast > 1:
        points_end_season[row.home] += 3
        points_end_season[row.away] += 0
    elif  row.forecast < -1:
        points_end_season[row.home] += 0
        points_end_season[row.away] += 3
    else:
        points_end_season[row.home] += 1
        points_end_season[row.away] += 1
    

In [47]:
results = sorted( ((v,k) for k,v in points_end_season.items()), reverse=True)
for index, (points, team) in enumerate(results):
    print(f"{index + 1 }. {team} ({points}) ")


1. Manchester City (555) 
2. Liverpool (473) 
3. Chelsea (451) 
4. Tottenham Hotspur (439) 
5. Manchester United (436) 
6. Arsenal (415) 
7. Leicester City (305) 
8. Everton (287) 
9. West Ham United (277) 
10. Southampton (274) 
11. Crystal Palace (243) 
12. Newcastle United (197) 
13. Burnley (161) 
14. West Bromwich Albion (138) 
15. Watford (138) 
16. AFC Bournemouth (137) 
17. Brighton & Hove Albion (132) 
18. Stoke City (128) 
19. Aston Villa (125) 
20. Wolverhampton Wanderers (119) 
21. Swansea City (112) 
22. Sunderland (79) 
23. Huddersfield Town (54) 
24. Sheffield United (52) 
25. Fulham (49) 
26. Leeds United (42) 
27. Cardiff City (34) 
28. Hull City (31) 
29. Middlesbrough (26) 
30. Norwich City (22) 
