In [1]:
import pandas as pd 
import numpy as np
import joblib

from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor



In [2]:
ML_PATH = "data/ml"

In [3]:
 
def get_fitting_summary(regressor, data, target):
    predictions = regressor.predict(data)
    errors_squared = (predictions - target) ** 2
    
    print('Mean Squared Error:', round(np.mean(errors_squared), 2), 'degrees.')

    score = regressor.score(data, target)
    print('R2:', round(score, 3))

In [18]:
def load_database():
    df = pd.read_csv(f"{ML_PATH}/df_historical.csv")
    df["diff_h_a"] = df.goals_home-df.goals_away
    df["home_code"] = df.home.astype("category").cat.codes
    df["away_code"] = df.away.astype("category").cat.codes


    return df 

In [28]:
def split_database(df):
    target_name = "diff_h_a"
    not_usefull_columns = [
        target_name,
        'Season',
        'home',
        'away',
        'goals_home',
        'goals_away'    
        ]

    features = df.drop(columns=not_usefull_columns)
    target = df[target_name]
    return features, target
    

In [20]:

def get_encoder(data):
    float_columns_selector = selector(dtype_include="float")
    int_columns_selector = selector(dtype_include="int")

    numerical_columns = float_columns_selector(data) 
    categorical_columns = int_columns_selector(data) 

    categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
    numerical_preprocessor = StandardScaler()


    preprocessor = ColumnTransformer([
        ('one-hot-encoder', categorical_preprocessor, categorical_columns),
        ('standard_scaler', numerical_preprocessor, numerical_columns)])
    return preprocessor

In [29]:
df = load_database()
features, target = split_database(df)
preprocessor = get_encoder(features)

In [30]:
# Fitting Random Forest Regression to the dataset

print("Random Forest Regression")
regressor = RandomForestRegressor(n_estimators = 10000, random_state = 42)
model = make_pipeline(preprocessor, regressor)
model.fit(features, target)

Random Forest Regression


In [48]:

joblib.dump(model, f'{ML_PATH}/random_forest.pkl', compress = 3 )


['data/ml/random_forest.pkl']

In [49]:

importances = regressor.feature_importances_
for i,j in zip(importances, list(features)):
    print("{}: {}".format(
        j,
        round(i, 2)
    ))



home_reserve_overall: 0.1
home_subs_overall: 0.1
home_titular_overall: 0.17
home_attack_overall: 0.08
home_defend_overall: 0.08
away_reserve_overall: 0.1
away_subs_overall: 0.1
away_titular_overall: 0.11
away_attack_overall: 0.08
away_defend_overall: 0.09


In [46]:
import json

teams_dict = dict( zip( df.home , features.home_code.astype("category").cat.codes  ) )
with open(f"{ML_PATH}/teams_dictionary.json", 'w') as f:
    json.dump(teams_dict, f)