In [1]:
import pandas as pd 
import numpy as np
import joblib

from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor



In [2]:
ML_PATH = "data/ml"

In [3]:
 
def get_fitting_summary(regressor, data, target):
    predictions = regressor.predict(data)
    errors_squared = (predictions - target) ** 2
    
    print('Mean Squared Error:', round(np.mean(errors_squared), 2), 'degrees.')

    score = regressor.score(data, target)
    print('R2:', round(score, 3))

In [4]:
def load_database():
    df = pd.read_csv(f"{ML_PATH}/df_history.csv")
    df["diff_h_a"] = df.GF-df.GA
    return df 

In [5]:
def split_database(df):
    target_name = "diff_h_a"
    not_usefull_columns = [
        target_name,
        'home',
        'away',
        'GF',
        'GA',    
        ]

    features = df.drop(columns=not_usefull_columns)
    target = df[target_name]
    return features, target
    

In [6]:

def get_encoder(data, categoical_columns, numeric_columns):

    float_columns_selector = selector(dtype_include="float")
    int_columns_selector = selector(dtype_include="int")
    

    numerical = data[numeric_columns]
    categorical = data[categoical_columns]

    categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
    numerical_preprocessor = StandardScaler()

    preprocessor = ColumnTransformer([
        ('one-hot-encoder', categorical_preprocessor, categoical_columns),
        ('standard_scaler', numerical_preprocessor, numeric_columns)])
    return preprocessor

In [8]:
numeric_column = [
    "home_last_wins",
    "home_last_draws",
    "home_last_loses",
    "away_last_wins",
    "away_last_draws",
    "away_last_loses",
]
categoric_column = [
    "home_code",
    "away_code",
]

df = load_database()
features, target = split_database(df)
preprocessor = get_encoder(features, categoric_column, numeric_column)

In [32]:
# Fitting Random Forest Regression to the dataset

print("Random Forest Regression")
regressor = RandomForestRegressor(n_estimators = 150, random_state = 42)
model = make_pipeline(preprocessor, regressor)
model.fit(features, target)

Random Forest Regression


Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('one-hot-encoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['home_code', 'away_code']),
                                                 ('standard_scaler',
                                                  StandardScaler(),
                                                  ['home_last_wins',
                                                   'home_last_draws',
                                                   'home_last_loses',
                                                   'away_last_wins',
                                                   'away_last_draws',
                                                   'away_last_loses'])])),
                ('randomforestregressor',
                 RandomForestRegressor(n_estimators=150, random_state=42))])

In [33]:
processed_features = preprocessor.fit_transform(features)

predictions = model.predict(features)
errors_squared = (predictions - target) ** 2

print('Mean Squared Error:', round(np.mean(errors_squared), 2), 'degrees.')

score = regressor.score(processed_features, target)
print('R2:', round(score, 3))

Mean Squared Error: 0.59 degrees.
R2: 0.821


In [34]:

joblib.dump(model, f'{ML_PATH}/random_forest.pkl', compress = 3 )


['data/ml/random_forest.pkl']

In [35]:

importances = regressor.feature_importances_
for value, var in zip(importances, list(features)):
    print(f"{var}: {value * 1000 :.4}")



home_code: 11.74
away_code: 10.27
home_last_wins: 1.697
home_last_draws: 4.127
home_last_loses: 8.441
away_last_wins: 1.326
away_last_draws: 7.439
away_last_loses: 4.528
