In [1]:
import pandas as pd 
import numpy as np
import joblib

from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor



In [2]:
ML_PATH = "data/ml"

In [3]:
 
def get_fitting_summary(regressor, data, target):
    predictions = regressor.predict(data)
    errors_squared = (predictions - target) ** 2
    
    print('Mean Squared Error:', round(np.mean(errors_squared), 2), 'degrees.')

    score = regressor.score(data, target)
    print('R2:', round(score, 3))

In [4]:
def load_database():
    df = pd.read_csv(f"{ML_PATH}/df_history.csv")
    df["diff_h_a"] = df.GF-df.GA
    


    return df 

In [5]:
def split_database(df):
    target_name = "diff_h_a"
    not_usefull_columns = [
        target_name,
        'home',
        'away',
        'GF',
        'GA',    
        ]

    features = df.drop(columns=not_usefull_columns)
    target = df[target_name]
    return features, target
    

In [6]:

def get_encoder(data, categoical_columns, numeric_columns):

    float_columns_selector = selector(dtype_include="float")
    int_columns_selector = selector(dtype_include="int")
    

    numerical = data[numeric_columns]
    categorical = data[categoical_columns]

    categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
    numerical_preprocessor = StandardScaler()

    preprocessor = ColumnTransformer([
        ('one-hot-encoder', categorical_preprocessor, categoical_columns),
        ('standard_scaler', numerical_preprocessor, numeric_columns)])
    return preprocessor

In [7]:
numeric_column = [
    "last_wins",
    "last_draws",
    "last_loses",
]
categoric_column = [
    "home_code",
    "away_code",
]

df = load_database()
features, target = split_database(df)
preprocessor = get_encoder(features, categoric_column, numeric_column)

In [12]:
# Fitting Random Forest Regression to the dataset

print("Random Forest Regression")
regressor = RandomForestRegressor(n_estimators = 100, random_state = 42)
model = make_pipeline(preprocessor, regressor)
model.fit(features, target)

Random Forest Regression


Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('one-hot-encoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['home_code', 'away_code']),
                                                 ('standard_scaler',
                                                  StandardScaler(),
                                                  ['last_wins', 'last_draws',
                                                   'last_loses'])])),
                ('randomforestregressor',
                 RandomForestRegressor(random_state=42))])

In [14]:
predictions = model.predict(features)
errors_squared = (predictions - target) ** 2

print('Mean Squared Error:', round(np.mean(errors_squared), 2), 'degrees.')

score = regressor.score(features, target)
print('R2:', round(score, 3))

Mean Squared Error: 1.31 degrees.


ValueError: Number of features of the model must match the input. Model n_features is 103 and input n_features is 5 

In [15]:
model.

ValueError: Number of features of the model must match the input. Model n_features is 103 and input n_features is 5 

In [10]:
get_fitting_summary(regressor, features, target)

ValueError: Number of features of the model must match the input. Model n_features is 103 and input n_features is 5 

In [None]:

joblib.dump(model, f'{ML_PATH}/random_forest.pkl', compress = 3 )


['data/ml/random_forest.pkl']

In [None]:

importances = regressor.feature_importances_
for i,j in zip(importances, list(features)):
    print("{}: {}".format(
        j,
        round(i, 2)
    ))



home_reserve_overall: 0.1
home_subs_overall: 0.1
home_titular_overall: 0.17
home_attack_overall: 0.08
home_defend_overall: 0.08
away_reserve_overall: 0.1
away_subs_overall: 0.1
away_titular_overall: 0.11
away_attack_overall: 0.08
away_defend_overall: 0.09


In [None]:
import json

teams_dict = dict( zip( df.home , features.home_code.astype("category").cat.codes  ) )
with open(f"{ML_PATH}/teams_dictionary.json", 'w') as f:
    json.dump(teams_dict, f)