In [149]:
import pandas as pd
import glob

In [150]:
csv_files = glob.glob("../data/clean/csv/*.csv")

In [151]:
df_list = [pd.read_csv(file) for file in csv_files]
df = pd.concat(df_list, ignore_index=True)

In [152]:
df.shape

(1791, 22)

In [153]:
y_home_odds = df["MarketAverageHomeWinOdds"]

In [154]:
X_win = df[[
    "HomeTeam", "AwayTeam", "HomeLastMatch", "AwayLastMatch", 
    "HomeAvgGoals", "AwayAvgGoals", "HomeWinStreak", "AwayWinStreak",
    "HomeAvgConceded", "AwayAvgConceded", "HomeGoalDiffAvg", "AwayGoalDiffAvg"
]]

In [155]:
from sklearn.model_selection import train_test_split

X_train_home, X_test_home, y_train_home, y_test_home = train_test_split(X_win, y_home_odds, test_size=0.2, random_state=10)

In [156]:
from sklearn.preprocessing import StandardScaler

scaler_home = StandardScaler()
X_train_home_scaled = scaler_home.fit_transform(X_train_home)
X_test_home_scaled = scaler_home.transform(X_test_home)

In [157]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error

home_odds_model = GradientBoostingRegressor(n_estimators=500, learning_rate=0.05, max_depth=5, random_state=10)
home_odds_model.fit(X_train_home_scaled, y_train_home)
y_home_pred = home_odds_model.predict(X_test_home_scaled)
home_odds_mae = mean_absolute_error(y_test_home, y_home_pred)

In [158]:
home_odds_mae

0.7939479726041732

In [159]:
y_home_odds

0       1.55
1       1.32
2       1.47
3       2.08
4       2.58
        ... 
1786    3.80
1787    3.22
1788    6.85
1789    3.36
1790    2.71
Name: MarketAverageHomeWinOdds, Length: 1791, dtype: float64

In [160]:
import joblib

joblib.dump(home_odds_model, "../data/models/home_odds_model.joblib")

['../data/models/home_odds_model.joblib']

In [161]:
y_away_odds = df["MarketAverageAwayWinOdds"]
y_over25_odds = df["MarketAverageOver2.5Goals"]

In [162]:
X_train_away, X_test_away, y_train_away, y_test_away = train_test_split(X_win, y_away_odds, test_size=0.2, random_state=10)
X_train_over25_odds, X_test_over25_odds, y_train_over25_odds, y_test_over25_odds = train_test_split(X_win, y_over25_odds, test_size=0.2, random_state=10)

In [163]:
scaler_away = StandardScaler()
scaler_over25_odds = StandardScaler()

X_train_away_scaled = scaler_away.fit_transform(X_train_away)
X_test_away_scaled = scaler_away.transform(X_test_away)
X_train_over25_odds_scaled = scaler_over25_odds.fit_transform(X_train_over25_odds)
X_test_over25_odds_scaled = scaler_over25_odds.transform(X_test_over25_odds)


In [164]:
away_odds_model = GradientBoostingRegressor(n_estimators=500, learning_rate=0.05, max_depth=5, random_state=10)
away_odds_model.fit(X_train_away_scaled, y_train_away)
y_away_pred = away_odds_model.predict(X_test_away_scaled)
away_odds_mae = mean_absolute_error(y_test_away, y_away_pred)

over25_odds_model = GradientBoostingRegressor(n_estimators=500, learning_rate=0.05, max_depth=5, random_state=10)
over25_odds_model.fit(X_train_over25_odds_scaled, y_train_over25_odds)
y_over25_odds_pred = over25_odds_model.predict(X_test_over25_odds_scaled)
over25_odds_mae = mean_absolute_error(y_test_over25_odds, y_over25_odds_pred)

In [165]:
away_odds_mae

1.3269060715839047

In [166]:
y_away_pred

array([ 1.22456298,  4.22785007,  4.03749679,  4.43037465,  2.91839624,
        3.72697327,  5.76225652,  3.08298104,  7.9739163 ,  2.79236702,
        3.37187775,  9.22613247,  6.5688978 , 13.17577878,  4.5397565 ,
        4.48961412,  6.52274746, 11.63945578,  2.77723889,  2.54024648,
        4.05968835,  4.05560394,  3.36687774,  2.33908413, 11.69744971,
        4.19168907,  6.63771855,  3.79068994,  2.41974045,  2.15693705,
        4.02433326,  0.65781358,  2.48320372,  3.07783074,  3.02213476,
        3.3654826 ,  1.68351771,  4.88408546, 12.43965906,  5.8739536 ,
        2.78607323,  2.53033969, 13.13068749,  4.07984055,  2.97676221,
        4.21630585,  2.08080102,  2.89262814,  3.14376941,  4.56781752,
        2.65113042,  5.00648272,  7.4285481 ,  3.12429524,  4.24552326,
        1.47704318,  4.57242849,  6.31540592,  7.97533029,  2.18797772,
        6.22604666,  2.84979604,  5.71267716,  1.54043275,  3.64277254,
        7.44146403,  2.50857483,  4.95062649,  3.06622357,  2.50

In [167]:
over25_odds_mae

0.1470069807610046

In [168]:
y_over25_odds_pred

array([1.50635235, 2.14284794, 1.7401179 , 1.84428163, 1.90360026,
       1.98618619, 1.76846677, 1.81091303, 1.86964726, 2.0771505 ,
       1.65699368, 1.62431396, 1.77977683, 1.54603685, 1.88902246,
       1.94381279, 1.51409406, 1.54742292, 2.12215604, 2.06533752,
       2.11107508, 1.58077098, 1.83542691, 1.75830628, 1.40587093,
       1.61519477, 1.73770921, 2.02510383, 1.97809085, 1.61655733,
       1.76234479, 1.51095179, 1.54522206, 1.98171776, 1.98642996,
       2.12386876, 1.76349655, 1.72130233, 1.40963675, 1.64737892,
       1.92862075, 1.88614111, 1.38080856, 1.86799275, 1.818832  ,
       1.73719123, 1.85378256, 1.51406986, 1.98099716, 2.08063555,
       1.8710514 , 1.76180601, 1.64640311, 1.99350287, 1.56933985,
       1.64590006, 1.56729199, 1.58423013, 1.81608294, 1.78151247,
       1.63388453, 1.72816281, 1.67852717, 1.70811638, 1.59870754,
       1.62028056, 1.68852161, 1.86763977, 2.18443905, 1.82556981,
       1.7379277 , 1.72735862, 1.61973287, 1.69976307, 2.00226

In [169]:
joblib.dump(home_odds_model, "../data/models/home_odds_model.joblib")
joblib.dump(away_odds_model, "../data/models/away_odds_model.joblib")
joblib.dump(over25_odds_model, "../data/models/over25_odds_model.joblib")
joblib.dump(scaler_away, "../data/models/scaler_away_odds.pkl")
joblib.dump(scaler_over25_odds, "../data/models/scaler_over25_odds.pkl")
joblib.dump(scaler_home, "../data/models/scaler_home_odds.pkl")

['../data/models/scaler_home_odds.pkl']