In [1]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


In [2]:
conn = sqlite3.connect('MatchDB.db')
df = pd.read_sql_query("SELECT * FROM Expected_Games", conn)

In [3]:
df = df.drop(['HomeTeam', 'AwayTeam', 'League', 'Country', 'date', 'id', 'match_id', 'HGames', 'AGames'], axis=1)
df = df.dropna()

In [4]:
home_df = df.drop(['HGoals', 'AGoals', 'xAwayPoss', 'xAwayShots', 'xAwaySonT', 'xAwaySoffT', 'xHomeBS', 'xAwayCor', 'xHomeOff', 'xAwayOff', 'xHomeFoul', 'xHomeYellow', 'xAwayYellow', 'xHomeRed', 'xAwayRed', 'xAwayPass', 'xAwayAccPass', 'xAwayPassOff', 'xAwayAccLongB', 'xAwayAccLongBpercent', 'xAwayAccCross', 'xAwayAccCrosspercent', 'xAwaySuccDribb', 'xAwaySuccDribbpercent', 'xHomeDuelsW', 'xHomeTackW', 'xHomeTackWpercent', 'xHomeInt', 'xHomeClear'], axis = 1)
away_df = df.drop(['HGoals', 'AGoals', 'xHomePoss', 'xHomeShots', 'xHomeSonT', 'xHomeSoffT', 'xAwayBS', 'xHomeCor', 'xHomeOff', 'xAwayOff', 'xAwayFoul', 'xHomeYellow', 'xAwayYellow', 'xHomeRed', 'xAwayRed', 'xHomePass', 'xHomeAccPass', 'xHomePassOff', 'xHomeAccLongB', 'xHomeAccLongBpercent', 'xHomeAccCross', 'xHomeAccCrosspercent', 'xHomeSuccDribb', 'xHomeSuccDribbpercent', 'xAwayDuelsW', 'xAwayTackW', 'xAwayTackWpercent', 'xAwayInt', 'xAwayClear'], axis = 1)
home_targ = df['HGoals']
away_targ = df['AGoals']

In [5]:
print(home_df.shape)
print(away_df.shape)

(25883, 21)
(25883, 21)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(home_df, home_targ, test_size=0.2, random_state=42)

In [7]:
my_pipe = Pipeline ([
        ('imputer', SimpleImputer(strategy='median')),
        ('std_scaler', StandardScaler()),
])
X_train = my_pipe.fit_transform(X_train)

In [8]:
param_grid = [{'n_estimators': [2, 5, 10, 100]}]

forest_reg = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(forest_reg, param_grid, cv=3,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(X_train, y_train)

In [9]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.819176,0.02331,0.004195,0.00029,2,{'n_estimators': 2},-2.330411,-2.275427,-2.319219,-2.308353,0.023726,4,-0.659483,-0.693495,-0.667071,-0.67335,0.014578
1,2.077798,0.017556,0.008927,0.000744,5,{'n_estimators': 5},-1.835943,-1.787644,-1.818968,-1.814185,0.020006,3,-0.375923,-0.395488,-0.385653,-0.385688,0.007988
2,4.258989,0.123661,0.015329,0.000888,10,{'n_estimators': 10},-1.686184,-1.641849,-1.643159,-1.657064,0.020598,2,-0.280005,-0.294804,-0.29464,-0.289816,0.006938
3,40.061162,0.628843,0.140749,0.000622,100,{'n_estimators': 100},-1.55095,-1.481102,-1.475202,-1.502418,0.034402,1,-0.205514,-0.210393,-0.211989,-0.209299,0.002754


In [10]:
grid_search.best_params_

{'n_estimators': 100}

In [11]:
X_test = my_pipe.transform(X_test)

In [12]:
prediction = grid_search.predict(X_test)

In [13]:
mse = mean_squared_error(y_test, prediction)
rmse = np.sqrt(mse)
rmse

1.2214519047633396

In [14]:
mean_value = df["HGoals"].mean()
mean_value

1.481551597573697

In [15]:
rmse / mean_value

0.8244410162721861

In [16]:
from sklearn.linear_model import LinearRegression

In [17]:
reg = LinearRegression().fit(X_train, y_train)

In [18]:
score = reg.score(X_train, y_train)

In [19]:
result = reg.predict(X_test)

In [31]:
y_test.value_counts()

1    1690
2    1326
0    1194
3     607
4     234
5      91
6      28
7       6
8       1
Name: HGoals, dtype: int64

In [40]:
game = 5000
print(result[game])
print(y_test[game])

1.2077943406583036


KeyError: 5000