In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import poisson  
from scipy.interpolate import *
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import mean_squared_error

In [2]:
matches = pd.read_csv(R'C:matches.csv')
stats = pd.read_csv(R'C:stats.csv')
bets = pd.read_csv(R'C:bets.csv')
elo = pd.read_csv(R'C:Elo.txt')
fifa = pd.read_csv(R'C:Fifa.txt')



In [3]:
def matches_data_preprocessing(matches, stats,  LEAGUE_ID = 148):
    
    ProcessedData = matches
    ProcessedStats =stats
    ProcessedData = ProcessedData[ProcessedData.match_status == 'Finished']
    ProcessedData = ProcessedData[ProcessedData.league_id == LEAGUE_ID]
    ProcessedData = ProcessedData.drop_duplicates(subset='match_id', keep='first')
    ProcessedData['total_score'] = ProcessedData['match_hometeam_score'] + ProcessedData['match_awayteam_score']
    
    conditions = [
    (ProcessedData['match_hometeam_score'] - ProcessedData['match_awayteam_score'] == 0),
    (ProcessedData['match_hometeam_score'] - ProcessedData['match_awayteam_score'] > 0),
    (ProcessedData['match_hometeam_score'] - ProcessedData['match_awayteam_score'] < 0)]
    choices = ['0', '1', '2']
    
    ProcessedData['Match_Result_Flag'] = np.select(conditions, choices, default='Null')
    
    ProcessedData = ProcessedData.merge(ProcessedStats, how='left', left_on='match_id', right_on='match_id', \
                            suffixes=(False, False))[ProcessedData.columns.tolist() \
                            + ['home_CornerKicks','home_ShotsonGoal','away_CornerKicks','away_ShotsonGoal']]
    
 
    
    return ProcessedData

In [4]:
def details_data_preprocessing(bets, matches, bets_type =  ['odd_1','odd_2','odd_x'], remove_bookmaker = [] , RemoveOlderThan = 2592000):
    
    ProcessedBets =  bets
    remove_bookmaker  = ProcessedBets[['odd_bookmakers', 'value']].groupby(['odd_bookmakers'])['value'] \
                             .count() \
                             .reset_index(name='count') \
                             .sort_values(['count'], ascending=True) \
                             .head(5)

    ProcessedBets = ProcessedBets.drop_duplicates(subset=['match_id','odd_bookmakers', 'variable', 'odd_epoch'], keep='first')
    ProcessedBets = ProcessedBets[ProcessedBets['variable'].isin(bets_type)]
    ProcessedBets = ProcessedBets.merge(matches,
                                    how='left', left_on='match_id', right_on='match_id'
                                    , suffixes=(False, False))[ProcessedBets.columns.tolist() \
                            + ['epoch']]
    #ProcessedBets = ProcessedBets [ProcessedBets['odd_epoch'] - ProcessedBets['epoch']<= 2592000  ]
    ProcessedBets = ProcessedBets[~ProcessedBets['odd_bookmakers'].isin(remove_bookmaker)]
    bets_pivot = pd.pivot_table(ProcessedBets,index=['match_id','odd_bookmakers','odd_epoch'] ,columns='variable',values='value').reset_index()
    bets_pivot['prob_odd_1'] = 1/bets_pivot['odd_1']
    bets_pivot['prob_odd_x'] = 1/bets_pivot['odd_x']
    bets_pivot['prob_odd_2'] = 1/bets_pivot['odd_2']
    
    return bets_pivot
    

                             
    

In [5]:
def feature_extraction_last_matches(data, nmatch=5):
    
    df = pd.DataFrame()
    hometeam_exact_last_3 = []
    hometeam_exact_last_5 = []
    awayteam_exact_last_3 = []
    awayteam_exact_last_5 = []
    home_shots_3 = []
    home_shots_5 = []
    away_shots_3 = []
    away_shots_5 = []
    home_corner_3 = []
    home_corner_5 = []
    away_corner_3 = []
    away_corner_5 = []
    match_id = []
    for i in data['match_hometeam_id'].unique():
        data_created = data[data['match_hometeam_id']==i]
        for y in data_created['epoch']:
            data_epoch = data_created[data_created['epoch'] <= y ]
            data_last5 = data_epoch.sort_values(by=['epoch'],  ascending=False).head(nmatch)
            data_last3 = data_epoch.sort_values(by=['epoch'],  ascending=False).head(3)
            hometeam_exact_last_3.append(data_last3['match_hometeam_score'].mean())
            hometeam_exact_last_5.append(data_last5['match_hometeam_score'].mean())
            match_id.append(data_created[data_created['epoch'] == y ]['match_id'].values[0])
    
    for i in data['match_awayteam_id'].unique():
        data_created = data[data['match_awayteam_id']==i]
        for y in data_created['epoch']:
            data_epoch = data_created[data_created['epoch'] <= y ]
            data_last5 = data_epoch.sort_values(by=['epoch'],  ascending=False).head(nmatch)
            data_last3 = data_epoch.sort_values(by=['epoch'],  ascending=False).head(3)
            awayteam_exact_last_3.append(data_last3['match_awayteam_score'].mean())
            awayteam_exact_last_5.append(data_last5['match_awayteam_score'].mean())
            
    for i in data['match_awayteam_id'].unique():
        data_created = data[data['match_awayteam_id']==i]
        for y in data_created['epoch']:
            data_epoch = data_created[data_created['epoch'] <= y ]
            data_last5 = data_epoch.sort_values(by=['epoch'],  ascending=False).head(nmatch)
            data_last3 = data_epoch.sort_values(by=['epoch'],  ascending=False).head(3)
            home_shots_3.append(data_last3['home_ShotsonGoal'].mean())
            home_shots_5.append(data_last5['home_ShotsonGoal'].mean())
    
    for i in data['match_awayteam_id'].unique():
        data_created = data[data['match_awayteam_id']==i]
        for y in data_created['epoch']:
            data_epoch = data_created[data_created['epoch'] <= y ]
            data_last5 = data_epoch.sort_values(by=['epoch'],  ascending=False).head(nmatch)
            data_last3 = data_epoch.sort_values(by=['epoch'],  ascending=False).head(3)
            away_shots_3.append(data_last3['away_ShotsonGoal'].mean())
            away_shots_5.append(data_last5['away_ShotsonGoal'].mean())
            
    for i in data['match_awayteam_id'].unique():
        data_created = data[data['match_awayteam_id']==i]
        for y in data_created['epoch']:
            data_epoch = data_created[data_created['epoch'] <= y ]
            data_last5 = data_epoch.sort_values(by=['epoch'],  ascending=False).head(nmatch)
            data_last3 = data_epoch.sort_values(by=['epoch'],  ascending=False).head(3)
            home_corner_3.append(data_last3['home_CornerKicks'].mean())
            home_corner_5.append(data_last5['home_CornerKicks'].mean())
    
    for i in data['match_awayteam_id'].unique():
        data_created = data[data['match_awayteam_id']==i]
        for y in data_created['epoch']:
            data_epoch = data_created[data_created['epoch'] <= y ]
            data_last5 = data_epoch.sort_values(by=['epoch'],  ascending=False).head(nmatch)
            data_last3 = data_epoch.sort_values(by=['epoch'],  ascending=False).head(3)
            away_corner_3.append(data_last3['away_CornerKicks'].mean())
            away_corner_5.append(data_last5['away_CornerKicks'].mean())
            
            
    df['hometeam_exact_last_3'] = hometeam_exact_last_3
    df['hometeam_exact_last_5'] = hometeam_exact_last_5
    df['awayteam_exact_last_3'] = awayteam_exact_last_3
    df['awayteam_exact_last_5'] = awayteam_exact_last_5
    df['home_shots_3'] = home_shots_3
    df['home_shots_5'] = home_shots_5
    df['away_shots_3'] = away_shots_3
    df['away_shots_5'] = away_shots_5
    df['home_corner_3'] = home_corner_3
    df['home_corner_5'] = home_corner_5
    df['away_corner_3'] = away_corner_3
    df['away_corner_5'] = away_corner_5
    df['match_id'] = match_id
    
    
    
    return(df)

In [6]:
def feature_extraction_total_score(data):
    df_total = pd.DataFrame()
    
    for i in  data['match_id']:
        a = data[data['match_id']==i][['match_hometeam_id', 'match_awayteam_id', 'epoch', 'match_id']]
        identical_matches = data[(data['epoch']<a['epoch'].values[0]) & \
                        ((data['match_hometeam_id'] == a['match_hometeam_id'].values[0])| \
                        (data['match_hometeam_id'] == a['match_awayteam_id'].values[0])) & \
                        ((data['match_awayteam_id'] == a['match_awayteam_id'].values[0]) | \
                        (data['match_awayteam_id'] == a['match_hometeam_id'].values[0]))]
        a['prev_match_avg_score'] =  identical_matches['total_score'].mean()

        df_total = df_total.append(a)
        
    return (df_total)

In [7]:
matches = matches_data_preprocessing(matches, stats)

In [8]:
matches = matches.merge (elo, how='left', left_on = ['match_awayteam_id'], right_on = ['team_id'],suffixes = ('_away','_home'))
matches = matches.merge (elo, how='left', left_on = ['match_hometeam_id'], right_on = ['team_id'],suffixes = ('_away','_home'))
matches = matches.merge (fifa, how='left', left_on = ['match_awayteam_id'], right_on = ['team_id'],suffixes = ('_away','_home'))
matches = matches.merge (fifa, how='left', left_on = ['match_hometeam_id'], right_on = ['team_id'],suffixes = ('_away','_home'))






In [9]:
matches.columns

Index(['match_awayteam_id', 'match_hometeam_id', 'match_id', 'epoch',
       'match_status', 'match_live', 'match_hometeam_name',
       'match_awayteam_name', 'match_hometeam_score', 'match_awayteam_score',
       'match_hometeam_halftime_score', 'match_awayteam_halftime_score',
       'match_hometeam_extra_score', 'match_awayteam_extra_score',
       'match_hometeam_penalty_score', 'match_awayteam_penalty_score',
       'league_id', 'total_score', 'Match_Result_Flag', 'home_CornerKicks',
       'home_ShotsonGoal', 'away_CornerKicks', 'away_ShotsonGoal', 'rank_away',
       'team_away', 'team_id_away', 'elo_away', 'rank_home', 'team_home',
       'team_id_home', 'elo_home', 'team_id_away', 'team_away', 'ATT_away',
       'MID_away', 'DEF_away', 'OVR_away', 'team_id_home', 'team_home',
       'ATT_home', 'MID_home', 'DEF_home', 'OVR_home'],
      dtype='object')

In [10]:
cols = ['rank_away', 'team_away', 'team_id_away', 'rank_home', 'team_home','team_id_home', 'team_id_away', 'team_away',  'team_id_home', 'team_home']
matches = matches.drop(columns=cols)

In [11]:
matches.columns

Index(['match_awayteam_id', 'match_hometeam_id', 'match_id', 'epoch',
       'match_status', 'match_live', 'match_hometeam_name',
       'match_awayteam_name', 'match_hometeam_score', 'match_awayteam_score',
       'match_hometeam_halftime_score', 'match_awayteam_halftime_score',
       'match_hometeam_extra_score', 'match_awayteam_extra_score',
       'match_hometeam_penalty_score', 'match_awayteam_penalty_score',
       'league_id', 'total_score', 'Match_Result_Flag', 'home_CornerKicks',
       'home_ShotsonGoal', 'away_CornerKicks', 'away_ShotsonGoal', 'elo_away',
       'elo_home', 'ATT_away', 'MID_away', 'DEF_away', 'OVR_away', 'ATT_home',
       'MID_home', 'DEF_home', 'OVR_home'],
      dtype='object')

In [12]:
odds = details_data_preprocessing(bets,matches)

In [13]:
matches_featured_first = feature_extraction_last_matches(matches, nmatch=5)

In [14]:
matches_featured_second = feature_extraction_total_score(matches)

In [15]:
results = pd.merge(matches_featured_second, matches_featured_first, how='left', on='match_id')

In [16]:
odds =odds[['match_id', 'prob_odd_1', 'prob_odd_x', 'prob_odd_2']].groupby(['match_id'])[['prob_odd_1', 'prob_odd_x', 'prob_odd_2']].mean().reset_index() 

In [17]:
results.head()

Unnamed: 0,match_hometeam_id,match_awayteam_id,epoch,match_id,prev_match_avg_score,hometeam_exact_last_3,hometeam_exact_last_5,awayteam_exact_last_3,awayteam_exact_last_5,home_shots_3,home_shots_5,away_shots_3,away_shots_5,home_corner_3,home_corner_5,away_corner_3,away_corner_5
0,2619,2614,1505561400,13331,,0.0,0.0,1.0,1.0,3.0,3.0,4.0,4.0,5.0,5.0,5.0,5.0
1,2623,2626,1505570400,13329,,0.0,0.0,1.333333,1.2,5.666667,6.0,4.666667,3.8,4.0,7.2,10.666667,7.2
2,2621,2629,1505570400,13327,,1.0,1.0,2.0,2.2,5.0,5.0,7.666667,7.2,3.333333,3.4,7.666667,8.8
3,2654,2641,1505570400,13456,,0.0,0.0,1.333333,1.4,4.333333,4.2,3.0,2.6,7.666667,6.6,5.0,4.8
4,2616,2617,1505651400,13324,,0.0,0.0,1.0,0.6,5.666667,4.4,2.666667,2.4,7.0,6.4,3.666667,4.6


In [18]:
results = results.merge(matches, how='left', on=['match_id','match_hometeam_id', 'match_awayteam_id', 'epoch'], suffixes=(False, False))[results.columns.tolist() + ['Match_Result_Flag', 'elo_away',
       'elo_home', 'ATT_away', 'MID_away', 'DEF_away', 'OVR_away', 'ATT_home',
       'MID_home', 'DEF_home', 'OVR_home']]


In [19]:
results = results.merge(odds, how='left', on='match_id', suffixes=(False, False))[results.columns.tolist() + ['prob_odd_1', 'prob_odd_x', 'prob_odd_2']]


In [20]:
matches.head(2)

Unnamed: 0,match_awayteam_id,match_hometeam_id,match_id,epoch,match_status,match_live,match_hometeam_name,match_awayteam_name,match_hometeam_score,match_awayteam_score,...,elo_away,elo_home,ATT_away,MID_away,DEF_away,OVR_away,ATT_home,MID_home,DEF_home,OVR_home
0,2614,2619,13331,1505561400,Finished,0,Crystal Palace,Southampton,0.0,1.0,...,1642,1743,71.0,76.0,77.0,76.0,76.0,78.0,76.0,77.0
1,2626,2623,13329,1505570400,Finished,0,Watford,Manchester City,0.0,6.0,...,1993,1627,87.0,87.0,83.0,86.0,78.0,78.0,76.0,77.0


In [21]:
results = results.fillna(results.mean())

In [22]:
results.head(2)

Unnamed: 0,match_hometeam_id,match_awayteam_id,epoch,match_id,prev_match_avg_score,hometeam_exact_last_3,hometeam_exact_last_5,awayteam_exact_last_3,awayteam_exact_last_5,home_shots_3,...,MID_away,DEF_away,OVR_away,ATT_home,MID_home,DEF_home,OVR_home,prob_odd_1,prob_odd_x,prob_odd_2
0,2619,2614,1505561400,13331,2.76669,0.0,0.0,1.0,1.0,3.0,...,76.0,77.0,76.0,76.0,78.0,76.0,77.0,0.505297,0.253288,0.29747
1,2623,2626,1505570400,13329,2.76669,0.0,0.0,1.333333,1.2,5.666667,...,87.0,83.0,86.0,78.0,78.0,76.0,77.0,0.505297,0.253288,0.29747


In [23]:
results.columns

Index(['match_hometeam_id', 'match_awayteam_id', 'epoch', 'match_id',
       'prev_match_avg_score', 'hometeam_exact_last_3',
       'hometeam_exact_last_5', 'awayteam_exact_last_3',
       'awayteam_exact_last_5', 'home_shots_3', 'home_shots_5', 'away_shots_3',
       'away_shots_5', 'home_corner_3', 'home_corner_5', 'away_corner_3',
       'away_corner_5', 'Match_Result_Flag', 'elo_away', 'elo_home',
       'ATT_away', 'MID_away', 'DEF_away', 'OVR_away', 'ATT_home', 'MID_home',
       'DEF_home', 'OVR_home', 'prob_odd_1', 'prob_odd_x', 'prob_odd_2'],
      dtype='object')

In [24]:
def mean_absolute_percentage_error(y_true, y_pred): 
    #y_true, y_pred = np.array(y_true), np.array(y_pred)
    _abs = np.abs((y_true - y_pred) / (y_true + 1))
    return np.mean(_abs) * 100

In [25]:

def rmse(y_true, y_pred):
    rms = np.sqrt(mean_squared_error(y_true, y_pred.columns.max))
    return rms

In [26]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split


# Train test split
X_train, X_test, y_train, y_test = train_test_split(results.drop(columns = ['match_hometeam_id', 'match_awayteam_id', 'epoch', 'match_id']), 
                                                        results['Match_Result_Flag'], 
                                                        test_size=0.2)


# Hyper Parameter Tuning

# Random Forest HPT
param_grid = {
    'learning_rate':[0.001], ##, 0.005, 0.01, 0.05, 0.1, 0.15, 0.5], 
    'max_depth': [3, 5 ],##, 7, 9, 11, 13, 15],
    'n_estimators':[100] ##, 200, 300, 400]
}
# Create a based model
rf = GradientBoostingClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, scoring='accuracy')

grid_search.fit(results.drop(columns = ['match_hometeam_id', 'match_awayteam_id', 'epoch', 'match_id']),
                results['Match_Result_Flag'])
        
grid_search.best_params_

best_grid_rf = grid_search.best_estimator_


# Fitting the Model by Using the Best Parameters
classifier = best_grid_rf
classifier.fit(X_train, y_train) 


# Prediction
y_pred = classifier.predict_proba(X_test)
print('Tuned learning_rate = ' + str(grid_search.best_params_['learning_rate']))
print('Tuned max_depth = ' + str(grid_search.best_params_['max_depth']))
print('Tuned n_estimators = ' + str(grid_search.best_params_['n_estimators']))


# Prediction
y_pred_train = classifier.predict_proba(X_train)




Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   11.2s finished


Tuned learning_rate = 0.001
Tuned max_depth = 3
Tuned n_estimators = 100


In [None]:
odds = details_data_preprocessing(bets, matches, bets_type =  ['odd_1','odd_2','odd_x'], remove_bookmaker = [] , RemoveOlderThan = 2592000)

In [39]:
y_pred[0][0]

0.49773246029413065

In [51]:
y_test.ilo[]

5575     0
3517     0
13474    1
11209    1
3052     1
6713     2
5553     0
6103     1
1140     1
2098     0
11736    1
5797     1
6374     0
12798    2
60       2
11504    1
7968     1
11503    1
13193    1
9361     1
2290     1
9400     1
2933     1
8962     2
3549     0
2496     1
7868     1
13331    1
3239     1
10657    1
        ..
1190     1
5698     1
6206     0
7509     2
9085     1
9957     0
7663     1
10999    1
3735     2
3668     0
9766     0
5488     0
8786     1
38       2
6668     0
13223    2
4149     0
11449    1
12803    2
9974     1
12226    2
2040     0
8662     1
9775     0
9075     1
4778     1
1741     0
5574     0
10735    0
4903     1
Name: Match_Result_Flag, Length: 2721, dtype: object

In [44]:
def rps (Prob_1, Prob_2, Prob_0, actual):
    if(actual==0):
        rps = (Prob_1 - 0)^2 + (Prob_1 + Prob_0 - 1) ^2
    if (actual==1):
        rps = (Prob_1 -1)^2 + (Prob_1 + Prob_0 - 1) ^2
    if (actual==2): 
        rps = (Prob_1 -1)^2 + (Prob_1 + Prob_0 - 1) ^2
        
    return (rps/2)

In [49]:
rps (y_pred[0][0], y_pred[0][1], y_pred[0][2], y_test[1])

KeyError: 1