In [1]:
import pandas as pd 
import numpy as np
from sklearn.metrics import accuracy_score, precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [469]:
matches = pd.read_csv("version3.csv")

In [2]:
def odds_numeric(matches):
    odds = matches['hOddsFraction']
    nOdds = []
    for odd in odds:
        split = odd.split("/")
        nOdds.append(round(int(split[0]) / int(split[1]),2))
    
    matches['h_nOdds'] = nOdds
    
    
    odds = matches['aOddsFraction']
    nOdds = []
    for odd in odds:
        split = odd.split("/")
        nOdds.append(round(int(split[0]) / int(split[1]),2))
    
    matches['a_nOdds'] = nOdds
    return matches


In [3]:
def encodes(matches):
    matches['hTeamCode'] = matches['hTeamName'].astype('category').cat.codes
    matches['aTeamCode'] = matches['aTeamName'].astype('category').cat.codes
    return matches
def possession(matches):
    matches['hBall possession'] = matches['hBall possession'].str.rstrip('%').astype(float) / 100
    matches['aBall possession'] = matches['aBall possession'].str.rstrip('%').astype(float) / 100
    return matches

In [4]:
def date_time_codes(matches):
    matches['dateTime'] = pd.to_datetime(matches['dateTime'])
    matches['dateCode'] = matches['dateTime'].dt.strftime('%Y%m%d').astype(int)
    matches['timeCode'] = matches['dateTime'].dt.strftime('%H%m%s').astype(int)
    return matches

In [5]:
def bet_fail(matches):
    matches['betFail'] = np.where(
        ((matches['hWin'] == 0) & (matches['h_nOdds'] < 1)) |
        ((matches['hWin'] == 1) & (matches['h_nOdds'] > 1)),
        0,  
        1   
    )
    return matches


In [6]:
def init(matches):
    matches = odds_numeric(matches)
    matches = encodes(matches)
    matches = date_time_codes(matches)
    matches = bet_fail(matches)
    matches = possession(matches)
    return matches

In [475]:
matches = init(matches)


In [483]:
matches.columns

Index(['hTotal shots', 'aTotal shots', 'hFouls', 'aFouls', 'hExpected goals',
       'aExpected goals', 'hFree kicks', 'aFree kicks', 'hTackles', 'aTackles',
       'hYellow cards', 'aYellow cards', 'hPasses', 'aPasses', 'hBig chances',
       'aBig chances', 'hRed cards', 'aRed cards', 'hGoalkeeper saves',
       'aGoalkeeper saves', 'hBall possession', 'aBall possession',
       'hCorner kicks', 'aCorner kicks', 'hOddsFraction', 'aOddsFraction',
       'hScore', 'aScore', 'aWin', 'draw', 'hWin', 'hVsTeamWins',
       'aVsTeamWins', 'aVsDraws', 'hVsManWins', 'aVsManWins', 'dateTime',
       'hTeamName', 'aTeamName', 'h_nOdds', 'a_nOdds', 'hTeamCode',
       'aTeamCode', 'dateCode', 'timeCode', 'betFail'],
      dtype='object')

In [17]:
def rolling_averages (group,cols,new_cols):
    group = group.sort_values('timeCode')
    rolling_stats = group[cols].rolling(3,closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    group = get_form(group)
    return group

def weighted_form(arr):
    weights = np.arange(1, len(arr) + 1)
    
    ## need to account for larger match form 

    return np.sum(arr * weights) / np.sum(weights)
    
## use neural networks 
def get_form (group):
    group['hForm'] = group['hWin'].rolling(3,closed='left').apply(weighted_form, raw=True)
    group['aForm'] = group['aWin'].rolling(3,closed='left').apply(weighted_form, raw=True)
    group = group.dropna(subset=['hForm','aForm'])
    return group

In [14]:
def init_rolling(matches):
    cols = 'hBall possession,aBall possession,hExpected goals,aExpected goals,hBig chances,aBig chances,hTotal shots,aTotal shots,hGoalkeeper saves,aGoalkeeper saves,hCorner kicks,aCorner kicks,hFouls,aFouls,hPasses,aPasses,hTackles,aTackles,hFree kicks,aFree kicks,hYellow cards,aYellow cards,aRed cards,hRed cards,hScore,aScore'.split(",")
    new_cols = [f"{c}_rolling" for c in cols]
    grouped_matches = matches.groupby('hTeamName')
    matches_rolling = matches.groupby('hTeamName').apply (lambda x: rolling_averages(x,cols,new_cols))
    matches_rolling.droplevel('hTeamName')
    return matches_rolling

In [20]:
def make_prediction (data,predictors):
    rf = RandomForestClassifier (n_estimators = 50, min_samples_split = 10, random_state = 1)
    X = data[predictors]
    y = data['betFail']
    X_train,X_test,y_train,y_test = train_test_split (X,y,random_state = 0)
    rf.fit (X_train,y_train)
    preds = rf.predict(X_test)
    acc = accuracy_score (y_test,preds)
    precision = precision_score (y_test,preds)
    return y_test,preds

In [10]:
new_predictors = ['h_nOdds',
 'hTeamCode',
 'aTeamCode',
 'dateCode',
 'timeCode',
 'a_nOdds',
 'hBall possession_rolling',
 'aBall possession_rolling',
 'hExpected goals_rolling',
 'aExpected goals_rolling',
 'hBig chances_rolling',
 'aBig chances_rolling',
 'hTotal shots_rolling',
 'aTotal shots_rolling',
 'hGoalkeeper saves_rolling',
 'aGoalkeeper saves_rolling',
 'hCorner kicks_rolling',
 'aCorner kicks_rolling',
 'hFouls_rolling',
 'aFouls_rolling',
 'hPasses_rolling',
 'aPasses_rolling',
 'hTackles_rolling',
 'aTackles_rolling',
 'hFree kicks_rolling',
 'aFree kicks_rolling',
 'hYellow cards_rolling',
 'aYellow cards_rolling',
 'aRed cards_rolling',
 'hRed cards_rolling',
 'hScore_rolling',
 'aScore_rolling',
 'hForm',
 'aForm',
 'hVsTeamWins',
 'aVsTeamWins',
 'aVsDraws',
 'hVsManWins',
 'aVsManWins']


#matches_rolling['betFail'].value_counts()

In [23]:
matches = pd.read_csv("version3.csv")
matches = init(matches)

matches_rolling = init_rolling (matches)
y_test,preds = make_prediction (matches_rolling,new_predictors)
#new_predictors
new_predictors

  matches_rolling = matches.groupby('hTeamName').apply (lambda x: rolling_averages(x,cols,new_cols))


['h_nOdds',
 'hTeamCode',
 'aTeamCode',
 'dateCode',
 'timeCode',
 'a_nOdds',
 'hBall possession_rolling',
 'aBall possession_rolling',
 'hExpected goals_rolling',
 'aExpected goals_rolling',
 'hBig chances_rolling',
 'aBig chances_rolling',
 'hTotal shots_rolling',
 'aTotal shots_rolling',
 'hGoalkeeper saves_rolling',
 'aGoalkeeper saves_rolling',
 'hCorner kicks_rolling',
 'aCorner kicks_rolling',
 'hFouls_rolling',
 'aFouls_rolling',
 'hPasses_rolling',
 'aPasses_rolling',
 'hTackles_rolling',
 'aTackles_rolling',
 'hFree kicks_rolling',
 'aFree kicks_rolling',
 'hYellow cards_rolling',
 'aYellow cards_rolling',
 'aRed cards_rolling',
 'hRed cards_rolling',
 'hScore_rolling',
 'aScore_rolling',
 'hForm',
 'aForm',
 'hVsTeamWins',
 'aVsTeamWins',
 'aVsDraws',
 'hVsManWins',
 'aVsManWins']

In [378]:
matches_rolling.to_csv('df.csv',index=False)

In [22]:
from sklearn.metrics import classification_report, confusion_matrix

print("Target distribution:")
print(matches_rolling['betFail'].value_counts(normalize=True))

print("\nConfusion matrix:")
print(confusion_matrix(y_test, preds))

print("\nClassification report:")
print(classification_report(y_test, preds))

Target distribution:
betFail
1    0.62234
0    0.37766
Name: proportion, dtype: float64

Confusion matrix:
[[22  0]
 [ 0 25]]

Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        22
           1       1.00      1.00      1.00        25

    accuracy                           1.00        47
   macro avg       1.00      1.00      1.00        47
weighted avg       1.00      1.00      1.00        47

