# Predicting AFL Winners
Creating a model to predict winners in 2022

## Imports and cleaning

In [1]:
import pandas as pd
import numpy as np

In [2]:
games_df = pd.read_csv('data/games.csv')
stats_df = pd.read_csv('data/stats.csv')
players_df = pd.read_csv('data/players.csv')

In [3]:
# Replace all NaN rainfall values with 0, assuming no reading
games_df['rainfall'].replace(np.NaN, 0.0, inplace = True)

In [4]:
import datetime
# games_df['date'] = pd.to_datetime(games_df['date'])
games_df.head()
# games_df[games_df['date'] == '18-Mar-2021']


games_df['date'] = games_df['date'].apply(lambda x: datetime.datetime.strptime(x, '%d-%b-%Y').strftime('%d-%b-%y'))


In [5]:
games_df['date'] = pd.to_datetime(games_df['date'])

### Import and clean odds dataset

In [6]:
odds = pd.read_csv('data/odds.csv')  
odds.columns = odds.iloc[0]
odds = odds.drop(index = 0)
odds.drop(columns = ['Total Score Open', 'Total Score Min',
       'Total Score Max', 'Total Score Close', 'Total Score Over Open',
       'Total Score Over Min', 'Total Score Over Max',
       'Total Score Over Close', 'Total Score Under Open',
       'Total Score Under Min', 'Total Score Under Max',
       'Total Score Under Close', 'Notes', 'Bookmakers Surveyed', 'Home Odds Open', 'Home Odds Min',
       'Home Odds Max', 'Home Odds Close', 'Away Odds Open', 'Away Odds Min',
       'Away Odds Max', 'Away Odds Close', 'Home Line Open', 'Home Line Min',
       'Home Line Max', 'Home Line Close', 'Away Line Open', 'Away Line Min',
       'Away Line Max', 'Away Line Close', 'Home Line Odds Open',
       'Home Line Odds Min', 'Home Line Odds Max', 'Home Line Odds Close',
       'Away Line Odds Open', 'Away Line Odds Min', 'Away Line Odds Max',
       'Away Line Odds Close', 'Play Off Game?', 'Kick Off (local)', 'Home Goals',
       'Home Behinds', 'Away Goals', 'Away Behinds'], inplace = True)

In [7]:
odds = odds.rename(columns = {
    'Date': 'date', 'Home Team': 'home_team', 'Away Team': 'away_team', 'Venue': 'venue', 'Home Score': 'home_score', 'Away Score': 'away_score',
       'Home Odds': 'home_odds', 'Away Odds': 'away_odds'
})

In [8]:
odds = odds.drop(index = odds[odds.isnull().any(axis=1)].index)

In [9]:
# create a column where if home_odds > away_odds, return 1 else return 0 (1 = correct, 0 = incorrect)
odds['home_odds'] = odds['home_odds'].apply(lambda x: float(x.replace('\t', ' ').strip()))
odds['away_odds'] = odds['away_odds'].apply(lambda x: float(x.replace('\t', ' ').strip()))

In [10]:
odds[['home_score', 'away_score']] = odds[['home_score', 'away_score']].astype(int)

odds['odds_on_home_team'] = odds['home_odds'] < odds['away_odds']

odds['odds_on_home_team'] = odds['odds_on_home_team'].apply(lambda x: 1 if x == True else 0)

odds['home_team_win'] = odds['home_score'] > odds['away_score']

odds['home_team_win'] = odds['home_team_win'].apply(lambda x: 1 if x == True else 0)

## Model Building

### Process

Process: 
1. Clean data
2. Scale
3. Baseline model
4. Add in featureseasiest first
5. Retest model
6. Repeat 4, 5

Features
- odds✅
- cumulative total games per game✅
- start time
- cumulative age of all players
- win/loss streak
- distanced travelled to play the game (this would account for melb teams playing in melb and derbies)


- Predicting home_team_win (use predict proba)

FLOW
from sklearn import SomeModel

- mdl = Model()
- mdl.fit(X_train,y_train)
- mdl.score(X_test,y_test)
- mdl.predict(X_new)

### Baseline Model

In [11]:
df_odds = odds.copy()
df_odds['date'] = pd.to_datetime(df_odds['date'])
df_odds = df_odds[(df_odds['date'] > '31-Dec-11') & (df_odds['date'] < '31-Dec-21')]

In [12]:
from sklearn.model_selection import train_test_split
X = df_odds[['home_odds', 'away_odds']]
y = df_odds['home_team_win']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [52]:
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

scaler = StandardScaler() 
X_train_scaled = scaler.fit_transform(X_train.values)

In [53]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_reg = LogisticRegression()

base_scores = cross_val_score(log_reg, X_train_scaled, y_train, cv=10) # Cross validate

scores = {}
scores['base'] = base_scores.mean()

scores

{'base': 0.7217410848067126}

In [54]:
# Train the model on the Training data
log_reg.fit(X_train.values, y_train.values)

# Score the model on the Test data
test_score = log_reg.score(X_test.values, y_test.values)

X_new = scaler.transform([[10, 1]])
y_pred = log_reg.predict_proba(X_new)



### Ft_1: Cumulative games
The cumulative total games of the playing team

In [16]:
team_total_games_df = stats_df.groupby(['gameId', 'team'])[['gameNumber']].sum().reset_index().rename(columns = { 'gameNumber' : 'teamTotalGames' })

In [17]:
home_away_df = team_total_games_df.merge(games_df[['homeTeam', 'awayTeam', 'gameId']], on = 'gameId', how = 'left')
home_away_df[['homeTotalGames','awayTotalGames']] = np.nan

In [18]:
row = home_away_df[home_away_df['gameId'] == '2012EF01']

def home_games(row):
    if row['team'] == row['homeTeam']:
        return row['teamTotalGames']
    else:
        return np.nan
        
def away_games(row):
    if row['team'] == row['awayTeam']:
        return row['teamTotalGames']
    else:
        return np.nan  
    
home_away_df['homeTotalGames'] = home_away_df.apply(lambda x: home_games(x), axis = 1)
home_away_df['awayTotalGames'] = home_away_df.apply(lambda x: away_games(x), axis = 1)

In [19]:
home_away_df = home_away_df.groupby('gameId')[['homeTotalGames', 'awayTotalGames']].sum().reset_index()

home_away_df.isna().sum()

gameId            0
homeTotalGames    0
awayTotalGames    0
dtype: int64

In [34]:
games_home_away_df = games_df[['gameId', 'date', 'startTime', 'homeTeam']].merge(home_away_df, on = 'gameId', how = 'left')

In [21]:
df_odds_1 = df_odds.merge(games_home_away_df[['date', 'homeTotalGames', 'awayTotalGames', 'homeTeam']], left_on = ['home_team', 'date' ], right_on = ['homeTeam', 'date'], how = 'left')

In [22]:
df_odds_1.drop(columns = 'homeTeam', inplace = True)

In [23]:
# Replace Nan's with mean
home_total_mean = df_odds_1['homeTotalGames'].mean()
away_total_mean = df_odds_1['awayTotalGames'].mean()


df_odds_1['homeTotalGames'] = df_odds_1['homeTotalGames'].replace(np.nan, home_total_mean)
df_odds_1['awayTotalGames'] = df_odds_1['awayTotalGames'].replace(np.nan, away_total_mean)

In [35]:
df_odds_1.isna().sum()

date                 0
home_team            0
away_team            0
venue                0
home_score           0
away_score           0
home_odds            0
away_odds            0
odds_on_home_team    0
home_team_win        0
homeTotalGames       0
awayTotalGames       0
dtype: int64

In [25]:
X_1 = df_odds_1[['home_odds', 'away_odds', 'homeTotalGames', 'awayTotalGames']]
y_1 = df_odds_1['home_team_win']

X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y_1, test_size=0.3)

In [63]:
scaler_1 = StandardScaler() 
X_train_1_scaled = scaler_1.fit_transform(X_train_1.values)

In [64]:
log_reg_1 = LogisticRegression()

ft_1_scores = cross_val_score(log_reg_1, X_train_1_scaled, y_train_1, cv=10) # Cross validate

scores['ft_1'] = ft_1_scores.mean()

scores

{'base': 0.7217410848067126, 'ft_1': 0.6963340325641795}

In [66]:
# Train the model on the Training data
log_reg_1.fit(X_train_1.values, y_train_1.values)

# Score the model on the Test data
test_score = log_reg_1.score(X_test_1.values, y_test_1.values)


X_new_1 = scaler_1.transform([[2, 8, 1000, 1000]]) # (odds home, odds away, home cumulative games, away cumulative games)
y_pred_1 = log_reg_1.predict_proba(X_new_1)
y_pred_1

array([[0.34180459, 0.65819541]])

# Conclusion

1. Including cumulative games does not meaningfully impact the models score
2. The model has a 70 - 72% accuracy in predicting which team will win based on bookmakers odds.
3. Said another way, if you bet the same way as the bookmakers you will be right 70-72% of the time
4. In the current tipping environment, that gives you a 6.3 average per round, which puts you in the top 5k tippers ([Tipping rankings](https://tipping.afl.com.au/tipping/index.html#/tipping-rankings))

In [29]:
df_odds_2 = df_odds_1.copy()

In [30]:
df_odds_2[['homeTotalGames', 'awayTotalGames']] = df_odds_2[['homeTotalGames', 'awayTotalGames']].apply(lambda x: round(x, 0))

In [31]:
df_odds_2

Unnamed: 0,date,home_team,away_team,venue,home_score,away_score,home_odds,away_odds,odds_on_home_team,home_team_win,homeTotalGames,awayTotalGames
0,2021-09-25,Melbourne,Western Bulldogs,Optus Stadium,140,66,1.66,2.20,1,1,2279.0,2591.0
1,2021-09-11,Port Adelaide,Western Bulldogs,Adelaide Oval,45,116,1.38,3.01,1,0,2635.0,2522.0
2,2021-09-10,Melbourne,Geelong,Optus Stadium,125,42,1.49,2.61,1,1,2256.0,3707.0
3,2021-09-04,Brisbane,Western Bulldogs,Gabba,78,79,1.58,2.36,1,0,2121.0,2077.0
4,2021-09-03,Geelong,GWS Giants,Optus Stadium,103,68,1.33,3.25,1,1,3525.0,2175.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2019,2012-03-31,Gold Coast,Adelaide,Metricon Stadium,68,137,4.39,1.22,0,0,1009.0,1830.0
2020,2012-03-31,Melbourne,Brisbane,MCG,78,119,1.42,2.81,1,0,1641.0,1651.0
2021,2012-03-30,Hawthorn,Collingwood,MCG,137,115,1.65,2.19,1,1,2071.0,1697.0
2022,2012-03-29,Richmond,Carlton,MCG,81,125,2.48,1.52,0,0,1552.0,2062.0


In [32]:
bl_votes = stats_df.groupby(['gameId', 'team'])[['Brownlow Votes']].sum().reset_index().sort_values(ascending = False, by = 'Brownlow Votes')

In [33]:
bl_votes.groupby('team')['Brownlow Votes'].sum().sort_values()

team
Gold Coast                426
Carlton                   536
Brisbane Lions            544
St Kilda                  544
Melbourne                 563
Essendon                  586
North Melbourne           593
Greater Western Sydney    603
Western Bulldogs          637
Adelaide                  665
Fremantle                 684
Port Adelaide             703
Richmond                  711
Collingwood               719
Hawthorn                  730
West Coast                734
Geelong                   805
Sydney                    821
Name: Brownlow Votes, dtype: int64