# AFL Data Cleaning for Fantasy and Tipping Machine Learning Models

### Data found on Kaggle [here](https://www.kaggle.com/datasets/stoney71/aflstats)

In [1]:
import pandas as pd
import numpy as np

#Supresses scientific notation
pd.set_option('display.float_format', lambda x: '%.2f' % x)

import warnings
warnings.filterwarnings('ignore')

In [2]:
games_df = pd.read_csv("games.csv")
games_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2024 entries, 0 to 2023
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   gameId         2024 non-null   object 
 1   year           2024 non-null   int64  
 2   round          2024 non-null   object 
 3   date           2024 non-null   object 
 4   venue          2024 non-null   object 
 5   startTime      2024 non-null   object 
 6   attendance     2024 non-null   int64  
 7   homeTeam       2024 non-null   object 
 8   homeTeamScore  2024 non-null   int64  
 9   awayTeam       2024 non-null   object 
 10  awayTeamScore  2024 non-null   int64  
 11  rainfall       1993 non-null   float64
dtypes: float64(1), int64(4), object(7)
memory usage: 189.9+ KB


In [3]:
players_df = pd.read_csv("players.csv")
players_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1495 entries, 0 to 1494
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   playerId     1495 non-null   int64 
 1   displayName  1495 non-null   object
 2   height       1495 non-null   int64 
 3   weight       1495 non-null   int64 
 4   dob          1495 non-null   object
 5   position     1495 non-null   object
 6   origin       1491 non-null   object
dtypes: int64(3), object(4)
memory usage: 81.9+ KB


In [4]:
stats_df = pd.read_csv("stats.csv")
stats_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89470 entries, 0 to 89469
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   gameId                   89470 non-null  object
 1   team                     89470 non-null  object
 2   year                     89470 non-null  int64 
 3   round                    89470 non-null  object
 4   playerId                 89470 non-null  int64 
 5   displayName              89470 non-null  object
 6   gameNumber               89470 non-null  int64 
 7   Disposals                89470 non-null  int64 
 8   Kicks                    89470 non-null  int64 
 9   Marks                    89470 non-null  int64 
 10  Handballs                89470 non-null  int64 
 11  Goals                    89470 non-null  int64 
 12  Behinds                  89470 non-null  int64 
 13  Hit Outs                 89470 non-null  int64 
 14  Tackles                  89470 non-nul

## Merging dataframes to create a dataframe broken down by player and game

In [5]:
df = stats_df.merge(games_df, how='left', on='gameId')\
             .merge(players_df, how='left', on='playerId')
df

Unnamed: 0,gameId,team,year_x,round_x,playerId,displayName_x,gameNumber,Disposals,Kicks,Marks,...,homeTeamScore,awayTeam,awayTeamScore,rainfall,displayName_y,height,weight,dob,position,origin
0,2021R104,Adelaide,2021,R1,2021661124,"Berry, Sam",1,8,6,1,...,103,Geelong,91,0.00,"Berry, Sam",181,81,12-Feb-2002,"Midfield, Forward",Gippsland Power
1,2021R104,Adelaide,2021,R1,2012662083,"Brown, Luke",168,5,2,0,...,103,Geelong,91,0.00,"Brown, Luke",181,81,22-Sep-1992,Defender,Norwood
2,2021R104,Adelaide,2021,R1,2020665315,"Butts, Jordon",3,10,5,3,...,103,Geelong,91,0.00,"Butts, Jordon",198,83,31-Dec-1999,Defender,Murray Bushrangers
3,2021R104,Adelaide,2021,R1,2018689604,"Doedee, Tom",31,13,9,4,...,103,Geelong,91,0.00,"Doedee, Tom",188,88,1-Mar-1997,Defender,Geelong Falcons
4,2021R104,Adelaide,2021,R1,2018703883,"Frampton, Billy",9,14,10,8,...,103,Geelong,91,0.00,"Frampton, Billy",200,90,20-Nov-1996,Forward,South Fremantle
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89465,2012R2309,Western Bulldogs,2012,R23,2011838967,"Skinner, Zephaniah",8,2,1,1,...,128,Western Bulldogs,61,0.00,"Skinner, Zephaniah",189,79,27-Jun-1989,Forward,-
89466,2012R2309,Western Bulldogs,2012,R23,2012833342,"Smith, Clay",16,22,11,5,...,128,Western Bulldogs,61,0.00,"Smith, Clay",181,85,11-May-1993,Forward,Gippsland Power
89467,2012R2309,Western Bulldogs,2012,R23,2012846405,"Talia, Michael",4,23,14,9,...,128,Western Bulldogs,61,0.00,"Talia, Michael",194,94,11-Feb-1993,Defender,Calder Cannons
89468,2012R2309,Western Bulldogs,2012,R23,2011872415,"Wallis, Mitch",25,19,8,2,...,128,Western Bulldogs,61,0.00,"Wallis, Mitch",186,85,24-Oct-1992,Forward,Calder Cannons


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 89470 entries, 0 to 89469
Data columns (total 48 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   gameId                   89470 non-null  object 
 1   team                     89470 non-null  object 
 2   year_x                   89470 non-null  int64  
 3   round_x                  89470 non-null  object 
 4   playerId                 89470 non-null  int64  
 5   displayName_x            89470 non-null  object 
 6   gameNumber               89470 non-null  int64  
 7   Disposals                89470 non-null  int64  
 8   Kicks                    89470 non-null  int64  
 9   Marks                    89470 non-null  int64  
 10  Handballs                89470 non-null  int64  
 11  Goals                    89470 non-null  int64  
 12  Behinds                  89470 non-null  int64  
 13  Hit Outs                 89470 non-null  int64  
 14  Tackles               

### Storing data for future use

In [7]:
year_list = sorted(list(df['year_x'].unique()))
year_list

[2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021]

## Cleaning Data

In [8]:
df['rainfall'] = df['rainfall'].astype(float)
df['rainfall'].loc[df['rainfall'] > 0]

69      4.20
70      4.20
71      4.20
72      4.20
73      4.20
        ... 
89443   1.60
89444   1.60
89445   1.60
89446   1.60
89447   1.60
Name: rainfall, Length: 39430, dtype: float64

In [9]:
df['% Played'] = df['% Played'] / 100
df['% Played']

0       0.80
1       0.23
2       0.93
3       0.84
4       0.90
        ... 
89465   0.28
89466   0.79
89467   0.88
89468   0.83
89469   0.89
Name: % Played, Length: 89470, dtype: float64

### Converting date and time objects to datetime

In [10]:
df = df[['playerId', 'height', 'weight', 'dob', 'position', 'gameId', 'team',
         'year_x', 'round_x', 'Disposals', 'Kicks', 'Marks', 'Handballs', 
         'Goals', 'Behinds', 'Hit Outs', 'Tackles', 'Rebounds', 'Inside 50s', 
         'Clearances', 'Clangers', 'Frees', 'Frees Against', 'Brownlow Votes', 
         'Contested Possessions', 'Uncontested Possessions', 'Contested Marks', 
         'Marks Inside 50', 'One Percenters', 'Bounces', 'Goal Assists', '% Played',
         'Subs', 'gameNumber',  'date', 'venue', 'startTime', 'attendance',
         'homeTeam', 'homeTeamScore', 'awayTeam', 'awayTeamScore', 'rainfall']]

df.rename({'year_x': 'year', 'round_x': 'round'}, axis=1, inplace=True)

#### Changing month abbreviation to numerical value

In [11]:
date_dict = {'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06',
             'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'}

In [12]:
dob = df['dob'].str.split('-', expand=True)
dob[1] = dob[1].replace(date_dict)
df['dob'] = dob.apply(lambda row: '-'.join(row.values.astype(str)), axis=1)
df['dob']

0        12-02-2002
1        22-09-1992
2        31-12-1999
3         1-03-1997
4        20-11-1996
            ...    
89465    27-06-1989
89466    11-05-1993
89467    11-02-1993
89468    24-10-1992
89469     4-09-1989
Name: dob, Length: 89470, dtype: object

In [13]:
date = df['date'].str.split('-', expand=True)
date[1] = date[1].replace(date_dict)
df['date'] = date.apply(lambda row: '-'.join(row.values.astype(str)), axis=1)
df['date']

0        20-03-2021
1        20-03-2021
2        20-03-2021
3        20-03-2021
4        20-03-2021
            ...    
89465    02-09-2012
89466    02-09-2012
89467    02-09-2012
89468    02-09-2012
89469    02-09-2012
Name: date, Length: 89470, dtype: object

In [14]:
df['dob'] = pd.to_datetime(df['dob'], format='%d'+'-'+'%m'+'-'+'%Y')
df['date'] = pd.to_datetime(df['date'], format='%d'+'-'+'%m'+'-'+'%Y')

### Converting startTime to Start Time Hour and Start Time Minute

In [15]:
df['Start Time Hour'] = pd.to_datetime(df['startTime'], format= '%I:%M %p').dt.hour
df['Start Time Minute'] = pd.to_datetime(df['startTime'], format= '%I:%M %p').dt.minute

df.drop('startTime', axis=1, inplace=True)
df[['Start Time Hour', 'Start Time Minute']]

Unnamed: 0,Start Time Hour,Start Time Minute
0,16,5
1,16,5
2,16,5
3,16,5
4,16,5
...,...,...
89465,16,40
89466,16,40
89467,16,40
89468,16,40


### Fixing 'round' column by making Finals rounds numerical

In [16]:
round_list = {'QF': '24', 'SF': '25', 'PF': '26', 'GF': '27', 'EF': '24'}
df['round'] = df['round'].replace(round_list)
df['round'] = df['round'].str.replace('R', '').astype(int)

### Adding 'year+1' column for future indexing

In [17]:
df['year+1'] = df['year'] + 1
df['year+1']

0        2022
1        2022
2        2022
3        2022
4        2022
         ... 
89465    2013
89466    2013
89467    2013
89468    2013
89469    2013
Name: year+1, Length: 89470, dtype: int64

### Constructing index

In [18]:
df = df.set_index(['year','round'], append=True)

## Feature Creation
### Age

In [19]:
df.insert(2, 'Age', (df['date'] - df['dob']) / np.timedelta64(1,'Y'))

df.drop('dob', axis=1, inplace=True)

### Total Points Scored 

In [20]:
df['Total Points Scored'] = df['Goals'] * 6 + df['Behinds']
df['Total Points Scored']

       year  round
0      2021  1         1
1      2021  1         0
2      2021  1         0
3      2021  1         0
4      2021  1        14
                      ..
89465  2012  23        0
89466  2012  23        0
89467  2012  23        0
89468  2012  23        6
89469  2012  23        0
Name: Total Points Scored, Length: 89470, dtype: int64

### Is Home Team

In [21]:
df['Is Home Team'] = np.where(df['team'] == df['homeTeam'], 1, 0)
df['Is Home Team']

       year  round
0      2021  1        1
1      2021  1        1
2      2021  1        1
3      2021  1        1
4      2021  1        1
                     ..
89465  2012  23       0
89466  2012  23       0
89467  2012  23       0
89468  2012  23       0
89469  2012  23       0
Name: Is Home Team, Length: 89470, dtype: int32

### Opponent Team

In [22]:
df['Opponent Team'] = np.where(df['team'] == df['homeTeam'], df['awayTeam'], df['homeTeam'])

df.drop('awayTeam', axis=1, inplace=True)

### Player Team Score and Opponent Team Score

In [23]:
df['Player Team Score'] = np.where(df['team'] == df['homeTeam'], df['homeTeamScore'], df['awayTeamScore'])
df['Opponent Team Score'] = np.where(df['team'] == df['homeTeam'], df['awayTeamScore'], df['homeTeamScore'])

### Score Margin (from the perspective of each player)

In [24]:
df['Margin'] = np.where(df['Is Home Team'] == 1,
                        df['homeTeamScore'] - df['awayTeamScore'],
                        df['awayTeamScore'] - df['homeTeamScore'])
df['Margin']

       year  round
0      2021  1        12
1      2021  1        12
2      2021  1        12
3      2021  1        12
4      2021  1        12
                      ..
89465  2012  23      -67
89466  2012  23      -67
89467  2012  23      -67
89468  2012  23      -67
89469  2012  23      -67
Name: Margin, Length: 89470, dtype: int64

### Game Result

In [25]:
conditions = [df['Margin'] > 0, df['Margin'] == 0, df['Margin'] < 0]
choices = [1, .5, 0]
df['Game Result'] = np.select(conditions, choices)
df['Game Result']

       year  round
0      2021  1       1.00
1      2021  1       1.00
2      2021  1       1.00
3      2021  1       1.00
4      2021  1       1.00
                     ... 
89465  2012  23      0.00
89466  2012  23      0.00
89467  2012  23      0.00
89468  2012  23      0.00
89469  2012  23      0.00
Name: Game Result, Length: 89470, dtype: float64

### Month

In [26]:
df['Month'] = df['date'].dt.month

### Day of year

In [27]:
df['Day of Year'] = df['date'].dt.dayofyear

### Weekday

In [28]:
df['Weekday'] = df['date'].dt.weekday

In [29]:
df.drop('date', axis=1, inplace=True)

### Spliting 'position' into multiple columns

In [30]:
df['position'].unique()

array(['Midfield, Forward', 'Defender', 'Forward', 'Midfield', 'Ruck',
       'Defender, Midfield', 'Forward, Ruck', 'Defender, Forward',
       'Defender, Ruck', 'Midfield, Ruck'], dtype=object)

In [31]:
position_list = ['Midfield', 'Forward', 'Defender', 'Ruck']

for pos in position_list:
    df[f'{pos}'] = np.where(df['position'].str.contains(pos), 1, 0)
    
df.drop('position', axis=1, inplace=True)

df[position_list].sum()

Midfield    31786
Forward     33602
Defender    34426
Ruck         5850
dtype: int64

### Shifting (offsetting) and aggregating features

In [32]:
df_final = df.copy()

In [33]:
shift_columns = ['Disposals', 'Kicks', 'Marks', 'Handballs', 'Goals', 'Behinds', 'Hit Outs',
                 'Tackles', 'Rebounds', 'Inside 50s', 'Clearances', 'Clangers', 'Frees',
                 'Frees Against', 'Brownlow Votes', 'Contested Possessions', 'Uncontested Possessions',
                 'Contested Marks', 'Marks Inside 50', 'One Percenters', 'Bounces', 'Goal Assists',
                 '% Played', 'Subs', 'venue', 'Player Team Score', 'Opponent Team Score', 'rainfall', 
                 'Margin', 'Total Points Scored', 'Game Result', 'gameNumber', 'Start Time Hour', 
                 'Start Time Minute', 'Month', 'Weekday', 'Day of Year', 'team', 'Opponent Team']

average_and_sum_columns = [col for col in shift_columns if col not in ['venue', 'rainfall', 'Subs', 'Opponent Team',
                                                                       'team', 'Start Time Hour', 'Start Time Minute',
                                                                       'Month', 'Weekday', 'Day of Year']]

In [34]:
def offset_function(dataframe, column_list, year_list):
    frame = None
    for col in column_list:
        for i in range(1,6): # Shifting back 1 to 5 games
            for n, year in enumerate(year_list):
                year_df = dataframe.xs(year, level=1, drop_level=False)
                year_df = year_df.sort_index(level=['year', 'round'])
                df_group = year_df.groupby('playerId')[col]
                df_shift = df_group.shift(i)
                if n != 0:
                    df = pd.concat([df, df_shift])
                else:
                    df = df_shift
                    
            df = df.rename(f'{col}_shift_{i}')
            if frame is None:
                frame = df
            else:
                frame = pd.concat((frame, df), axis=1)
    return frame

In [35]:
offset_df = offset_function(df, shift_columns, year_list)
df_final = pd.concat((df_final, offset_df), axis=1)
offset_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Disposals_shift_1,Disposals_shift_2,Disposals_shift_3,Disposals_shift_4,Disposals_shift_5,Kicks_shift_1,Kicks_shift_2,Kicks_shift_3,Kicks_shift_4,Kicks_shift_5,...,team_shift_1,team_shift_2,team_shift_3,team_shift_4,team_shift_5,Opponent Team_shift_1,Opponent Team_shift_2,Opponent Team_shift_3,Opponent Team_shift_4,Opponent Team_shift_5
Unnamed: 0_level_1,year,round,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
80362,2012,1,,,,,,,,,,,...,,,,,,,,,,
80363,2012,1,,,,,,,,,,,...,,,,,,,,,,
80364,2012,1,,,,,,,,,,,...,,,,,,,,,,
80365,2012,1,,,,,,,,,,,...,,,,,,,,,,
80366,2012,1,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9517,2021,27,23.00,10.00,28.00,20.00,15.00,13.00,2.00,13.00,10.00,2.00,...,Western Bulldogs,Western Bulldogs,Western Bulldogs,Western Bulldogs,Western Bulldogs,Port Adelaide,Brisbane Lions,Essendon,Port Adelaide,Hawthorn
9518,2021,27,6.00,10.00,10.00,14.00,9.00,2.00,5.00,6.00,8.00,4.00,...,Western Bulldogs,Western Bulldogs,Western Bulldogs,Western Bulldogs,Western Bulldogs,Port Adelaide,Brisbane Lions,Essendon,Port Adelaide,Hawthorn
9519,2021,27,4.00,12.00,10.00,7.00,14.00,3.00,9.00,6.00,5.00,11.00,...,Western Bulldogs,Western Bulldogs,Western Bulldogs,Western Bulldogs,Western Bulldogs,Brisbane Lions,Essendon,Port Adelaide,Hawthorn,Adelaide
9520,2021,27,19.00,13.00,13.00,13.00,16.00,16.00,10.00,9.00,8.00,9.00,...,Western Bulldogs,Western Bulldogs,Western Bulldogs,Western Bulldogs,Western Bulldogs,Port Adelaide,Brisbane Lions,Essendon,Port Adelaide,Hawthorn


In [36]:
def window_function(dataframe, column_list, year_list):
    frame = None
    for col in column_list:
        for window_func in ('boxcar', 'exponential', 'triang'):
            for func in ('sum', 'mean'):
                for i in (3, 5): # Rolling window size
                    for n, year in enumerate(year_list):
                        year_df = dataframe.xs(year, level=1, drop_level=False)
                        year_df = year_df.sort_index(level=['year', 'round'])
                        df_group = year_df.groupby('playerId')[col]
                        df_shift = df_group.shift()
                        df_rolling = df_shift.rolling(i, win_type=window_func, min_periods=1)
                        if func == 'sum':
                            df_rolling = df_rolling.sum()
                        elif func == 'mean':
                            df_rolling = df_rolling.mean()
                        
                        if n != 0:
                            df = pd.concat([df, df_rolling])
                        else:
                            df = df_rolling

                    df = df.rename(f'{col}_{window_func}_{func}_{i}')
                    if frame is None:
                        frame = df
                    else:
                        frame = pd.concat((frame, df), axis=1)
    #frame = frame.reset_index('playerId', drop=True)
    return frame

In [37]:
window_df = window_function(df, average_and_sum_columns, year_list)
df_final = pd.concat((df_final, window_df), axis=1)
window_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Disposals_boxcar_sum_3,Disposals_boxcar_sum_5,Disposals_boxcar_mean_3,Disposals_boxcar_mean_5,Disposals_exponential_sum_3,Disposals_exponential_sum_5,Disposals_exponential_mean_3,Disposals_exponential_mean_5,Disposals_triang_sum_3,Disposals_triang_sum_5,...,gameNumber_boxcar_mean_3,gameNumber_boxcar_mean_5,gameNumber_exponential_sum_3,gameNumber_exponential_sum_5,gameNumber_exponential_mean_3,gameNumber_exponential_mean_5,gameNumber_triang_sum_3,gameNumber_triang_sum_5,gameNumber_triang_mean_3,gameNumber_triang_mean_5
Unnamed: 0_level_1,year,round,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
80362,2012,1,,,,,,,,,,,...,,,,,,,,,,
80363,2012,1,,,,,,,,,,,...,,,,,,,,,,
80364,2012,1,,,,,,,,,,,...,,,,,,,,,,
80365,2012,1,,,,,,,,,,,...,,,,,,,,,,
80366,2012,1,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9517,2021,27,61.00,82.00,20.33,16.40,31.92,36.80,18.39,18.34,38.00,51.00,...,97.00,86.40,129.81,138.79,74.79,69.17,163.50,221.33,81.75,73.78
9518,2021,27,44.00,77.00,14.67,15.40,30.73,34.09,17.70,16.99,33.50,51.00,...,82.33,75.40,210.34,141.45,121.18,70.50,218.00,234.67,109.00,78.22
9519,2021,27,33.00,71.00,11.00,14.20,15.93,34.38,9.18,17.13,19.50,46.00,...,76.33,66.20,98.15,221.71,56.55,110.50,125.50,255.67,62.75,85.22
9520,2021,27,29.00,67.00,9.67,13.40,13.20,20.53,7.60,10.23,16.50,35.33,...,40.33,69.20,55.89,113.99,32.20,56.81,69.50,199.00,34.75,66.33


In [38]:
def current_year_cumulative_function(dataframe, column_list, year_list):
    frame = None
    for col in column_list:
        for func in ['sum', 'max', 'min']:
            for n, year in enumerate(year_list):
                year_df = dataframe.xs(year, level=1, drop_level=False)
                year_df = year_df.set_index('playerId', append=True)
                year_df = year_df.sort_index(level=['year', 'round'])
                df_group = year_df.groupby('playerId')[col]
                if func == 'sum':
                    df_cum = df_group.cumsum()
                elif func == 'max':
                    df_cum = df_group.cummax()
                elif func == 'min':
                    df_cum = df_group.cummin()
                
                df_group_cum = df_cum.groupby('playerId')
                df_shift = df_group_cum.shift()
                    
                if n != 0:
                    df = pd.concat([df, df_shift])
                else:
                    df = df_shift
            df = df.rename(f'{col}_cumulative_{func}')
            if frame is None:
                frame = df
            else:
                frame = pd.concat((frame, df), axis=1)
    frame = frame.reset_index('playerId', drop=True)
    return frame

In [39]:
cumulative_df = current_year_cumulative_function(df, average_and_sum_columns, year_list)
df_final = pd.concat((df_final, cumulative_df), axis=1)
cumulative_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Disposals_cumulative_sum,Disposals_cumulative_max,Disposals_cumulative_min,Kicks_cumulative_sum,Kicks_cumulative_max,Kicks_cumulative_min,Marks_cumulative_sum,Marks_cumulative_max,Marks_cumulative_min,Handballs_cumulative_sum,...,Margin_cumulative_min,Total Points Scored_cumulative_sum,Total Points Scored_cumulative_max,Total Points Scored_cumulative_min,Game Result_cumulative_sum,Game Result_cumulative_max,Game Result_cumulative_min,gameNumber_cumulative_sum,gameNumber_cumulative_max,gameNumber_cumulative_min
Unnamed: 0_level_1,year,round,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
80362,2012,1,,,,,,,,,,,...,,,,,,,,,,
80363,2012,1,,,,,,,,,,,...,,,,,,,,,,
80364,2012,1,,,,,,,,,,,...,,,,,,,,,,
80365,2012,1,,,,,,,,,,,...,,,,,,,,,,
80366,2012,1,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9517,2021,27,368.00,35.00,10.00,175.00,17.00,2.00,47.00,7.00,0.00,193.00,...,-27.00,69.00,19.00,0.00,12.00,1.00,0.00,2904.00,189.00,174.00
9518,2021,27,96.00,14.00,6.00,46.00,8.00,2.00,24.00,6.00,0.00,50.00,...,-27.00,44.00,12.00,0.00,8.00,1.00,0.00,175.00,22.00,13.00
9519,2021,27,153.00,14.00,4.00,117.00,12.00,3.00,47.00,7.00,0.00,36.00,...,-28.00,169.00,25.00,1.00,10.00,1.00,0.00,165.00,18.00,4.00
9520,2021,27,370.00,28.00,11.00,239.00,21.00,6.00,101.00,8.00,1.00,131.00,...,-28.00,8.00,6.00,0.00,16.00,1.00,0.00,1551.00,81.00,60.00


In [40]:
def previous_year_cumulative_function(dataframe, column_list, year_list):
    df = dataframe.reset_index(['year', 'round'])
    df = df.set_index(['playerId', 'year+1'])
    df_sum = df.groupby(['playerId', 'year+1'])[average_and_sum_columns].sum()
    df_mean = df.groupby(['playerId', 'year+1'])[average_and_sum_columns].mean()
    return df_sum, df_mean

In [41]:
df_sum, df_mean = previous_year_cumulative_function(df, average_and_sum_columns, year_list)

In [42]:
df_final = df_final.reset_index(['year', 'round'])

In [43]:
df_final = df_final.merge(df_sum, how='left', left_on=['playerId', 'year'],
                          right_on=['playerId', 'year+1'], suffixes=('', '_prevous_year_sum'))
df_sum

Unnamed: 0_level_0,Unnamed: 1_level_0,Disposals,Kicks,Marks,Handballs,Goals,Behinds,Hit Outs,Tackles,Rebounds,Inside 50s,...,One Percenters,Bounces,Goal Assists,% Played,Player Team Score,Opponent Team Score,Margin,Total Points Scored,Game Result,gameNumber
playerId,year+1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1993707514,2013,289,244,88,45,1,2,0,20,89,20,...,110,0,0,16.00,1743,1727,16,8,10.00,6417
1993707514,2014,198,151,63,47,0,0,0,14,67,8,...,65,2,1,11.55,1259,1152,107,0,8.00,4836
1993707514,2015,279,198,103,81,1,0,0,11,60,21,...,96,0,1,12.79,1277,1120,157,6,9.00,5790
1993707514,2016,116,80,35,36,1,1,0,6,36,5,...,50,0,1,5.80,573,548,25,7,3.00,2779
1996722607,2013,520,285,99,235,35,9,0,47,30,83,...,28,49,18,20.11,2425,2259,166,219,14.00,7705
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20151001643,2018,265,139,60,126,14,8,1,47,10,46,...,14,4,9,11.41,1296,1285,11,92,6.50,609
20151001643,2019,338,232,92,106,48,22,0,48,9,72,...,21,9,17,17.07,1962,1531,431,310,16.00,1281
20151001643,2020,285,204,81,81,34,22,0,35,3,61,...,13,7,13,13.21,1374,1291,83,226,10.00,1360
20151001643,2021,135,86,26,49,14,12,0,12,5,28,...,11,2,4,8.70,593,553,40,96,5.50,935


In [44]:
df_final = df_final.merge(df_mean, how='left', left_on=['playerId', 'year'],
                          right_on=['playerId', 'year+1'], suffixes=('', '_prevous_year_mean'))
df_mean

Unnamed: 0_level_0,Unnamed: 1_level_0,Disposals,Kicks,Marks,Handballs,Goals,Behinds,Hit Outs,Tackles,Rebounds,Inside 50s,...,One Percenters,Bounces,Goal Assists,% Played,Player Team Score,Opponent Team Score,Margin,Total Points Scored,Game Result,gameNumber
playerId,year+1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1993707514,2013,16.06,13.56,4.89,2.50,0.06,0.11,0.00,1.11,4.94,1.11,...,6.11,0.00,0.00,0.89,96.83,95.94,0.89,0.44,0.56,356.50
1993707514,2014,15.23,11.62,4.85,3.62,0.00,0.00,0.00,1.08,5.15,0.62,...,5.00,0.15,0.08,0.89,96.85,88.62,8.23,0.00,0.62,372.00
1993707514,2015,18.60,13.20,6.87,5.40,0.07,0.00,0.00,0.73,4.00,1.40,...,6.40,0.00,0.07,0.85,85.13,74.67,10.47,0.40,0.60,386.00
1993707514,2016,16.57,11.43,5.00,5.14,0.14,0.14,0.00,0.86,5.14,0.71,...,7.14,0.00,0.14,0.83,81.86,78.29,3.57,1.00,0.43,397.00
1996722607,2013,22.61,12.39,4.30,10.22,1.52,0.39,0.00,2.04,1.30,3.61,...,1.22,2.13,0.78,0.87,105.43,98.22,7.22,9.52,0.61,335.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20151001643,2018,18.93,9.93,4.29,9.00,1.00,0.57,0.07,3.36,0.71,3.29,...,1.00,0.29,0.64,0.82,92.57,91.79,0.79,6.57,0.46,43.50
20151001643,2019,16.10,11.05,4.38,5.05,2.29,1.05,0.00,2.29,0.43,3.43,...,1.00,0.43,0.81,0.81,93.43,72.90,20.52,14.76,0.76,61.00
20151001643,2020,16.76,12.00,4.76,4.76,2.00,1.29,0.00,2.06,0.18,3.59,...,0.76,0.41,0.76,0.78,80.82,75.94,4.88,13.29,0.59,80.00
20151001643,2021,13.50,8.60,2.60,4.90,1.40,1.20,0.00,1.20,0.50,2.80,...,1.10,0.20,0.40,0.87,59.30,55.30,4.00,9.60,0.55,93.50


## Creating a target feature from AFL Fantasy and Supercoach point values

In [45]:
afl_fantasy_points = {'Kicks': 3,
                      'Marks': 3,
                      'Handballs': 2,
                      'Goals': 6,
                      'Behinds': 1,
                      'Hit Outs': 1,
                      'Tackles': 4,
                      'Frees': 1,
                      'Frees Against': -3}

In [46]:
supercoach_points = {'Kicks': 4,
                     'Handballs': 1.5,
                     'Marks': 6,
                     'Tackles': 4, 
                     'Frees': 4, 
                     'Frees Against': -4, 
                     'Hit Outs': 5, 
                     'Goals': 8, 
                     'Behinds': 1}

In [47]:
from functools import reduce

def get_fantasy_points(dataset, points_dict, column_name):
    fantasy_list = []
    for col_name, pts in points_dict.items():
        col_vals = dataset[col_name] * pts
        fantasy_list.append(col_vals)
    dataset.insert(0, column_name, round(reduce((lambda x, y: x + y), fantasy_list)))
    return dataset

In [48]:
df_final = get_fantasy_points(df_final, afl_fantasy_points, 'AFL Fantasy Points')
df_final = get_fantasy_points(df_final, supercoach_points, 'Supercoach Points')

### Target Margin (for creating the tipping model)

In [49]:
df_final['Target Margin'] = df_final['homeTeamScore'] - df_final['awayTeamScore']

### Dropping redundant or irrelevant columns

In [50]:
df_final.drop(['homeTeamScore', 'awayTeamScore', 'playerId', 'gameId', 'year+1'], axis=1, inplace=True)

### Dropping columns of data that would be unavailable the day of the game

In [51]:
drop_column_list = ['Disposals', 'Kicks', 'Marks', 'Handballs', 'Goals', 'Behinds', 'Hit Outs', 
                   'Tackles', 'Rebounds', 'Inside 50s', 'Clearances', 'Clangers', 'Frees', 
                   'Frees Against', 'Brownlow Votes', 'Contested Possessions', 'Bounces',
                   'Uncontested Possessions', 'Contested Marks', 'Marks Inside 50', 'One Percenters',
                   '% Played', 'Subs', 'Total Points Scored',  'Goal Assists', 'Player Team Score',
                   'Opponent Team Score', 'Margin', 'Game Result', 'attendance', 'rainfall']

df_final.drop(drop_column_list, axis=1, inplace=True)

### Normalizing columns by round and by game

In [52]:
exclude_from_normalization = ['year', 'round', 'Is Home Team', 'Month', 'Day of Year', 'Weekday',
                              'Midfield', 'Forward', 'Defender', 'Ruck', 'AFL Fantasy Points', 
                              'Supercoach Points', 'Target Margin']

normalization_list = [col for col in df_final.select_dtypes(exclude=object).columns 
                      if col not in exclude_from_normalization]

In [53]:
def normalization_func(dataframe, column_list):
    for col in column_list:
        df_round = dataframe.groupby(['year', 'round'])[col]
        dataframe[f'{col}_round_norm'] = df_round.transform(lambda x: (x - min(x))/(max(x)-min(x)))

        df_game = dataframe.groupby(['year', 'round', 'homeTeam'])[col]
        dataframe[f'{col}_game_norm'] = df_game.transform(lambda x: (x - min(x))/(max(x)-min(x)))
    return dataframe

In [54]:
df_final = normalization_func(df_final, normalization_list)
df_final

Unnamed: 0,Supercoach Points,AFL Fantasy Points,year,round,height,Age,weight,team,gameNumber,venue,...,Opponent Team Score_prevous_year_mean_round_norm,Opponent Team Score_prevous_year_mean_game_norm,Margin_prevous_year_mean_round_norm,Margin_prevous_year_mean_game_norm,Total Points Scored_prevous_year_mean_round_norm,Total Points Scored_prevous_year_mean_game_norm,Game Result_prevous_year_mean_round_norm,Game Result_prevous_year_mean_game_norm,gameNumber_prevous_year_mean_round_norm,gameNumber_prevous_year_mean_game_norm
0,62.00,50,2021,1,181,19.10,81,Adelaide,1,Adelaide Oval,...,,,,,,,,,,
1,24.00,21,2021,1,181,28.49,81,Adelaide,168,Adelaide Oval,...,,,,,,,,,,
2,50.00,35,2021,1,198,21.22,83,Adelaide,3,Adelaide Oval,...,,,,,,,,,,
3,90.00,65,2021,1,188,24.05,88,Adelaide,31,Adelaide Oval,...,,,,,,,,,,
4,147.00,83,2021,1,200,24.33,90,Adelaide,9,Adelaide Oval,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89465,16.00,12,2012,23,189,23.18,79,Western Bulldogs,8,Gabba,...,,,,,,,,,,
89466,110.00,85,2012,23,181,19.31,85,Western Bulldogs,16,Gabba,...,,,,,,,,,,
89467,132.00,88,2012,23,194,19.56,94,Western Bulldogs,4,Gabba,...,,,,,,,,,,
89468,96.00,77,2012,23,186,19.86,85,Western Bulldogs,25,Gabba,...,,,,,,,,,,


In [55]:
# homeTeam is redundant but was needed for the normalization groupby
df_final.drop('homeTeam', axis=1, inplace=True)

## Exporting dataframe

In [56]:
df_final.to_csv('AFL_dataset.csv')