# Initial Modelling with XGBoost

In [35]:
import pandas as pd
import glob
import sweetviz as sv
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, plot_roc_curve, roc_auc_score, RocCurveDisplay
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score,accuracy_score

In [10]:
pd.set_option('display.max_columns', None)

In [11]:
fixtures_list = glob.glob('./Data/Fixtures/*.csv')
frame = []
for file in fixtures_list:
    frame.append(pd.read_csv(file))

fixtures_total = pd.concat(frame)

fixtures_total_cleaned = fixtures_total.dropna(subset=['winner'])

fixtures_total_cleaned = fixtures_total_cleaned.astype({'winnerteamid': int})

fixtures_usable = fixtures_total_cleaned.drop(columns=['localtime','winner', 'unixtime','hteam','ateam','timestr','roundname','updated','complete','ateam'])

fixtures_usable['date'] = pd.to_datetime(fixtures_usable['date'])

fixtures_usable['day_of_week'] = fixtures_usable['date'].dt.day_of_week
fixtures_usable['day_of_year'] = fixtures_usable['date'].dt.day_of_year
fixtures_usable['is_month_end'] = fixtures_usable['date'].dt.is_month_end
fixtures_usable['is_month_start'] = fixtures_usable['date'].dt.is_month_start
fixtures_usable['is_quarter_end'] = fixtures_usable['date'].dt.is_quarter_end
fixtures_usable['is_quarter_start'] = fixtures_usable['date'].dt.is_quarter_start
fixtures_usable['month'] = fixtures_usable['date'].dt.month
fixtures_usable['day'] = fixtures_usable['date'].dt.day
fixtures_usable['hour'] = fixtures_usable['date'].dt.hour
fixtures_usable['minute'] = fixtures_usable['date'].dt.minute

fixtures_usable.replace({False: 0, True: 1}, inplace=True)
fixtures_usable = fixtures_usable.sort_values(by=['date'], ascending=True)
fixtures_usable = fixtures_usable.drop(columns=['date'])

In [12]:
ladder_list = glob.glob('./Data/Ladder/*.csv')

frame = []
for file in ladder_list:
    frame.append(pd.read_csv(file))

ladder_total = pd.concat(frame)

In [13]:
lineup_list = glob.glob('./Data/Lineups/*.csv')

frame = []
for file in lineup_list:
    frame.append(pd.read_csv(file))

lineup_total = pd.concat(frame)

In [14]:
player_stats_list = glob.glob('./Data/Player_Stats/*.csv')

frame = []
for file in player_stats_list:
    frame.append(pd.read_csv(file, low_memory=False))

player_stats_total = pd.concat(frame)

In [15]:
results_list = glob.glob('./Data/Results/*.csv')

frame = []
for file in results_list:
    frame.append(pd.read_csv(file, low_memory=False))

results_total = pd.concat(frame)

In [16]:
team_id_to_name = dict(zip(fixtures_total_cleaned.hteam,fixtures_total_cleaned.hteamid))
team_id_to_name_sorted = sorted(team_id_to_name.items(), key=lambda x:x[1], reverse=False)

# Feature Engineering with fixtures

In [19]:
home_features = ['hgoals', 'hbehinds', 'hscore']
away_features = ['agoals', 'abehinds', 'ascore']
def create_exp_weighted_avgs(df, span):
    ema_features_home = fixtures_usable[['id','hteamid']].copy()
        # Create a copy of the df
    for feat_name in home_features:
        feature_ema = (df.groupby('hteamid')[feat_name]
                        .transform(lambda row: (row.ewm(span-span)
                                                .mean()
                                                .shift(1))))
        feat_name = 'ewm_' + str(span) + '_' + feat_name
        ema_features_home[feat_name] = feature_ema
    
    ema_features_away = fixtures_usable[['id','ateamid']].copy()
    for feat_name in away_features:
        feature_ema = (df.groupby('ateamid')[feat_name]
                        .transform(lambda row: (row.ewm(span-span)
                                                .mean()
                                                .shift(1))))
        feat_name = 'ewm_' + str(span) + '_' + feat_name
        ema_features_away[feat_name] = feature_ema
    
    return ema_features_home, ema_features_away

features_rolling_averages_home, features_rolling_averages_away = create_exp_weighted_avgs(fixtures_usable, span=10)

features_rolling_averages_home.drop(columns=['hteamid'], inplace=True)
features_rolling_averages_away.drop(columns=['ateamid'], inplace=True)

fixtures_usable_merged = pd.merge(fixtures_usable, features_rolling_averages_home, on="id", how='left')
fixtures_usable_merged = pd.merge(fixtures_usable, features_rolling_averages_away, on="id", how='left')

In [24]:
fixtures_usable_training = fixtures_usable_merged.drop(columns=['ascore','tz','id','hgoals','hbehinds','abehinds','hscore','agoals'])
fixtures_usable_training.dropna(inplace=True)
fixtures_usable_training['home_team_winner'] = fixtures_usable_training.apply(lambda x : 1 if x['winnerteamid'] == x['hteamid'] else 0, axis=1)
fixtures_usable_training = fixtures_usable_training.drop(columns=['winnerteamid'])
fixtures_usable_training = pd.get_dummies(fixtures_usable_training)

# Feature Engineering with Player Stats

In [62]:
print(player_stats_total.columns.values)

['venue_name' 'match_id' 'match_home_team' 'match_away_team' 'match_date'
 'match_local_time' 'match_attendance' 'match_round'
 'match_home_team_goals' 'match_home_team_behinds' 'match_home_team_score'
 'match_away_team_goals' 'match_away_team_behinds' 'match_away_team_score'
 'match_margin' 'match_winner' 'match_weather_temp_c' 'match_weather_type'
 'player_id' 'player_first_name' 'player_last_name' 'player_height_cm'
 'player_weight_kg' 'player_is_retired' 'player_team' 'guernsey_number'
 'kicks' 'marks' 'handballs' 'disposals' 'effective_disposals'
 'disposal_efficiency_percentage' 'goals' 'behinds' 'hitouts' 'tackles'
 'rebounds' 'inside_fifties' 'clearances' 'clangers' 'free_kicks_for'
 'free_kicks_against' 'brownlow_votes' 'contested_possessions'
 'uncontested_possessions' 'contested_marks' 'marks_inside_fifty'
 'one_percenters' 'bounces' 'goal_assists' 'time_on_ground_percentage'
 'afl_fantasy_score' 'supercoach_score' 'centre_clearances'
 'stoppage_clearances' 'score_involvemen

In [63]:
player_stats_total.groupby(['match_id', 'match_home_team'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f27baa45190>

In [None]:
for types in print

# Modelling

In [44]:
target_column = [
    'home_team_winner'
]

In [45]:
train_set = fixtures_usable_training[(fixtures_usable_training['year'] <= 2017) & (fixtures_usable_training['year'] != 2022)] # Only pas
test_set = fixtures_usable_training[fixtures_usable_training['year'] == 2022]

X_train = train_set.drop(columns=[target_column[0]])
X_test = test_set.drop(columns=[target_column[0]])

y_train = train_set[target_column]
y_test = test_set[target_column]

In [49]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

eval_set = [(X_test, y_test)]
clf = XGBClassifier(objective='binary:logistic', n_estimators=500000)

clf.fit(X_train, y_train, eval_metric="auc", eval_set=eval_set,
        early_stopping_rounds=2000,
        verbose=1000)

y_pred_train = clf.predict(X_train)

# evaluate predictions
train_accuracy = accuracy_score(y_train, y_pred_train)
print(f'Accuracy for classifier on train set: {train_accuracy}')

[0]	validation_0-auc:0.51770




[1000]	validation_0-auc:0.52242
[2000]	validation_0-auc:0.52419
[3000]	validation_0-auc:0.52508
[4000]	validation_0-auc:0.52577
[5000]	validation_0-auc:0.52567
[6000]	validation_0-auc:0.52616
[6568]	validation_0-auc:0.52616
Accuracy for classifier on train set: 1.0


In [50]:
y_pred_test = clf.predict(X_test)

# evaluate predictions
test_accuracy = accuracy_score(y_test, y_pred_test)
print(f'Accuracy for classifier on test set: {test_accuracy}')

Accuracy for classifier on test set: 0.529126213592233


In [51]:
y_pred_test_naive = y_test.replace(0, 1)
naive_accuracy = accuracy_score(y_test, y_pred_test_naive)
print(f'Accuracy for naive test set (home only): {naive_accuracy}')

Accuracy for naive test set (home only): 0.6019417475728155
