In [1]:
pip install xgboost lightgbm

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.


In [25]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns

from scipy.stats import rankdata
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier, LGBMRegressor 
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor

import optuna 

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'March-Mania-2023/Evan-Data/MNCAA_train_538.csv'
file_key_2 = 'March-Mania-2023/Evan-Data/MNCAA_test_538.csv'
file_key_3 = 'March-Mania-2023/Evan-Data/WNCAA_train_538.csv'
file_key_4 = 'March-Mania-2023/Evan-Data/WNCAA_test_538.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

bucket_object_4 = bucket.Object(file_key_4)
file_object_4 = bucket_object_4.get()
file_content_stream_4 = file_object_4.get('Body')

## Reading data files
man_train = pd.read_csv(file_content_stream_1)
# man_test = pd.read_csv(file_content_stream_2)
woman_train = pd.read_csv(file_content_stream_3)
# woman_test = pd.read_csv(file_content_stream_4)

man_train['target'] = np.where(man_train['ResultDiff'] > 0, 1, 0)
man_train = man_train[man_train['Season'] >= 2016].reset_index(drop = True)

woman_train['target'] = np.where(woman_train['ResultDiff'] > 0, 1, 0)
woman_train = woman_train[woman_train['Season'] >= 2016].reset_index(drop = True)

man_test = pd.read_csv('man_test_tour_phase_1.csv')
woman_test = pd.read_csv('woman_test_tour_phase_1.csv')

In [26]:
man_train['target'].value_counts()

1    401
0    401
Name: target, dtype: int64

# Man

In [27]:
to_select = ['X1_WinRatio14d',
             'X1_PointsMean',
             'X1_PointsMedian',
             'X1_PointsDiffMean',
             'X1_FgaMean',
             'X1_FgaMedian',
             'X1_FgaMin',
             'X1_FgaMax',
             'X1_AstMean',
             'X1_BlkMean',
             'X1_OppFgaMean',
             'X1_OppFgaMin',
             'X1_EfgpMean',
             'X1_PossessionsMean',
             'X1_PpmMean',
             'X1_FtrMean',
             'X1_TopMean',
             'X1_DrebpMean',
             'X2_WinRatio14d',
             'X2_PointsMean',
             'X2_PointsMedian',
             'X2_PointsDiffMean',
             'X2_FgaMean',
             'X2_FgaMedian',
             'X2_FgaMin',
             'X2_FgaMax',
             'X2_AstMean',
             'X2_BlkMean',
             'X2_OppFgaMean',
             'X2_OppFgaMin',
             'X2_EfgpMean',
             'X2_PossessionsMean',
             'X2_PpmMean',
             'X2_FtrMean',
             'X2_TopMean',
             'X2_DrebpMean',
             'Seed1',
             'Seed2',
             'SeedDiff',
             'quality_march_T1',
             'quality_march_T2', 
             'ResultDiff',
             'X1_team_rating',
             'X1_rd1_win',
             'X1_rd2_win',
             'X1_rd3_win',
             'X1_rd4_win',
             'X1_rd5_win',
             'X1_rd6_win',
             'X1_rd7_win',
             'X2_team_rating',
             'X2_rd1_win',
             'X2_rd2_win',
             'X2_rd3_win',
             'X2_rd4_win',
             'X2_rd5_win',
             'X2_rd6_win',
             'X2_rd7_win']

X = man_train[to_select]
Y = man_train['target']

man_test_tour_ID = man_test['ID']
man_test_tour = man_test[to_select]

In [36]:
#############
## XGBoost ##
#############

xgb_params = pd.read_csv('man_XGB_Phase_2_42_Optuna_Hyperparameters.csv')

xgb_md = XGBClassifier(tree_method = 'hist', 
                       max_depth = xgb_params['max_depth'][0],
                       learning_rate = xgb_params['learning_rate'][0],
                       n_estimators = xgb_params['n_estimators'][0],
                       gamma = xgb_params['gamma'][0],
                       min_child_weight = xgb_params['min_child_weight'][0],
                       colsample_bytree = xgb_params['colsample_bytree'][0],
                       subsample = xgb_params['subsample'][0]).fit(X, Y)

xgb_pred = xgb_md.predict_proba(man_test_tour)[:, 1]

##############
## LightGBM ##
##############

lgb_params = pd.read_csv('man_LightGBM_Phase_2_42_Optuna_Hyperparameters.csv')

lgb_md = LGBMClassifier(boosting_type = 'gbdt', 
                        n_estimators = lgb_params['n_estimators'][0],
                        learning_rate = lgb_params['learning_rate'][0],
                        max_depth = lgb_params['max_depth'][0],
                        lambda_l1 = lgb_params['lambda_l1'][0],
                        lambda_l2 = lgb_params['lambda_l2'][0],
                        num_leaves = lgb_params['num_leaves'][0],
                        bagging_fraction = lgb_params['bagging_fraction'][0],
                        feature_fraction = lgb_params['feature_fraction'][0]).fit(X, Y)

lgb_pred = lgb_md.predict_proba(man_test_tour)[:, 1]

##################
## HistGradient ##
##################

hist_params = pd.read_csv('man_Hist_Phase_2_42_Optuna_Hyperparameters.csv')

hist_md = HistGradientBoostingClassifier(l2_regularization = hist_params['l2_regularization'][0],
                                         early_stopping = False,
                                         learning_rate = hist_params['learning_rate'][0],
                                         max_iter = hist_params['max_iter'][0],
                                         max_depth = hist_params['max_depth'][0],
                                         max_bins = hist_params['max_bins'][0],
                                         min_samples_leaf = hist_params['min_samples_leaf'][0],
                                         max_leaf_nodes = hist_params['max_leaf_nodes'][0]).fit(X, Y)

hist_pred = hist_md.predict_proba(man_test_tour)[:, 1]

##############
## Ensemble ##
##############

ens_pred = (xgb_pred + lgb_pred + hist_pred) / 3
man_test_tour['Pred'] = ens_pred



In [37]:
man_test_tour.head()

Unnamed: 0,X1_WinRatio14d,X1_PointsMean,X1_PointsMedian,X1_PointsDiffMean,X1_FgaMean,X1_FgaMedian,X1_FgaMin,X1_FgaMax,X1_AstMean,X1_BlkMean,X1_OppFgaMean,X1_OppFgaMin,X1_EfgpMean,X1_PossessionsMean,X1_PpmMean,X1_FtrMean,X1_TopMean,X1_DrebpMean,X2_WinRatio14d,X2_PointsMean,X2_PointsMedian,X2_PointsDiffMean,X2_FgaMean,X2_FgaMedian,X2_FgaMin,X2_FgaMax,X2_AstMean,X2_BlkMean,X2_OppFgaMean,X2_OppFgaMin,X2_EfgpMean,X2_PossessionsMean,X2_PpmMean,X2_FtrMean,X2_TopMean,X2_DrebpMean,Seed1,Seed2,SeedDiff,quality_march_T1,quality_march_T2,ResultDiff,X1_team_rating,X1_rd1_win,X1_rd2_win,X1_rd3_win,X1_rd4_win,X1_rd5_win,X1_rd6_win,X1_rd7_win,X2_team_rating,X2_rd1_win,X2_rd2_win,X2_rd3_win,X2_rd4_win,X2_rd5_win,X2_rd6_win,X2_rd7_win,Pred
0,0.8,82.176471,80.0,13.676471,62.205882,63.0,48,90,15.205882,5.058824,64.382353,53,0.52876,75.384559,1.884614,0.275133,0.17941,0.740689,0.8,82.676471,84.5,11.176471,59.264706,59.0,49,71,19.176471,3.205882,63.294118,45,0.569708,73.827206,1.84568,0.265131,0.176279,0.729807,1.0,2.0,-1.0,1.313516,1.376789,4,92.24,1.0,0.985207,0.811241,0.640182,0.436759,0.289307,0.146452,88.98,1.0,0.941026,0.676923,0.35135,0.149377,0.085218,0.036114,0.948642
1,0.8,82.176471,80.0,13.676471,62.205882,63.0,48,90,15.205882,5.058824,64.382353,53,0.52876,75.384559,1.884614,0.275133,0.17941,0.740689,0.4,70.323529,68.5,2.617647,60.470588,62.0,47,75,14.235294,4.676471,58.294118,49,0.481441,71.490441,1.787261,0.220176,0.160698,0.680526,1.0,11.0,-10.0,1.313516,0.950975,12,92.24,1.0,0.985207,0.811241,0.640182,0.436759,0.289307,0.146452,80.36,0.544272,0.183005,0.0503,0.01923,0.007441,0.001812,0.000619,0.950995
2,0.8,82.176471,80.0,13.676471,62.205882,63.0,48,90,15.205882,5.058824,64.382353,53,0.52876,75.384559,1.884614,0.275133,0.17941,0.740689,0.25,74.424242,76.0,7.030303,56.909091,56.0,48,70,13.424242,5.242424,55.757576,44,0.522086,71.236364,1.780909,0.270631,0.176877,0.736434,1.0,8.0,-7.0,1.313516,0.834293,10,92.24,1.0,0.985207,0.811241,0.640182,0.436759,0.289307,0.146452,85.12,1.0,0.516337,0.179549,0.086244,0.035891,0.013111,0.006306,0.950323
3,0.8,82.176471,80.0,13.676471,62.205882,63.0,48,90,15.205882,5.058824,64.382353,53,0.52876,75.384559,1.884614,0.275133,0.17941,0.740689,0.333333,72.71875,72.0,5.65625,58.59375,59.0,47,73,14.1875,5.09375,56.4375,47,0.495432,70.496094,1.762402,0.255005,0.167288,0.687117,1.0,9.0,-8.0,1.313516,0.937026,11,92.24,1.0,0.985207,0.811241,0.640182,0.436759,0.289307,0.146452,84.86,1.0,0.579162,0.146244,0.082779,0.037729,0.017131,0.007497,0.950384
4,0.8,82.176471,80.0,13.676471,62.205882,63.0,48,90,15.205882,5.058824,64.382353,53,0.52876,75.384559,1.884614,0.275133,0.17941,0.740689,0.333333,77.15625,77.0,6.90625,57.59375,56.5,50,72,14.5,2.375,56.1875,45,0.530645,69.980469,1.749512,0.279926,0.170526,0.667339,1.0,3.0,-2.0,1.313516,0.97997,5,92.24,1.0,0.985207,0.811241,0.640182,0.436759,0.289307,0.146452,87.12,1.0,0.889041,0.45151,0.241393,0.097918,0.057396,0.025092,0.949934


In [31]:
np.max(xgb_pred)

0.9916419

In [19]:
lgb_pred

array([0.99941535, 0.99947264, 0.99947264, ..., 0.99947264, 0.99936685,
       0.9993514 ])

In [33]:
hist_pred

array([0.87067264, 0.87067264, 0.87067264, ..., 0.87067264, 0.87067264,
       0.87067264])

In [34]:
np.min(hist_pred)

0.129327356666901

In [35]:
np.max(hist_pred)

0.870672643333099