In [None]:
pip install xgboost lightgbm

In [None]:
import boto3
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

from tqdm import tqdm

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns

from scipy.stats import rankdata
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, HistGradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier, LGBMRegressor 
from xgboost import XGBClassifier, XGBRegressor
# from catboost import CatBoostClassifier, CatBoostRegressor

# import optuna 

s3 = boto3.resource('s3')
bucket_name = 'analytics-data-science-competitions'
bucket = s3.Bucket(bucket_name)

file_key_1 = 'March-Mania-2023/Evan-Data/MNCAA_train_538.csv'
file_key_2 = 'March-Mania-2023/Evan-Data/MNCAA_test_538.csv'
file_key_3 = 'March-Mania-2023/Evan-Data/WNCAA_train_538.csv'
file_key_4 = 'March-Mania-2023/Evan-Data/WNCAA_test_538.csv'
file_key_5 = 'March-Mania-2023/Evan-Data/SampleSubmission2023.csv'

bucket_object_1 = bucket.Object(file_key_1)
file_object_1 = bucket_object_1.get()
file_content_stream_1 = file_object_1.get('Body')

bucket_object_2 = bucket.Object(file_key_2)
file_object_2 = bucket_object_2.get()
file_content_stream_2 = file_object_2.get('Body')

bucket_object_3 = bucket.Object(file_key_3)
file_object_3 = bucket_object_3.get()
file_content_stream_3 = file_object_3.get('Body')

bucket_object_4 = bucket.Object(file_key_4)
file_object_4 = bucket_object_4.get()
file_content_stream_4 = file_object_4.get('Body')

bucket_object_5 = bucket.Object(file_key_5)
file_object_5 = bucket_object_5.get()
file_content_stream_5 = file_object_5.get('Body')

## Reading data files
man_train = pd.read_csv(file_content_stream_1)
man_test = pd.read_csv(file_content_stream_2)
woman_train = pd.read_csv(file_content_stream_3)
woman_test = pd.read_csv(file_content_stream_4)

submission = pd.read_csv(file_content_stream_5)

man_train['target'] = np.where(man_train['ResultDiff'] > 0, 1, 0)
man_train_2016 = man_train[man_train['Season'] >= 2016].reset_index(drop = True)

woman_train['target'] = np.where(woman_train['ResultDiff'] > 0, 1, 0)
woman_train_2016 = woman_train[woman_train['Season'] >= 2016].reset_index(drop = True)

# man_test = pd.read_csv('man_test_tour_phase_1.csv')
# woman_test = pd.read_csv('woman_test_tour_phase_1.csv')

# Solution 1
## Phase 1: Man

In [None]:
to_select = ['X1_WinRatio14d',
             'X1_PointsMean',
             'X1_PointsMedian',
             'X1_PointsDiffMean',
             'X1_FgaMean',
             'X1_FgaMedian',
             'X1_FgaMin',
             'X1_FgaMax',
             'X1_AstMean',
             'X1_BlkMean',
             'X1_OppFgaMean',
             'X1_OppFgaMin',
             'X1_EfgpMean',
             'X1_PossessionsMean',
             'X1_PpmMean',
             'X1_FtrMean',
             'X1_TopMean',
             'X1_DrebpMean',
             'X2_WinRatio14d',
             'X2_PointsMean',
             'X2_PointsMedian',
             'X2_PointsDiffMean',
             'X2_FgaMean',
             'X2_FgaMedian',
             'X2_FgaMin',
             'X2_FgaMax',
             'X2_AstMean',
             'X2_BlkMean',
             'X2_OppFgaMean',
             'X2_OppFgaMin',
             'X2_EfgpMean',
             'X2_PossessionsMean',
             'X2_PpmMean',
             'X2_FtrMean',
             'X2_TopMean',
             'X2_DrebpMean',
             'Seed1',
             'Seed2',
             'SeedDiff',
             'quality_march_T1',
             'quality_march_T2']

X = man_train[to_select]
Y = man_train['ResultDiff']

man_test_tour =  man_test[~(man_test['Seed1'].isnull() | man_test['Seed2'].isnull())].reset_index(drop = True)
man_test_IDS = man_test_tour['ID'] 
man_test_tour = man_test_tour[to_select]

#############
## XGBoost ##
#############

xgb_params = pd.read_csv('man_XGB_Phase_1_42_Optuna_Hyperparameters.csv')
xgb_preds = list()

for i in tqdm(range(100)):

    xgb_md = XGBRegressor(tree_method = 'hist', 
                          max_depth = xgb_params['max_depth'][0],
                          learning_rate = xgb_params['learning_rate'][0],
                          n_estimators = xgb_params['n_estimators'][0],
                          gamma = xgb_params['gamma'][0],
                          min_child_weight = xgb_params['min_child_weight'][0],
                          colsample_bytree = xgb_params['colsample_bytree'][0],
                          subsample = xgb_params['subsample'][0],
                          random_state = 1).fit(X, Y)

    xgb_pred = xgb_md.predict(man_test_tour)
    xgb_preds.append(xgb_pred)
    
##############
## LightGBM ##
##############

lgb_params = pd.read_csv('man_LightGBM_Phase_1_42_Optuna_Hyperparameters.csv')
lgb_preds = list()

for i in tqdm(range(100)):

    lgb_md = LGBMRegressor(boosting_type = 'gbdt', 
                           n_estimators = lgb_params['n_estimators'][0],
                           learning_rate = lgb_params['learning_rate'][0],
                           max_depth = lgb_params['max_depth'][0],
                           lambda_l1 = lgb_params['lambda_l1'][0],
                           lambda_l2 = lgb_params['lambda_l2'][0],
                           num_leaves = lgb_params['num_leaves'][0],
                           bagging_fraction = lgb_params['bagging_fraction'][0],
                           feature_fraction = lgb_params['feature_fraction'][0], 
                           random_state = i).fit(X, Y)

    lgb_pred = lgb_md.predict(man_test_tour)
    lgb_preds.append(lgb_pred)
    
##################
## HistGradient ##
##################

hist_params = pd.read_csv('man_Hist_Phase_1_42_Optuna_Hyperparameters.csv')
hist_preds = list()

for i in tqdm(range(100)):

    hist_md = HistGradientBoostingRegressor(l2_regularization = hist_params['l2_regularization'][0],
                                            early_stopping = False,
                                            learning_rate = hist_params['learning_rate'][0],
                                            max_iter = hist_params['max_iter'][0],
                                            max_depth = hist_params['max_depth'][0],
                                            max_bins = hist_params['max_bins'][0],
                                            min_samples_leaf = hist_params['min_samples_leaf'][0],
                                            max_leaf_nodes = hist_params['max_leaf_nodes'][0],
                                            random_state = i).fit(X, Y)

    hist_pred = hist_md.predict(man_test_tour)
    hist_preds.append(hist_pred)

##############
## Ensemble ##
##############

xgb_pred = pd.DataFrame(xgb_preds).apply(np.mean, axis = 0)
lgb_pred = pd.DataFrame(lgb_preds).apply(np.mean, axis = 0)
hist_pred = pd.DataFrame(hist_preds).apply(np.mean, axis = 0)

ens_pred = (xgb_pred + lgb_pred + hist_pred) / 3
man_test_tour['ResultDiff'] = np.round_(ens_pred, decimals = 0).astype(int)
man_test_tour['ID'] = man_test_IDS

########################
## Appending 538 data ##
########################

data_538 = ['ID',
            'X1_team_rating',
            'X1_rd1_win',
            'X1_rd2_win',
            'X1_rd3_win',
            'X1_rd4_win',
            'X1_rd5_win',
            'X1_rd6_win',
            'X1_rd7_win',
            'X2_team_rating',
            'X2_rd1_win',
            'X2_rd2_win',
            'X2_rd3_win',
            'X2_rd4_win',
            'X2_rd5_win',
            'X2_rd6_win',
            'X2_rd7_win']

man_test_538 = man_test[data_538]
man_test_tour = pd.merge(man_test_tour, man_test_538, on = 'ID', how = 'left')

## Phase 2: Man

In [None]:
to_select = ['X1_WinRatio14d',
             'X1_PointsMean',
             'X1_PointsMedian',
             'X1_PointsDiffMean',
             'X1_FgaMean',
             'X1_FgaMedian',
             'X1_FgaMin',
             'X1_FgaMax',
             'X1_AstMean',
             'X1_BlkMean',
             'X1_OppFgaMean',
             'X1_OppFgaMin',
             'X1_EfgpMean',
             'X1_PossessionsMean',
             'X1_PpmMean',
             'X1_FtrMean',
             'X1_TopMean',
             'X1_DrebpMean',
             'X2_WinRatio14d',
             'X2_PointsMean',
             'X2_PointsMedian',
             'X2_PointsDiffMean',
             'X2_FgaMean',
             'X2_FgaMedian',
             'X2_FgaMin',
             'X2_FgaMax',
             'X2_AstMean',
             'X2_BlkMean',
             'X2_OppFgaMean',
             'X2_OppFgaMin',
             'X2_EfgpMean',
             'X2_PossessionsMean',
             'X2_PpmMean',
             'X2_FtrMean',
             'X2_TopMean',
             'X2_DrebpMean',
             'Seed1',
             'Seed2',
             'SeedDiff',
             'quality_march_T1',
             'quality_march_T2', 
             'ResultDiff',
             'X1_team_rating',
             'X1_rd1_win',
             'X1_rd2_win',
             'X1_rd3_win',
             'X1_rd4_win',
             'X1_rd5_win',
             'X1_rd6_win',
             'X1_rd7_win',
             'X2_team_rating',
             'X2_rd1_win',
             'X2_rd2_win',
             'X2_rd3_win',
             'X2_rd4_win',
             'X2_rd5_win',
             'X2_rd6_win',
             'X2_rd7_win']

X = man_train_2016[to_select]
Y = man_train_2016['target']

man_test_tour_ID = man_test_tour['ID']
man_test_tour = man_test_tour[to_select]

#############
## XGBoost ##
#############

xgb_params = pd.read_csv('man_XGB_Phase_2_42_Optuna_Hyperparameters.csv')
xgb_preds = list()

for i in tqdm(range(100)):

    xgb_md = XGBClassifier(tree_method = 'hist', 
                           max_depth = xgb_params['max_depth'][0],
                           learning_rate = xgb_params['learning_rate'][0],
                           n_estimators = xgb_params['n_estimators'][0],
                           gamma = xgb_params['gamma'][0],
                           min_child_weight = xgb_params['min_child_weight'][0],
                           colsample_bytree = xgb_params['colsample_bytree'][0],
                           subsample = xgb_params['subsample'][0],
                           random_state = i).fit(X, Y)

    xgb_pred = xgb_md.predict_proba(man_test_tour)[:, 1]
    xgb_preds.append(xgb_pred)

##############
## LightGBM ##
##############

lgb_params = pd.read_csv('man_LightGBM_Phase_2_42_Optuna_Hyperparameters.csv')
lgb_preds = list()

for i in tqdm(range(100)):
    
    lgb_md = LGBMClassifier(boosting_type = 'gbdt', 
                            n_estimators = lgb_params['n_estimators'][0],
                            learning_rate = lgb_params['learning_rate'][0],
                            max_depth = lgb_params['max_depth'][0],
                            lambda_l1 = lgb_params['lambda_l1'][0],
                            lambda_l2 = lgb_params['lambda_l2'][0],
                            num_leaves = lgb_params['num_leaves'][0],
                            bagging_fraction = lgb_params['bagging_fraction'][0],
                            feature_fraction = lgb_params['feature_fraction'][0],
                            random_state = i).fit(X, Y)

    lgb_pred = lgb_md.predict_proba(man_test_tour)[:, 1]
    lgb_preds.append(lgb_pred)

##################
## HistGradient ##
##################

hist_params = pd.read_csv('man_Hist_Phase_2_42_Optuna_Hyperparameters.csv')
hist_preds = list()

for i in tqdm(range(100)):

    hist_md = HistGradientBoostingClassifier(l2_regularization = hist_params['l2_regularization'][0],
                                             early_stopping = False,
                                             learning_rate = hist_params['learning_rate'][0],
                                             max_iter = hist_params['max_iter'][0],
                                             max_depth = hist_params['max_depth'][0],
                                             max_bins = hist_params['max_bins'][0],
                                             min_samples_leaf = hist_params['min_samples_leaf'][0],
                                             max_leaf_nodes = hist_params['max_leaf_nodes'][0], 
                                             random_state = i).fit(X, Y)

    hist_pred = hist_md.predict_proba(man_test_tour)[:, 1]
    hist_preds.append(hist_pred)

##############
## Ensemble ##
##############

xgb_pred = pd.DataFrame(xgb_preds).apply(np.mean, axis = 0)
lgb_pred = pd.DataFrame(lgb_preds).apply(np.mean, axis = 0)
hist_pred = pd.DataFrame(hist_preds).apply(np.mean, axis = 0)

ens_pred = (xgb_pred + lgb_pred + hist_pred) / 3
man_pred = pd.DataFrame({'ID': man_test_tour_ID, 'Pred': ens_pred})
man_pred.head()

## Phase 1: Woman

In [None]:
to_select = ['X1_WinRatio14d',
             'X1_PointsMean',
             'X1_PointsMedian',
             'X1_PointsDiffMean',
             'X1_FgaMean',
             'X1_FgaMedian',
             'X1_FgaMin',
             'X1_FgaMax',
             'X1_AstMean',
             'X1_BlkMean',
             'X1_OppFgaMean',
             'X1_OppFgaMin',
             'X1_EfgpMean',
             'X1_PossessionsMean',
             'X1_PpmMean',
             'X1_FtrMean',
             'X1_TopMean',
             'X1_DrebpMean',
             'X2_WinRatio14d',
             'X2_PointsMean',
             'X2_PointsMedian',
             'X2_PointsDiffMean',
             'X2_FgaMean',
             'X2_FgaMedian',
             'X2_FgaMin',
             'X2_FgaMax',
             'X2_AstMean',
             'X2_BlkMean',
             'X2_OppFgaMean',
             'X2_OppFgaMin',
             'X2_EfgpMean',
             'X2_PossessionsMean',
             'X2_PpmMean',
             'X2_FtrMean',
             'X2_TopMean',
             'X2_DrebpMean',
             'Seed1',
             'Seed2',
             'SeedDiff',
             'quality_march_T1',
             'quality_march_T2']

X = woman_train[to_select]
Y = woman_train['ResultDiff']

woman_test_tour =  woman_test[~(woman_test['Seed1'].isnull() | woman_test['Seed2'].isnull())].reset_index(drop = True)
woman_test_IDS = woman_test_tour['ID'] 
woman_test_tour = woman_test_tour[to_select]

#############
## XGBoost ##
#############

xgb_params = pd.read_csv('woman_XGB_Phase_1_42_Optuna_Hyperparameters.csv')
xgb_preds = list()

for i in tqdm(range(100)):

    xgb_md = XGBRegressor(tree_method = 'hist', 
                          max_depth = xgb_params['max_depth'][0],
                          learning_rate = xgb_params['learning_rate'][0],
                          n_estimators = xgb_params['n_estimators'][0],
                          gamma = xgb_params['gamma'][0],
                          min_child_weight = xgb_params['min_child_weight'][0],
                          colsample_bytree = xgb_params['colsample_bytree'][0],
                          subsample = xgb_params['subsample'][0],
                          random_state = 1).fit(X, Y)

    xgb_pred = xgb_md.predict(woman_test_tour)
    xgb_preds.append(xgb_pred)
    
##############
## LightGBM ##
##############

lgb_params = pd.read_csv('woman_LightGBM_Phase_1_42_Optuna_Hyperparameters.csv')
lgb_preds = list()

for i in tqdm(range(100)):

    lgb_md = LGBMRegressor(boosting_type = 'gbdt', 
                           n_estimators = lgb_params['n_estimators'][0],
                           learning_rate = lgb_params['learning_rate'][0],
                           max_depth = lgb_params['max_depth'][0],
                           lambda_l1 = lgb_params['lambda_l1'][0],
                           lambda_l2 = lgb_params['lambda_l2'][0],
                           num_leaves = lgb_params['num_leaves'][0],
                           bagging_fraction = lgb_params['bagging_fraction'][0],
                           feature_fraction = lgb_params['feature_fraction'][0], 
                           random_state = i).fit(X, Y)

    lgb_pred = lgb_md.predict(woman_test_tour)
    lgb_preds.append(lgb_pred)
    
##################
## HistGradient ##
##################

hist_params = pd.read_csv('woman_Hist_Phase_1_42_Optuna_Hyperparameters.csv')
hist_preds = list()

for i in tqdm(range(100)):

    hist_md = HistGradientBoostingRegressor(l2_regularization = hist_params['l2_regularization'][0],
                                            early_stopping = False,
                                            learning_rate = hist_params['learning_rate'][0],
                                            max_iter = hist_params['max_iter'][0],
                                            max_depth = hist_params['max_depth'][0],
                                            max_bins = hist_params['max_bins'][0],
                                            min_samples_leaf = hist_params['min_samples_leaf'][0],
                                            max_leaf_nodes = hist_params['max_leaf_nodes'][0],
                                            random_state = i).fit(X, Y)

    hist_pred = hist_md.predict(woman_test_tour)
    hist_preds.append(hist_pred)

##############
## Ensemble ##
##############

xgb_pred = pd.DataFrame(xgb_preds).apply(np.mean, axis = 0)
lgb_pred = pd.DataFrame(lgb_preds).apply(np.mean, axis = 0)
hist_pred = pd.DataFrame(hist_preds).apply(np.mean, axis = 0)

ens_pred = (xgb_pred + lgb_pred + hist_pred) / 3
woman_test_tour['ResultDiff'] = np.round_(ens_pred, decimals = 0).astype(int)
woman_test_tour['ID'] = woman_test_IDS

########################
## Appending 538 data ##
########################

data_538 = ['ID',
            'X1_team_rating',
            'X1_rd1_win',
            'X1_rd2_win',
            'X1_rd3_win',
            'X1_rd4_win',
            'X1_rd5_win',
            'X1_rd6_win',
            'X1_rd7_win',
            'X2_team_rating',
            'X2_rd1_win',
            'X2_rd2_win',
            'X2_rd3_win',
            'X2_rd4_win',
            'X2_rd5_win',
            'X2_rd6_win',
            'X2_rd7_win']

woman_test_538 = woman_test[data_538]
woman_test_tour = pd.merge(woman_test_tour, woman_test_538, on = 'ID', how = 'left')