In [1]:
import numpy as np
import pandas as pd
import os
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from scipy.interpolate import UnivariateSpline
import statsmodels.api as sm
import matplotlib.pyplot as plt
import collections

pd.set_option("display.max_column", 999)
print(os.listdir("../input"))

['march-machine-learning-mania-2025']


In [2]:
DATA_PATH = '/kaggle/input/march-machine-learning-mania-2025/'

In [4]:
massey = pd.read_csv(DATA_PATH + "MMasseyOrdinals.csv")

tourney_results = pd.concat([
    pd.read_csv(DATA_PATH + "MNCAATourneyDetailedResults.csv"),
    pd.read_csv(DATA_PATH + "WNCAATourneyDetailedResults.csv"),
], ignore_index=True)

seeds = pd.concat([
    pd.read_csv(DATA_PATH + "MNCAATourneySeeds.csv"),
    pd.read_csv(DATA_PATH + "WNCAATourneySeeds.csv"),
], ignore_index=True)

regular_results = pd.concat([
    pd.read_csv(DATA_PATH + "MRegularSeasonDetailedResults.csv"),
    pd.read_csv(DATA_PATH + "WRegularSeasonDetailedResults.csv"),
], ignore_index=True)

In [5]:
regular_results_swap = regular_results[[
    'Season', 'DayNum', 'LTeamID', 'LScore', 'WTeamID', 'WScore', 'WLoc', 'NumOT', 
    'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF', 
    'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']]

regular_results_swap.loc[regular_results['WLoc'] == 'H', 'WLoc'] = 'A'
regular_results_swap.loc[regular_results['WLoc'] == 'A', 'WLoc'] = 'H'
regular_results.columns.values[6] = 'location'
regular_results_swap.columns.values[6] = 'location'

regular_results.columns = [x.replace('W','T1_').replace('L','T2_') for x in list(regular_results.columns)]
regular_results_swap.columns = [x.replace('L','T1_').replace('W','T2_') for x in list(regular_results.columns)]

regular_data = pd.concat([regular_results, regular_results_swap]).sort_index().reset_index(drop = True)

In [6]:
tourney_results = pd.concat([
    pd.read_csv(DATA_PATH + "MNCAATourneyDetailedResults.csv"),
    pd.read_csv(DATA_PATH + "WNCAATourneyDetailedResults.csv"),
], ignore_index=True)

seeds = pd.concat([
    pd.read_csv(DATA_PATH + "MNCAATourneySeeds.csv"),
    pd.read_csv(DATA_PATH + "WNCAATourneySeeds.csv"),
], ignore_index=True)

regular_results = pd.concat([
    pd.read_csv(DATA_PATH + "MRegularSeasonDetailedResults.csv"),
    pd.read_csv(DATA_PATH + "WRegularSeasonDetailedResults.csv"),
], ignore_index=True)

In [7]:
def prepare_data(df):
    dfswap = df[['Season', 'DayNum', 'LTeamID', 'LScore', 'WTeamID', 'WScore', 'WLoc', 'NumOT', 
    'LFGM', 'LFGA', 'LFGM3', 'LFGA3', 'LFTM', 'LFTA', 'LOR', 'LDR', 'LAst', 'LTO', 'LStl', 'LBlk', 'LPF', 
    'WFGM', 'WFGA', 'WFGM3', 'WFGA3', 'WFTM', 'WFTA', 'WOR', 'WDR', 'WAst', 'WTO', 'WStl', 'WBlk', 'WPF']]

    dfswap.loc[df['WLoc'] == 'H', 'WLoc'] = 'A'
    dfswap.loc[df['WLoc'] == 'A', 'WLoc'] = 'H'
    df.columns.values[6] = 'location'
    dfswap.columns.values[6] = 'location'    
      
    df.columns = [x.replace('W','T1_').replace('L','T2_') for x in list(df.columns)]
    dfswap.columns = [x.replace('L','T1_').replace('W','T2_') for x in list(dfswap.columns)]

    output = pd.concat([df, dfswap]).reset_index(drop=True)
    output.loc[output.location=='N','location'] = '0'
    output.loc[output.location=='H','location'] = '1'
    output.loc[output.location=='A','location'] = '-1'
    output.location = output.location.astype(int)
    
    output['PointDiff'] = output['T1_Score'] - output['T2_Score']
    
    return output

regular_data = prepare_data(regular_results)
tourney_data = prepare_data(tourney_results)

In [9]:
boxscore_cols = ['T1_Score', 'T2_Score', 
        'T1_FGM', 'T1_FGA', 'T1_FGM3', 'T1_FGA3', 'T1_FTM', 'T1_FTA', 'T1_OR', 'T1_DR', 'T1_Ast', 'T1_TO', 'T1_Stl', 'T1_Blk', 'T1_PF', 
        'T2_FGM', 'T2_FGA', 'T2_FGM3', 'T2_FGA3', 'T2_FTM', 'T2_FTA', 'T2_OR', 'T2_DR', 'T2_Ast', 'T2_TO', 'T2_Stl', 'T2_Blk', 'T2_PF', 
        'PointDiff']

boxscore_cols = [
        'T1_FGM', 'T1_FGA', 'T1_FGM3', 'T1_FGA3', 'T1_OR', 'T1_Ast', 'T1_TO', 'T1_Stl', 'T1_PF', 
        'T2_FGM', 'T2_FGA', 'T2_FGM3', 'T2_FGA3', 'T2_OR', 'T2_Ast', 'T2_TO', 'T2_Stl', 'T2_Blk',  
        'PointDiff']

In [10]:
funcs = [np.mean]

season_statistics = regular_data.groupby(["Season", 'T1_TeamID'])[boxscore_cols].agg(funcs)
season_statistics = regular_data.groupby(["Season", 'T1_TeamID'])[boxscore_cols].agg(funcs).reset_index()
season_statistics.columns = [''.join(col).strip() for col in season_statistics.columns.values]

season_statistics_T1 = season_statistics.copy()
season_statistics_T2 = season_statistics.copy()

season_statistics_T1.columns = ["T1_" + x.replace("T1_","").replace("T2_","opponent_") for x in list(season_statistics_T1.columns)]
season_statistics_T2.columns = ["T2_" + x.replace("T1_","").replace("T2_","opponent_") for x in list(season_statistics_T2.columns)]
season_statistics_T1.columns.values[0] = "Season"
season_statistics_T2.columns.values[0] = "Season"

tourney_data = tourney_data[['Season', 'DayNum', 'T1_TeamID', 'T1_Score', 'T2_TeamID' ,'T2_Score']]
tourney_data = pd.merge(tourney_data, season_statistics_T1, on = ['Season', 'T1_TeamID'], how = 'left')
tourney_data = pd.merge(tourney_data, season_statistics_T2, on = ['Season', 'T2_TeamID'], how = 'left')

last14days_stats_T1 = regular_data.loc[regular_data.DayNum>118].reset_index(drop=True)
last14days_stats_T1['win'] = np.where(last14days_stats_T1['PointDiff']>0,1,0)
last14days_stats_T1 = last14days_stats_T1.groupby(['Season','T1_TeamID'])['win'].mean().reset_index(name='T1_win_ratio_14d')

last14days_stats_T2 = regular_data.loc[regular_data.DayNum>118].reset_index(drop=True)
last14days_stats_T2['win'] = np.where(last14days_stats_T2['PointDiff']<0,1,0)
last14days_stats_T2 = last14days_stats_T2.groupby(['Season','T2_TeamID'])['win'].mean().reset_index(name='T2_win_ratio_14d')

tourney_data = pd.merge(tourney_data, last14days_stats_T1, on = ['Season', 'T1_TeamID'], how = 'left')
tourney_data = pd.merge(tourney_data, last14days_stats_T2, on = ['Season', 'T2_TeamID'], how = 'left')

regular_season_effects = regular_data[['Season','T1_TeamID','T2_TeamID','PointDiff']].copy()
regular_season_effects['T1_TeamID'] = regular_season_effects['T1_TeamID'].astype(str)
regular_season_effects['T2_TeamID'] = regular_season_effects['T2_TeamID'].astype(str)
regular_season_effects['win'] = np.where(regular_season_effects['PointDiff']>0,1,0)
march_madness = pd.merge(seeds[['Season','TeamID']],seeds[['Season','TeamID']],on='Season')
march_madness.columns = ['Season', 'T1_TeamID', 'T2_TeamID']
march_madness.T1_TeamID = march_madness.T1_TeamID.astype(str)
march_madness.T2_TeamID = march_madness.T2_TeamID.astype(str)
regular_season_effects = pd.merge(regular_season_effects, march_madness, on = ['Season','T1_TeamID','T2_TeamID'])

  season_statistics = regular_data.groupby(["Season", 'T1_TeamID'])[boxscore_cols].agg(funcs)
  season_statistics = regular_data.groupby(["Season", 'T1_TeamID'])[boxscore_cols].agg(funcs).reset_index()


In [11]:
def team_quality(season):
    formula = 'win~-1+T1_TeamID+T2_TeamID'
    glm = sm.GLM.from_formula(formula=formula, 
                              data=regular_season_effects.loc[regular_season_effects.Season==season,:], 
                              family=sm.families.Binomial()).fit()
    
    quality = pd.DataFrame(glm.params).reset_index()
    quality.columns = ['TeamID','quality']
    quality['Season'] = season
    #quality['quality'] = np.exp(quality['quality'])
    quality = quality.loc[quality.TeamID.str.contains('T1_')].reset_index(drop=True)
    quality['TeamID'] = quality['TeamID'].apply(lambda x: x[10:14]).astype(int)
    return quality

In [13]:
formula = 'win~-1+T1_TeamID+T2_TeamID'
glm = sm.GLM.from_formula(formula=formula, 
                          data=regular_season_effects.loc[regular_season_effects.Season==2010,:], 
                          family=sm.families.Binomial()).fit()

quality = pd.DataFrame(glm.params).reset_index()

glm_quality = pd.concat([team_quality(2010),
                         team_quality(2011),
                         team_quality(2012),
                         team_quality(2013),
                         team_quality(2014),
                         team_quality(2015),
                         team_quality(2016),
                         team_quality(2017),
                         team_quality(2018),
                         team_quality(2019),
                         ##team_quality(2020),
                         team_quality(2021),
                         team_quality(2022),
                         team_quality(2023),
                         team_quality(2024)
                         ]).reset_index(drop=True)

glm_quality_T1 = glm_quality.copy()
glm_quality_T2 = glm_quality.copy()
glm_quality_T1.columns = ['T1_TeamID','T1_quality','Season']
glm_quality_T2.columns = ['T2_TeamID','T2_quality','Season']

  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)
  t = np.exp(-z)


In [14]:
tourney_data = pd.merge(tourney_data, glm_quality_T1, on = ['Season', 'T1_TeamID'], how = 'left')
tourney_data = pd.merge(tourney_data, glm_quality_T2, on = ['Season', 'T2_TeamID'], how = 'left')

seeds['seed'] = seeds['Seed'].apply(lambda x: int(x[1:3]))
seeds_T1 = seeds[['Season','TeamID','seed']].copy()
seeds_T2 = seeds[['Season','TeamID','seed']].copy()
seeds_T1.columns = ['Season','T1_TeamID','T1_seed']
seeds_T2.columns = ['Season','T2_TeamID','T2_seed']

tourney_data = pd.merge(tourney_data, seeds_T1, on = ['Season', 'T1_TeamID'], how = 'left')
tourney_data = pd.merge(tourney_data, seeds_T2, on = ['Season', 'T2_TeamID'], how = 'left')
tourney_data["Seed_diff"] = tourney_data["T1_seed"] - tourney_data["T2_seed"]

In [15]:
tourney_data['IsMensTeam'] = (tourney_data['T1_TeamID'] < 1500).astype(int)

latest_ranking_days = massey.groupby('Season')['RankingDayNum'].apply(
    lambda x: x[x < 133].max() if any(x < 133) else x.max()
).reset_index()
latest_ranking_days.rename(columns={'RankingDayNum': 'LatestRankingDay'}, inplace=True)

massey_latest = pd.merge(massey, latest_ranking_days, on='Season')
massey_latest = massey_latest[massey_latest['RankingDayNum'] == massey_latest['LatestRankingDay']]

key_systems = ['AP', 'KEN', 'SAG', 'MOR', 'POM', 'RTH', 'DOL', 'RPI']

for system in key_systems:
    system_ranks = massey_latest[massey_latest['SystemName'] == system].copy()
    
    if len(system_ranks) > 0:
        system_ranks_T1 = system_ranks[['Season', 'TeamID', 'OrdinalRank']].copy()
        system_ranks_T2 = system_ranks[['Season', 'TeamID', 'OrdinalRank']].copy()
        
        system_ranks_T1.columns = ['Season', 'T1_TeamID', f'T1_{system}_Rank']
        system_ranks_T2.columns = ['Season', 'T2_TeamID', f'T2_{system}_Rank']
        
        tourney_data = pd.merge(tourney_data, system_ranks_T1, on=['Season', 'T1_TeamID'], how='left')
        tourney_data = pd.merge(tourney_data, system_ranks_T2, on=['Season', 'T2_TeamID'], how='left')
        
        tourney_data[f'{system}_Rank_diff'] = tourney_data[f'T1_{system}_Rank'] - tourney_data[f'T2_{system}_Rank']

avg_ranks = massey_latest.groupby(['Season', 'TeamID'])['OrdinalRank'].mean().reset_index()
avg_ranks.columns = ['Season', 'TeamID', 'AvgRank']

avg_ranks_T1 = avg_ranks.copy()
avg_ranks_T2 = avg_ranks.copy()
avg_ranks_T1.columns = ['Season', 'T1_TeamID', 'T1_AvgRank']
avg_ranks_T2.columns = ['Season', 'T2_TeamID', 'T2_AvgRank']

tourney_data = pd.merge(tourney_data, avg_ranks_T1, on=['Season', 'T1_TeamID'], how='left')
tourney_data = pd.merge(tourney_data, avg_ranks_T2, on=['Season', 'T2_TeamID'], how='left')

tourney_data['AvgRank_diff'] = tourney_data['T1_AvgRank'] - tourney_data['T2_AvgRank']

In [16]:
rank_std = massey_latest.groupby(['Season', 'TeamID'])['OrdinalRank'].std().reset_index()
rank_std.columns = ['Season', 'TeamID', 'RankStd']

rank_std_T1 = rank_std.copy()
rank_std_T2 = rank_std.copy()
rank_std_T1.columns = ['Season', 'T1_TeamID', 'T1_RankStd']
rank_std_T2.columns = ['Season', 'T2_TeamID', 'T2_RankStd']

tourney_data = pd.merge(tourney_data, rank_std_T1, on=['Season', 'T1_TeamID'], how='left')
tourney_data = pd.merge(tourney_data, rank_std_T2, on=['Season', 'T2_TeamID'], how='left')

rank_count = massey_latest.groupby(['Season', 'TeamID']).size().reset_index(name='NumRankings')
rank_count_T1 = rank_count.copy()
rank_count_T2 = rank_count.copy()
rank_count_T1.columns = ['Season', 'T1_TeamID', 'T1_NumRankings']
rank_count_T2.columns = ['Season', 'T2_TeamID', 'T2_NumRankings']

tourney_data = pd.merge(tourney_data, rank_count_T1, on=['Season', 'T1_TeamID'], how='left')
tourney_data = pd.merge(tourney_data, rank_count_T2, on=['Season', 'T2_TeamID'], how='left')

In [17]:
previous_days = massey.groupby(['Season', 'SystemName'])['RankingDayNum'].apply(
    lambda x: x.sort_values().iloc[[-2, -1]].reset_index(drop=True) if len(x) >= 2 else pd.Series([x.iloc[0], x.iloc[0]])
).reset_index()
previous_days.columns = ['Season', 'SystemName', 'idx', 'RankingDayNum']
previous_days = previous_days.pivot(index=['Season', 'SystemName'], columns='idx', values='RankingDayNum').reset_index()
previous_days.columns = ['Season', 'SystemName', 'PreviousRankingDay', 'LastRankingDay']

previous_massey = pd.merge(massey, previous_days, on=['Season', 'SystemName'])
previous_massey = previous_massey[previous_massey['RankingDayNum'] == previous_massey['PreviousRankingDay']]
previous_massey = previous_massey[['Season', 'SystemName', 'TeamID', 'OrdinalRank']]
previous_massey.columns = ['Season', 'SystemName', 'TeamID', 'PreviousRank']

last_massey = pd.merge(massey, previous_days, on=['Season', 'SystemName'])
last_massey = last_massey[last_massey['RankingDayNum'] == last_massey['LastRankingDay']]
last_massey = last_massey[['Season', 'SystemName', 'TeamID', 'OrdinalRank']]
last_massey.columns = ['Season', 'SystemName', 'TeamID', 'LastRank']

momentum = pd.merge(previous_massey, last_massey, on=['Season', 'SystemName', 'TeamID'])
momentum['RankChange'] = momentum['PreviousRank'] - momentum['LastRank']  # Positive means improvement

avg_momentum = momentum.groupby(['Season', 'TeamID'])['RankChange'].mean().reset_index()
avg_momentum.columns = ['Season', 'TeamID', 'AvgMomentum']

avg_momentum_T1 = avg_momentum.copy()
avg_momentum_T2 = avg_momentum.copy()
avg_momentum_T1.columns = ['Season', 'T1_TeamID', 'T1_AvgMomentum']
avg_momentum_T2.columns = ['Season', 'T2_TeamID', 'T2_AvgMomentum']

tourney_data = pd.merge(tourney_data, avg_momentum_T1, on=['Season', 'T1_TeamID'], how='left')
tourney_data = pd.merge(tourney_data, avg_momentum_T2, on=['Season', 'T2_TeamID'], how='left')
tourney_data['Momentum_diff'] = tourney_data['T1_AvgMomentum'] - tourney_data['T2_AvgMomentum']

In [18]:
massey_feature_cols = ['T1_AvgRank', 'T2_AvgRank', 'AvgRank_diff', 
                        'T1_RankStd', 'T2_RankStd',
                        'T1_NumRankings', 'T2_NumRankings',
                        'T1_AvgMomentum', 'T2_AvgMomentum', 'Momentum_diff']

for system in key_systems:
    if f'T1_{system}_Rank' in tourney_data.columns:
        massey_feature_cols.extend([f'T1_{system}_Rank', f'T2_{system}_Rank', f'{system}_Rank_diff'])

for col in massey_feature_cols:
    if col in tourney_data.columns:
        tourney_data[f'{col}_mens'] = tourney_data[col] * tourney_data['IsMensTeam']
    
base_features = list(season_statistics_T1.columns[2:999]) + \
    list(season_statistics_T2.columns[2:999]) + \
    list(seeds_T1.columns[2:999]) + \
    list(seeds_T2.columns[2:999]) + \
    list(last14days_stats_T1.columns[2:999]) + \
    list(last14days_stats_T2.columns[2:999]) + \
    ["Seed_diff"] + ["T1_quality", "T2_quality"]

massey_mens_features = [f'{col}_mens' for col in massey_feature_cols if f'{col}_mens' in tourney_data.columns]

In [19]:
features = base_features + massey_mens_features

y = tourney_data['T1_Score'] - tourney_data['T2_Score']

In [20]:
features = list(season_statistics_T1.columns[2:999]) + \
    list(season_statistics_T2.columns[2:999]) + \
    list(seeds_T1.columns[2:999]) + \
    list(seeds_T2.columns[2:999]) + \
    list(last14days_stats_T1.columns[2:999]) + \
    list(last14days_stats_T2.columns[2:999]) + \
    ["Seed_diff"] + ["T1_quality","T2_quality"]

In [21]:
X = tourney_data[features].values
dtrain = xgb.DMatrix(X, label = y)

In [22]:
def cauchyobj(preds, dtrain):
    labels = dtrain.get_label()
    c = 5000 
    x =  preds-labels    
    grad = x / (x**2/c**2+1)
    hess = -c**2*(x**2-c**2)/(x**2+c**2)**2
    return grad, hess

In [23]:
param = {} 
#param['objective'] = 'reg:linear'
param['eval_metric'] =  'mae'
param['booster'] = 'gbtree'
param['eta'] = 0.02 #change to ~0.02 for final run
param['subsample'] = 0.35
param['colsample_bytree'] = 0.7
param['num_parallel_tree'] = 10 #recommend 10
param['min_child_weight'] = 40
param['gamma'] = 10
param['max_depth'] =  3
param['silent'] = 1

print(param)

{'eval_metric': 'mae', 'booster': 'gbtree', 'eta': 0.02, 'subsample': 0.35, 'colsample_bytree': 0.7, 'num_parallel_tree': 10, 'min_child_weight': 40, 'gamma': 10, 'max_depth': 3, 'silent': 1}


In [24]:
xgb_cv = []
repeat_cv = 10 # recommend 10

for i in range(repeat_cv): 
    print(f"Fold repeater {i}")
    xgb_cv.append(
        xgb.cv(
          params = param,
          dtrain = dtrain,
          obj = cauchyobj,
          num_boost_round = 3000,
          folds = KFold(n_splits = 5, shuffle = True, random_state = i),
          early_stopping_rounds = 25,
          verbose_eval = 50
        )
    )

Fold repeater 0
[0]	train-mae:13.57403+0.05891	test-mae:13.57528+0.23801


Parameters: { "silent" } are not used.



[50]	train-mae:10.43116+0.04229	test-mae:10.53994+0.21149
[100]	train-mae:9.64440+0.03956	test-mae:9.85037+0.17504
[150]	train-mae:9.36710+0.03936	test-mae:9.66491+0.17098
[200]	train-mae:9.21539+0.03992	test-mae:9.59929+0.17247
[250]	train-mae:9.10076+0.03969	test-mae:9.56934+0.17291
[300]	train-mae:8.99963+0.04028	test-mae:9.55189+0.17403
[350]	train-mae:8.90909+0.04034	test-mae:9.54542+0.17252
[400]	train-mae:8.82478+0.04063	test-mae:9.53933+0.17267
[450]	train-mae:8.74394+0.04146	test-mae:9.53533+0.17438
[481]	train-mae:8.69652+0.04095	test-mae:9.53598+0.17632
Fold repeater 1
[0]	train-mae:13.57349+0.07711	test-mae:13.57489+0.30751


Parameters: { "silent" } are not used.



[50]	train-mae:10.43057+0.04889	test-mae:10.54222+0.21314
[100]	train-mae:9.64325+0.04605	test-mae:9.85107+0.19755
[150]	train-mae:9.36534+0.04582	test-mae:9.66185+0.20455
[200]	train-mae:9.21196+0.04498	test-mae:9.59720+0.21434
[250]	train-mae:9.09789+0.04652	test-mae:9.56963+0.22422
[300]	train-mae:8.99867+0.04830	test-mae:9.55792+0.23323
[350]	train-mae:8.90827+0.04944	test-mae:9.55156+0.23697
[400]	train-mae:8.82357+0.04986	test-mae:9.54671+0.23953
[450]	train-mae:8.74297+0.05080	test-mae:9.54626+0.24410
[463]	train-mae:8.72281+0.05099	test-mae:9.54701+0.24561
Fold repeater 2
[0]	train-mae:13.57356+0.05842	test-mae:13.57545+0.23750


Parameters: { "silent" } are not used.



[50]	train-mae:10.43247+0.03513	test-mae:10.54841+0.11721
[100]	train-mae:9.64531+0.04335	test-mae:9.85304+0.15427
[150]	train-mae:9.36937+0.04709	test-mae:9.66072+0.18218
[200]	train-mae:9.21809+0.05169	test-mae:9.59178+0.19506
[250]	train-mae:9.10385+0.05455	test-mae:9.56168+0.20133
[300]	train-mae:9.00556+0.05707	test-mae:9.54579+0.20641
[350]	train-mae:8.91411+0.05727	test-mae:9.53801+0.21287
[400]	train-mae:8.82732+0.05856	test-mae:9.53252+0.21739
[450]	train-mae:8.74599+0.05897	test-mae:9.52732+0.21955
[487]	train-mae:8.68844+0.05876	test-mae:9.52702+0.22316
Fold repeater 3
[0]	train-mae:13.57445+0.09770	test-mae:13.57697+0.40207


Parameters: { "silent" } are not used.

Parameters: { "silent" } are not used.



[50]	train-mae:10.42920+0.03850	test-mae:10.54556+0.15714
[100]	train-mae:9.64290+0.05230	test-mae:9.86393+0.17479
[150]	train-mae:9.36838+0.05540	test-mae:9.67203+0.19945
[200]	train-mae:9.21463+0.05343	test-mae:9.60715+0.20628
[250]	train-mae:9.09843+0.05233	test-mae:9.57867+0.21186
[300]	train-mae:8.99903+0.05238	test-mae:9.56887+0.21504
[350]	train-mae:8.90817+0.05251	test-mae:9.56288+0.21827
[400]	train-mae:8.82269+0.05095	test-mae:9.56083+0.21980
[414]	train-mae:8.79960+0.05125	test-mae:9.56136+0.22010
Fold repeater 4
[0]	train-mae:13.57413+0.06837	test-mae:13.57665+0.27629


Parameters: { "silent" } are not used.



[50]	train-mae:10.43190+0.03354	test-mae:10.54538+0.20219
[100]	train-mae:9.64521+0.01880	test-mae:9.85627+0.10953
[150]	train-mae:9.36771+0.01560	test-mae:9.66745+0.07872
[200]	train-mae:9.21219+0.01413	test-mae:9.60169+0.06594
[250]	train-mae:9.09776+0.01569	test-mae:9.57234+0.05792
[300]	train-mae:8.99912+0.01751	test-mae:9.55628+0.05818
[350]	train-mae:8.90867+0.02002	test-mae:9.54847+0.05979
[400]	train-mae:8.82322+0.02196	test-mae:9.54769+0.06379
[450]	train-mae:8.74162+0.02455	test-mae:9.54580+0.06740
[464]	train-mae:8.71881+0.02496	test-mae:9.54549+0.06923
Fold repeater 5
[0]	train-mae:13.57366+0.12302	test-mae:13.57560+0.50433


Parameters: { "silent" } are not used.



[50]	train-mae:10.43266+0.04571	test-mae:10.54274+0.28498
[100]	train-mae:9.64563+0.03962	test-mae:9.85019+0.19503
[150]	train-mae:9.37022+0.04094	test-mae:9.66020+0.17865
[200]	train-mae:9.21845+0.04146	test-mae:9.59269+0.18513
[250]	train-mae:9.10293+0.04169	test-mae:9.56459+0.19386
[300]	train-mae:9.00372+0.04054	test-mae:9.54924+0.20040
[350]	train-mae:8.91422+0.04133	test-mae:9.53956+0.20568
[400]	train-mae:8.82790+0.04241	test-mae:9.53541+0.20992
[450]	train-mae:8.74809+0.04200	test-mae:9.52825+0.21298
[500]	train-mae:8.67143+0.04090	test-mae:9.52471+0.21570
[550]	train-mae:8.59795+0.04021	test-mae:9.52087+0.22088
[600]	train-mae:8.52709+0.03963	test-mae:9.51955+0.22300
[650]	train-mae:8.45800+0.03846	test-mae:9.51938+0.22652
[662]	train-mae:8.44196+0.03828	test-mae:9.52072+0.22691
Fold repeater 6
[0]	train-mae:13.57337+0.07311	test-mae:13.57536+0.29721


Parameters: { "silent" } are not used.



[50]	train-mae:10.43330+0.02910	test-mae:10.53933+0.18200
[100]	train-mae:9.64928+0.02781	test-mae:9.85530+0.13041
[150]	train-mae:9.37108+0.03002	test-mae:9.66817+0.12373
[200]	train-mae:9.21664+0.03229	test-mae:9.60967+0.12502
[250]	train-mae:9.10109+0.03285	test-mae:9.58662+0.12798
[300]	train-mae:8.99947+0.03174	test-mae:9.57378+0.13172
[350]	train-mae:8.90792+0.03124	test-mae:9.56811+0.13549
[400]	train-mae:8.82257+0.03217	test-mae:9.56345+0.13901
[450]	train-mae:8.74192+0.03258	test-mae:9.55996+0.13912
[474]	train-mae:8.70374+0.03368	test-mae:9.56260+0.13894
Fold repeater 7
[0]	train-mae:13.57349+0.07115	test-mae:13.57488+0.29093


Parameters: { "silent" } are not used.



[50]	train-mae:10.43214+0.05813	test-mae:10.54495+0.26795
[100]	train-mae:9.64390+0.06078	test-mae:9.86023+0.27673
[150]	train-mae:9.36361+0.06178	test-mae:9.67830+0.27257
[200]	train-mae:9.21017+0.06484	test-mae:9.61453+0.27245
[250]	train-mae:9.09555+0.06528	test-mae:9.58898+0.26827
[300]	train-mae:8.99581+0.06421	test-mae:9.57788+0.26494
[350]	train-mae:8.90347+0.06420	test-mae:9.57085+0.26036
[395]	train-mae:8.82594+0.06320	test-mae:9.57010+0.25804
Fold repeater 8
[0]	train-mae:13.57350+0.10118	test-mae:13.57477+0.41032


Parameters: { "silent" } are not used.



[50]	train-mae:10.43286+0.08195	test-mae:10.53502+0.37788
[100]	train-mae:9.64626+0.08270	test-mae:9.83942+0.36618
[150]	train-mae:9.36675+0.08623	test-mae:9.66146+0.35719
[200]	train-mae:9.21237+0.08850	test-mae:9.60237+0.35247
[250]	train-mae:9.09523+0.08888	test-mae:9.58210+0.35190
[300]	train-mae:8.99499+0.09012	test-mae:9.57561+0.35039
[350]	train-mae:8.90294+0.09060	test-mae:9.56940+0.35017
[394]	train-mae:8.82633+0.09008	test-mae:9.56799+0.34749
Fold repeater 9
[0]	train-mae:13.57363+0.06382	test-mae:13.57535+0.25157


Parameters: { "silent" } are not used.



[50]	train-mae:10.43526+0.05041	test-mae:10.53367+0.21643
[100]	train-mae:9.64603+0.04569	test-mae:9.84231+0.19757
[150]	train-mae:9.36983+0.04319	test-mae:9.65425+0.19262
[200]	train-mae:9.21685+0.04097	test-mae:9.58753+0.18207
[250]	train-mae:9.10292+0.03981	test-mae:9.56074+0.17764
[300]	train-mae:9.00315+0.04097	test-mae:9.55257+0.17605
[350]	train-mae:8.91102+0.03973	test-mae:9.54727+0.17893
[400]	train-mae:8.82564+0.03955	test-mae:9.54642+0.17970
[408]	train-mae:8.81210+0.03935	test-mae:9.54673+0.17999


In [25]:
iteration_counts = [np.argmin(x['test-mae-mean'].values) for x in xgb_cv]
val_mae = [np.min(x['test-mae-mean'].values) for x in xgb_cv]
iteration_counts, val_mae

([457, 438, 462, 389, 440, 637, 450, 371, 370, 384],
 [9.5352501637851,
  9.545600728284343,
  9.525169485470645,
  9.56030076817667,
  9.544954531453138,
  9.518652758123725,
  9.559955355203034,
  9.56921682526018,
  9.567636328985348,
  9.544987312163048])

In [26]:
oof_preds = []
for i in range(repeat_cv):
    print(f"Fold repeater {i}")
    preds = y.copy()
    kfold = KFold(n_splits = 5, shuffle = True, random_state = i)    
    for train_index, val_index in kfold.split(X,y):
        dtrain_i = xgb.DMatrix(X[train_index], label = y[train_index])
        dval_i = xgb.DMatrix(X[val_index], label = y[val_index])  
        model = xgb.train(
              params = param,
              dtrain = dtrain_i,
              num_boost_round = iteration_counts[i],
              verbose_eval = 50
        )
        preds[val_index] = model.predict(dval_i)
    oof_preds.append(np.clip(preds,-30,30))

Fold repeater 0


Parameters: { "silent" } are not used.

  1.07806597e+01 -7.12072432e-01  1.04159784e+01 -8.71903801e+00
 -3.52902603e+00 -1.35922283e-01 -3.50643849e+00 -1.36244106e+00
  5.67299795e+00  2.85753822e+00  1.82981796e+01  3.93462396e+00
  2.69727492e+00  4.37359810e-01 -1.39862609e+00  7.94504976e+00
  2.03151798e+01 -7.47154474e-01  3.79337049e+00  3.03327389e+01
  9.20329475e+00 -1.76556289e+00  1.78650742e+01  6.79344416e+00
  1.11748466e+01  3.95318246e+00  1.32144547e+01 -6.64903045e-01
  4.90721750e+00  3.46282077e+00 -3.30636883e+00  2.22962093e+01
 -6.86720133e+00  1.54064922e+01  3.07419491e+00 -4.08535671e+00
  9.73347282e+00  1.58892908e+01  4.36276245e+00  2.10145493e+01
  4.58607864e+00  6.29394436e+00 -2.45464754e+00  4.21408796e+00
  2.74376297e+00  3.99851227e+00 -1.26579361e+01  4.27405739e+00
 -5.43402694e-02 -8.47968996e-01  1.20202341e+01  1.26868021e+00
  4.95606613e+00  1.27517033e+01  3.96908832e+00  1.07691231e+01
  8.35287452e-01 -2.19753385e+00  1.61576807e+00  

Fold repeater 1


Parameters: { "silent" } are not used.

 -2.13062692e+00  1.66388836e+01  2.18063984e+01  1.39392653e+01
  1.39623060e+01  2.80292249e+00  8.76191711e+00 -2.00363731e+00
 -3.87179613e+00 -6.25944185e+00 -3.32603025e+00  3.19266915e+00
  1.29153633e+01  1.98962002e+01  1.10727000e+00  1.34898543e+00
  6.84674978e+00  7.15912008e+00 -7.79842186e+00 -3.27516198e-01
  6.34845829e+00  4.92432165e+00 -6.65064096e-01  2.80116725e+00
 -1.95700958e-01 -3.44499826e+00  4.83765697e+00 -3.69480324e+00
  1.31893361e+00  2.40197525e+01  1.92824421e+01  1.54287605e+01
  4.75604439e+00  3.51030684e+00  5.34647417e+00 -1.66120899e+00
  9.47359180e+00  6.08157158e+00 -2.09517419e-01  1.19479113e+01
  2.53726816e+00  1.07162209e+01  1.50796480e+01  1.78194904e+01
 -1.12381687e+01  2.19110603e+01 -1.24304304e+01  1.95765324e+01
  4.55728769e+00  7.78823900e+00  1.40652701e-01 -3.36281013e+00
  2.53949809e+00  2.00369625e+01  1.46538389e+00  7.79570055e+00
  6.17002869e+00  6.03714561e+00  6.99928999e+00  

Fold repeater 2


Parameters: { "silent" } are not used.

 -9.35817146e+00  1.22641873e+00 -3.12094808e+00 -1.85137892e+00
  8.43353844e+00 -4.81216955e+00  4.81112719e-01  3.45534539e+00
 -3.23164725e+00 -1.43369508e+00  2.25290661e+01  7.89405918e+00
 -2.12593436e+00  1.79080510e+00  1.32346926e+01  6.84356642e+00
  4.99663115e+00  1.71394405e+01 -8.80106163e+00  3.77943420e+00
  3.85998774e+00 -7.97571611e+00 -1.76547360e+00  3.63277364e+00
 -7.12677240e-01  7.42793083e+00 -4.83371794e-01  3.63891149e+00
  1.29155416e+01  6.50680399e+00  3.02547836e+00  2.46425209e+01
 -1.72997034e+00  3.30518770e+00  2.67451916e+01  2.61925812e+01
  7.27298880e+00  6.71811914e+00  5.76284456e+00  8.30408669e+00
 -2.41843677e+00 -2.73544645e+00  1.01883574e+01  1.10150175e+01
 -5.02218485e+00  3.44609070e+00 -5.11910617e-01  1.19049301e+01
  3.24149966e+00  8.34561253e+00 -1.73279238e+00 -4.05375576e+00
  1.03170576e+01  8.55071545e+00  3.45441484e+00  2.95202851e+00
 -1.69418120e+00  1.70650196e+01  6.47965074e-01 -

Fold repeater 3


Parameters: { "silent" } are not used.

  2.32350979e+01  1.28425875e+01  5.30038404e+00  2.05700135e+00
  1.47523670e+01  1.87740765e+01  1.02702484e+01 -3.74244452e+00
 -7.58698654e+00  1.63306713e+00 -2.17132831e+00  9.30328751e+00
 -1.52692959e-01  2.20303268e+01  2.57143283e+00 -1.07457483e+00
 -4.30542135e+00  1.64456344e+00 -1.11763120e+00 -5.10026991e-01
  1.88023949e+01  4.12963343e+00  1.32170448e+01  4.08359003e+00
  3.47355223e+00  1.19039841e+01 -1.81137431e+00  2.99403739e+00
 -9.65167642e-01  2.20074635e+01  1.04883471e+01  1.13663702e+01
 -5.81902885e+00  1.65054359e+01  1.91062031e+01 -2.06032467e+00
  4.11899853e+00  2.95713544e+00 -3.91340351e+00 -4.28671885e+00
  8.96232605e+00 -1.44869685e+00  6.23525143e+00  4.75208378e+00
 -3.03631949e+00 -3.55312347e-01  5.09722424e+00 -1.17384994e+00
  2.48699970e+01  1.62780590e+01  6.41186428e+00  6.42676067e+00
  4.74243402e+00  1.53218746e+00  2.14933157e+00  2.87349739e+01
  5.87240934e+00  4.32942200e+00  2.32684708e+01  

Fold repeater 4


Parameters: { "silent" } are not used.

  2.04884243e+01  2.43919640e+01 -4.90714455e+00 -8.89120674e+00
 -1.70670724e+00  5.47604752e+00  7.06006706e-01  2.75661802e+00
 -2.19738650e+00  4.49120045e+00  2.08727932e+01  7.13031387e+00
 -4.17186499e-01  4.55452490e+00  6.85513783e+00 -1.76301646e+00
  3.40391183e+00  7.69273758e-01 -5.32747567e-01  1.91472566e+00
  2.50307026e+01 -9.14095819e-01  1.52541256e+01  1.96872559e+01
 -8.22027111e+00  3.06993675e+00  1.13372002e+01 -4.75173473e+00
 -1.84849048e+00  5.94352961e+00 -2.80823398e+00  9.15512753e+00
  1.64336224e+01 -4.25206375e+00  2.33705826e+01 -4.86989307e+00
  4.41451788e+00  4.69700050e+00  6.89690781e+00  7.56325769e+00
 -8.16645050e+00 -3.48743320e+00  4.94744396e+00 -2.71348524e+00
  3.16535544e+00  6.71141720e+00  4.49684286e+00  2.68305850e+00
  9.37221622e+00  6.56514597e+00  4.35171127e+00 -2.67515093e-01
  1.97938967e+00  6.11675644e+00  1.34120002e-01  4.11896181e+00
  5.64681435e+00  4.79586458e+00 -1.43835616e+00  

Fold repeater 5


Parameters: { "silent" } are not used.

  1.42920074e+01  4.81978893e+00  5.76848984e+00  1.08068829e+01
 -2.66889811e+00  2.15746975e+00  2.35844440e+01  4.00775290e+00
  7.99433231e+00  5.22871876e+00  2.58764434e+00 -7.81108618e+00
  6.42498446e+00 -5.64493716e-01  3.27436852e+00 -1.17987049e+00
  2.70952492e+01  1.66275845e+01  9.73274708e+00 -9.43299830e-02
  1.32574332e+00  4.89414310e+00 -2.54831100e+00 -1.97410369e+00
 -5.60212469e+00  6.39354944e+00  1.36074457e+01 -1.36824059e+00
 -5.59573650e-01 -5.93455458e+00  3.95326662e+00 -4.07758802e-01
  8.86655331e+00  2.18365211e+01  2.09654689e+00 -1.23830569e+00
  1.03292084e+01  8.46076775e+00  1.69796028e+01  6.64499378e+00
  2.00009747e+01  1.34588747e+01  1.83712692e+01 -1.54415429e+00
  3.41003346e+00 -1.85849142e+00  2.37354827e+00  2.30859547e+01
  5.77187443e+00  6.19042778e+00  1.24788752e+01  1.37221451e+01
  2.73835516e+00  3.22224350e+01  1.53754034e+01  1.17307625e+01
  9.53122807e+00  6.15423918e+00  2.34778938e+01  

Fold repeater 6


Parameters: { "silent" } are not used.

  8.25514257e-01  2.25519028e+01 -1.26477265e+00  3.20346308e+00
  8.05271626e+00  4.68823862e+00  4.84047079e+00  3.17868620e-01
  3.89124417e+00 -4.89002180e+00  2.59989053e-02  9.86184311e+00
  2.50293636e+00  1.52259989e+01  2.25196934e+00 -1.29224733e-03
 -8.17925930e+00 -8.30556297e+00 -1.14479005e+00  3.97464538e+00
  3.29658580e+00 -4.27267551e+00  2.77136064e+00  1.84357777e+01
 -5.41420341e-01 -2.02691936e+00  2.76174545e+01  5.44149685e+00
  2.55676579e+00 -5.76544189e+00 -1.20014179e+00 -1.11073625e+00
 -6.70550525e-01  6.81703269e-01 -1.99926949e+00  8.01768780e+00
  1.42717659e+00  1.46027641e+01 -5.12311077e+00  2.55496483e+01
  1.48976860e+01  3.56758428e+00  1.94236450e+01  5.71879768e+00
 -3.81454062e+00  2.04259062e+00 -4.61533308e-01 -3.19253534e-01
  9.66539443e-01  1.26108837e+01  2.29725990e+01 -3.82246971e+00
  7.54731941e+00  5.06320286e+00 -2.06685138e+00  5.10910797e+00
  1.12775478e+01 -2.50892544e+00  5.64723682e+00  

Fold repeater 7


Parameters: { "silent" } are not used.

  1.54440765e+01 -1.23407173e+00 -2.02877712e+00  8.86695385e+00
  5.19505405e+00  5.85094976e+00 -1.79809856e+00 -3.84784436e+00
  2.37699814e+01  2.81739259e+00  4.14771986e+00  7.73533535e+00
  7.84413290e+00  3.99500728e+00 -1.62867737e+00  2.22486091e+00
  3.85601044e+00  8.02293968e+00  3.81889081e+00  2.50315690e+00
  1.23838024e+01  1.42872095e+01  3.66735792e+00 -1.57022295e+01
  2.13006172e+01 -8.76636982e-01  1.93752079e+01  3.38640976e+00
  6.75912762e+00  1.33261538e+01  5.00713396e+00  1.07497206e+01
 -2.71689725e+00  1.17771463e+01  8.41227150e+00  7.85932875e+00
  2.18308296e+01 -1.06315126e+01  2.33510723e+01 -1.70159066e+00
 -1.30700045e+01  8.20723248e+00 -1.65327835e+00  6.59180880e+00
  5.37164974e+00 -3.73154432e-01  1.57559433e+01  8.69999599e+00
  1.19802260e+00  4.39477396e+00 -1.68342090e+00  8.72503471e+00
  5.72768748e-01  4.32893801e+00  2.52381730e+00  5.52279902e+00
  3.31324744e+00  2.60461655e+01  8.96126556e+00 -

Fold repeater 8


Parameters: { "silent" } are not used.

  1.85897236e+01  2.43578568e+01 -1.05565310e+00 -2.20317936e+00
  7.53826761e+00  5.33613682e+00  7.50138569e+00  4.63872862e+00
 -4.93868923e+00 -3.75713682e+00 -2.97220796e-01 -3.98798251e+00
  7.25423157e-01  1.35331011e+01  6.46521759e+00  2.28924580e+01
 -6.03503656e+00  4.72919941e+00  3.46565795e+00  4.73070717e+00
  2.75950265e+00  3.70985419e-01 -2.17707539e+00 -4.83158541e+00
  1.47060070e+01  5.79361820e+00  2.68255424e+01 -8.52707863e+00
  1.12983112e+01 -7.04530764e+00 -4.69195843e+00  4.39428329e+00
  3.65197039e+00  5.19252491e+00  8.33417511e+00 -1.07167535e-01
  9.92134857e+00 -6.90956736e+00  2.25688362e+01  2.20430889e+01
  6.19786406e+00 -8.77933121e+00 -2.76169944e+00  5.89109659e+00
  5.04378557e+00  3.53773046e+00 -1.72378385e+00 -1.92201972e+00
  3.19964743e+00  2.94631271e+01  2.06996498e+01  1.30368423e+01
  2.58920360e+00  4.10623598e+00 -2.20712471e+00 -1.56535876e+00
  1.30523300e+00  1.70402551e+00  3.21455765e+00  

Fold repeater 9


Parameters: { "silent" } are not used.

  2.23493919e+01  1.40105021e+00 -3.26974988e+00 -3.71026611e+00
  5.40158892e+00  4.89811182e+00  4.04387093e+00  2.46105499e+01
  2.33669682e+01  8.22405434e+00 -2.34824038e+00  2.59948044e+01
  3.49594045e+00  1.84731312e+01  5.82925606e+00  5.63449383e+00
 -1.32872534e+00  4.74322701e+00  3.08122396e+00  4.26985550e+00
  4.49911356e+00  2.01676536e+00  1.23375835e+01 -2.35337782e+00
  3.71199679e+00  3.42932582e+00 -1.39722252e+00  6.31314850e+00
 -7.53959894e-01 -1.51754630e+00  5.24432993e+00 -2.86455250e+00
  9.80099106e+00  8.09895039e+00  1.29346838e+01 -1.81302190e+00
  4.72832537e+00  9.00786877e+00  8.47000313e+00  2.28028679e+01
  1.21912670e+01 -3.94385529e+00  1.59724307e+00 -1.55134782e-01
  2.14546833e+01  4.11395407e+00  9.02755451e+00  8.44392490e+00
  1.68354149e+01  3.68163824e+00  4.09857178e+00  1.15637627e+01
  1.77784462e+01  1.83432853e+00  4.60981178e+00  1.76671505e+01
 -2.01168609e+00  4.35535526e+00 -1.42270756e+00 -

In [27]:
spline_model = []

for i in range(repeat_cv):
    dat = list(zip(oof_preds[i],np.where(y>0,1,0)))
    dat = sorted(dat, key = lambda x: x[0])
    datdict = {}
    for k in range(len(dat)):
        datdict[dat[k][0]]= dat[k][1]
        
    spline_model.append(UnivariateSpline(list(datdict.keys()), list(datdict.values())))
    spline_fit = spline_model[i](oof_preds[i])
    
    print(f"logloss of cvsplit {i}: {log_loss(np.where(y>0,1,0),spline_fit)}") 

logloss of cvsplit 0: 0.5125175738238397
logloss of cvsplit 1: 0.5138533626296884
logloss of cvsplit 2: 0.5125918250887339
logloss of cvsplit 3: 0.5145289029820715
logloss of cvsplit 4: 0.5118717031213915
logloss of cvsplit 5: 0.5138111768047946
logloss of cvsplit 6: 0.5140527935535993
logloss of cvsplit 7: 0.5141136492432942
logloss of cvsplit 8: 0.5138317523543188
logloss of cvsplit 9: 0.5136670724341129


In [28]:
spline_model = []

for i in range(repeat_cv):
    dat = list(zip(oof_preds[i],np.where(y>0,1,0)))
    dat = sorted(dat, key = lambda x: x[0])
    datdict = {}
    for k in range(len(dat)):
        datdict[dat[k][0]]= dat[k][1]
    spline_model.append(UnivariateSpline(list(datdict.keys()), list(datdict.values())))
    spline_fit = spline_model[i](oof_preds[i])
    spline_fit = np.clip(spline_fit,0.025,0.975)
    
    print(f"adjusted logloss of cvsplit {i}: {log_loss(np.where(y>0,1,0),spline_fit)}") 

adjusted logloss of cvsplit 0: 0.5110061966645212
adjusted logloss of cvsplit 1: 0.5124764794761477
adjusted logloss of cvsplit 2: 0.510740765242457
adjusted logloss of cvsplit 3: 0.5132593170849638
adjusted logloss of cvsplit 4: 0.5099307928143542
adjusted logloss of cvsplit 5: 0.5127938453402021
adjusted logloss of cvsplit 6: 0.5133431370796638
adjusted logloss of cvsplit 7: 0.5123192549178317
adjusted logloss of cvsplit 8: 0.5114422121955751
adjusted logloss of cvsplit 9: 0.5120801885567579


In [None]:
spline_model = []

for i in range(repeat_cv):
    dat = list(zip(oof_preds[i],np.where(y>0,1,0)))
    dat = sorted(dat, key = lambda x: x[0])
    datdict = {}
    for k in range(len(dat)):
        datdict[dat[k][0]]= dat[k][1]
    spline_model.append(UnivariateSpline(list(datdict.keys()), list(datdict.values())))
    spline_fit = spline_model[i](oof_preds[i])
    spline_fit = np.clip(spline_fit,0.025,0.975)
    spline_fit[(tourney_data.T1_seed==1) & (tourney_data.T2_seed==16)] = 1.0
    spline_fit[(tourney_data.T1_seed==2) & (tourney_data.T2_seed==15)] = 1.0
    spline_fit[(tourney_data.T1_seed==3) & (tourney_data.T2_seed==14)] = 1.0
    spline_fit[(tourney_data.T1_seed==4) & (tourney_data.T2_seed==13)] = 1.0
    spline_fit[(tourney_data.T1_seed==16) & (tourney_data.T2_seed==1)] = 0.0
    spline_fit[(tourney_data.T1_seed==15) & (tourney_data.T2_seed==2)] = 0.0
    spline_fit[(tourney_data.T1_seed==14) & (tourney_data.T2_seed==3)] = 0.0
    spline_fit[(tourney_data.T1_seed==13) & (tourney_data.T2_seed==4)] = 0.0
    
    print(f"adjusted logloss of cvsplit {i}: {log_loss(np.where(y>0,1,0),spline_fit)}") 

In [None]:
val_cv = []
spline_model = []

for i in range(repeat_cv):
    dat = list(zip(oof_preds[i],np.where(y>0,1,0)))
    dat = sorted(dat, key = lambda x: x[0])
    datdict = {}
    for k in range(len(dat)):
        datdict[dat[k][0]]= dat[k][1]
    spline_model.append(UnivariateSpline(list(datdict.keys()), list(datdict.values())))
    spline_fit = spline_model[i](oof_preds[i])
    spline_fit = np.clip(spline_fit,0.025,0.975)
    # spline_fit[(tourney_data.T1_seed==1) & (tourney_data.T2_seed==16) & (tourney_data.T1_Score > tourney_data.T2_Score)] = 1.0
    # spline_fit[(tourney_data.T1_seed==2) & (tourney_data.T2_seed==15) & (tourney_data.T1_Score > tourney_data.T2_Score)] = 1.0
    # spline_fit[(tourney_data.T1_seed==3) & (tourney_data.T2_seed==14) & (tourney_data.T1_Score > tourney_data.T2_Score)] = 1.0
    # spline_fit[(tourney_data.T1_seed==4) & (tourney_data.T2_seed==13) & (tourney_data.T1_Score > tourney_data.T2_Score)] = 1.0
    # spline_fit[(tourney_data.T1_seed==16) & (tourney_data.T2_seed==1) & (tourney_data.T1_Score < tourney_data.T2_Score)] = 0.0
    # spline_fit[(tourney_data.T1_seed==15) & (tourney_data.T2_seed==2) & (tourney_data.T1_Score < tourney_data.T2_Score)] = 0.0
    # spline_fit[(tourney_data.T1_seed==14) & (tourney_data.T2_seed==3) & (tourney_data.T1_Score < tourney_data.T2_Score)] = 0.0
    # spline_fit[(tourney_data.T1_seed==13) & (tourney_data.T2_seed==4) & (tourney_data.T1_Score < tourney_data.T2_Score)] = 0.0
    
    val_cv.append(pd.DataFrame({"y":np.where(y>0,1,0), "pred":spline_fit, "season":tourney_data.Season}))
    print(f"adjusted logloss of cvsplit {i}: {log_loss(np.where(y>0,1,0),spline_fit)}") 
    
val_cv = pd.concat(val_cv)
val_cv.groupby('season').apply(lambda x: log_loss(x.y, x.pred))

In [29]:
val_cv = []
spline_model = []

for i in range(repeat_cv):
    dat = list(zip(oof_preds[i],np.where(y>0,1,0)))
    dat = sorted(dat, key = lambda x: x[0])
    datdict = {}
    for k in range(len(dat)):
        datdict[dat[k][0]]= dat[k][1]
    spline_model.append(UnivariateSpline(list(datdict.keys()), list(datdict.values())))
    spline_fit = spline_model[i](oof_preds[i])
    spline_fit = np.clip(spline_fit,0.025,0.975)
    
    val_cv.append(pd.DataFrame({"y":np.where(y>0,1,0), "pred":spline_fit, "season":tourney_data.Season}))
    print(f"adjusted logloss of cvsplit {i}: {log_loss(np.where(y>0,1,0),spline_fit)}") 
    
val_cv = pd.concat(val_cv)
val_cv.groupby('season').apply(lambda x: log_loss(x.y, x.pred))

adjusted logloss of cvsplit 0: 0.5110061966645212
adjusted logloss of cvsplit 1: 0.5124764794761477
adjusted logloss of cvsplit 2: 0.510740765242457
adjusted logloss of cvsplit 3: 0.5132593170849638
adjusted logloss of cvsplit 4: 0.5099307928143542
adjusted logloss of cvsplit 5: 0.5127938453402021
adjusted logloss of cvsplit 6: 0.5133431370796638
adjusted logloss of cvsplit 7: 0.5123192549178317
adjusted logloss of cvsplit 8: 0.5114422121955751
adjusted logloss of cvsplit 9: 0.5120801885567579


  val_cv.groupby('season').apply(lambda x: log_loss(x.y, x.pred))


season
2003    0.535308
2004    0.513371
2005    0.512771
2006    0.573797
2007    0.451186
2008    0.480414
2009    0.484470
2010    0.492734
2011    0.506600
2012    0.508857
2013    0.528881
2014    0.507851
2015    0.449796
2016    0.557843
2017    0.493377
2018    0.554450
2019    0.454967
2021    0.541969
2022    0.559915
2023    0.541057
2024    0.483038
dtype: float64

In [30]:
sub = pd.read_csv(DATA_PATH + "SampleSubmissionStage2.csv")
sub['Season'] = sub['ID'].apply(lambda x: int(x.split('_')[0]))
sub["T1_TeamID"] = sub['ID'].apply(lambda x: int(x.split('_')[1]))
sub["T2_TeamID"] = sub['ID'].apply(lambda x: int(x.split('_')[2]))
sub.head()

Unnamed: 0,ID,Pred,Season,T1_TeamID,T2_TeamID
0,2025_1101_1102,0.5,2025,1101,1102
1,2025_1101_1103,0.5,2025,1101,1103
2,2025_1101_1104,0.5,2025,1101,1104
3,2025_1101_1105,0.5,2025,1101,1105
4,2025_1101_1106,0.5,2025,1101,1106


In [31]:
sub = pd.merge(sub, season_statistics_T1, on = ['Season', 'T1_TeamID'], how = 'left')
sub = pd.merge(sub, season_statistics_T2, on = ['Season', 'T2_TeamID'], how = 'left')

sub = pd.merge(sub, glm_quality_T1, on = ['Season', 'T1_TeamID'], how = 'left')

sub = pd.merge(sub, glm_quality_T2, on = ['Season', 'T2_TeamID'], how = 'left')

sub = pd.merge(sub, seeds_T1, on = ['Season', 'T1_TeamID'], how = 'left')
sub = pd.merge(sub, seeds_T2, on = ['Season', 'T2_TeamID'], how = 'left')
sub = pd.merge(sub, last14days_stats_T1, on = ['Season', 'T1_TeamID'], how = 'left')
sub = pd.merge(sub, last14days_stats_T2, on = ['Season', 'T2_TeamID'], how = 'left')

sub["Seed_diff"] = sub["T1_seed"] - sub["T2_seed"]

sub.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,ID,Pred,Season,T1_TeamID,T2_TeamID,T1_FGMmean,T1_FGAmean,T1_FGM3mean,T1_FGA3mean,T1_ORmean,T1_Astmean,T1_TOmean,T1_Stlmean,T1_PFmean,T1_opponent_FGMmean,T1_opponent_FGAmean,T1_opponent_FGM3mean,T1_opponent_FGA3mean,T1_opponent_ORmean,T1_opponent_Astmean,T1_opponent_TOmean,T1_opponent_Stlmean,T1_opponent_Blkmean,T1_PointDiffmean,T2_FGMmean,T2_FGAmean,T2_FGM3mean,T2_FGA3mean,T2_ORmean,T2_Astmean,T2_TOmean,T2_Stlmean,T2_PFmean,T2_opponent_FGMmean,T2_opponent_FGAmean,T2_opponent_FGM3mean,T2_opponent_FGA3mean,T2_opponent_ORmean,T2_opponent_Astmean,T2_opponent_TOmean,T2_opponent_Stlmean,T2_opponent_Blkmean,T2_PointDiffmean,T1_quality,T2_quality,T1_seed,T2_seed,T1_win_ratio_14d,T2_win_ratio_14d,Seed_diff
0,2025_1101_1102,0.5,2025,1101,1102,24.206897,56.241379,4.103448,14.206897,8.655172,12.482759,14.206897,10.034483,21.448276,23.655172,51.310345,5.448276,16.689655,8.758621,12.206897,16.034483,7.896552,4.689655,-3.448276,21.25,50.5,8.09375,24.46875,5.9375,13.375,12.1875,5.65625,17.84375,26.40625,54.9375,7.21875,19.90625,7.9375,12.84375,9.6875,6.8125,3.21875,-11.71875,,,,,0.333333,0.0,
1,2025_1101_1103,0.5,2025,1101,1103,24.206897,56.241379,4.103448,14.206897,8.655172,12.482759,14.206897,10.034483,21.448276,23.655172,51.310345,5.448276,16.689655,8.758621,12.206897,16.034483,7.896552,4.689655,-3.448276,30.21875,64.125,10.65625,29.09375,9.96875,17.6875,11.75,7.5625,18.28125,26.6875,61.375,7.4375,23.34375,8.84375,11.9375,12.0,7.1875,2.4375,8.0625,,,,13.0,0.333333,1.0,
2,2025_1101_1104,0.5,2025,1101,1104,24.206897,56.241379,4.103448,14.206897,8.655172,12.482759,14.206897,10.034483,21.448276,23.655172,51.310345,5.448276,16.689655,8.758621,12.206897,16.034483,7.896552,4.689655,-3.448276,31.090909,64.424242,10.424242,29.787879,10.969697,16.787879,12.121212,6.060606,19.030303,28.909091,68.090909,7.363636,23.909091,10.333333,12.666667,10.060606,7.818182,4.242424,9.69697,,,,2.0,0.333333,0.5,
3,2025_1101_1105,0.5,2025,1101,1105,24.206897,56.241379,4.103448,14.206897,8.655172,12.482759,14.206897,10.034483,21.448276,23.655172,51.310345,5.448276,16.689655,8.758621,12.206897,16.034483,7.896552,4.689655,-3.448276,23.137931,59.793103,7.655172,24.862069,11.344828,12.344828,15.103448,8.0,21.310345,26.551724,56.517241,8.275862,21.206897,9.724138,15.344828,14.137931,9.068966,3.034483,-10.586207,,,,,0.333333,0.0,
4,2025_1101_1106,0.5,2025,1101,1106,24.206897,56.241379,4.103448,14.206897,8.655172,12.482759,14.206897,10.034483,21.448276,23.655172,51.310345,5.448276,16.689655,8.758621,12.206897,16.034483,7.896552,4.689655,-3.448276,25.121212,62.787879,8.848485,26.909091,9.333333,11.393939,8.727273,7.636364,18.212121,24.818182,56.757576,7.484848,22.393939,9.151515,12.484848,12.060606,5.242424,3.272727,-0.30303,,,,16.0,0.333333,1.0,


In [32]:
Xsub = sub[features].values
dtest = xgb.DMatrix(Xsub)

In [33]:
sub_models = []
for i in range(repeat_cv):
    print(f"Fold repeater {i}")
    sub_models.append(
        xgb.train(
          params = param,
          dtrain = dtrain,
          num_boost_round = int(iteration_counts[i] * 1.05),
          verbose_eval = 50
        )
    )

Fold repeater 0


Parameters: { "silent" } are not used.



Fold repeater 1


Parameters: { "silent" } are not used.



Fold repeater 2


Parameters: { "silent" } are not used.



Fold repeater 3


Parameters: { "silent" } are not used.



Fold repeater 4


Parameters: { "silent" } are not used.



Fold repeater 5


Parameters: { "silent" } are not used.



Fold repeater 6


Parameters: { "silent" } are not used.



Fold repeater 7


Parameters: { "silent" } are not used.



Fold repeater 8


Parameters: { "silent" } are not used.



Fold repeater 9


Parameters: { "silent" } are not used.



In [34]:
sub_preds = []
for i in range(repeat_cv):
    sub_preds.append(np.clip(spline_model[i](np.clip(sub_models[i].predict(dtest),-30,30)),0.025,0.975))
    
sub["Pred"] = pd.DataFrame(sub_preds).mean(axis=0)
sub[['ID','Pred']].to_csv("submission.csv", index = None)

In [35]:
subb = pd.read_csv("/kaggle/working/submission.csv")
subb.tail()

Unnamed: 0,ID,Pred
131402,2025_3477_3479,0.176091
131403,2025_3477_3480,0.088492
131404,2025_3478_3479,0.155267
131405,2025_3478_3480,0.070716
131406,2025_3479_3480,0.047002
