In [37]:
import feather
import os
import re
import pickle
import time
import datetime
import random

import numpy as np
import pandas as pd

from numba import jit

from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import matthews_corrcoef

import seaborn as sns
import matplotlib.pyplot as plt

from scipy.sparse import csr_matrix, hstack, vstack

from ml_toolbox.xgboostmonitor_utils import *
import ml_toolbox.xgboostmonitor_utils as xgbm

%matplotlib inline

import xgboost as xgb
import subprocess

# Custom modules
import const
import func

In [2]:
# Based on: https://www.kaggle.com/c/caterpillar-tube-pricing/forums/t/15748/strategies-to-encode-categorical-variables-with-many-categories/88207

## Load data

In [239]:
print const.TRAIN_FILES

['train_numeric', 'train_categorical_to_num', 'train_date']


In [3]:
# Load look-up table
lut = pd.read_csv(const.LOOK_UP_TABLE)
lut.head(3)

Unnamed: 0,line,station,feature_nr,feat_nr_dat,name_dat,name_cat,name_num,col_dat,col_num,col_cat,station_V2,line_V2
0,0,0,0,1.0,L0_S0_D1,,L0_S0_F0,0.0,0.0,,0.0,1.0
1,0,0,2,3.0,L0_S0_D3,,L0_S0_F2,1.0,1.0,,0.0,1.0
2,0,0,4,5.0,L0_S0_D5,,L0_S0_F4,2.0,2.0,,0.0,1.0


In [87]:
# Load cluster info
cluster_info = pd.read_csv(os.path.join(const.DATA_PATH, 'eda_sample_clusters.csv'), index_col=0)
print cluster_info.shape
cluster_info.head(3)

(2367495, 7)


Unnamed: 0_level_0,unique_path,cluster_n8,cluster_n15,cluster_n25,cluster_n50,cluster_n150,cluster_n500
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
4,13409,1,2,3,47,36,369
6,7029,2,10,0,35,63,14
7,12763,1,2,19,3,132,477


In [12]:
# Load timestamps
date_train = func.load_data_file(const.TRAIN_FILES[2])
date_test = func.load_data_file(const.TEST_FILES[2])

Returning <open file '/Volumes/My Book/kaggle_bosch/train_date.pkl', mode 'rb' at 0x1184441e0>.pkl
Returning <open file '/Volumes/My Book/kaggle_bosch/test_date.pkl', mode 'rb' at 0x1184441e0>.pkl


In [15]:
date_data = vstack([date_train['data']['features'],date_test['data']['features']], format='csr')
ids = pd.concat([date_train['data']['ids'], date_test['data']['ids']])
y = date_train['data']['y']
del date_train, date_test

In [16]:
# Load response
#y = func.read_last_column(os.path.join(const.BASE_PATH, const.TRAIN_FILES[0] + '.csv'))
print y.shape
y.head(3)

(1183747, 1)


Unnamed: 0_level_0,Response
Id,Unnamed: 1_level_1
4,0
6,0
7,0


In [17]:
# Load IDs of train + test
#ids = pd.concat([func.read_first_column(os.path.join(const.BASE_PATH, const.TRAIN_FILES[0])),
#                 func.read_first_column(os.path.join(const.BASE_PATH, const.TEST_FILES[0]))],
#                axis=0)
print ids.shape
ids.head(3)

(2367495, 1)


Unnamed: 0,Id
0,4
1,6
2,7


In [427]:
# Add response to cluster info
cluster_info['R'] = y.Response

# Add sample time to cluster info
def max_element_row(X):
    ''' Return maximum value of each row of sparse csr matrix X'''
    ''' nan values are assumed to be encoded as zero'''
    
    output = X.max(1).todense().A1
    
    output[output==0] = 0
    
    return output

cluster_info['tmax'] = (max_element_row(date_data)*5).astype(int)

## Calculate features based on cluster 500

In [592]:
cluster_mean = cluster_info.groupby(['cluster_n500'])['R'].agg(['mean','count','sum'])
cluster_n500 = cluster_info.merge(cluster_mean, left_on='cluster_n500', right_index=True, how='left')

In [605]:
def loo_mean(mean, count, sample_val):
    output = (mean * count - sample_val) / (count - 1)
    
    # Return all sample mean
    output[count<=1] = 0.0058
    
    return output

def adjust_low_count_mean(count, mean):
    # Adjust mean for samples with low count
    # Use cut-off point to trim samples with low amount of samples
    cutoff = 1000
    train_mean = 0.0058
    r = pd.Series([1 + random.random()/5 - 0.1 for x in mean], index=mean.index)
    output = (count * mean + (cutoff-count)*train_mean)/cutoff
    
    output[count>(cutoff - 10 * count * mean)] = mean[count>(cutoff - 10 * count * mean)]
    
    return output

def cut_off_loo_mean(r1_count, mean):
    train_mean = 0.0058
    
    output = mean.copy()
    output[r1_count<15] = train_mean
    
    return output

def taper_mean_bin_prob(mean, bin_prob):
    train_mean = 0.0058
    
    output = bin_prob * train_mean + (1 - bin_prob) * mean
    
    return output

def random_loo_mean(mean, count, sample_val):
    # Random number between 0.9 and 1.1
    train_mean = 0.0058
    r = pd.Series([1 + random.random()/5 - 0.1 for x in mean], index=mean.index)
    #print r
    # Train samples have out of sample mean
    output = r * loo_mean(mean, count, sample_val)
    
    # Test samples have in-sample mean
    output[sample_val.isnull()] = mean[sample_val.isnull()]
    
    # Samples with mean null (categorical values not in train) set to all train sample mean
    output[mean.isnull()] = train_mean
    
    return output


def bin_prob(n, k, p):
    return scipy.misc.comb(n,k)*(p**k)*((1-p)**(n-k))

from scipy import special, exp, log
lgam = special.gammaln

def binomial2(n, k, p):
    return exp(lgam(n+1) - lgam(n-k+1) - lgam(k+1) + k*log(p) + (n-k)*log(1.-p))

In [None]:
cluster_n500['bin_prob'] = cluster_n500[['count','sum']].apply(lambda x: binomial2(x[0], x[1], 0.0058), axis=1)

In [606]:
cluster_n500['loo_mean'] = random_loo_mean(cluster_n500['mean'],
                                                  cluster_n500['count'],
                                                  cluster_n500['R'])

cluster_n500['loo_mean_tapered'] = adjust_low_count_mean(cluster_n500['count'],
                                                  cluster_n500['loo_mean'])

cluster_n500['loo_mean_cutoff'] = cut_off_loo_mean(cluster_n500['sum'],
                                                  cluster_n500['loo_mean'])

cluster_n500['loo_mean_prob_bin'] = taper_mean_bin_prob(cluster_n500['loo_mean'],
                                                  cluster_n500['bin_prob'])

In [595]:
cluster_n500.isnull().sum()

unique_path               0
cluster_n8                0
cluster_n15               0
cluster_n25               0
cluster_n50               0
cluster_n150              0
cluster_n500              0
R                   1183748
tmax                      0
mean                      0
count                     0
sum                       0
loo_mean                  0
loo_mean_tapered          0
loo_mean_cutoff           0
dtype: int64

In [611]:
cluster_n500[['loo_mean', 
              'loo_mean_tapered', 
              'loo_mean_cutoff', 
              'loo_mean_prob_bin']].to_csv(os.path.join(const.DATA_PATH, 'feat_set_cluster_n500_loo.csv'), 
                                         index_label='ID')

In [607]:
cluster_n500.sort_values('loo_mean', ascending=False)

Unnamed: 0_level_0,unique_path,cluster_n8,cluster_n15,cluster_n25,cluster_n50,cluster_n150,cluster_n500,R,tmax,mean,count,sum,loo_mean,loo_mean_tapered,loo_mean_cutoff,bin_prob,loo_mean_prob_bin
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
434784,405,0,9,20,4,12,234,0.0,5114,0.384615,26,10.0,0.436645,0.017002,0.005800,2.084954e-16,0.436645
578446,771,0,0,10,25,11,459,0.0,3693,0.375000,16,6.0,0.431751,0.012615,0.005800,2.876270e-10,0.431751
1620745,407,0,9,20,4,12,234,0.0,5034,0.384615,26,10.0,0.428182,0.016782,0.005800,2.084954e-16,0.428182
653587,650,0,0,10,25,11,459,0.0,3693,0.375000,16,6.0,0.424711,0.012503,0.005800,2.876270e-10,0.424711
436625,700,0,0,10,25,51,459,0.0,3693,0.375000,16,6.0,0.422635,0.012469,0.005800,2.876270e-10,0.422635
1525812,866,0,0,10,25,51,459,0.0,5143,0.375000,16,6.0,0.416566,0.012372,0.005800,2.876270e-10,0.416566
1680715,933,0,9,20,4,12,234,0.0,5202,0.384615,26,10.0,0.414360,0.016423,0.005800,2.084954e-16,0.414360
290451,398,0,9,20,4,12,234,0.0,5031,0.384615,26,10.0,0.410902,0.016333,0.005800,2.084954e-16,0.410902
1912444,404,0,9,20,4,12,234,0.0,5121,0.384615,26,10.0,0.409994,0.016309,0.005800,2.084954e-16,0.409994
669024,404,0,9,20,4,12,234,0.0,5226,0.384615,26,10.0,0.399589,0.016039,0.005800,2.084954e-16,0.399589


In [609]:
cluster_n500.sample(20)

Unnamed: 0_level_0,unique_path,cluster_n8,cluster_n15,cluster_n25,cluster_n50,cluster_n150,cluster_n500,R,tmax,mean,count,sum,loo_mean,loo_mean_tapered,loo_mean_cutoff,bin_prob,loo_mean_prob_bin
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
756634,4038,2,12,5,16,55,81,0.0,5619,0.00595,9411,56.0,0.005896,0.005896,0.005896,0.052421,0.005891
1252852,13051,1,2,19,3,4,429,,5558,0.00462,25974,120.0,0.00462,0.00462,0.00462,0.001252,0.004621
966057,1485,7,3,18,31,9,266,,4979,0.00351,10827,38.0,0.00351,0.00351,0.00351,0.000209,0.00351
1642451,2583,6,4,4,9,58,410,0.0,3872,0.008061,22825,184.0,0.008662,0.008662,0.008662,4e-06,0.008662
1111737,10957,3,5,15,1,44,199,0.0,4717,0.002106,1899,4.0,0.0021,0.0021,0.0058,0.009977,0.002137
2216066,13747,1,2,3,47,8,63,0.0,4401,0.005264,12727,67.0,0.005133,0.005133,0.005133,0.035203,0.005157
1641067,13387,1,2,3,47,36,369,,3044,0.004285,12602,54.0,0.004285,0.004285,0.004285,0.003442,0.00429
920530,10745,1,2,12,49,19,139,,859,0.004174,13177,55.0,0.004174,0.004174,0.004174,0.00189,0.004177
850754,5494,2,12,24,21,98,30,0.0,4931,0.005973,9375,56.0,0.006472,0.006472,0.006472,0.052116,0.006437
1792954,2580,6,4,4,9,58,97,,5444,0.007751,32642,253.0,0.007751,0.007751,0.007751,1e-06,0.007751


In [582]:
cluster_n500.groupby('R')['loo_mean'].mean()

R
0.0    0.005316
1.0    0.015070
Name: loo_mean, dtype: float64

## Calculate features based on unique path

In [612]:
cluster_mean = cluster_info.groupby(['unique_path'])['R'].agg(['mean','count','sum'])
cluster_upath = cluster_info.merge(cluster_mean, left_on='unique_path', right_index=True, how='left')

In [614]:
cluster_upath['bin_prob'] = cluster_upath[['count','sum']].apply(lambda x: binomial2(x[0], x[1], 0.0058), axis=1)

In [615]:
is_train = ~cluster_upath['R'].isnull()
cluster_upath['loo_mean'] = random_loo_mean(cluster_upath['mean'],
                                                  cluster_upath['count'],
                                                  cluster_upath['R'])

cluster_upath['loo_mean_tapered'] = adjust_low_count_mean(cluster_upath['count'],
                                                  cluster_upath['loo_mean'])

cluster_upath['loo_mean_cutoff'] = cut_off_loo_mean(cluster_upath['sum'],
                                                  cluster_upath['loo_mean'])

cluster_upath['loo_mean_prob_bin'] = taper_mean_bin_prob(cluster_upath['loo_mean'],
                                                  cluster_upath['bin_prob'])

In [616]:
cluster_upath.isnull().sum()

unique_path                0
cluster_n8                 0
cluster_n15                0
cluster_n25                0
cluster_n50                0
cluster_n150               0
cluster_n500               0
R                    1183748
tmax                       0
mean                    4984
count                      0
sum                     4984
loo_mean                   0
loo_mean_tapered           0
loo_mean_cutoff            0
bin_prob                4984
loo_mean_prob_bin       4984
dtype: int64

In [620]:
cluster_upath.sort_values('loo_mean', ascending=False).head(20)

Unnamed: 0_level_0,unique_path,cluster_n8,cluster_n15,cluster_n25,cluster_n50,cluster_n150,cluster_n500,R,tmax,mean,count,sum,loo_mean,loo_mean_tapered,loo_mean_cutoff,bin_prob,loo_mean_prob_bin
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1402461,12614,1,2,19,3,91,417,0.0,4395,0.5,2,1.0,1.098499,0.007985,0.0058,0.01153272,1.085897
1003981,14802,1,2,1,3,4,0,0.0,3435,0.5,2,1.0,1.09445,0.007977,0.0058,0.01153272,1.081895
1717223,13113,1,2,19,0,91,296,0.0,6531,0.666667,3,2.0,1.093473,0.009063,0.0058,0.0001003347,1.093364
1912156,7078,2,10,0,6,93,373,0.0,4698,0.5,2,1.0,1.089647,0.007968,0.0058,0.01153272,1.077148
435640,1101,0,9,20,4,12,234,1.0,5052,1.0,3,3.0,1.08578,0.00904,0.0058,1.95112e-07,1.08578
291416,12158,3,5,15,22,136,269,0.0,2840,0.5,2,1.0,1.083662,0.007956,0.0058,0.01153272,1.071231
2145106,294,0,9,20,4,12,15,1.0,5886,1.0,2,2.0,1.083298,0.007955,0.0058,3.364e-05,1.083261
2120566,11099,7,3,18,18,122,25,0.0,6054,0.5,2,1.0,1.083061,0.007955,0.0058,0.01153272,1.070637
159085,1746,7,3,18,31,9,221,0.0,3715,0.5,2,1.0,1.080949,0.00795,0.0058,0.01153272,1.06855
435639,1101,0,9,20,4,12,234,1.0,5052,1.0,3,3.0,1.080126,0.009023,0.0058,1.95112e-07,1.080126


In [629]:
cluster_upath.head(20)

Unnamed: 0_level_0,unique_path,cluster_n8,cluster_n15,cluster_n25,cluster_n50,cluster_n150,cluster_n500,R,tmax,mean,count,sum,loo_mean,loo_mean_tapered,loo_mean_cutoff,bin_prob,loo_mean_prob_bin
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
4,13409,1,2,3,47,36,369,0.0,436,0.002041,490,1.0,0.002042,0.003958,0.0058,0.165307,0.002663
6,7029,2,10,0,35,63,14,0.0,6578,0.003097,3552,11.0,0.003035,0.003035,0.0058,0.007936,0.003057
7,12763,1,2,19,3,132,477,0.0,8122,0.004139,10147,42.0,0.004474,0.004474,0.004474,0.004149,0.004479
9,13658,1,2,3,19,8,232,0.0,5770,0.003854,10380,40.0,0.003703,0.003703,0.003703,0.001316,0.003705
11,9865,1,2,12,27,74,165,0.0,3030,0.00502,10159,51.0,0.005224,0.005224,0.005224,0.031896,0.005242
13,10112,1,2,12,27,134,3,0.0,6698,0.005476,10043,55.0,0.005307,0.005307,0.005307,0.049082,0.005331
14,3893,2,12,5,16,55,423,0.0,8320,0.004569,3940,18.0,0.004515,0.004515,0.004515,0.05364,0.004584
16,2415,6,4,4,43,14,231,0.0,4021,0.007089,12272,87.0,0.007029,0.007029,0.007029,0.008224,0.007019
18,13658,1,2,3,19,8,232,0.0,2590,0.003854,10380,40.0,0.003928,0.003928,0.003928,0.001316,0.00393
23,6206,2,12,5,6,135,151,0.0,789,0.004264,3752,16.0,0.004202,0.004202,0.004202,0.042706,0.00427


In [621]:
cluster_upath[['loo_mean', 
              'loo_mean_tapered', 
              'loo_mean_cutoff', 
              'loo_mean_prob_bin']].to_csv(os.path.join(const.DATA_PATH, 'feat_set_cluster_upath_loo.csv'), 
                                         index_label='ID')

## Calculate features based on max data

In [492]:
cluster_mean = cluster_info.groupby(['tmax'])['R'].agg(['mean','count','sum'])
cluster_tmax = cluster_info.merge(cluster_mean, left_on='tmax', right_index=True, how='left')

In [493]:
#cluster_tmax['adj_mean'] = adjust_mean(cluster_tmax['count'], cluster_tmax['mean'])

In [622]:
cluster_tmax['bin_prob'] = cluster_tmax[['count','sum']].apply(lambda x: binomial2(x[0], x[1], 0.0058), axis=1)

In [623]:
cluster_tmax['loo_mean'] = random_loo_mean(cluster_tmax['mean'],
                                                  cluster_tmax['count'],
                                                  cluster_tmax['R'])

cluster_tmax['loo_mean_tapered'] = adjust_low_count_mean(cluster_tmax['count'],
                                                  cluster_tmax['loo_mean'])

cluster_tmax['loo_mean_cutoff'] = cut_off_loo_mean(cluster_tmax['sum'],
                                                  cluster_tmax['loo_mean'])

cluster_tmax['loo_mean_prob_bin'] = taper_mean_bin_prob(cluster_tmax['loo_mean'],
                                                  cluster_tmax['bin_prob'])

In [624]:
cluster_tmax.sort_values('loo_mean', ascending=False)

Unnamed: 0_level_0,unique_path,cluster_n8,cluster_n15,cluster_n25,cluster_n50,cluster_n150,cluster_n500,R,tmax,mean,count,sum,loo_mean,bin_prob,loo_mean_tapered,loo_mean_cutoff,loo_mean_prob_bin
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
195987,14271,1,2,1,19,59,216,1.0,4188,1.000000,2,2.0,1.001012e+00,0.000034,0.007790,0.0058,1.000979
38011,9545,1,2,19,48,112,135,1.0,4188,1.000000,2,2.0,9.402211e-01,0.000034,0.007669,0.0058,0.940190
453838,14264,1,2,1,19,59,363,0.0,1570,0.250000,4,1.0,3.408419e-01,0.022799,0.007140,0.0058,0.333203
1096292,12520,1,2,19,0,59,170,0.0,7498,0.272727,11,3.0,3.263494e-01,0.000031,0.009326,0.0058,0.326340
1253487,8416,1,2,3,27,104,80,0.0,8244,0.272727,11,3.0,3.224451e-01,0.000031,0.009283,0.0058,0.322435
909291,10399,1,2,12,27,74,294,0.0,7498,0.272727,11,3.0,3.204419e-01,0.000031,0.009261,0.0058,0.320432
1096291,12520,1,2,19,0,59,170,0.0,7498,0.272727,11,3.0,3.186799e-01,0.000031,0.009242,0.0058,0.318670
529721,14788,1,2,1,19,4,286,0.0,7498,0.272727,11,3.0,3.180068e-01,0.000031,0.009234,0.0058,0.317997
1188093,8138,1,2,3,27,123,354,0.0,8244,0.272727,11,3.0,3.160392e-01,0.000031,0.009213,0.0058,0.316030
538140,9866,1,2,12,27,74,165,0.0,1570,0.250000,4,1.0,3.154071e-01,0.022799,0.007038,0.0058,0.308349


In [625]:
cluster_tmax[['loo_mean', 
              'loo_mean_tapered', 
              'loo_mean_cutoff', 
              'loo_mean_prob_bin']].to_csv(os.path.join(const.DATA_PATH, 'feat_set_cluster_tmax_loo.csv'), 
                                         index_label='ID')

In [626]:
cols = ['loo_mean', 
              'loo_mean_tapered', 
              'loo_mean_cutoff', 
              'loo_mean_prob_bin']

for col in cols:
    print(col)
    print(cluster_tmax.groupby('R')[col].mean())
    print('')

loo_mean
R
0.0    0.005755
1.0    0.015315
Name: loo_mean, dtype: float64

loo_mean_tapered
R
0.0    0.005775
1.0    0.008709
Name: loo_mean_tapered, dtype: float64

loo_mean_cutoff
R
0.0    0.006053
1.0    0.009864
Name: loo_mean_cutoff, dtype: float64

loo_mean_prob_bin
R
0.0    0.006216
1.0    0.015583
Name: loo_mean_prob_bin, dtype: float64



In [627]:
cols = ['loo_mean', 
              'loo_mean_tapered', 
              'loo_mean_cutoff', 
              'loo_mean_prob_bin']

for col in cols:
    print(col)
    print(cluster_n500.groupby('R')[col].mean())
    print('')

loo_mean
R
0.0    0.005688
1.0    0.026806
Name: loo_mean, dtype: float64

loo_mean_tapered
R
0.0    0.005204
1.0    0.011705
Name: loo_mean_tapered, dtype: float64

loo_mean_cutoff
R
0.0    0.005627
1.0    0.023568
Name: loo_mean_cutoff, dtype: float64

loo_mean_prob_bin
R
0.0    0.005718
1.0    0.026825
Name: loo_mean_prob_bin, dtype: float64



In [628]:
cols = ['loo_mean', 
              'loo_mean_tapered', 
              'loo_mean_cutoff', 
              'loo_mean_prob_bin']

for col in cols:
    print(col)
    print(cluster_upath.groupby('R')[col].mean())
    print('')

loo_mean
R
0.0    0.005543
1.0    0.041153
Name: loo_mean, dtype: float64

loo_mean_tapered
R
0.0    0.005071
1.0    0.006438
Name: loo_mean_tapered, dtype: float64

loo_mean_cutoff
R
0.0    0.005349
1.0    0.013677
Name: loo_mean_cutoff, dtype: float64

loo_mean_prob_bin
R
0.0    0.00578
1.0    0.04125
Name: loo_mean_prob_bin, dtype: float64

