### stage_model_classifier

base: 14./stage_model_classifier_withneighbor-newfeatures

prediction models of sign classifiers on stage dataset

data format:
    target , eventid ,    car_number,    stageid,     features...

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import xgboost as xgb


In [3]:
# bulid regression model
#classifiers = ['currank','avgrank','dice','lr','lrl1','lsvc','lsvcl2','rf','lrbias','xgb']
classifiers = ['currank','avgrank','dice','lr','lsvc','lsvcl2','rf','lrbias','xgb']
train_x, train_y, test_x, test_y = None, None, None, None

def get_classifier(classifier = 'lr'):
    
    class_weight = None
    
    if classifier == "lsvc":
        clf = LinearSVC(penalty='l1',dual=False, tol=1e-3, class_weight=class_weight )
    elif classifier == "lsvcl2":
        clf = LinearSVC(penalty='l2', tol=1e-4, class_weight=class_weight)
    elif classifier == 'rf':
        #clf = RandomForestClassifier(n_estimators=100, n_jobs=4,criterion='entropy', min_samples_split=1,class_weight = class_weight)
        clf = RandomForestClassifier(n_estimators=100, n_jobs=-1,criterion='entropy', class_weight = class_weight)
    elif classifier == 'lr':
        clf = LogisticRegression(class_weight = class_weight, n_jobs=-1, fit_intercept = False, verbose = 0)
    elif classifier == 'lrbias':
        clf = LogisticRegression(class_weight = class_weight, n_jobs=-1, fit_intercept = True, verbose = 1)
    elif classifier == 'lrl1':
        clf = LogisticRegression(class_weight = class_weight, penalty='l1',n_jobs=-1)
    elif classifier == 'xgb':
        clf = xgb.XGBClassifier(booster = 'gbtree', nthread = -1, subsample = 1, 
                                n_estimators = 600, colsample_bytree = 1, max_depth = 6, min_child_weight = 1)
    elif classifier == 'dice':
        clf = RandomDice('1234')
    elif classifier == 'currank':
        clf = CurRank()
    elif classifier == 'avgrank':
        clf = AverageRank()        
    else:
        clf = None
        
    return clf


class CurRank():
    """
    predict with current rank
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = [0 for x in range(test_x.shape[0])]
        return np.array(pred_y)
    
class AverageRank():
    """
    print('[*] predict with average rankchg (change_in_rank_all):idx = 15')
    change_in_rank_all = test[:,15]
    pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in change_in_rank_all])
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            #13, change_in_rank_all
            pred_y.append(x[13])
        pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in pred_y])
        return np.array(pred_y_avg)   

class RandomDice():
    """
    a random dice model
    """
    def __init__(self, seed='1234'):
        self.dist = []
        self.val = []
        random.seed(seed)
    
    def fit(self, x, y):
        total = y.shape[0]
        yval = set(y)
        
        ratio = 0.
        for val in yval:
            self.val.append(val)
            ratio += np.sum(y==val)*1.0 / total
            self.dist.append(ratio)
            
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            dice = random.random()
            #search in self.dist
            find_idx = -1
            for idx, ratio in enumerate(self.dist):
                if dice <= ratio:
                    find_idx = idx
                    break
            
            #or the last one match
            pred_y.append(self.val[find_idx])
            
        return np.array(pred_y)

def evaluate(test_y, pred_y):
    precision = metrics.precision_score(test_y, pred_y, average=None) 
    recall = metrics.recall_score(test_y, pred_y, average=None)
    f1 = metrics.f1_score(test_y, pred_y, average=None)
    accuracy = metrics.accuracy_score(test_y, pred_y)
    print('precision=%s, recall=%s, f1=%s, accuracy=%.2f'%(precision,recall, f1, accuracy))
    return accuracy
    
#
#features
#    cols=[Myidx, 'target','eventid','car_number','stageid',
#             'firststage','pit_in_caution','start_position',
#             'start_rank','start_rank_ratio','top_pack','bottom_pack',
#             'average_rank','average_rank_all',
#             'change_in_rank','change_in_rank_all','rate_of_change','rate_of_change_all']    
def split_by_eventid(stagedata, eventid):
    """
    split by eventid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['eventid'] != eventid].to_numpy()
    test  = stagedata[stagedata['eventid'] == eventid].to_numpy()

    #2:car_number
    train_x = train[:,2:]
    train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    test_x = test[:,2:]
    test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    
    return train, test, train_x, train_y, test_x, test_y


def split_by_stageid(stagedata, stageid):
    """
    split by stageid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['stageid'] <= stageid].to_numpy()
    test  = stagedata[stagedata['stageid'] > stageid].to_numpy()

    train_x = train[:,2:]
    train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    test_x = test[:,2:]
    test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    
    return train, test, train_x, train_y, test_x, test_y


### baseline
def baseline_model():
    #1. predict with current rank, rankchg = 0
    print('[*] predict with current rank, rankchg = 0')
    pred_y_simple = np.zeros_like(test_y)
    score1 = evaluate(test_y, pred_y_simple)

    #2. predict with average rankchg (change_in_rank_all):idx = 15
    print('[*] predict with average rankchg (change_in_rank_all):idx = 15')
    change_in_rank_all = test[:,15]
    pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in change_in_rank_all])
    score2 = evaluate(test_y, pred_y_avg)
    return score1, score2

def classifier_model(name='lr'):
    ### test learning models
    print('[*] predict with %s model'%name)
    clf = get_classifier(name)
    clf.fit(train_x, train_y)

    pred_y = clf.predict(test_x)
    score = evaluate(test_y, pred_y)
    return score

In [4]:
def test_20182019():
    global train_x, train_y, test_x, test_y
    
    ### fix train
    cols = ['runid','trainsize','testsize','testdistribution']
    cols.extend(classifiers)
    print('cols:%s'%cols)
    retdf = pd.DataFrame([],columns=cols)


    events = set(stagedata['eventid'])

    years = ['2013','2014','2015','2016','2017','2018','2019']
    #events = ['Indy500']
    eventsname = [f'Indy500-{x}' for x in years]
    events_id={key:idx for idx, key in enumerate(eventsname)}

    #first 
    eventid = events_id['Indy500-2018']
    ignore_eventid = events_id['Indy500-2019']
    stdata_2018 = stagedata[stagedata['eventid']!=ignore_eventid]

    print('Testset = %s'%eventsname[eventid])

    train, test, train_x, train_y, test_x, test_y = split_by_eventid(stdata_2018, eventid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
    #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
    #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

    #record
    rec = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

    acc = [0 for x in range(len(classifiers))]
    for idx, clf in enumerate(classifiers):
        acc[idx] = classifier_model(clf)

    rec.extend(acc)
    print('rec:%s'%rec)

    #new df
    df = pd.DataFrame([rec],columns=cols)
    retdf = pd.concat([retdf, df])        


    eventid = events_id['Indy500-2019']
    ignore_eventid = events_id['Indy500-2018']
    stdata_2019 = stagedata[stagedata['eventid']!=ignore_eventid]

    print('Testset = %s'%eventsname[eventid])

    train2, test2, train_x2, train_y2, test_x, test_y = split_by_eventid(stdata_2019, eventid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))

    #record
    rec = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

    acc = [0 for x in range(len(classifiers))]
    for idx, clf in enumerate(classifiers):
        acc[idx] = classifier_model(clf)

    rec.extend(acc)
    print('rec:%s'%rec)

    #new df
    df = pd.DataFrame([rec],columns=cols)
    retdf = pd.concat([retdf, df]) 
    
    retdf.to_csv(f'stint_classifier_result_{suffix}.csv', float_format='%.3f')
    return retdf

def test_cv():
    global train_x, train_y, test_x, test_y
    
    cols = ['runid','trainsize','testsize','testdistribution']
    cols.extend(classifiers)
    print('cols:%s'%cols)
    retdf = pd.DataFrame([],columns=cols)


    events = set(stagedata['eventid'])

    years = ['2013','2014','2015','2016','2017','2018','2019']
    #events = ['Indy500']
    eventsname = [f'Indy500-{x}' for x in years]
    events_id={key:idx for idx, key in enumerate(eventsname)}


    for eventid in events:
        print('Testset = %s'%eventsname[eventid])

        train, test, train_x, train_y, test_x, test_y = split_by_eventid(stagedata, eventid)
        test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
        #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
        #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

        #record
        rec = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

        acc = [0 for x in range(len(classifiers))]
        for idx, clf in enumerate(classifiers):
            acc[idx] = classifier_model(clf)

        rec.extend(acc)
        print('rec:%s'%rec)

        #new df
        df = pd.DataFrame([rec],columns=cols)
        retdf = pd.concat([retdf, df])        

    retdf.to_csv('crossvalid_stagedata_classifier_%s.csv'%suffix, float_format='%.3f')
    #df_event = retdf
    return retdf

### test oracle with stintlen

In [5]:
#load data
_trim = 0
_include_final = True
_include_stintlen = True
include_str = '1' if _include_final else '0'
stint_str = '1' if _include_stintlen else ''
suffix = f'indy500-2013-2019-end{include_str}{stint_str}-t{_trim}'
output_file = f'stage-indy500-2013-2019-end{include_str}{stint_str}-t{_trim}.csv'
stagedata = pd.read_csv(output_file)

stagedata.fillna(0, inplace=True)
stagedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1522 entries, 0 to 1521
Data columns (total 37 columns):
Unnamed: 0                   1522 non-null int64
target                       1522 non-null int64
start_lap                    1522 non-null int64
stint_len                    1522 non-null int64
eventid                      1522 non-null int64
car_number                   1522 non-null int64
stageid                      1522 non-null int64
firststage                   1522 non-null int64
pit_in_caution               1522 non-null int64
start_position               1522 non-null int64
start_rank                   1522 non-null int64
start_rank_ratio             1522 non-null float64
top_pack                     1522 non-null int64
bottom_pack                  1522 non-null int64
average_rank                 1522 non-null float64
average_rank_all             1522 non-null float64
change_in_rank               1522 non-null int64
change_in_rank_all           1522 non-null float64
rat

In [6]:
stagedata.head(5)

Unnamed: 0.1,Unnamed: 0,target,start_lap,stint_len,eventid,car_number,stageid,firststage,pit_in_caution,start_position,...,laptime_std_all,laps_prev,laps_after_last_pitstop,pittime_prev,prev_nb0_change_in_rank,prev_nb1_change_in_rank,prev_nb2_change_in_rank,follow_nb0_change_in_rank,follow_nb1_change_in_rank,follow_nb2_change_in_rank
0,1,-12,31,27,0,1,1,1,0,7,...,22.9104,31,31,66.0815,2,8,0,14,6,3
1,2,14,58,32,0,1,2,1,1,7,...,23.822858,27,27,62.0677,-19,-21,0,-3,-21,-17
2,3,-7,90,32,0,1,3,1,0,7,...,21.857882,32,32,91.23935,11,-2,-3,7,17,19
3,4,5,122,30,0,1,4,1,0,7,...,19.394133,32,32,61.19415,-15,-9,-6,0,4,-6
4,5,-3,152,29,0,1,5,1,0,7,...,17.737505,30,30,60.8541,-3,0,0,-3,-3,0


In [7]:
df_event = test_cv()

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lr', 'lsvc', 'lsvcl2', 'rf', 'lrbias', 'xgb']
Testset = Indy500-2013
[*] predict with currank model
precision=[0.         0.14871795 0.        ], recall=[0. 1. 0.], f1=[0.         0.25892857 0.        ], accuracy=0.15
[*] predict with avgrank model
precision=[0.         0.         0.34871795], recall=[0. 0. 1.], f1=[0.         0.         0.51711027], accuracy=0.35
[*] predict with dice model
precision=[0.46902655 0.2173913  0.27118644], recall=[0.54081633 0.17241379 0.23529412], f1=[0.50236967 0.19230769 0.2519685 ], accuracy=0.38
[*] predict with lr model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.62015504 0.33333333 0.6031746 ], recall=[0.81632653 0.03448276 0.55882353], f1=[0.70484581 0.0625     0.58015267], accuracy=0.61
[*] predict with lsvc model




precision=[0.62307692 0.5        0.61904762], recall=[0.82653061 0.03448276 0.57352941], f1=[0.71052632 0.06451613 0.59541985], accuracy=0.62
[*] predict with lsvcl2 model




precision=[0.76623377 0.23728814 0.57627119], recall=[0.60204082 0.48275862 0.5       ], f1=[0.67428571 0.31818182 0.53543307], accuracy=0.55
[*] predict with rf model
precision=[0.62015504 1.         0.59375   ], recall=[0.81632653 0.06896552 0.55882353], f1=[0.70484581 0.12903226 0.57575758], accuracy=0.62
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.8s finished


precision=[0.62992126 0.4        0.6031746 ], recall=[0.81632653 0.06896552 0.55882353], f1=[0.71111111 0.11764706 0.58015267], accuracy=0.62
[*] predict with xgb model
precision=[0.67226891 0.5        0.61428571], recall=[0.81632653 0.10344828 0.63235294], f1=[0.73732719 0.17142857 0.62318841], accuracy=0.65
rec:['Indy500-2013', 1327, 195, '+:68,0:29,-:98', 0.14871794871794872, 0.3487179487179487, 0.37948717948717947, 0.6102564102564103, 0.6205128205128205, 0.5487179487179488, 0.6153846153846154, 0.6153846153846154, 0.6461538461538462]
Testset = Indy500-2014
[*] predict with currank model
precision=[0.         0.19111111 0.        ], recall=[0. 1. 0.], f1=[0.         0.32089552 0.        ], accuracy=0.19
[*] predict with avgrank model
precision=[0.   0.   0.28], recall=[0. 0. 1.], f1=[0.     0.     0.4375], accuracy=0.28
[*] predict with dice model
precision=[0.51908397 0.07407407 0.29850746], recall=[0.57142857 0.04651163 0.31746032], f1=[0.544      0.05714286 0.30769231], accuracy=0

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.7        0.4        0.44705882], recall=[0.76470588 0.09302326 0.6031746 ], f1=[0.73092369 0.1509434  0.51351351], accuracy=0.59
[*] predict with lsvc model




precision=[0.70542636 0.4        0.44186047], recall=[0.76470588 0.09302326 0.6031746 ], f1=[0.73387097 0.1509434  0.51006711], accuracy=0.59
[*] predict with lsvcl2 model




precision=[0.71774194 1.         0.42424242], recall=[0.74789916 0.04651163 0.66666667], f1=[0.73251029 0.08888889 0.51851852], accuracy=0.59
[*] predict with rf model
precision=[0.68461538 0.8        0.37777778], recall=[0.74789916 0.09302326 0.53968254], f1=[0.71485944 0.16666667 0.44444444], accuracy=0.56
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.6s finished


precision=[0.69924812 0.4        0.46341463], recall=[0.78151261 0.09302326 0.6031746 ], f1=[0.73809524 0.1509434  0.52413793], accuracy=0.60
[*] predict with xgb model
precision=[0.72321429 0.46666667 0.44897959], recall=[0.68067227 0.1627907  0.6984127 ], f1=[0.7012987  0.24137931 0.54658385], accuracy=0.59
rec:['Indy500-2014', 1297, 225, '+:63,0:43,-:119', 0.19111111111111112, 0.28, 0.4, 0.5911111111111111, 0.5911111111111111, 0.5911111111111111, 0.5644444444444444, 0.6, 0.5866666666666667]
Testset = Indy500-2015
[*] predict with currank model
precision=[0.         0.13333333 0.        ], recall=[0. 1. 0.], f1=[0.         0.23529412 0.        ], accuracy=0.13
[*] predict with avgrank model
precision=[0.         0.         0.37948718], recall=[0. 0. 1.], f1=[0.         0.         0.55018587], accuracy=0.38
[*] predict with dice model
precision=[0.53508772 0.125      0.40350877], recall=[0.64210526 0.11538462 0.31081081], f1=[0.58373206 0.12       0.35114504], accuracy=0.45
[*] predic

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.67567568 1.         0.62195122], recall=[0.78947368 0.07692308 0.68918919], f1=[0.72815534 0.14285714 0.65384615], accuracy=0.66
[*] predict with lsvc model




precision=[0.67592593 1.         0.61627907], recall=[0.76842105 0.03846154 0.71621622], f1=[0.71921182 0.07407407 0.6625    ], accuracy=0.65
[*] predict with lsvcl2 model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.73584906 0.         0.47183099], recall=[0.41052632 0.         0.90540541], f1=[0.52702703 0.         0.62037037], accuracy=0.54
[*] predict with rf model
precision=[0.66393443 0.75       0.71014493], recall=[0.85263158 0.11538462 0.66216216], f1=[0.74654378 0.2        0.68531469], accuracy=0.68
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.6s finished


precision=[0.67567568 1.         0.62195122], recall=[0.78947368 0.07692308 0.68918919], f1=[0.72815534 0.14285714 0.65384615], accuracy=0.66
[*] predict with xgb model
precision=[0.65322581 0.42857143 0.703125  ], recall=[0.85263158 0.11538462 0.60810811], f1=[0.73972603 0.18181818 0.65217391], accuracy=0.66
rec:['Indy500-2015', 1327, 195, '+:74,0:26,-:95', 0.13333333333333333, 0.37948717948717947, 0.4461538461538462, 0.6564102564102564, 0.6512820512820513, 0.5435897435897435, 0.6820512820512821, 0.6564102564102564, 0.6615384615384615]
Testset = Indy500-2016
[*] predict with currank model
precision=[0.         0.12598425 0.        ], recall=[0. 1. 0.], f1=[0.         0.22377622 0.        ], accuracy=0.13
[*] predict with avgrank model
precision=[0.         0.         0.38582677], recall=[0. 0. 1.], f1=[0.         0.         0.55681818], accuracy=0.39
[*] predict with dice model
precision=[0.49342105 0.05714286 0.47761194], recall=[0.60483871 0.0625     0.32653061], f1=[0.54347826 0.05

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.63414634 0.33333333 0.66666667], recall=[0.83870968 0.09375    0.55102041], f1=[0.72222222 0.14634146 0.60335196], accuracy=0.63
[*] predict with lsvc model




precision=[0.62068966 0.5        0.72222222], recall=[0.87096774 0.125      0.53061224], f1=[0.72483221 0.2        0.61176471], accuracy=0.65
[*] predict with lsvcl2 model




precision=[0.63125    0.08333333 0.63414634], recall=[0.81451613 0.03125    0.53061224], f1=[0.71126761 0.04545455 0.57777778], accuracy=0.61
[*] predict with rf model
precision=[0.64327485 0.75       0.72151899], recall=[0.88709677 0.09375    0.58163265], f1=[0.74576271 0.16666667 0.6440678 ], accuracy=0.67
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.5s finished


precision=[0.63030303 0.4        0.67088608], recall=[0.83870968 0.125      0.54081633], f1=[0.71972318 0.19047619 0.59887006], accuracy=0.63
[*] predict with xgb model
precision=[0.6796875  0.41176471 0.62385321], recall=[0.7016129  0.21875    0.69387755], f1=[0.69047619 0.28571429 0.65700483], accuracy=0.64
rec:['Indy500-2016', 1268, 254, '+:98,0:32,-:124', 0.12598425196850394, 0.3858267716535433, 0.42913385826771655, 0.6338582677165354, 0.6456692913385826, 0.6062992125984252, 0.6692913385826772, 0.6338582677165354, 0.6377952755905512]
Testset = Indy500-2017
[*] predict with currank model
precision=[0.         0.19758065 0.        ], recall=[0. 1. 0.], f1=[0.         0.32996633 0.        ], accuracy=0.20
[*] predict with avgrank model
precision=[0.         0.         0.29032258], recall=[0. 0. 1.], f1=[0.   0.   0.45], accuracy=0.29
[*] predict with dice model
precision=[0.46575342 0.16666667 0.23611111], recall=[0.53543307 0.10204082 0.23611111], f1=[0.4981685  0.12658228 0.23611111

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.76       0.19886364 0.34042553], recall=[0.1496063  0.71428571 0.22222222], f1=[0.25       0.31111111 0.26890756], accuracy=0.28
[*] predict with lsvc model




precision=[0.75862069 0.20606061 0.42592593], recall=[0.17322835 0.69387755 0.31944444], f1=[0.28205128 0.31775701 0.36507937], accuracy=0.32
[*] predict with lsvcl2 model




precision=[0.84210526 0.18888889 0.30612245], recall=[0.12598425 0.69387755 0.20833333], f1=[0.21917808 0.29694323 0.24793388], accuracy=0.26
[*] predict with rf model
precision=[0.63157895 0.51724138 0.53731343], recall=[0.75590551 0.30612245 0.5       ], f1=[0.68817204 0.38461538 0.51798561], accuracy=0.59
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.6s finished


precision=[0.76       0.2        0.35416667], recall=[0.1496063  0.71428571 0.23611111], f1=[0.25       0.3125     0.28333333], accuracy=0.29
[*] predict with xgb model
precision=[0.73636364 0.43859649 0.54320988], recall=[0.63779528 0.51020408 0.61111111], f1=[0.6835443  0.47169811 0.5751634 ], accuracy=0.60
rec:['Indy500-2017', 1274, 248, '+:72,0:49,-:127', 0.1975806451612903, 0.2903225806451613, 0.3629032258064516, 0.28225806451612906, 0.3185483870967742, 0.2620967741935484, 0.592741935483871, 0.2862903225806452, 0.6048387096774194]
Testset = Indy500-2018
[*] predict with currank model
precision=[0.        0.1005291 0.       ], recall=[0. 1. 0.], f1=[0.         0.18269231 0.        ], accuracy=0.10
[*] predict with avgrank model
precision=[0.         0.         0.38095238], recall=[0. 0. 1.], f1=[0.         0.         0.55172414], accuracy=0.38
[*] predict with dice model
precision=[0.47706422 0.12       0.32727273], recall=[0.53061224 0.15789474 0.25      ], f1=[0.50241546 0.136363

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.64661654 0.125      0.72916667], recall=[0.87755102 0.05263158 0.48611111], f1=[0.74458874 0.07407407 0.58333333], accuracy=0.65
[*] predict with lsvc model




precision=[0.63888889 0.33333333 0.78571429], recall=[0.93877551 0.05263158 0.45833333], f1=[0.76033058 0.09090909 0.57894737], accuracy=0.67
[*] predict with lsvcl2 model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.69387755 0.11428571 0.        ], recall=[0.34693878 0.84210526 0.        ], f1=[0.46258503 0.20125786 0.        ], accuracy=0.26
[*] predict with rf model
precision=[0.63970588 1.         0.70588235], recall=[0.8877551  0.10526316 0.5       ], f1=[0.74358974 0.19047619 0.58536585], accuracy=0.66
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.4s finished


precision=[0.65185185 0.125      0.76086957], recall=[0.89795918 0.05263158 0.48611111], f1=[0.75536481 0.07407407 0.59322034], accuracy=0.66
[*] predict with xgb model
precision=[0.66972477 0.4        0.67142857], recall=[0.74489796 0.21052632 0.65277778], f1=[0.70531401 0.27586207 0.66197183], accuracy=0.66
rec:['Indy500-2018', 1333, 189, '+:72,0:19,-:98', 0.10052910052910052, 0.38095238095238093, 0.3862433862433862, 0.6455026455026455, 0.6666666666666666, 0.26455026455026454, 0.6613756613756614, 0.656084656084656, 0.656084656084656]
Testset = Indy500-2019
[*] predict with currank model
precision=[0.         0.15277778 0.        ], recall=[0. 1. 0.], f1=[0.         0.26506024 0.        ], accuracy=0.15
[*] predict with avgrank model
precision=[0.         0.         0.30555556], recall=[0. 0. 1.], f1=[0.         0.         0.46808511], accuracy=0.31
[*] predict with dice model
precision=[0.59349593 0.33333333 0.34848485], recall=[0.62393162 0.27272727 0.34848485], f1=[0.60833333 0.3  

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.66911765 0.5        0.55555556], recall=[0.77777778 0.12121212 0.60606061], f1=[0.71936759 0.19512195 0.57971014], accuracy=0.62
[*] predict with lsvc model




precision=[0.64179104 0.5        0.525     ], recall=[0.73504274 0.03030303 0.63636364], f1=[0.68525896 0.05714286 0.57534247], accuracy=0.60
[*] predict with lsvcl2 model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.81578947 0.         0.34831461], recall=[0.26495726 0.         0.93939394], f1=[0.4        0.         0.50819672], accuracy=0.43
[*] predict with rf model
precision=[0.63076923 0.66666667 0.525     ], recall=[0.7008547  0.12121212 0.63636364], f1=[0.66396761 0.20512821 0.57534247], accuracy=0.59
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.5s finished


precision=[0.65441176 0.42857143 0.53424658], recall=[0.76068376 0.09090909 0.59090909], f1=[0.70355731 0.15       0.56115108], accuracy=0.61
[*] predict with xgb model
precision=[0.65       0.46666667 0.59016393], recall=[0.77777778 0.21212121 0.54545455], f1=[0.70817121 0.29166667 0.56692913], accuracy=0.62
rec:['Indy500-2019', 1306, 216, '+:66,0:33,-:117', 0.1527777777777778, 0.3055555555555556, 0.4861111111111111, 0.625, 0.5972222222222222, 0.4305555555555556, 0.5925925925925926, 0.6064814814814815, 0.6203703703703703]


In [8]:
retdf_oracle = test_20182019()

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lr', 'lsvc', 'lsvcl2', 'rf', 'lrbias', 'xgb']
Testset = Indy500-2018
[*] predict with currank model
precision=[0.        0.1005291 0.       ], recall=[0. 1. 0.], f1=[0.         0.18269231 0.        ], accuracy=0.10
[*] predict with avgrank model
precision=[0.         0.         0.38095238], recall=[0. 0. 1.], f1=[0.         0.         0.55172414], accuracy=0.38
[*] predict with dice model
precision=[0.47222222 0.12       0.32142857], recall=[0.52040816 0.15789474 0.25      ], f1=[0.49514563 0.13636364 0.28125   ], accuracy=0.38
[*] predict with lr model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.67164179 0.14285714 0.79166667], recall=[0.91836735 0.05263158 0.52777778], f1=[0.77586207 0.07692308 0.63333333], accuracy=0.68
[*] predict with lsvc model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.62745098 0.         0.86111111], recall=[0.97959184 0.         0.43055556], f1=[0.76494024 0.         0.57407407], accuracy=0.67
[*] predict with lsvcl2 model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.69791667 0.         0.53763441], recall=[0.68367347 0.         0.69444444], f1=[0.69072165 0.         0.60606061], accuracy=0.62
[*] predict with rf model
precision=[0.66129032 1.         0.66666667], recall=[0.83673469 0.10526316 0.58333333], f1=[0.73873874 0.19047619 0.62222222], accuracy=0.67
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.5s finished


precision=[0.67910448 0.14285714 0.8125    ], recall=[0.92857143 0.05263158 0.54166667], f1=[0.78448276 0.07692308 0.65      ], accuracy=0.69
[*] predict with xgb model
precision=[0.67889908 0.375      0.671875  ], recall=[0.75510204 0.31578947 0.59722222], f1=[0.71497585 0.34285714 0.63235294], accuracy=0.65
rec:['Indy500-2018', 1117, 189, '+:72,0:19,-:98', 0.10052910052910052, 0.38095238095238093, 0.38095238095238093, 0.6825396825396826, 0.671957671957672, 0.6190476190476191, 0.6666666666666666, 0.6931216931216931, 0.6507936507936508]
Testset = Indy500-2019
[*] predict with currank model
precision=[0.         0.15277778 0.        ], recall=[0. 1. 0.], f1=[0.         0.26506024 0.        ], accuracy=0.15
[*] predict with avgrank model
precision=[0.         0.         0.30555556], recall=[0. 0. 1.], f1=[0.         0.         0.46808511], accuracy=0.31
[*] predict with dice model
precision=[0.59349593 0.3125     0.3442623 ], recall=[0.62393162 0.3030303  0.31818182], f1=[0.60833333 0.30

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.65957447 0.375      0.56716418], recall=[0.79487179 0.09090909 0.57575758], f1=[0.72093023 0.14634146 0.57142857], accuracy=0.62
[*] predict with lsvc model




precision=[0.63414634 0.5        0.66      ], recall=[0.88888889 0.03030303 0.5       ], f1=[0.74021352 0.05714286 0.56896552], accuracy=0.64
[*] predict with lsvcl2 model




precision=[0.55897436 0.27777778 0.66666667], recall=[0.93162393 0.15151515 0.03030303], f1=[0.69871795 0.19607843 0.05797101], accuracy=0.54
[*] predict with rf model
precision=[0.624      0.83333333 0.49411765], recall=[0.66666667 0.15151515 0.63636364], f1=[0.6446281  0.25641026 0.55629139], accuracy=0.58
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.5s finished


precision=[0.66906475 0.5        0.56521739], recall=[0.79487179 0.12121212 0.59090909], f1=[0.7265625  0.19512195 0.57777778], accuracy=0.63
[*] predict with xgb model
precision=[0.65034965 0.40909091 0.56862745], recall=[0.79487179 0.27272727 0.43939394], f1=[0.71538462 0.32727273 0.4957265 ], accuracy=0.61
rec:['Indy500-2019', 1117, 216, '+:66,0:33,-:117', 0.1527777777777778, 0.3055555555555556, 0.48148148148148145, 0.6203703703703703, 0.6388888888888888, 0.5370370370370371, 0.5787037037037037, 0.6296296296296297, 0.6064814814814815]


In [9]:
retdf_oracle

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lr,lsvc,lsvcl2,rf,lrbias,xgb
0,Indy500-2018,1117,189,"+:72,0:19,-:98",0.100529,0.380952,0.380952,0.68254,0.671958,0.619048,0.666667,0.693122,0.650794
0,Indy500-2019,1117,216,"+:66,0:33,-:117",0.152778,0.305556,0.481481,0.62037,0.638889,0.537037,0.578704,0.62963,0.606481


### test normal without stint_len

In [10]:
#load data
_trim = 0
_include_final = True
_include_stintlen = False
include_str = '1' if _include_final else '0'
stint_str = '1' if _include_stintlen else ''
suffix = f'indy500-2013-2019-end{include_str}{stint_str}-t{_trim}'
output_file = f'stage-indy500-2013-2019-end{include_str}{stint_str}-t{_trim}.csv'
stagedata = pd.read_csv(output_file)

stagedata.fillna(0, inplace=True)
stagedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1522 entries, 0 to 1521
Data columns (total 37 columns):
Unnamed: 0                   1522 non-null int64
target                       1522 non-null int64
start_lap                    1522 non-null int64
stint_len                    1522 non-null int64
eventid                      1522 non-null int64
car_number                   1522 non-null int64
stageid                      1522 non-null int64
firststage                   1522 non-null int64
pit_in_caution               1522 non-null int64
start_position               1522 non-null int64
start_rank                   1522 non-null int64
start_rank_ratio             1522 non-null float64
top_pack                     1522 non-null int64
bottom_pack                  1522 non-null int64
average_rank                 1522 non-null float64
average_rank_all             1522 non-null float64
change_in_rank               1522 non-null int64
change_in_rank_all           1522 non-null float64
rat

In [11]:
df_event = test_cv()

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lr', 'lsvc', 'lsvcl2', 'rf', 'lrbias', 'xgb']
Testset = Indy500-2013
[*] predict with currank model
precision=[0.         0.14871795 0.        ], recall=[0. 1. 0.], f1=[0.         0.25892857 0.        ], accuracy=0.15
[*] predict with avgrank model
precision=[0.         0.         0.34871795], recall=[0. 0. 1.], f1=[0.         0.         0.51711027], accuracy=0.35
[*] predict with dice model
precision=[0.46902655 0.2173913  0.27118644], recall=[0.54081633 0.17241379 0.23529412], f1=[0.50236967 0.19230769 0.2519685 ], accuracy=0.38
[*] predict with lr model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.62015504 0.33333333 0.6031746 ], recall=[0.81632653 0.03448276 0.55882353], f1=[0.70484581 0.0625     0.58015267], accuracy=0.61
[*] predict with lsvc model




precision=[0.62307692 0.5        0.61904762], recall=[0.82653061 0.03448276 0.57352941], f1=[0.71052632 0.06451613 0.59541985], accuracy=0.62
[*] predict with lsvcl2 model




precision=[0.59398496 0.27272727 0.60784314], recall=[0.80612245 0.10344828 0.45588235], f1=[0.68398268 0.15       0.5210084 ], accuracy=0.58
[*] predict with rf model
precision=[0.62406015 1.         0.61666667], recall=[0.84693878 0.06896552 0.54411765], f1=[0.71861472 0.12903226 0.578125  ], accuracy=0.63
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.4s finished


precision=[0.62992126 0.4        0.6031746 ], recall=[0.81632653 0.06896552 0.55882353], f1=[0.71111111 0.11764706 0.58015267], accuracy=0.62
[*] predict with xgb model
precision=[0.67226891 0.5        0.61428571], recall=[0.81632653 0.10344828 0.63235294], f1=[0.73732719 0.17142857 0.62318841], accuracy=0.65
rec:['Indy500-2013', 1327, 195, '+:68,0:29,-:98', 0.14871794871794872, 0.3487179487179487, 0.37948717948717947, 0.6102564102564103, 0.6205128205128205, 0.5794871794871795, 0.6256410256410256, 0.6153846153846154, 0.6461538461538462]
Testset = Indy500-2014
[*] predict with currank model
precision=[0.         0.19111111 0.        ], recall=[0. 1. 0.], f1=[0.         0.32089552 0.        ], accuracy=0.19
[*] predict with avgrank model
precision=[0.   0.   0.28], recall=[0. 0. 1.], f1=[0.     0.     0.4375], accuracy=0.28
[*] predict with dice model
precision=[0.51908397 0.07407407 0.29850746], recall=[0.57142857 0.04651163 0.31746032], f1=[0.544      0.05714286 0.30769231], accuracy=0

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.70542636 0.4        0.44186047], recall=[0.76470588 0.09302326 0.6031746 ], f1=[0.73387097 0.1509434  0.51006711], accuracy=0.59
[*] predict with lsvcl2 model




precision=[0.73809524 0.33333333 0.36296296], recall=[0.5210084  0.04651163 0.77777778], f1=[0.61083744 0.08163265 0.49494949], accuracy=0.50
[*] predict with rf model
precision=[0.70542636 0.375      0.43181818], recall=[0.76470588 0.06976744 0.6031746 ], f1=[0.73387097 0.11764706 0.50331126], accuracy=0.59
[*] predict with lrbias model
precision=[0.69924812 0.4        0.46341463], recall=[0.78151261 0.09302326 0.6031746 ], f1=[0.73809524 0.1509434  0.52413793], accuracy=0.60
[*] predict with xgb model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.1s finished


precision=[0.72321429 0.46666667 0.44897959], recall=[0.68067227 0.1627907  0.6984127 ], f1=[0.7012987  0.24137931 0.54658385], accuracy=0.59
rec:['Indy500-2014', 1297, 225, '+:63,0:43,-:119', 0.19111111111111112, 0.28, 0.4, 0.5911111111111111, 0.5911111111111111, 0.5022222222222222, 0.5866666666666667, 0.6, 0.5866666666666667]
Testset = Indy500-2015
[*] predict with currank model
precision=[0.         0.13333333 0.        ], recall=[0. 1. 0.], f1=[0.         0.23529412 0.        ], accuracy=0.13
[*] predict with avgrank model
precision=[0.         0.         0.37948718], recall=[0. 0. 1.], f1=[0.         0.         0.55018587], accuracy=0.38
[*] predict with dice model
precision=[0.53508772 0.125      0.40350877], recall=[0.64210526 0.11538462 0.31081081], f1=[0.58373206 0.12       0.35114504], accuracy=0.45
[*] predict with lr model
precision=[0.67567568 1.         0.62195122], recall=[0.78947368 0.07692308 0.68918919], f1=[0.72815534 0.14285714 0.65384615], accuracy=0.66
[*] predict

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.67592593 1.         0.61627907], recall=[0.76842105 0.03846154 0.71621622], f1=[0.71921182 0.07407407 0.6625    ], accuracy=0.65
[*] predict with lsvcl2 model




precision=[0.65217391 0.         0.40350877], recall=[0.15789474 0.         0.93243243], f1=[0.25423729 0.         0.56326531], accuracy=0.43
[*] predict with rf model
precision=[0.640625  1.        0.6984127], recall=[0.86315789 0.15384615 0.59459459], f1=[0.73542601 0.26666667 0.64233577], accuracy=0.67
[*] predict with lrbias model
precision=[0.67567568 1.         0.62195122], recall=[0.78947368 0.07692308 0.68918919], f1=[0.72815534 0.14285714 0.65384615], accuracy=0.66
[*] predict with xgb model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.1s finished


precision=[0.65322581 0.42857143 0.703125  ], recall=[0.85263158 0.11538462 0.60810811], f1=[0.73972603 0.18181818 0.65217391], accuracy=0.66
rec:['Indy500-2015', 1327, 195, '+:74,0:26,-:95', 0.13333333333333333, 0.37948717948717947, 0.4461538461538462, 0.6564102564102564, 0.6512820512820513, 0.4307692307692308, 0.6666666666666666, 0.6564102564102564, 0.6615384615384615]
Testset = Indy500-2016
[*] predict with currank model
precision=[0.         0.12598425 0.        ], recall=[0. 1. 0.], f1=[0.         0.22377622 0.        ], accuracy=0.13
[*] predict with avgrank model
precision=[0.         0.         0.38582677], recall=[0. 0. 1.], f1=[0.         0.         0.55681818], accuracy=0.39
[*] predict with dice model
precision=[0.49342105 0.05714286 0.47761194], recall=[0.60483871 0.0625     0.32653061], f1=[0.54347826 0.05970149 0.38787879], accuracy=0.43
[*] predict with lr model
precision=[0.63414634 0.33333333 0.66666667], recall=[0.83870968 0.09375    0.55102041], f1=[0.72222222 0.146

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.62068966 0.5        0.72222222], recall=[0.87096774 0.125      0.53061224], f1=[0.72483221 0.2        0.61176471], accuracy=0.65
[*] predict with lsvcl2 model




precision=[0.50617284 0.4        0.83333333], recall=[0.99193548 0.0625     0.05102041], f1=[0.67029973 0.10810811 0.09615385], accuracy=0.51
[*] predict with rf model
precision=[0.64071856 0.5        0.7037037 ], recall=[0.86290323 0.09375    0.58163265], f1=[0.73539519 0.15789474 0.63687151], accuracy=0.66
[*] predict with lrbias model
precision=[0.63030303 0.4        0.67088608], recall=[0.83870968 0.125      0.54081633], f1=[0.71972318 0.19047619 0.59887006], accuracy=0.63
[*] predict with xgb model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.1s finished


precision=[0.6796875  0.41176471 0.62385321], recall=[0.7016129  0.21875    0.69387755], f1=[0.69047619 0.28571429 0.65700483], accuracy=0.64
rec:['Indy500-2016', 1268, 254, '+:98,0:32,-:124', 0.12598425196850394, 0.3858267716535433, 0.42913385826771655, 0.6338582677165354, 0.6456692913385826, 0.5118110236220472, 0.65748031496063, 0.6338582677165354, 0.6377952755905512]
Testset = Indy500-2017
[*] predict with currank model
precision=[0.         0.19758065 0.        ], recall=[0. 1. 0.], f1=[0.         0.32996633 0.        ], accuracy=0.20
[*] predict with avgrank model
precision=[0.         0.         0.29032258], recall=[0. 0. 1.], f1=[0.   0.   0.45], accuracy=0.29
[*] predict with dice model
precision=[0.46575342 0.16666667 0.23611111], recall=[0.53543307 0.10204082 0.23611111], f1=[0.4981685  0.12658228 0.23611111], accuracy=0.36
[*] predict with lr model
precision=[0.76       0.19886364 0.34042553], recall=[0.1496063  0.71428571 0.22222222], f1=[0.25       0.31111111 0.26890756], 

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.75862069 0.20606061 0.42592593], recall=[0.17322835 0.69387755 0.31944444], f1=[0.28205128 0.31775701 0.36507937], accuracy=0.32
[*] predict with lsvcl2 model




precision=[1.         0.22047244 0.42105263], recall=[0.05511811 0.57142857 0.66666667], f1=[0.10447761 0.31818182 0.51612903], accuracy=0.33
[*] predict with rf model
precision=[0.62745098 0.48148148 0.57352941], recall=[0.75590551 0.26530612 0.54166667], f1=[0.68571429 0.34210526 0.55714286], accuracy=0.60
[*] predict with lrbias model
precision=[0.76       0.2        0.35416667], recall=[0.1496063  0.71428571 0.23611111], f1=[0.25       0.3125     0.28333333], accuracy=0.29
[*] predict with xgb model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.1s finished


precision=[0.73636364 0.43859649 0.54320988], recall=[0.63779528 0.51020408 0.61111111], f1=[0.6835443  0.47169811 0.5751634 ], accuracy=0.60
rec:['Indy500-2017', 1274, 248, '+:72,0:49,-:127', 0.1975806451612903, 0.2903225806451613, 0.3629032258064516, 0.28225806451612906, 0.3185483870967742, 0.3346774193548387, 0.5967741935483871, 0.2862903225806452, 0.6048387096774194]
Testset = Indy500-2018
[*] predict with currank model
precision=[0.        0.1005291 0.       ], recall=[0. 1. 0.], f1=[0.         0.18269231 0.        ], accuracy=0.10
[*] predict with avgrank model
precision=[0.         0.         0.38095238], recall=[0. 0. 1.], f1=[0.         0.         0.55172414], accuracy=0.38
[*] predict with dice model
precision=[0.47706422 0.12       0.32727273], recall=[0.53061224 0.15789474 0.25      ], f1=[0.50241546 0.13636364 0.28346457], accuracy=0.39
[*] predict with lr model
precision=[0.64661654 0.125      0.72916667], recall=[0.87755102 0.05263158 0.48611111], f1=[0.74458874 0.074074

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.63888889 0.33333333 0.78571429], recall=[0.93877551 0.05263158 0.45833333], f1=[0.76033058 0.09090909 0.57894737], accuracy=0.67
[*] predict with lsvcl2 model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.68686869 0.         0.57777778], recall=[0.69387755 0.         0.72222222], f1=[0.69035533 0.         0.64197531], accuracy=0.63
[*] predict with rf model
precision=[0.64492754 0.66666667 0.72916667], recall=[0.90816327 0.10526316 0.48611111], f1=[0.75423729 0.18181818 0.58333333], accuracy=0.67
[*] predict with lrbias model
precision=[0.65185185 0.125      0.76086957], recall=[0.89795918 0.05263158 0.48611111], f1=[0.75536481 0.07407407 0.59322034], accuracy=0.66
[*] predict with xgb model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.1s finished


precision=[0.66972477 0.4        0.67142857], recall=[0.74489796 0.21052632 0.65277778], f1=[0.70531401 0.27586207 0.66197183], accuracy=0.66
rec:['Indy500-2018', 1333, 189, '+:72,0:19,-:98', 0.10052910052910052, 0.38095238095238093, 0.3862433862433862, 0.6455026455026455, 0.6666666666666666, 0.6349206349206349, 0.6666666666666666, 0.656084656084656, 0.656084656084656]
Testset = Indy500-2019
[*] predict with currank model
precision=[0.         0.15277778 0.        ], recall=[0. 1. 0.], f1=[0.         0.26506024 0.        ], accuracy=0.15
[*] predict with avgrank model
precision=[0.         0.         0.30555556], recall=[0. 0. 1.], f1=[0.         0.         0.46808511], accuracy=0.31
[*] predict with dice model
precision=[0.59349593 0.33333333 0.34848485], recall=[0.62393162 0.27272727 0.34848485], f1=[0.60833333 0.3        0.34848485], accuracy=0.49
[*] predict with lr model
precision=[0.66911765 0.5        0.55555556], recall=[0.77777778 0.12121212 0.60606061], f1=[0.71936759 0.19512

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.64179104 0.5        0.525     ], recall=[0.73504274 0.03030303 0.63636364], f1=[0.68525896 0.05714286 0.57534247], accuracy=0.60
[*] predict with lsvcl2 model




precision=[0.68831169 0.08333333 0.41732283], recall=[0.45299145 0.03030303 0.8030303 ], f1=[0.54639175 0.04444444 0.5492228 ], accuracy=0.50
[*] predict with rf model
precision=[0.64705882 0.66666667 0.50549451], recall=[0.65811966 0.12121212 0.6969697 ], f1=[0.65254237 0.20512821 0.58598726], accuracy=0.59
[*] predict with lrbias model
precision=[0.65441176 0.42857143 0.53424658], recall=[0.76068376 0.09090909 0.59090909], f1=[0.70355731 0.15       0.56115108], accuracy=0.61
[*] predict with xgb model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.1s finished


precision=[0.65       0.46666667 0.59016393], recall=[0.77777778 0.21212121 0.54545455], f1=[0.70817121 0.29166667 0.56692913], accuracy=0.62
rec:['Indy500-2019', 1306, 216, '+:66,0:33,-:117', 0.1527777777777778, 0.3055555555555556, 0.4861111111111111, 0.625, 0.5972222222222222, 0.49537037037037035, 0.5879629629629629, 0.6064814814814815, 0.6203703703703703]


In [12]:
df_event

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lr,lsvc,lsvcl2,rf,lrbias,xgb
0,Indy500-2013,1327,195,"+:68,0:29,-:98",0.148718,0.348718,0.379487,0.610256,0.620513,0.579487,0.625641,0.615385,0.646154
0,Indy500-2014,1297,225,"+:63,0:43,-:119",0.191111,0.28,0.4,0.591111,0.591111,0.502222,0.586667,0.6,0.586667
0,Indy500-2015,1327,195,"+:74,0:26,-:95",0.133333,0.379487,0.446154,0.65641,0.651282,0.430769,0.666667,0.65641,0.661538
0,Indy500-2016,1268,254,"+:98,0:32,-:124",0.125984,0.385827,0.429134,0.633858,0.645669,0.511811,0.65748,0.633858,0.637795
0,Indy500-2017,1274,248,"+:72,0:49,-:127",0.197581,0.290323,0.362903,0.282258,0.318548,0.334677,0.596774,0.28629,0.604839
0,Indy500-2018,1333,189,"+:72,0:19,-:98",0.100529,0.380952,0.386243,0.645503,0.666667,0.634921,0.666667,0.656085,0.656085
0,Indy500-2019,1306,216,"+:66,0:33,-:117",0.152778,0.305556,0.486111,0.625,0.597222,0.49537,0.587963,0.606481,0.62037


In [13]:
retdf = test_20182019()

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lr', 'lsvc', 'lsvcl2', 'rf', 'lrbias', 'xgb']
Testset = Indy500-2018
[*] predict with currank model
precision=[0.        0.1005291 0.       ], recall=[0. 1. 0.], f1=[0.         0.18269231 0.        ], accuracy=0.10
[*] predict with avgrank model
precision=[0.         0.         0.38095238], recall=[0. 0. 1.], f1=[0.         0.         0.55172414], accuracy=0.38
[*] predict with dice model
precision=[0.47222222 0.12       0.32142857], recall=[0.52040816 0.15789474 0.25      ], f1=[0.49514563 0.13636364 0.28125   ], accuracy=0.38
[*] predict with lr model
precision=[0.67164179 0.14285714 0.79166667], recall=[0.91836735 0.05263158 0.52777778], f1=[0.77586207 0.07692308 0.63333333], accuracy=0.68
[*] predict with lsvc model


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.62745098 0.         0.86111111], recall=[0.97959184 0.         0.43055556], f1=[0.76494024 0.         0.57407407], accuracy=0.67
[*] predict with lsvcl2 model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.73529412 0.         0.41935484], recall=[0.25510204 0.         0.90277778], f1=[0.37878788 0.         0.57268722], accuracy=0.48
[*] predict with rf model
precision=[0.64122137 1.         0.67857143], recall=[0.85714286 0.10526316 0.52777778], f1=[0.73362445 0.19047619 0.59375   ], accuracy=0.66
[*] predict with lrbias model
precision=[0.67910448 0.14285714 0.8125    ], recall=[0.92857143 0.05263158 0.54166667], f1=[0.78448276 0.07692308 0.65      ], accuracy=0.69
[*] predict with xgb model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.1s finished


precision=[0.67889908 0.375      0.671875  ], recall=[0.75510204 0.31578947 0.59722222], f1=[0.71497585 0.34285714 0.63235294], accuracy=0.65
rec:['Indy500-2018', 1117, 189, '+:72,0:19,-:98', 0.10052910052910052, 0.38095238095238093, 0.38095238095238093, 0.6825396825396826, 0.671957671957672, 0.47619047619047616, 0.656084656084656, 0.6931216931216931, 0.6507936507936508]
Testset = Indy500-2019
[*] predict with currank model
precision=[0.         0.15277778 0.        ], recall=[0. 1. 0.], f1=[0.         0.26506024 0.        ], accuracy=0.15
[*] predict with avgrank model
precision=[0.         0.         0.30555556], recall=[0. 0. 1.], f1=[0.         0.         0.46808511], accuracy=0.31
[*] predict with dice model
precision=[0.59349593 0.3125     0.3442623 ], recall=[0.62393162 0.3030303  0.31818182], f1=[0.60833333 0.30769231 0.33070866], accuracy=0.48
[*] predict with lr model
precision=[0.65957447 0.375      0.56716418], recall=[0.79487179 0.09090909 0.57575758], f1=[0.72093023 0.146

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.63414634 0.5        0.66      ], recall=[0.88888889 0.03030303 0.5       ], f1=[0.74021352 0.05714286 0.56896552], accuracy=0.64
[*] predict with lsvcl2 model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.73529412 0.         0.39864865], recall=[0.42735043 0.         0.89393939], f1=[0.54054054 0.         0.55140187], accuracy=0.50
[*] predict with rf model
precision=[0.632      0.71428571 0.51190476], recall=[0.67521368 0.15151515 0.65151515], f1=[0.65289256 0.25       0.57333333], accuracy=0.59
[*] predict with lrbias model
precision=[0.66906475 0.5        0.56521739], recall=[0.79487179 0.12121212 0.59090909], f1=[0.7265625  0.19512195 0.57777778], accuracy=0.63
[*] predict with xgb model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.1s finished


precision=[0.65034965 0.40909091 0.56862745], recall=[0.79487179 0.27272727 0.43939394], f1=[0.71538462 0.32727273 0.4957265 ], accuracy=0.61
rec:['Indy500-2019', 1117, 216, '+:66,0:33,-:117', 0.1527777777777778, 0.3055555555555556, 0.48148148148148145, 0.6203703703703703, 0.6388888888888888, 0.5046296296296297, 0.5879629629629629, 0.6296296296296297, 0.6064814814814815]


In [14]:
retdf

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lr,lsvc,lsvcl2,rf,lrbias,xgb
0,Indy500-2018,1117,189,"+:72,0:19,-:98",0.100529,0.380952,0.380952,0.68254,0.671958,0.47619,0.656085,0.693122,0.650794
0,Indy500-2019,1117,216,"+:66,0:33,-:117",0.152778,0.305556,0.481481,0.62037,0.638889,0.50463,0.587963,0.62963,0.606481


In [15]:
retdf_oracle

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lr,lsvc,lsvcl2,rf,lrbias,xgb
0,Indy500-2018,1117,189,"+:72,0:19,-:98",0.100529,0.380952,0.380952,0.68254,0.671958,0.619048,0.666667,0.693122,0.650794
0,Indy500-2019,1117,216,"+:66,0:33,-:117",0.152778,0.305556,0.481481,0.62037,0.638889,0.537037,0.578704,0.62963,0.606481
