### stage_model_classifier

base: 14./stage_model_classifier_withneighbor-newfeatures

prediction models of sign classifiers on stage dataset

data format:
    target , eventid ,    car_number,    stageid,     features...

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import xgboost as xgb


In [3]:
# bulid regression model
#classifiers = ['currank','avgrank','dice','lr','lrl1','lsvc','lsvcl2','rf','lrbias','xgb']
classifiers = ['currank','avgrank','dice','lr','lsvc','lsvcl2','rf','lrbias','xgb']
train_x, train_y, test_x, test_y = None, None, None, None

def get_classifier(classifier = 'lr'):
    
    class_weight = None
    
    if classifier == "lsvc":
        clf = LinearSVC(penalty='l1',dual=False, tol=1e-3, class_weight=class_weight )
    elif classifier == "lsvcl2":
        clf = LinearSVC(penalty='l2', tol=1e-4, class_weight=class_weight)
    elif classifier == 'rf':
        #clf = RandomForestClassifier(n_estimators=100, n_jobs=4,criterion='entropy', min_samples_split=1,class_weight = class_weight)
        clf = RandomForestClassifier(n_estimators=100, n_jobs=-1,criterion='entropy', class_weight = class_weight)
    elif classifier == 'lr':
        clf = LogisticRegression(class_weight = class_weight, n_jobs=-1, fit_intercept = False, verbose = 0)
    elif classifier == 'lrbias':
        clf = LogisticRegression(class_weight = class_weight, n_jobs=-1, fit_intercept = True, verbose = 1)
    elif classifier == 'lrl1':
        clf = LogisticRegression(class_weight = class_weight, penalty='l1',n_jobs=-1)
    elif classifier == 'xgb':
        clf = xgb.XGBClassifier(booster = 'gbtree', nthread = -1, subsample = 1, 
                                n_estimators = 600, colsample_bytree = 1, max_depth = 6, min_child_weight = 1)
    elif classifier == 'dice':
        clf = RandomDice('1234')
    elif classifier == 'currank':
        clf = CurRank()
    elif classifier == 'avgrank':
        clf = AverageRank()        
    else:
        clf = None
        
    return clf


class CurRank():
    """
    predict with current rank
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = [0 for x in range(test_x.shape[0])]
        return np.array(pred_y)
    
class AverageRank():
    """
    print('[*] predict with average rankchg (change_in_rank_all):idx = 15')
    change_in_rank_all = test[:,15]
    pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in change_in_rank_all])
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            #13, change_in_rank_all
            pred_y.append(x[13])
        pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in pred_y])
        return np.array(pred_y_avg)   

class RandomDice():
    """
    a random dice model
    """
    def __init__(self, seed='1234'):
        self.dist = []
        self.val = []
        random.seed(seed)
    
    def fit(self, x, y):
        total = y.shape[0]
        yval = set(y)
        
        ratio = 0.
        for val in yval:
            self.val.append(val)
            ratio += np.sum(y==val)*1.0 / total
            self.dist.append(ratio)
            
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            dice = random.random()
            #search in self.dist
            find_idx = -1
            for idx, ratio in enumerate(self.dist):
                if dice <= ratio:
                    find_idx = idx
                    break
            
            #or the last one match
            pred_y.append(self.val[find_idx])
            
        return np.array(pred_y)

def evaluate(test_y, pred_y):
    precision = metrics.precision_score(test_y, pred_y, average=None) 
    recall = metrics.recall_score(test_y, pred_y, average=None)
    f1 = metrics.f1_score(test_y, pred_y, average=None)
    accuracy = metrics.accuracy_score(test_y, pred_y)
    print('precision=%s, recall=%s, f1=%s, accuracy=%.2f'%(precision,recall, f1, accuracy))
    return accuracy
    
#
#features
#    cols=[Myidx, 'target','eventid','car_number','stageid',
#             'firststage','pit_in_caution','start_position',
#             'start_rank','start_rank_ratio','top_pack','bottom_pack',
#             'average_rank','average_rank_all',
#             'change_in_rank','change_in_rank_all','rate_of_change','rate_of_change_all']    
def split_by_eventid(stagedata, eventid):
    """
    split by eventid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['eventid'] != eventid].to_numpy()
    test  = stagedata[stagedata['eventid'] == eventid].to_numpy()

    #2:car_number
    train_x = train[:,2:]
    train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    test_x = test[:,2:]
    test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    
    return train, test, train_x, train_y, test_x, test_y


def split_by_stageid(stagedata, stageid):
    """
    split by stageid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['stageid'] <= stageid].to_numpy()
    test  = stagedata[stagedata['stageid'] > stageid].to_numpy()

    train_x = train[:,2:]
    train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    test_x = test[:,2:]
    test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    
    return train, test, train_x, train_y, test_x, test_y


### baseline
def baseline_model():
    #1. predict with current rank, rankchg = 0
    print('[*] predict with current rank, rankchg = 0')
    pred_y_simple = np.zeros_like(test_y)
    score1 = evaluate(test_y, pred_y_simple)

    #2. predict with average rankchg (change_in_rank_all):idx = 15
    print('[*] predict with average rankchg (change_in_rank_all):idx = 15')
    change_in_rank_all = test[:,15]
    pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in change_in_rank_all])
    score2 = evaluate(test_y, pred_y_avg)
    return score1, score2

def classifier_model(name='lr'):
    ### test learning models
    print('[*] predict with %s model'%name)
    clf = get_classifier(name)
    clf.fit(train_x, train_y)

    pred_y = clf.predict(test_x)
    score = evaluate(test_y, pred_y)
    return score

In [4]:
def test_20182019():
    global train_x, train_y, test_x, test_y
    
    ### fix train
    cols = ['runid','trainsize','testsize','testdistribution']
    cols.extend(classifiers)
    print('cols:%s'%cols)
    retdf = pd.DataFrame([],columns=cols)


    events = set(stagedata['eventid'])

    years = ['2013','2014','2015','2016','2017','2018','2019']
    #events = ['Indy500']
    eventsname = [f'Indy500-{x}' for x in years]
    events_id={key:idx for idx, key in enumerate(eventsname)}

    #first 
    eventid = events_id['Indy500-2018']
    ignore_eventid = events_id['Indy500-2019']
    stdata_2018 = stagedata[stagedata['eventid']!=ignore_eventid]

    print('Testset = %s'%eventsname[eventid])

    train, test, train_x, train_y, test_x, test_y = split_by_eventid(stdata_2018, eventid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
    #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
    #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

    #record
    rec = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

    acc = [0 for x in range(len(classifiers))]
    for idx, clf in enumerate(classifiers):
        acc[idx] = classifier_model(clf)

    rec.extend(acc)
    print('rec:%s'%rec)

    #new df
    df = pd.DataFrame([rec],columns=cols)
    retdf = pd.concat([retdf, df])        


    eventid = events_id['Indy500-2019']
    ignore_eventid = events_id['Indy500-2018']
    stdata_2019 = stagedata[stagedata['eventid']!=ignore_eventid]

    print('Testset = %s'%eventsname[eventid])

    train2, test2, train_x2, train_y2, test_x, test_y = split_by_eventid(stdata_2019, eventid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))

    #record
    rec = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

    acc = [0 for x in range(len(classifiers))]
    for idx, clf in enumerate(classifiers):
        acc[idx] = classifier_model(clf)

    rec.extend(acc)
    print('rec:%s'%rec)

    #new df
    df = pd.DataFrame([rec],columns=cols)
    retdf = pd.concat([retdf, df]) 
    
    retdf.to_csv(f'stint_classifier_result_{suffix}.csv', float_format='%.3f')
    return retdf

def test_cv():
    global train_x, train_y, test_x, test_y
    
    cols = ['runid','trainsize','testsize','testdistribution']
    cols.extend(classifiers)
    print('cols:%s'%cols)
    retdf = pd.DataFrame([],columns=cols)


    events = set(stagedata['eventid'])

    years = ['2013','2014','2015','2016','2017','2018','2019']
    #events = ['Indy500']
    eventsname = [f'Indy500-{x}' for x in years]
    events_id={key:idx for idx, key in enumerate(eventsname)}


    for eventid in events:
        print('Testset = %s'%eventsname[eventid])

        train, test, train_x, train_y, test_x, test_y = split_by_eventid(stagedata, eventid)
        test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
        #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
        #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

        #record
        rec = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

        acc = [0 for x in range(len(classifiers))]
        for idx, clf in enumerate(classifiers):
            acc[idx] = classifier_model(clf)

        rec.extend(acc)
        print('rec:%s'%rec)

        #new df
        df = pd.DataFrame([rec],columns=cols)
        retdf = pd.concat([retdf, df])        

    retdf.to_csv('crossvalid_stagedata_classifier_%s.csv'%suffix, float_format='%.3f')
    #df_event = retdf
    return retdf

### test oracle with stintlen

In [5]:
#load data
_trim = 0
_include_final = True
_include_stintlen = True
include_str = '1' if _include_final else '0'
stint_str = '1' if _include_stintlen else ''
suffix = f'indy500-2013-2019-end{include_str}{stint_str}-t{_trim}'
output_file = f'shortterm-indy500-2013-2019-end{include_str}{stint_str}-t{_trim}.csv'
stagedata = pd.read_csv(output_file)

stagedata.fillna(0, inplace=True)
stagedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31487 entries, 0 to 31486
Data columns (total 37 columns):
Unnamed: 0                   31487 non-null int64
target                       31487 non-null int64
start_lap                    31487 non-null int64
stint_len                    31487 non-null int64
eventid                      31487 non-null int64
car_number                   31487 non-null int64
stageid                      31487 non-null int64
firststage                   31487 non-null int64
pit_in_caution               31487 non-null int64
start_position               31487 non-null int64
start_rank                   31487 non-null int64
start_rank_ratio             31487 non-null float64
top_pack                     31487 non-null int64
bottom_pack                  31487 non-null int64
average_rank                 31487 non-null float64
average_rank_all             31487 non-null float64
change_in_rank               31487 non-null int64
change_in_rank_all           31487 

In [6]:
stagedata.head(5)

Unnamed: 0.1,Unnamed: 0,target,start_lap,stint_len,eventid,car_number,stageid,firststage,pit_in_caution,start_position,...,laptime_std_all,laps_prev,laps_after_last_pitstop,pittime_prev,prev_nb0_change_in_rank,prev_nb1_change_in_rank,prev_nb2_change_in_rank,follow_nb0_change_in_rank,follow_nb1_change_in_rank,follow_nb2_change_in_rank
0,0,0,0,42,0,1,0,0,0,3,...,0.0,0,0,0.0,0,0,0,0,0,0
1,1,0,1,42,0,1,1,1,0,3,...,25.236722,40,10,62.0677,0,0,0,0,0,0
2,2,0,2,42,0,1,2,1,0,3,...,25.771546,40,11,62.0677,0,0,0,0,0,0
3,3,0,3,42,0,1,3,1,0,3,...,26.200225,40,12,62.0677,0,0,0,0,0,0
4,4,0,4,42,0,1,4,1,0,3,...,26.110915,40,13,62.0677,0,0,0,0,0,0


In [7]:
df_event = test_cv()

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lr', 'lsvc', 'lsvcl2', 'rf', 'lrbias', 'xgb']
Testset = Indy500-2013
[*] predict with currank model
precision=[0.         0.14871795 0.        ], recall=[0. 1. 0.], f1=[0.         0.25892857 0.        ], accuracy=0.15
[*] predict with avgrank model
precision=[0.40425532 0.33333333 0.22972973], recall=[0.3877551  0.31034483 0.25      ], f1=[0.39583333 0.32142857 0.23943662], accuracy=0.33
[*] predict with dice model
precision=[0.46902655 0.2173913  0.27118644], recall=[0.54081633 0.17241379 0.23529412], f1=[0.50236967 0.19230769 0.2519685 ], accuracy=0.38
[*] predict with lr model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.61538462 0.5        0.6031746 ], recall=[0.81632653 0.03448276 0.55882353], f1=[0.70175439 0.06451613 0.58015267], accuracy=0.61
[*] predict with lsvc model




precision=[0.625 0.5   0.6  ], recall=[0.81632653 0.03448276 0.57352941], f1=[0.7079646  0.06451613 0.58646617], accuracy=0.62
[*] predict with lsvcl2 model




precision=[0.56441718 0.25       0.57142857], recall=[0.93877551 0.03448276 0.23529412], f1=[0.70498084 0.06060606 0.33333333], accuracy=0.56
[*] predict with rf model
precision=[0.6259542  0.66666667 0.60655738], recall=[0.83673469 0.06896552 0.54411765], f1=[0.71615721 0.125      0.57364341], accuracy=0.62
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.4s finished


precision=[0.62015504 0.5        0.609375  ], recall=[0.81632653 0.03448276 0.57352941], f1=[0.70484581 0.06451613 0.59090909], accuracy=0.62
[*] predict with xgb model
precision=[0.66393443 0.5        0.60869565], recall=[0.82653061 0.06896552 0.61764706], f1=[0.73636364 0.12121212 0.61313869], accuracy=0.64
rec:['Indy500-2013', 1327, 195, '+:68,0:29,-:98', 0.14871794871794872, 0.3282051282051282, 0.37948717948717947, 0.6102564102564103, 0.6153846153846154, 0.558974358974359, 0.6205128205128205, 0.6153846153846154, 0.6410256410256411]
Testset = Indy500-2014
[*] predict with currank model
precision=[0.         0.19111111 0.        ], recall=[0. 1. 0.], f1=[0.         0.32089552 0.        ], accuracy=0.19
[*] predict with avgrank model
precision=[0.47058824 0.24242424 0.2739726 ], recall=[0.47058824 0.18604651 0.31746032], f1=[0.47058824 0.21052632 0.29411765], accuracy=0.37
[*] predict with dice model
precision=[0.51908397 0.07407407 0.29850746], recall=[0.57142857 0.04651163 0.3174603

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.70967742 0.41666667 0.4494382 ], recall=[0.7394958  0.11627907 0.63492063], f1=[0.72427984 0.18181818 0.52631579], accuracy=0.59
[*] predict with lsvc model




precision=[0.7109375  0.4        0.44827586], recall=[0.76470588 0.09302326 0.61904762], f1=[0.73684211 0.1509434  0.52      ], accuracy=0.60
[*] predict with lsvcl2 model




precision=[0.86666667 0.2038835  0.42056075], recall=[0.1092437  0.48837209 0.71428571], f1=[0.19402985 0.28767123 0.52941176], accuracy=0.35
[*] predict with rf model
precision=[0.70769231 0.8        0.4       ], recall=[0.77310924 0.09302326 0.57142857], f1=[0.73895582 0.16666667 0.47058824], accuracy=0.59
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.5s finished


precision=[0.712      0.41666667 0.45454545], recall=[0.74789916 0.11627907 0.63492063], f1=[0.7295082  0.18181818 0.52980132], accuracy=0.60
[*] predict with xgb model
precision=[0.66129032 0.26666667 0.40697674], recall=[0.68907563 0.09302326 0.55555556], f1=[0.67489712 0.13793103 0.46979866], accuracy=0.54
rec:['Indy500-2014', 1297, 225, '+:63,0:43,-:119', 0.19111111111111112, 0.37333333333333335, 0.4, 0.5911111111111111, 0.5955555555555555, 0.3511111111111111, 0.5866666666666667, 0.5955555555555555, 0.5377777777777778]
Testset = Indy500-2015
[*] predict with currank model
precision=[0.         0.13333333 0.        ], recall=[0. 1. 0.], f1=[0.         0.23529412 0.        ], accuracy=0.13
[*] predict with avgrank model
precision=[0.27472527 0.2173913  0.24691358], recall=[0.26315789 0.19230769 0.27027027], f1=[0.2688172  0.20408163 0.25806452], accuracy=0.26
[*] predict with dice model
precision=[0.53508772 0.125      0.40350877], recall=[0.64210526 0.11538462 0.31081081], f1=[0.583

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.66371681 1.         0.625     ], recall=[0.78947368 0.07692308 0.67567568], f1=[0.72115385 0.14285714 0.64935065], accuracy=0.65
[*] predict with lsvc model




precision=[0.67924528 1.         0.60227273], recall=[0.75789474 0.03846154 0.71621622], f1=[0.71641791 0.07407407 0.65432099], accuracy=0.65
[*] predict with lsvcl2 model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.60447761 0.         0.60655738], recall=[0.85263158 0.         0.5       ], f1=[0.70742358 0.         0.54814815], accuracy=0.61
[*] predict with rf model
precision=[0.65517241 0.66666667 0.67105263], recall=[0.8        0.07692308 0.68918919], f1=[0.72037915 0.13793103 0.68      ], accuracy=0.66
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.4s finished


precision=[0.66371681 1.         0.625     ], recall=[0.78947368 0.07692308 0.67567568], f1=[0.72115385 0.14285714 0.64935065], accuracy=0.65
[*] predict with xgb model
precision=[0.68032787 0.5        0.71641791], recall=[0.87368421 0.11538462 0.64864865], f1=[0.76497696 0.1875     0.68085106], accuracy=0.69
rec:['Indy500-2015', 1327, 195, '+:74,0:26,-:95', 0.13333333333333333, 0.2564102564102564, 0.4461538461538462, 0.6512820512820513, 0.6461538461538462, 0.6051282051282051, 0.6615384615384615, 0.6512820512820513, 0.6871794871794872]
Testset = Indy500-2016
[*] predict with currank model
precision=[0.         0.12598425 0.        ], recall=[0. 1. 0.], f1=[0.         0.22377622 0.        ], accuracy=0.13
[*] predict with avgrank model
precision=[0.42519685 0.25       0.31313131], recall=[0.43548387 0.21875    0.31632653], f1=[0.43027888 0.23333333 0.31472081], accuracy=0.36
[*] predict with dice model
precision=[0.49342105 0.05714286 0.47761194], recall=[0.60483871 0.0625     0.3265306

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.625      0.38461538 0.69863014], recall=[0.84677419 0.15625    0.52040816], f1=[0.71917808 0.22222222 0.59649123], accuracy=0.63
[*] predict with lsvc model




precision=[0.61235955 0.66666667 0.71232877], recall=[0.87903226 0.0625     0.53061224], f1=[0.7218543  0.11428571 0.60818713], accuracy=0.64
[*] predict with lsvcl2 model




precision=[1.    0.124 0.5  ], recall=[0.01612903 0.96875    0.01020408], f1=[0.03174603 0.21985816 0.02      ], accuracy=0.13
[*] predict with rf model
precision=[0.65853659 0.42857143 0.71084337], recall=[0.87096774 0.09375    0.60204082], f1=[0.75       0.15384615 0.6519337 ], accuracy=0.67
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.5s finished


precision=[0.62650602 0.42857143 0.68918919], recall=[0.83870968 0.1875     0.52040816], f1=[0.71724138 0.26086957 0.59302326], accuracy=0.63
[*] predict with xgb model
precision=[0.69444444 0.4375     0.69148936], recall=[0.80645161 0.21875    0.66326531], f1=[0.74626866 0.29166667 0.67708333], accuracy=0.68
rec:['Indy500-2016', 1268, 254, '+:98,0:32,-:124', 0.12598425196850394, 0.36220472440944884, 0.42913385826771655, 0.6338582677165354, 0.6417322834645669, 0.13385826771653545, 0.6692913385826772, 0.6338582677165354, 0.6771653543307087]
Testset = Indy500-2017
[*] predict with currank model
precision=[0.         0.19758065 0.        ], recall=[0. 1. 0.], f1=[0.         0.32996633 0.        ], accuracy=0.20
[*] predict with avgrank model
precision=[0.37931034 0.23404255 0.18823529], recall=[0.34645669 0.2244898  0.22222222], f1=[0.36213992 0.22916667 0.20382166], accuracy=0.29
[*] predict with dice model
precision=[0.46575342 0.16666667 0.23611111], recall=[0.53543307 0.10204082 0.236

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.74074074 0.19774011 0.34090909], recall=[0.15748031 0.71428571 0.20833333], f1=[0.25974026 0.30973451 0.25862069], accuracy=0.28
[*] predict with lsvc model




precision=[0.74074074 0.20118343 0.40384615], recall=[0.15748031 0.69387755 0.29166667], f1=[0.25974026 0.31192661 0.33870968], accuracy=0.30
[*] predict with lsvcl2 model




precision=[0.54347826 0.4        0.76923077], recall=[0.98425197 0.04081633 0.13888889], f1=[0.70028011 0.07407407 0.23529412], accuracy=0.55
[*] predict with rf model
precision=[0.62758621 0.47222222 0.53731343], recall=[0.71653543 0.34693878 0.5       ], f1=[0.66911765 0.4        0.51798561], accuracy=0.58
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.5s finished


precision=[0.77777778 0.19883041 0.34      ], recall=[0.16535433 0.69387755 0.23611111], f1=[0.27272727 0.30909091 0.27868852], accuracy=0.29
[*] predict with xgb model
precision=[0.73636364 0.47272727 0.55421687], recall=[0.63779528 0.53061224 0.63888889], f1=[0.6835443  0.5        0.59354839], accuracy=0.62
rec:['Indy500-2017', 1274, 248, '+:72,0:49,-:127', 0.1975806451612903, 0.2862903225806452, 0.3629032258064516, 0.28225806451612906, 0.3024193548387097, 0.5524193548387096, 0.5806451612903226, 0.2903225806451613, 0.6169354838709677]
Testset = Indy500-2018
[*] predict with currank model
precision=[0.        0.1005291 0.       ], recall=[0. 1. 0.], f1=[0.         0.18269231 0.        ], accuracy=0.10
[*] predict with avgrank model
precision=[0.38947368 0.29411765 0.23376623], recall=[0.37755102 0.26315789 0.25      ], f1=[0.38341969 0.27777778 0.24161074], accuracy=0.32
[*] predict with dice model
precision=[0.47706422 0.12       0.32727273], recall=[0.53061224 0.15789474 0.25      ]

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.65648855 0.22222222 0.73469388], recall=[0.87755102 0.10526316 0.5       ], f1=[0.7510917  0.14285714 0.59504132], accuracy=0.66
[*] predict with lsvc model




precision=[0.64137931 0.33333333 0.80487805], recall=[0.94897959 0.05263158 0.45833333], f1=[0.7654321  0.09090909 0.5840708 ], accuracy=0.67
[*] predict with lsvcl2 model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.53551913 0.         1.        ], recall=[1.         0.         0.08333333], f1=[0.6975089  0.         0.15384615], accuracy=0.55
[*] predict with rf model
precision=[0.65116279 0.25       0.69642857], recall=[0.85714286 0.05263158 0.54166667], f1=[0.74008811 0.08695652 0.609375  ], accuracy=0.66
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.5s finished


precision=[0.65151515 0.2        0.76595745], recall=[0.87755102 0.10526316 0.5       ], f1=[0.74782609 0.13793103 0.60504202], accuracy=0.66
[*] predict with xgb model
precision=[0.71134021 0.42857143 0.65384615], recall=[0.70408163 0.31578947 0.70833333], f1=[0.70769231 0.36363636 0.68      ], accuracy=0.67
rec:['Indy500-2018', 1333, 189, '+:72,0:19,-:98', 0.10052910052910052, 0.31746031746031744, 0.3862433862433862, 0.656084656084656, 0.671957671957672, 0.5502645502645502, 0.656084656084656, 0.656084656084656, 0.6666666666666666]
Testset = Indy500-2019
[*] predict with currank model
precision=[0.         0.15277778 0.        ], recall=[0. 1. 0.], f1=[0.         0.26506024 0.        ], accuracy=0.15
[*] predict with avgrank model
precision=[0.46428571 0.34482759 0.24      ], recall=[0.44444444 0.3030303  0.27272727], f1=[0.45414847 0.32258065 0.25531915], accuracy=0.37
[*] predict with dice model
precision=[0.59349593 0.33333333 0.34848485], recall=[0.62393162 0.27272727 0.34848485],

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.66666667 0.44444444 0.54666667], recall=[0.75213675 0.12121212 0.62121212], f1=[0.70682731 0.19047619 0.58156028], accuracy=0.62
[*] predict with lsvc model




precision=[0.65384615 0.66666667 0.51807229], recall=[0.72649573 0.06060606 0.65151515], f1=[0.68825911 0.11111111 0.57718121], accuracy=0.60
[*] predict with lsvcl2 model




precision=[0.64705882 0.33333333 0.625     ], recall=[0.84615385 0.15151515 0.45454545], f1=[0.73333333 0.20833333 0.52631579], accuracy=0.62
[*] predict with rf model
precision=[0.63414634 0.66666667 0.51724138], recall=[0.66666667 0.12121212 0.68181818], f1=[0.65       0.20512821 0.58823529], accuracy=0.59
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.4s finished


precision=[0.66412214 0.5        0.53246753], recall=[0.74358974 0.12121212 0.62121212], f1=[0.7016129  0.19512195 0.57342657], accuracy=0.61
[*] predict with xgb model
precision=[0.63909774 0.5        0.53623188], recall=[0.72649573 0.21212121 0.56060606], f1=[0.68       0.29787234 0.54814815], accuracy=0.60
rec:['Indy500-2019', 1306, 216, '+:66,0:33,-:117', 0.1527777777777778, 0.37037037037037035, 0.4861111111111111, 0.6157407407407407, 0.6018518518518519, 0.6203703703703703, 0.5879629629629629, 0.6111111111111112, 0.5972222222222222]


In [7]:
retdf_oracle = test_20182019()

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lr', 'lsvc', 'lsvcl2', 'rf', 'lrbias', 'xgb']
Testset = Indy500-2018
[*] predict with currank model
precision=[0.         0.54255076 0.        ], recall=[0. 1. 0.], f1=[0.         0.70344624 0.        ], accuracy=0.54
[*] predict with avgrank model
precision=[0.         0.         0.11498973], recall=[0. 0. 1.], f1=[0.         0.         0.20626151], accuracy=0.11
[*] predict with dice model
precision=[0.33885667 0.54286843 0.1255814 ], recall=[0.27248501 0.57779647 0.16071429], f1=[0.30206795 0.55978814 0.14099217], accuracy=0.43
[*] predict with lr model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.40909091 0.5677675  0.38023952], recall=[0.07195203 0.90370059 0.25198413], f1=[0.1223796  0.69738764 0.30310263], accuracy=0.54
[*] predict with lsvc model




precision=[0.408      0.55455868 0.66666667], recall=[0.0679547  0.96173255 0.01190476], f1=[0.11650485 0.70347585 0.02339181], accuracy=0.55
[*] predict with lsvcl2 model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.35080548 0.77232143 0.        ], recall=[0.97201865 0.07275021 0.        ], f1=[0.5155477  0.13297463 0.        ], accuracy=0.37
[*] predict with rf model
precision=[0.4743083  0.59348255 0.33633634], recall=[0.31978681 0.75820017 0.22222222], f1=[0.38201353 0.66580502 0.26762246], accuracy=0.55
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.1s finished


precision=[0.41637011 0.5670266  0.39344262], recall=[0.07794803 0.90538267 0.23809524], f1=[0.13131313 0.69732794 0.29666255], accuracy=0.55
[*] predict with xgb model
precision=[0.44160178 0.59444048 0.25368732], recall=[0.26449034 0.70142977 0.34126984], f1=[0.33083333 0.64351852 0.29103215], accuracy=0.51
rec:['Indy500-2018', 22403, 4383, '+:504,0:2378,-:1501', 0.542550764316678, 0.11498973305954825, 0.4252794889345197, 0.5439196897102441, 0.5464293862651152, 0.3723477070499658, 0.5464293862651152, 0.5452886151038102, 0.5103810175678759]
Testset = Indy500-2019
[*] predict with currank model
precision=[0.         0.63667305 0.        ], recall=[0. 1. 0.], f1=[0.         0.77800884 0.        ], accuracy=0.64
[*] predict with avgrank model
precision=[0.         0.         0.10082961], recall=[0. 0. 1.], f1=[0.         0.         0.18318841], accuracy=0.10
[*] predict with dice model
precision=[0.26214341 0.63874539 0.11527378], recall=[0.27552674 0.57834948 0.16877637], f1=[0.26866851

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.41635688 0.65001141 0.63265306], recall=[0.09076175 0.95188774 0.06540084], f1=[0.14903526 0.77250542 0.11854685], accuracy=0.64
[*] predict with lsvc model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.45454545 0.65085432 0.        ], recall=[0.09319287 0.96725693 0.        ], f1=[0.15467384 0.77812122 0.        ], accuracy=0.64
[*] predict with lsvcl2 model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.         0.63667305 0.        ], recall=[0. 1. 0.], f1=[0.         0.77800884 0.        ], accuracy=0.64
[*] predict with rf model
precision=[0.35765896 0.72582287 0.41621622], recall=[0.40113452 0.71466756 0.32489451], f1=[0.37815126 0.72020202 0.36492891], accuracy=0.59
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    1.1s finished


precision=[0.41366906 0.64961169 0.62222222], recall=[0.09319287 0.95021717 0.05907173], f1=[0.1521164  0.77167277 0.10789981], accuracy=0.64
[*] predict with xgb model
precision=[0.32323997 0.70531915 0.325     ], recall=[0.34602917 0.66455062 0.38396624], f1=[0.33424658 0.68432823 0.35203095], accuracy=0.55
rec:['Indy500-2019', 22403, 4701, '+:474,0:2993,-:1234', 0.6366730482875984, 0.10082961072112316, 0.45756222080408426, 0.6364603275898745, 0.6402893001489045, 0.6366730482875984, 0.5930653052542012, 0.635396724101255, 0.5526483726866624]


In [8]:
retdf_oracle

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lr,lsvc,lsvcl2,rf,lrbias,xgb
0,Indy500-2018,22403,4383,"+:504,0:2378,-:1501",0.542551,0.11499,0.425279,0.54392,0.546429,0.372348,0.546429,0.545289,0.510381
0,Indy500-2019,22403,4701,"+:474,0:2993,-:1234",0.636673,0.10083,0.457562,0.63646,0.640289,0.636673,0.593065,0.635397,0.552648


In [11]:
events = set(stagedata['eventid'])

years = ['2013','2014','2015','2016','2017','2018','2019']
#events = ['Indy500']
eventsname = [f'Indy500-{x}' for x in years]
events_id={key:idx for idx, key in enumerate(eventsname)}
eventid = events_id['Indy500-2018']
ignore_eventid = events_id['Indy500-2019']
stdata_2018 = stagedata[stagedata['eventid']!=ignore_eventid]
test_2018 =   stagedata[stagedata['eventid']==events_id['Indy500-2018']]

In [12]:
test_2018

Unnamed: 0.1,Unnamed: 0,target,start_lap,stint_len,eventid,car_number,stageid,firststage,pit_in_caution,start_position,...,laptime_std_all,laps_prev,laps_after_last_pitstop,pittime_prev,prev_nb0_change_in_rank,prev_nb1_change_in_rank,prev_nb2_change_in_rank,follow_nb0_change_in_rank,follow_nb1_change_in_rank,follow_nb2_change_in_rank
22403,22403,0,0,42,5,1,0,0,0,3,...,0.000000,0,0,0.00000,0,0,0,0,0,0
22404,22404,0,1,42,5,1,1,1,0,3,...,8.567336,40,9,61.12735,0,0,0,0,0,0
22405,22405,0,2,42,5,1,2,1,0,3,...,8.473138,40,10,61.12735,0,0,0,0,0,0
22406,22406,0,3,42,5,1,3,1,0,3,...,8.380925,40,11,61.12735,0,0,0,0,0,0
22407,22407,0,4,42,5,1,4,1,0,3,...,8.291206,40,12,61.12735,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26781,26781,-2,154,42,5,98,154,1,0,10,...,22.591369,40,24,59.99830,0,0,0,0,0,0
26782,26782,-4,155,42,5,98,155,1,0,10,...,22.545040,40,25,59.99830,0,0,0,0,0,0
26783,26783,-2,156,42,5,98,156,1,0,10,...,22.501341,40,26,59.99830,0,0,0,0,0,0
26784,26784,0,157,42,5,98,157,1,0,10,...,22.460143,40,27,59.99830,0,0,0,0,0,0


In [13]:
len(test_2018[test_2018['target']==0])

2378

In [14]:
test_2018[test_2018['car_number']==12]

Unnamed: 0.1,Unnamed: 0,target,start_lap,stint_len,eventid,car_number,stageid,firststage,pit_in_caution,start_position,...,laptime_std_all,laps_prev,laps_after_last_pitstop,pittime_prev,prev_nb0_change_in_rank,prev_nb1_change_in_rank,prev_nb2_change_in_rank,follow_nb0_change_in_rank,follow_nb1_change_in_rank,follow_nb2_change_in_rank
23312,23312,0,0,42,5,12,0,0,0,6,...,0.000000,0,0,0.00000,0,0,0,0,0,0
23313,23313,0,1,42,5,12,1,1,0,6,...,8.716711,40,9,61.85245,0,0,0,0,0,0
23314,23314,0,2,42,5,12,2,1,0,6,...,8.619699,40,10,61.85245,0,0,0,0,0,0
23315,23315,0,3,42,5,12,3,1,0,6,...,8.527553,40,11,61.85245,0,0,0,0,0,0
23316,23316,0,4,42,5,12,4,1,0,6,...,8.437718,40,12,61.85245,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23466,23466,-2,154,42,5,12,154,1,0,6,...,23.131084,40,23,58.78955,0,0,0,0,0,0
23467,23467,-2,155,42,5,12,155,1,0,6,...,23.087297,40,24,58.78955,0,0,0,0,0,0
23468,23468,0,156,42,5,12,156,1,0,6,...,23.049108,40,25,58.78955,0,0,0,0,0,0
23469,23469,0,157,42,5,12,157,1,0,6,...,23.008583,40,26,58.78955,0,0,0,0,0,0
