### stage_model_classifier

base: 14./stage_model_classifier_withneighbor-newfeatures

prediction models of sign classifiers on stage dataset

data format:
    target , eventid ,    car_number,    stageid,     features...

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import xgboost as xgb


In [3]:
# bulid regression model
#classifiers = ['currank','avgrank','dice','lr','lrl1','lsvc','lsvcl2','rf','lrbias','xgb']
classifiers = ['currank','avgrank','dice','lr','lsvc','lsvcl2','rf','lrbias','xgb']
def get_classifier(classifier = 'lr'):
    
    class_weight = None
    
    if classifier == "lsvc":
        clf = LinearSVC(penalty='l1',dual=False, tol=1e-3, class_weight=class_weight )
    elif classifier == "lsvcl2":
        clf = LinearSVC(penalty='l2', tol=1e-4, class_weight=class_weight)
    elif classifier == 'rf':
        #clf = RandomForestClassifier(n_estimators=100, n_jobs=4,criterion='entropy', min_samples_split=1,class_weight = class_weight)
        clf = RandomForestClassifier(n_estimators=100, n_jobs=-1,criterion='entropy', class_weight = class_weight)
    elif classifier == 'lr':
        clf = LogisticRegression(class_weight = class_weight, n_jobs=-1, fit_intercept = False, verbose = 0)
    elif classifier == 'lrbias':
        clf = LogisticRegression(class_weight = class_weight, n_jobs=-1, fit_intercept = True, verbose = 1)
    elif classifier == 'lrl1':
        clf = LogisticRegression(class_weight = class_weight, penalty='l1',n_jobs=-1)
    elif classifier == 'xgb':
        clf = xgb.XGBClassifier(booster = 'gbtree', nthread = -1, subsample = 1, 
                                n_estimators = 600, colsample_bytree = 1, max_depth = 6, min_child_weight = 1)
    elif classifier == 'dice':
        clf = RandomDice('1234')
    elif classifier == 'currank':
        clf = CurRank()
    elif classifier == 'avgrank':
        clf = AverageRank()        
    else:
        clf = None
        
    return clf


class CurRank():
    """
    predict with current rank
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = [0 for x in range(test_x.shape[0])]
        return np.array(pred_y)
    
class AverageRank():
    """
    print('[*] predict with average rankchg (change_in_rank_all):idx = 15')
    change_in_rank_all = test[:,15]
    pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in change_in_rank_all])
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            #13, change_in_rank_all
            pred_y.append(x[13])
        pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in pred_y])
        return np.array(pred_y_avg)   

class RandomDice():
    """
    a random dice model
    """
    def __init__(self, seed='1234'):
        self.dist = []
        self.val = []
        random.seed(seed)
    
    def fit(self, x, y):
        total = y.shape[0]
        yval = set(y)
        
        ratio = 0.
        for val in yval:
            self.val.append(val)
            ratio += np.sum(y==val)*1.0 / total
            self.dist.append(ratio)
            
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            dice = random.random()
            #search in self.dist
            find_idx = -1
            for idx, ratio in enumerate(self.dist):
                if dice <= ratio:
                    find_idx = idx
                    break
            
            #or the last one match
            pred_y.append(self.val[find_idx])
            
        return np.array(pred_y)

def evaluate(test_y, pred_y):
    precision = metrics.precision_score(test_y, pred_y, average=None) 
    recall = metrics.recall_score(test_y, pred_y, average=None)
    f1 = metrics.f1_score(test_y, pred_y, average=None)
    accuracy = metrics.accuracy_score(test_y, pred_y)
    print('precision=%s, recall=%s, f1=%s, accuracy=%.2f'%(precision,recall, f1, accuracy))
    return accuracy
    
#
#features
#    cols=[Myidx, 'target','eventid','car_number','stageid',
#             'firststage','pit_in_caution','start_position',
#             'start_rank','start_rank_ratio','top_pack','bottom_pack',
#             'average_rank','average_rank_all',
#             'change_in_rank','change_in_rank_all','rate_of_change','rate_of_change_all']    
def split_by_eventid(stagedata, eventid):
    """
    split by eventid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['eventid'] != eventid].to_numpy()
    test  = stagedata[stagedata['eventid'] == eventid].to_numpy()

    #2:car_number
    train_x = train[:,2:]
    train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    test_x = test[:,2:]
    test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    
    return train, test, train_x, train_y, test_x, test_y


def split_by_stageid(stagedata, stageid):
    """
    split by stageid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['stageid'] <= stageid].to_numpy()
    test  = stagedata[stagedata['stageid'] > stageid].to_numpy()

    train_x = train[:,2:]
    train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    test_x = test[:,2:]
    test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    
    return train, test, train_x, train_y, test_x, test_y


### baseline
def baseline_model():
    #1. predict with current rank, rankchg = 0
    print('[*] predict with current rank, rankchg = 0')
    pred_y_simple = np.zeros_like(test_y)
    score1 = evaluate(test_y, pred_y_simple)

    #2. predict with average rankchg (change_in_rank_all):idx = 15
    print('[*] predict with average rankchg (change_in_rank_all):idx = 15')
    change_in_rank_all = test[:,15]
    pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in change_in_rank_all])
    score2 = evaluate(test_y, pred_y_avg)
    return score1, score2

def classifier_model(name='lr'):
    ### test learning models
    print('[*] predict with %s model'%name)
    clf = get_classifier(name)
    clf.fit(train_x, train_y)

    pred_y = clf.predict(test_x)
    score = evaluate(test_y, pred_y)
    return score

In [4]:
#load data
_trim = 0
_include_final = False
include_str = '1' if _include_final else '0'
suffix = f'indy500-2013-2019-end{include_str}-t{_trim}'
output_file = f'stage-indy500-2013-2019-end{include_str}-t{_trim}.csv'
stagedata = pd.read_csv(output_file)


stagedata.fillna(0, inplace=True)
stagedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 35 columns):
Unnamed: 0                   1313 non-null int64
target                       1313 non-null int64
eventid                      1313 non-null int64
car_number                   1313 non-null int64
stageid                      1313 non-null int64
firststage                   1313 non-null int64
pit_in_caution               1313 non-null int64
start_position               1313 non-null int64
start_rank                   1313 non-null int64
start_rank_ratio             1313 non-null float64
top_pack                     1313 non-null int64
bottom_pack                  1313 non-null int64
average_rank                 1313 non-null float64
average_rank_all             1313 non-null float64
change_in_rank               1313 non-null int64
change_in_rank_all           1313 non-null float64
rate_of_change               1313 non-null int64
rate_of_change_all           1313 non-null float64
l

In [5]:
stagedata.head(5)

Unnamed: 0.1,Unnamed: 0,target,eventid,car_number,stageid,firststage,pit_in_caution,start_position,start_rank,start_rank_ratio,...,laptime_std_all,laps_prev,laps_after_last_pitstop,pittime_prev,prev_nb0_change_in_rank,prev_nb1_change_in_rank,prev_nb2_change_in_rank,follow_nb0_change_in_rank,follow_nb1_change_in_rank,follow_nb2_change_in_rank
0,1,-12,0,1,1,1,0,7,15,0.454545,...,22.9104,31,31,66.0815,2,8,0,14,6,3
1,2,14,0,1,2,1,1,7,3,0.090909,...,23.822858,27,27,62.0677,-19,-21,0,-3,-21,-17
2,3,-7,0,1,3,1,0,7,17,0.515152,...,21.857882,32,32,91.23935,11,-2,-3,7,17,19
3,4,5,0,1,4,1,0,7,10,0.30303,...,19.394133,32,32,61.19415,-15,-9,-6,0,4,-6
4,5,-3,0,1,5,1,0,7,15,0.454545,...,17.737505,30,30,60.8541,-3,0,0,-3,-3,0


In [6]:
cols = ['runid','trainsize','testsize','testdistribution']
cols.extend(classifiers)
print('cols:%s'%cols)
retdf = pd.DataFrame([],columns=cols)


events = set(stagedata['eventid'])

years = ['2013','2014','2015','2016','2017','2018','2019']
#events = ['Indy500']
eventsname = [f'Indy500-{x}' for x in years]
events_id={key:idx for idx, key in enumerate(eventsname)}


for eventid in events:
    print('Testset = %s'%eventsname[eventid])
    
    train, test, train_x, train_y, test_x, test_y = split_by_eventid(stagedata, eventid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
    #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
    #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))
    
    #record
    rec = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    
    acc = [0 for x in range(len(classifiers))]
    for idx, clf in enumerate(classifiers):
        acc[idx] = classifier_model(clf)

    rec.extend(acc)
    print('rec:%s'%rec)
    
    #new df
    df = pd.DataFrame([rec],columns=cols)
    retdf = pd.concat([retdf, df])        
    
retdf.to_csv('crossvalid_stagedata_splitbyevent%s.csv'%suffix)
df_event = retdf

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lr', 'lsvc', 'lsvcl2', 'rf', 'lrbias', 'xgb']
Testset = Indy500-2013
[*] predict with currank model
precision=[0.        0.1626506 0.       ], recall=[0. 1. 0.], f1=[0.         0.27979275 0.        ], accuracy=0.16
[*] predict with avgrank model
precision=[0.37777778 0.         0.25      ], recall=[0.425      0.         0.30508475], f1=[0.4        0.         0.27480916], accuracy=0.31
[*] predict with dice model
precision=[0.43617021 0.15789474 0.30188679], recall=[0.5125     0.11111111 0.27118644], f1=[0.47126437 0.13043478 0.28571429], accuracy=0.36
[*] predict with lr model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.58095238 0.         0.53333333], recall=[0.7625     0.         0.54237288], f1=[0.65945946 0.         0.53781513], accuracy=0.56
[*] predict with lsvc model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.58181818 0.         0.53571429], recall=[0.8        0.         0.50847458], f1=[0.67368421 0.         0.52173913], accuracy=0.57
[*] predict with lsvcl2 model




precision=[0.54961832 0.25       0.59259259], recall=[0.9        0.07407407 0.27118644], f1=[0.68246445 0.11428571 0.37209302], accuracy=0.54
[*] predict with rf model
precision=[0.57272727 1.         0.52727273], recall=[0.7875     0.03703704 0.49152542], f1=[0.66315789 0.07142857 0.50877193], accuracy=0.56
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.4s finished


precision=[0.57009346 0.         0.51724138], recall=[0.7625     0.         0.50847458], f1=[0.65240642 0.         0.51282051], accuracy=0.55
[*] predict with xgb model
precision=[0.55855856 0.4        0.52      ], recall=[0.775      0.07407407 0.44067797], f1=[0.64921466 0.125      0.47706422], accuracy=0.54
rec:['Indy500-2013', 1147, 166, '+:59,0:27,-:80', 0.16265060240963855, 0.3132530120481928, 0.3614457831325301, 0.5602409638554217, 0.5662650602409639, 0.5421686746987951, 0.5602409638554217, 0.5481927710843374, 0.5421686746987951]
Testset = Indy500-2014
[*] predict with currank model
precision=[0.         0.16923077 0.        ], recall=[0. 1. 0.], f1=[0.         0.28947368 0.        ], accuracy=0.17
[*] predict with avgrank model
precision=[0.42391304 0.1        0.22580645], recall=[0.37142857 0.03030303 0.36842105], f1=[0.39593909 0.04651163 0.28      ], accuracy=0.31
[*] predict with dice model
precision=[0.5625     0.         0.37096774], recall=[0.6        0.         0.4035087

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.70175439 0.75       0.48051948], recall=[0.76190476 0.09090909 0.64912281], f1=[0.73059361 0.16216216 0.55223881], accuracy=0.62
[*] predict with lsvc model




precision=[0.69565217 0.4        0.48      ], recall=[0.76190476 0.06060606 0.63157895], f1=[0.72727273 0.10526316 0.54545455], accuracy=0.61
[*] predict with lsvcl2 model




precision=[0.84615385 0.35714286 0.36619718], recall=[0.31428571 0.15151515 0.9122807 ], f1=[0.45833333 0.21276596 0.52261307], accuracy=0.46
[*] predict with rf model
precision=[0.6952381  0.6        0.44705882], recall=[0.6952381  0.09090909 0.66666667], f1=[0.6952381  0.15789474 0.53521127], accuracy=0.58
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.5s finished


precision=[0.69565217 0.75       0.47368421], recall=[0.76190476 0.09090909 0.63157895], f1=[0.72727273 0.16216216 0.54135338], accuracy=0.61
[*] predict with xgb model
precision=[0.70833333 0.3125     0.44578313], recall=[0.64761905 0.15151515 0.64912281], f1=[0.67661692 0.20408163 0.52857143], accuracy=0.56
rec:['Indy500-2014', 1118, 195, '+:57,0:33,-:105', 0.16923076923076924, 0.3128205128205128, 0.441025641025641, 0.6153846153846154, 0.6051282051282051, 0.46153846153846156, 0.5846153846153846, 0.6102564102564103, 0.5641025641025641]
Testset = Indy500-2015
[*] predict with currank model
precision=[0.        0.1257485 0.       ], recall=[0. 1. 0.], f1=[0.         0.22340426 0.        ], accuracy=0.13
[*] predict with avgrank model
precision=[0.34666667 0.15384615 0.24050633], recall=[0.32911392 0.0952381  0.28358209], f1=[0.33766234 0.11764706 0.26027397], accuracy=0.28
[*] predict with dice model
precision=[0.5        0.13636364 0.45098039], recall=[0.59493671 0.14285714 0.34328358]

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.61627907 0.         0.57142857], recall=[0.67088608 0.         0.65671642], f1=[0.64242424 0.         0.61111111], accuracy=0.58
[*] predict with lsvc model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.59574468 0.         0.57534247], recall=[0.70886076 0.         0.62686567], f1=[0.64739884 0.         0.6       ], accuracy=0.59
[*] predict with lsvcl2 model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.84615385 0.         0.46099291], recall=[0.27848101 0.         0.97014925], f1=[0.41904762 0.         0.625     ], accuracy=0.52
[*] predict with rf model


  _warn_prf(average, modifier, msg_start, len(result))
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.


precision=[0.65263158 0.         0.68055556], recall=[0.78481013 0.         0.73134328], f1=[0.71264368 0.         0.70503597], accuracy=0.66
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.4s finished


precision=[0.61363636 0.         0.57142857], recall=[0.6835443  0.         0.65671642], f1=[0.64670659 0.         0.61111111], accuracy=0.59
[*] predict with xgb model
precision=[0.60638298 0.         0.67692308], recall=[0.72151899 0.         0.65671642], f1=[0.65895954 0.         0.66666667], accuracy=0.60
rec:['Indy500-2015', 1146, 167, '+:67,0:21,-:79', 0.12574850299401197, 0.281437125748503, 0.437125748502994, 0.5808383233532934, 0.5868263473053892, 0.5209580838323353, 0.6646706586826348, 0.5868263473053892, 0.6047904191616766]
Testset = Indy500-2016
[*] predict with currank model
precision=[0.         0.11946903 0.        ], recall=[0. 1. 0.], f1=[0.         0.21343874 0.        ], accuracy=0.12
[*] predict with avgrank model
precision=[0.37614679 0.06666667 0.28431373], recall=[0.36607143 0.03703704 0.33333333], f1=[0.37104072 0.04761905 0.30687831], accuracy=0.31
[*] predict with dice model
precision=[0.51145038 0.15625    0.41269841], recall=[0.59821429 0.18518519 0.29885057]

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.63636364 0.2        0.66176471], recall=[0.8125     0.11111111 0.51724138], f1=[0.71372549 0.14285714 0.58064516], accuracy=0.62
[*] predict with lsvc model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.58709677 0.         0.6056338 ], recall=[0.8125     0.         0.49425287], f1=[0.68164794 0.         0.5443038 ], accuracy=0.59
[*] predict with lsvcl2 model




precision=[0.52358491 0.         0.84615385], recall=[0.99107143 0.         0.12643678], f1=[0.68518519 0.         0.22      ], accuracy=0.54
[*] predict with rf model
precision=[0.62686567 0.         0.59550562], recall=[0.75      0.        0.6091954], f1=[0.68292683 0.         0.60227273], accuracy=0.61
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.4s finished


precision=[0.61333333 0.2        0.68852459], recall=[0.82142857 0.11111111 0.48275862], f1=[0.70229008 0.14285714 0.56756757], accuracy=0.61
[*] predict with xgb model
precision=[0.61344538 0.16666667 0.54736842], recall=[0.65178571 0.07407407 0.59770115], f1=[0.63203463 0.1025641  0.57142857], accuracy=0.56
rec:['Indy500-2016', 1087, 226, '+:87,0:27,-:112', 0.11946902654867257, 0.3141592920353982, 0.4336283185840708, 0.6150442477876106, 0.5929203539823009, 0.5398230088495575, 0.6061946902654868, 0.6061946902654868, 0.5619469026548672]
Testset = Indy500-2017
[*] predict with currank model
precision=[0.         0.20465116 0.        ], recall=[0. 1. 0.], f1=[0.         0.33976834 0.        ], accuracy=0.20
[*] predict with avgrank model
precision=[0.42857143 0.33333333 0.25641026], recall=[0.48571429 0.13636364 0.3030303 ], f1=[0.45535714 0.19354839 0.27777778], accuracy=0.36
[*] predict with dice model
precision=[0.5        0.16666667 0.28985507], recall=[0.58095238 0.09090909 0.303030

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.75609756 0.20134228 0.44      ], recall=[0.2952381  0.68181818 0.16666667], f1=[0.42465753 0.31088083 0.24175824], accuracy=0.33
[*] predict with lsvc model




precision=[0.76190476 0.20805369 0.5       ], recall=[0.3047619  0.70454545 0.18181818], f1=[0.43537415 0.32124352 0.26666667], accuracy=0.35
[*] predict with lsvcl2 model




precision=[0.88888889 0.19672131 0.43478261], recall=[0.07619048 0.81818182 0.15151515], f1=[0.14035088 0.31718062 0.2247191 ], accuracy=0.25
[*] predict with rf model
precision=[0.57758621 0.37209302 0.60714286], recall=[0.63809524 0.36363636 0.51515152], f1=[0.60633484 0.36781609 0.55737705], accuracy=0.54
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.5s finished


precision=[0.72093023 0.20134228 0.43478261], recall=[0.2952381  0.68181818 0.15151515], f1=[0.41891892 0.31088083 0.2247191 ], accuracy=0.33
[*] predict with xgb model
precision=[0.65277778 0.29347826 0.56862745], recall=[0.44761905 0.61363636 0.43939394], f1=[0.53107345 0.39705882 0.4957265 ], accuracy=0.48
rec:['Indy500-2017', 1098, 215, '+:66,0:44,-:105', 0.20465116279069767, 0.3581395348837209, 0.3953488372093023, 0.33488372093023255, 0.3488372093023256, 0.25116279069767444, 0.5441860465116279, 0.3302325581395349, 0.4790697674418605]
Testset = Indy500-2018
[*] predict with currank model
precision=[0.         0.10062893 0.        ], recall=[0. 1. 0.], f1=[0.         0.18285714 0.        ], accuracy=0.10
[*] predict with avgrank model
precision=[0.34615385 0.         0.22222222], recall=[0.34177215 0.         0.25      ], f1=[0.34394904 0.         0.23529412], accuracy=0.27
[*] predict with dice model
precision=[0.53333333 0.19047619 0.41666667], recall=[0.60759494 0.25       0.3125

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.62280702 0.         0.72727273], recall=[0.89873418 0.         0.5       ], f1=[0.7357513  0.         0.59259259], accuracy=0.65
[*] predict with lsvc model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.60683761 0.         0.71428571], recall=[0.89873418 0.         0.46875   ], f1=[0.7244898  0.         0.56603774], accuracy=0.64
[*] predict with lsvcl2 model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.70833333 0.         0.51351351], recall=[0.43037975 0.         0.890625  ], f1=[0.53543307 0.         0.65142857], accuracy=0.57
[*] predict with rf model


  _warn_prf(average, modifier, msg_start, len(result))
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.


precision=[0.63106796 0.         0.67857143], recall=[0.82278481 0.         0.59375   ], f1=[0.71428571 0.         0.63333333], accuracy=0.65
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.5s finished
  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.6173913  0.         0.72727273], recall=[0.89873418 0.         0.5       ], f1=[0.73195876 0.         0.59259259], accuracy=0.65
[*] predict with xgb model
precision=[0.5974026  0.4        0.55844156], recall=[0.58227848 0.125      0.671875  ], f1=[0.58974359 0.19047619 0.60992908], accuracy=0.57
rec:['Indy500-2018', 1154, 159, '+:64,0:16,-:79', 0.10062893081761007, 0.27044025157232704, 0.4528301886792453, 0.6477987421383647, 0.6352201257861635, 0.5723270440251572, 0.6477987421383647, 0.6477987421383647, 0.5723270440251572]
Testset = Indy500-2019
[*] predict with currank model
precision=[0.         0.15135135 0.        ], recall=[0. 1. 0.], f1=[0.        0.2629108 0.       ], accuracy=0.15
[*] predict with avgrank model
precision=[0.45783133 0.14285714 0.21052632], recall=[0.38383838 0.03571429 0.34482759], f1=[0.41758242 0.05714286 0.26143791], accuracy=0.32
[*] predict with dice model
precision=[0.53333333 0.15       0.26666667], recall=[0.56565657 0.10714286 0.27586207]



precision=[0.67647059 1.         0.56790123], recall=[0.6969697  0.07142857 0.79310345], f1=[0.68656716 0.13333333 0.6618705 ], accuracy=0.63
[*] predict with lsvcl2 model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.63793103 0.         0.56521739], recall=[0.74747475 0.         0.67241379], f1=[0.68837209 0.         0.61417323], accuracy=0.61
[*] predict with rf model


  _warn_prf(average, modifier, msg_start, len(result))
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.


precision=[0.61607143 0.         0.50684932], recall=[0.6969697  0.         0.63793103], f1=[0.65402844 0.         0.5648855 ], accuracy=0.57
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.4s finished


precision=[0.6509434  0.         0.55128205], recall=[0.6969697  0.         0.74137931], f1=[0.67317073 0.         0.63235294], accuracy=0.61
[*] predict with xgb model
precision=[0.64150943 0.2        0.54054054], recall=[0.68686869 0.03571429 0.68965517], f1=[0.66341463 0.06060606 0.60606061], accuracy=0.59
rec:['Indy500-2019', 1128, 185, '+:58,0:28,-:99', 0.15135135135135136, 0.31891891891891894, 0.40540540540540543, 0.6054054054054054, 0.6324324324324324, 0.6108108108108108, 0.572972972972973, 0.6054054054054054, 0.5891891891891892]


In [7]:
df_event

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lr,lsvc,lsvcl2,rf,lrbias,xgb
0,Indy500-2013,1147,166,"+:59,0:27,-:80",0.162651,0.313253,0.361446,0.560241,0.566265,0.542169,0.560241,0.548193,0.542169
0,Indy500-2014,1118,195,"+:57,0:33,-:105",0.169231,0.312821,0.441026,0.615385,0.605128,0.461538,0.584615,0.610256,0.564103
0,Indy500-2015,1146,167,"+:67,0:21,-:79",0.125749,0.281437,0.437126,0.580838,0.586826,0.520958,0.664671,0.586826,0.60479
0,Indy500-2016,1087,226,"+:87,0:27,-:112",0.119469,0.314159,0.433628,0.615044,0.59292,0.539823,0.606195,0.606195,0.561947
0,Indy500-2017,1098,215,"+:66,0:44,-:105",0.204651,0.35814,0.395349,0.334884,0.348837,0.251163,0.544186,0.330233,0.47907
0,Indy500-2018,1154,159,"+:64,0:16,-:79",0.100629,0.27044,0.45283,0.647799,0.63522,0.572327,0.647799,0.647799,0.572327
0,Indy500-2019,1128,185,"+:58,0:28,-:99",0.151351,0.318919,0.405405,0.605405,0.632432,0.610811,0.572973,0.605405,0.589189


In [8]:
stagedata[(stagedata['eventid']==5) & (stagedata['car_number']==12)]

Unnamed: 0.1,Unnamed: 0,target,eventid,car_number,stageid,firststage,pit_in_caution,start_position,start_rank,start_rank_ratio,...,laptime_std_all,laps_prev,laps_after_last_pitstop,pittime_prev,prev_nb0_change_in_rank,prev_nb1_change_in_rank,prev_nb2_change_in_rank,follow_nb0_change_in_rank,follow_nb1_change_in_rank,follow_nb2_change_in_rank
1002,1172,2,5,12,1,1,0,3,4,0.121212,...,8.527139,32,32,66.10705,-1,0,0,0,-3,-5
1003,1173,-5,5,12,2,1,1,3,6,0.181818,...,11.242669,18,18,61.85245,-6,-5,0,6,3,3
1004,1174,1,5,12,3,1,0,3,1,0.030303,...,23.296174,44,44,117.01525,0,0,0,-3,3,3
1005,1175,6,5,12,4,1,0,3,2,0.060606,...,20.554273,35,35,59.24225,-3,0,0,2,1,0


In [9]:
### fix train
#load data
_trim = 0
_include_final = False
include_str = '1' if _include_final else '0'
suffix = f'indy500-2013-2019-end{include_str}-t{_trim}'
output_file = f'stage-indy500-2013-2019-end{include_str}-t{_trim}.csv'
stagedata = pd.read_csv(output_file)
stagedata.fillna(0, inplace=True)
#stagedata.info()

cols = ['runid','trainsize','testsize','testdistribution']
cols.extend(classifiers)
print('cols:%s'%cols)
retdf = pd.DataFrame([],columns=cols)


events = set(stagedata['eventid'])

years = ['2013','2014','2015','2016','2017','2018','2019']
#events = ['Indy500']
eventsname = [f'Indy500-{x}' for x in years]
events_id={key:idx for idx, key in enumerate(eventsname)}

#first 
eventid = events_id['Indy500-2018']
ignore_eventid = events_id['Indy500-2019']
stdata_2018 = stagedata[stagedata['eventid']!=ignore_eventid]

print('Testset = %s'%eventsname[eventid])

train, test, train_x, train_y, test_x, test_y = split_by_eventid(stdata_2018, eventid)
test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
#print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
#      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

#record
rec = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

acc = [0 for x in range(len(classifiers))]
for idx, clf in enumerate(classifiers):
    acc[idx] = classifier_model(clf)

rec.extend(acc)
print('rec:%s'%rec)

#new df
df = pd.DataFrame([rec],columns=cols)
retdf = pd.concat([retdf, df])        


eventid = events_id['Indy500-2019']
ignore_eventid = events_id['Indy500-2018']
stdata_2019 = stagedata[stagedata['eventid']!=ignore_eventid]

print('Testset = %s'%eventsname[eventid])

train2, test2, train_x2, train_y2, test_x, test_y = split_by_eventid(stdata_2019, eventid)
test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))

#record
rec = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

acc = [0 for x in range(len(classifiers))]
for idx, clf in enumerate(classifiers):
    acc[idx] = classifier_model(clf)

rec.extend(acc)
print('rec:%s'%rec)

#new df
df = pd.DataFrame([rec],columns=cols)
retdf = pd.concat([retdf, df]) 
retdf

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lr', 'lsvc', 'lsvcl2', 'rf', 'lrbias', 'xgb']
Testset = Indy500-2018
[*] predict with currank model
precision=[0.         0.10062893 0.        ], recall=[0. 1. 0.], f1=[0.         0.18285714 0.        ], accuracy=0.10
[*] predict with avgrank model
precision=[0.34615385 0.         0.22222222], recall=[0.34177215 0.         0.25      ], f1=[0.34394904 0.         0.23529412], accuracy=0.27
[*] predict with dice model
precision=[0.53333333 0.19047619 0.41666667], recall=[0.60759494 0.25       0.3125    ], f1=[0.56804734 0.21621622 0.35714286], accuracy=0.45
[*] predict with lr model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.64150943 0.         0.71153846], recall=[0.86075949 0.         0.578125  ], f1=[0.73513514 0.         0.63793103], accuracy=0.66
[*] predict with lsvc model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.6509434  0.         0.73584906], recall=[0.87341772 0.         0.609375  ], f1=[0.74594595 0.         0.66666667], accuracy=0.68
[*] predict with lsvcl2 model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.66304348 0.         0.64179104], recall=[0.7721519 0.        0.671875 ], f1=[0.71345029 0.         0.65648855], accuracy=0.65
[*] predict with rf model


  _warn_prf(average, modifier, msg_start, len(result))
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.


precision=[0.63265306 0.         0.6557377 ], recall=[0.78481013 0.         0.625     ], f1=[0.70056497 0.         0.64      ], accuracy=0.64
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.4s finished


precision=[0.63551402 0.         0.70588235], recall=[0.86075949 0.         0.5625    ], f1=[0.7311828  0.         0.62608696], accuracy=0.65
[*] predict with xgb model
precision=[0.65217391 0.125      0.53658537], recall=[0.56962025 0.0625     0.6875    ], f1=[0.60810811 0.08333333 0.60273973], accuracy=0.57
rec:['Indy500-2018', 969, 159, '+:64,0:16,-:79', 0.10062893081761007, 0.27044025157232704, 0.4528301886792453, 0.660377358490566, 0.6792452830188679, 0.6540880503144654, 0.6415094339622641, 0.6540880503144654, 0.5660377358490566]
Testset = Indy500-2019
[*] predict with currank model
precision=[0.         0.15135135 0.        ], recall=[0. 1. 0.], f1=[0.        0.2629108 0.       ], accuracy=0.15
[*] predict with avgrank model
precision=[0.45783133 0.14285714 0.21052632], recall=[0.38383838 0.03571429 0.34482759], f1=[0.41758242 0.05714286 0.26143791], accuracy=0.32
[*] predict with dice model
precision=[0.53333333 0.125      0.26785714], recall=[0.56565657 0.10714286 0.25862069], 

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.65454545 0.5        0.57534247], recall=[0.72727273 0.03571429 0.72413793], f1=[0.68899522 0.06666667 0.64122137], accuracy=0.62
[*] predict with lsvc model




precision=[0.65454545 1.         0.58108108], recall=[0.72727273 0.03571429 0.74137931], f1=[0.68899522 0.06896552 0.65151515], accuracy=0.63
[*] predict with lsvcl2 model




precision=[0.672      0.25       0.61538462], recall=[0.84848485 0.07142857 0.55172414], f1=[0.75       0.11111111 0.58181818], accuracy=0.64
[*] predict with rf model


  _warn_prf(average, modifier, msg_start, len(result))
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.


precision=[0.64761905 0.         0.55      ], recall=[0.68686869 0.         0.75862069], f1=[0.66666667 0.         0.63768116], accuracy=0.61
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.5s finished


precision=[0.65178571 0.5        0.57746479], recall=[0.73737374 0.03571429 0.70689655], f1=[0.69194313 0.06666667 0.63565891], accuracy=0.62
[*] predict with xgb model
precision=[0.67346939 0.15789474 0.52941176], recall=[0.66666667 0.10714286 0.62068966], f1=[0.67005076 0.12765957 0.57142857], accuracy=0.57
rec:['Indy500-2019', 969, 185, '+:58,0:28,-:99', 0.15135135135135136, 0.31891891891891894, 0.4, 0.6216216216216216, 0.6270270270270271, 0.6378378378378379, 0.6054054054054054, 0.6216216216216216, 0.5675675675675675]


Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lr,lsvc,lsvcl2,rf,lrbias,xgb
0,Indy500-2018,969,159,"+:64,0:16,-:79",0.100629,0.27044,0.45283,0.660377,0.679245,0.654088,0.641509,0.654088,0.566038
0,Indy500-2019,969,185,"+:58,0:28,-:99",0.151351,0.318919,0.4,0.621622,0.627027,0.637838,0.605405,0.621622,0.567568


In [10]:
retdf.to_csv(f'stint_classifier_result_t2013-2017_t{_trim}.csv', float_format='%.3f')