### stage_model_classifier

base: 14./stage_model_classifier_withneighbor-newfeatures

prediction models of sign classifiers on stage dataset

data format:
    target , eventid ,    car_number,    stageid,     features...

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import xgboost as xgb


In [3]:
# bulid regression model
#classifiers = ['currank','avgrank','dice','lr','lrl1','lsvc','lsvcl2','rf','lrbias','xgb']
classifiers = ['currank','avgrank','dice','lr','lsvc','lsvcl2','rf','lrbias','xgb']
def get_classifier(classifier = 'lr'):
    
    class_weight = None
    
    if classifier == "lsvc":
        clf = LinearSVC(penalty='l1',dual=False, tol=1e-3, class_weight=class_weight )
    elif classifier == "lsvcl2":
        clf = LinearSVC(penalty='l2', tol=1e-4, class_weight=class_weight)
    elif classifier == 'rf':
        #clf = RandomForestClassifier(n_estimators=100, n_jobs=4,criterion='entropy', min_samples_split=1,class_weight = class_weight)
        clf = RandomForestClassifier(n_estimators=100, n_jobs=-1,criterion='entropy', class_weight = class_weight)
    elif classifier == 'lr':
        clf = LogisticRegression(class_weight = class_weight, n_jobs=-1, fit_intercept = False, verbose = 0)
    elif classifier == 'lrbias':
        clf = LogisticRegression(class_weight = class_weight, n_jobs=-1, fit_intercept = True, verbose = 1)
    elif classifier == 'lrl1':
        clf = LogisticRegression(class_weight = class_weight, penalty='l1',n_jobs=-1)
    elif classifier == 'xgb':
        clf = xgb.XGBClassifier(booster = 'gbtree', nthread = -1, subsample = 1, 
                                n_estimators = 600, colsample_bytree = 1, max_depth = 6, min_child_weight = 1)
    elif classifier == 'dice':
        clf = RandomDice('1234')
    elif classifier == 'currank':
        clf = CurRank()
    elif classifier == 'avgrank':
        clf = AverageRank()        
    else:
        clf = None
        
    return clf


class CurRank():
    """
    predict with current rank
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = [0 for x in range(test_x.shape[0])]
        return np.array(pred_y)
    
class AverageRank():
    """
    print('[*] predict with average rankchg (change_in_rank_all):idx = 15')
    change_in_rank_all = test[:,15]
    pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in change_in_rank_all])
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            #13, change_in_rank_all
            pred_y.append(x[13])
        pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in pred_y])
        return np.array(pred_y_avg)   

class RandomDice():
    """
    a random dice model
    """
    def __init__(self, seed='1234'):
        self.dist = []
        self.val = []
        random.seed(seed)
    
    def fit(self, x, y):
        total = y.shape[0]
        yval = set(y)
        
        ratio = 0.
        for val in yval:
            self.val.append(val)
            ratio += np.sum(y==val)*1.0 / total
            self.dist.append(ratio)
            
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            dice = random.random()
            #search in self.dist
            find_idx = -1
            for idx, ratio in enumerate(self.dist):
                if dice <= ratio:
                    find_idx = idx
                    break
            
            #or the last one match
            pred_y.append(self.val[find_idx])
            
        return np.array(pred_y)

def evaluate(test_y, pred_y):
    precision = metrics.precision_score(test_y, pred_y, average=None) 
    recall = metrics.recall_score(test_y, pred_y, average=None)
    f1 = metrics.f1_score(test_y, pred_y, average=None)
    accuracy = metrics.accuracy_score(test_y, pred_y)
    print('precision=%s, recall=%s, f1=%s, accuracy=%.2f'%(precision,recall, f1, accuracy))
    return accuracy
    
#
#features
#    cols=[Myidx, 'target','eventid','car_number','stageid',
#             'firststage','pit_in_caution','start_position',
#             'start_rank','start_rank_ratio','top_pack','bottom_pack',
#             'average_rank','average_rank_all',
#             'change_in_rank','change_in_rank_all','rate_of_change','rate_of_change_all']    
def split_by_eventid(stagedata, eventid):
    """
    split by eventid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['eventid'] != eventid].to_numpy()
    test  = stagedata[stagedata['eventid'] == eventid].to_numpy()

    #2:car_number
    train_x = train[:,2:]
    train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    test_x = test[:,2:]
    test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    
    return train, test, train_x, train_y, test_x, test_y


def split_by_stageid(stagedata, stageid):
    """
    split by stageid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['stageid'] <= stageid].to_numpy()
    test  = stagedata[stagedata['stageid'] > stageid].to_numpy()

    train_x = train[:,2:]
    train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    test_x = test[:,2:]
    test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    
    return train, test, train_x, train_y, test_x, test_y


### baseline
def baseline_model():
    #1. predict with current rank, rankchg = 0
    print('[*] predict with current rank, rankchg = 0')
    pred_y_simple = np.zeros_like(test_y)
    score1 = evaluate(test_y, pred_y_simple)

    #2. predict with average rankchg (change_in_rank_all):idx = 15
    print('[*] predict with average rankchg (change_in_rank_all):idx = 15')
    change_in_rank_all = test[:,15]
    pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in change_in_rank_all])
    score2 = evaluate(test_y, pred_y_avg)
    return score1, score2

def classifier_model(name='lr'):
    ### test learning models
    print('[*] predict with %s model'%name)
    clf = get_classifier(name)
    clf.fit(train_x, train_y)

    pred_y = clf.predict(test_x)
    score = evaluate(test_y, pred_y)
    return score

In [4]:
#load data
_trim = 0
_predictlen = 2
_include_final = False
include_str = '1' if _include_final else '0'
suffix = f'indy500-2013-2019-end{include_str}-t{_trim}'
output_file = f'stage-indy500-2013-2019-end{include_str}-t{_trim}-l{_predictlen}.csv'
stagedata = pd.read_csv(output_file)


stagedata.fillna(0, inplace=True)
stagedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1312 entries, 0 to 1311
Data columns (total 35 columns):
Unnamed: 0                   1312 non-null int64
target                       1312 non-null int64
eventid                      1312 non-null int64
car_number                   1312 non-null int64
stageid                      1312 non-null int64
firststage                   1312 non-null int64
pit_in_caution               1312 non-null int64
start_position               1312 non-null int64
start_rank                   1312 non-null int64
start_rank_ratio             1312 non-null float64
top_pack                     1312 non-null int64
bottom_pack                  1312 non-null int64
average_rank                 1312 non-null float64
average_rank_all             1312 non-null float64
change_in_rank               1312 non-null int64
change_in_rank_all           1312 non-null float64
rate_of_change               1312 non-null int64
rate_of_change_all           1312 non-null float64
l

In [5]:
stagedata.head(5)

Unnamed: 0.1,Unnamed: 0,target,eventid,car_number,stageid,firststage,pit_in_caution,start_position,start_rank,start_rank_ratio,...,laptime_std_all,laps_prev,laps_after_last_pitstop,pittime_prev,prev_nb0_change_in_rank,prev_nb1_change_in_rank,prev_nb2_change_in_rank,follow_nb0_change_in_rank,follow_nb1_change_in_rank,follow_nb2_change_in_rank
0,1,-5,0,1,1,1,0,7,15,0.454545,...,22.9104,2,31,66.0815,0,0,0,2,0,0
1,2,-2,0,1,2,1,1,7,3,0.090909,...,23.822858,2,27,62.0677,-2,-4,0,2,-1,0
2,3,-5,0,1,3,1,0,7,17,0.515152,...,21.857882,2,32,91.23935,-1,0,0,-2,0,2
3,4,-4,0,1,4,1,0,7,10,0.30303,...,19.394133,2,32,61.19415,0,0,0,-8,7,-4
4,5,-5,0,1,5,1,0,7,15,0.454545,...,17.737505,2,30,60.8541,0,0,0,0,2,0


In [6]:
cols = ['runid','trainsize','testsize','testdistribution']
cols.extend(classifiers)
print('cols:%s'%cols)
retdf = pd.DataFrame([],columns=cols)


events = set(stagedata['eventid'])

years = ['2013','2014','2015','2016','2017','2018','2019']
#events = ['Indy500']
eventsname = [f'Indy500-{x}' for x in years]
events_id={key:idx for idx, key in enumerate(eventsname)}


for eventid in events:
    print('Testset = %s'%eventsname[eventid])
    
    train, test, train_x, train_y, test_x, test_y = split_by_eventid(stagedata, eventid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
    #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
    #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))
    
    #record
    rec = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    
    acc = [0 for x in range(len(classifiers))]
    for idx, clf in enumerate(classifiers):
        acc[idx] = classifier_model(clf)

    rec.extend(acc)
    print('rec:%s'%rec)
    
    #new df
    df = pd.DataFrame([rec],columns=cols)
    retdf = pd.concat([retdf, df])        
    
retdf.to_csv('crossvalid_stagedata_splitbyevent%s.csv'%suffix)
df_event = retdf

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lr', 'lsvc', 'lsvcl2', 'rf', 'lrbias', 'xgb']
Testset = Indy500-2013
[*] predict with currank model
precision=[0.         0.20481928 0.        ], recall=[0. 1. 0.], f1=[0.   0.34 0.  ], accuracy=0.20
[*] predict with avgrank model
precision=[0.3625     0.25       0.62162162], recall=[0.51785714 0.08823529 0.60526316], f1=[0.42647059 0.13043478 0.61333333], accuracy=0.47
[*] predict with dice model
precision=[0.31481481 0.11111111 0.48684211], recall=[0.30357143 0.11764706 0.48684211], f1=[0.30909091 0.11428571 0.48684211], accuracy=0.35
[*] predict with lr model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.59615385 0.8        0.70786517], recall=[0.55357143 0.58823529 0.82894737], f1=[0.57407407 0.6779661  0.76363636], accuracy=0.69
[*] predict with lsvc model




precision=[0.63636364 0.7826087  0.70707071], recall=[0.5        0.52941176 0.92105263], f1=[0.56       0.63157895 0.8       ], accuracy=0.70
[*] predict with lsvcl2 model




precision=[0.64285714 0.66666667 0.66981132], recall=[0.48214286 0.35294118 0.93421053], f1=[0.55102041 0.46153846 0.78021978], accuracy=0.66
[*] predict with rf model
precision=[0.70731707 0.65625    0.72043011], recall=[0.51785714 0.61764706 0.88157895], f1=[0.59793814 0.63636364 0.79289941], accuracy=0.70
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.5s finished


precision=[0.56862745 0.79166667 0.69230769], recall=[0.51785714 0.55882353 0.82894737], f1=[0.54205607 0.65517241 0.75449102], accuracy=0.67
[*] predict with xgb model
precision=[0.55319149 0.625      0.72413793], recall=[0.46428571 0.58823529 0.82894737], f1=[0.50485437 0.60606061 0.77300613], accuracy=0.66
rec:['Indy500-2013', 1146, 166, '+:76,0:34,-:56', 0.20481927710843373, 0.46987951807228917, 0.3493975903614458, 0.6867469879518072, 0.6987951807228916, 0.6626506024096386, 0.7048192771084337, 0.6686746987951807, 0.6566265060240963]
Testset = Indy500-2014
[*] predict with currank model
precision=[0.         0.22051282 0.        ], recall=[0. 1. 0.], f1=[0.         0.36134454 0.        ], accuracy=0.22
[*] predict with avgrank model
precision=[0.31147541 0.1        0.55645161], recall=[0.43181818 0.02325581 0.63888889], f1=[0.36190476 0.03773585 0.59482759], accuracy=0.46
[*] predict with dice model
precision=[0.21428571 0.17073171 0.63095238], recall=[0.34090909 0.1627907  0.490740

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.38983051 0.63636364 0.77192982], recall=[0.52272727 0.3255814  0.81481481], f1=[0.44660194 0.43076923 0.79279279], accuracy=0.64
[*] predict with lsvc model




precision=[0.5        0.64       0.78448276], recall=[0.61363636 0.37209302 0.84259259], f1=[0.55102041 0.47058824 0.8125    ], accuracy=0.69
[*] predict with lsvcl2 model




precision=[1.         0.45238095 0.66447368], recall=[0.02272727 0.44186047 0.93518519], f1=[0.04444444 0.44705882 0.77692308], accuracy=0.62
[*] predict with rf model
precision=[0.44444444 0.55882353 0.76      ], recall=[0.36363636 0.44186047 0.87962963], f1=[0.4        0.49350649 0.81545064], accuracy=0.67
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.5s finished


precision=[0.40625    0.6        0.79245283], recall=[0.59090909 0.34883721 0.77777778], f1=[0.48148148 0.44117647 0.78504673], accuracy=0.64
[*] predict with xgb model
precision=[0.45652174 0.39393939 0.69827586], recall=[0.47727273 0.30232558 0.75      ], f1=[0.46666667 0.34210526 0.72321429], accuracy=0.59
rec:['Indy500-2014', 1117, 195, '+:108,0:43,-:44', 0.2205128205128205, 0.4564102564102564, 0.38461538461538464, 0.6410256410256411, 0.6871794871794872, 0.6205128205128205, 0.6666666666666666, 0.6410256410256411, 0.5897435897435898]
Testset = Indy500-2015
[*] predict with currank model
precision=[0.         0.24550898 0.        ], recall=[0. 1. 0.], f1=[0.         0.39423077 0.        ], accuracy=0.25
[*] predict with avgrank model
precision=[0.35416667 0.46153846 0.45283019], recall=[0.32692308 0.14634146 0.64864865], f1=[0.34       0.22222222 0.53333333], accuracy=0.43
[*] predict with dice model
precision=[0.23636364 0.13888889 0.35526316], recall=[0.25       0.12195122 0.364864

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.4057971  0.5        0.69230769], recall=[0.53846154 0.24390244 0.72972973], f1=[0.46280992 0.32786885 0.71052632], accuracy=0.55
[*] predict with lsvc model




precision=[0.46938776 0.63157895 0.60606061], recall=[0.44230769 0.29268293 0.81081081], f1=[0.45544554 0.4        0.69364162], accuracy=0.57
[*] predict with lsvcl2 model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.41322314 0.         0.89130435], recall=[0.96153846 0.         0.55405405], f1=[0.57803468 0.         0.68333333], accuracy=0.54
[*] predict with rf model
precision=[0.47222222 0.64516129 0.64      ], recall=[0.32692308 0.48780488 0.86486486], f1=[0.38636364 0.55555556 0.73563218], accuracy=0.60
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.4s finished


precision=[0.42028986 0.54545455 0.69736842], recall=[0.55769231 0.29268293 0.71621622], f1=[0.47933884 0.38095238 0.70666667], accuracy=0.56
[*] predict with xgb model
precision=[0.5        0.52777778 0.69662921], recall=[0.40384615 0.46341463 0.83783784], f1=[0.44680851 0.49350649 0.7607362 ], accuracy=0.61
rec:['Indy500-2015', 1145, 167, '+:74,0:41,-:52', 0.24550898203592814, 0.4251497005988024, 0.2694610778443114, 0.5508982035928144, 0.5688622754491018, 0.5449101796407185, 0.6047904191616766, 0.562874251497006, 0.6107784431137725]
Testset = Indy500-2016
[*] predict with currank model
precision=[0.         0.19469027 0.        ], recall=[0. 1. 0.], f1=[0.         0.32592593 0.        ], accuracy=0.19
[*] predict with avgrank model
precision=[0.45744681 0.06666667 0.4017094 ], recall=[0.43877551 0.02272727 0.55952381], f1=[0.44791667 0.03389831 0.46766169], accuracy=0.40
[*] predict with dice model
precision=[0.45454545 0.11764706 0.3853211 ], recall=[0.30612245 0.13636364 0.5       

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.69811321 0.29230769 0.60185185], recall=[0.37755102 0.43181818 0.77380952], f1=[0.49006623 0.34862385 0.67708333], accuracy=0.54
[*] predict with lsvc model




precision=[0.62765957 0.33928571 0.72368421], recall=[0.60204082 0.43181818 0.6547619 ], f1=[0.61458333 0.38       0.6875    ], accuracy=0.59
[*] predict with lsvcl2 model




precision=[0.5257732  0.25301205 0.69565217], recall=[0.52040816 0.47727273 0.38095238], f1=[0.52307692 0.33070866 0.49230769], accuracy=0.46
[*] predict with rf model
precision=[0.75806452 0.32786885 0.60194175], recall=[0.47959184 0.45454545 0.73809524], f1=[0.5875     0.38095238 0.6631016 ], accuracy=0.57
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.5s finished


precision=[0.68421053 0.3015873  0.61320755], recall=[0.39795918 0.43181818 0.77380952], f1=[0.50322581 0.35514019 0.68421053], accuracy=0.54
[*] predict with xgb model
precision=[0.76811594 0.375      0.63366337], recall=[0.54081633 0.47727273 0.76190476], f1=[0.63473054 0.42       0.69189189], accuracy=0.61
rec:['Indy500-2016', 1086, 226, '+:84,0:44,-:98', 0.19469026548672566, 0.4026548672566372, 0.34513274336283184, 0.5353982300884956, 0.588495575221239, 0.46017699115044247, 0.5707964601769911, 0.5442477876106194, 0.6106194690265486]
Testset = Indy500-2017
[*] predict with currank model
precision=[0.         0.35348837 0.        ], recall=[0. 1. 0.], f1=[0.         0.52233677 0.        ], accuracy=0.35
[*] predict with avgrank model
precision=[0.38947368 0.57142857 0.34343434], recall=[0.49333333 0.15789474 0.53125   ], f1=[0.43529412 0.24742268 0.41717791], accuracy=0.39
[*] predict with dice model
precision=[0.40909091 0.31818182 0.2952381 ], recall=[0.36       0.18421053 0.484375

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.5        0.37974684 0.34146341], recall=[0.10666667 0.78947368 0.21875   ], f1=[0.17582418 0.51282051 0.26666667], accuracy=0.38
[*] predict with lsvc model




precision=[0.55555556 0.38853503 0.48387097], recall=[0.2        0.80263158 0.234375  ], f1=[0.29411765 0.52360515 0.31578947], accuracy=0.42
[*] predict with lsvcl2 model




precision=[0.43548387 0.54237288 0.65625   ], recall=[0.72       0.42105263 0.328125  ], f1=[0.54271357 0.47407407 0.4375    ], accuracy=0.50
[*] predict with rf model
precision=[0.61702128 0.51648352 0.53246753], recall=[0.38666667 0.61842105 0.640625  ], f1=[0.47540984 0.56287425 0.58156028], accuracy=0.54
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.4s finished


precision=[0.5        0.37974684 0.34146341], recall=[0.10666667 0.78947368 0.21875   ], f1=[0.17582418 0.51282051 0.26666667], accuracy=0.38
[*] predict with xgb model
precision=[0.58139535 0.46391753 0.52      ], recall=[0.33333333 0.59210526 0.609375  ], f1=[0.42372881 0.52023121 0.56115108], accuracy=0.51
rec:['Indy500-2017', 1097, 215, '+:64,0:76,-:75', 0.35348837209302325, 0.386046511627907, 0.33488372093023255, 0.3813953488372093, 0.4232558139534884, 0.49767441860465117, 0.5441860465116279, 0.3813953488372093, 0.5069767441860465]
Testset = Indy500-2018
[*] predict with currank model
precision=[0.         0.27672956 0.        ], recall=[0. 1. 0.], f1=[0.         0.43349754 0.        ], accuracy=0.28
[*] predict with avgrank model
precision=[0.22857143 0.1        0.56140351], recall=[0.34782609 0.02272727 0.69565217], f1=[0.27586207 0.03703704 0.62135922], accuracy=0.46
[*] predict with dice model
precision=[0.19298246 0.28125    0.61428571], recall=[0.47826087 0.20454545 0.467391

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.37837838 0.68       0.83505155], recall=[0.60869565 0.38636364 0.88043478], f1=[0.46666667 0.49275362 0.85714286], accuracy=0.70
[*] predict with lsvc model




precision=[0.41860465 0.76       0.87912088], recall=[0.7826087  0.43181818 0.86956522], f1=[0.54545455 0.55072464 0.87431694], accuracy=0.74
[*] predict with lsvcl2 model




precision=[0.38095238 0.55769231 0.87209302], recall=[0.34782609 0.65909091 0.81521739], f1=[0.36363636 0.60416667 0.84269663], accuracy=0.70
[*] predict with rf model
precision=[0.36170213 0.68181818 0.84444444], recall=[0.73913043 0.34090909 0.82608696], f1=[0.48571429 0.45454545 0.83516484], accuracy=0.68
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.9s finished


precision=[0.35897436 0.70833333 0.82291667], recall=[0.60869565 0.38636364 0.85869565], f1=[0.4516129  0.5        0.84042553], accuracy=0.69
[*] predict with xgb model
precision=[0.37777778 0.65       0.81914894], recall=[0.73913043 0.29545455 0.83695652], f1=[0.5        0.40625    0.82795699], accuracy=0.67
rec:['Indy500-2018', 1153, 159, '+:92,0:44,-:23', 0.27672955974842767, 0.4591194968553459, 0.39622641509433965, 0.7044025157232704, 0.7358490566037735, 0.7044025157232704, 0.6792452830188679, 0.6918238993710691, 0.6729559748427673]
Testset = Indy500-2019
[*] predict with currank model
precision=[0.         0.22826087 0.        ], recall=[0. 1. 0.], f1=[0.         0.37168142 0.        ], accuracy=0.23
[*] predict with avgrank model
precision=[0.40909091 0.14285714 0.52380952], recall=[0.39130435 0.04761905 0.6875    ], f1=[0.4        0.07142857 0.59459459], accuracy=0.47
[*] predict with dice model
precision=[0.18461538 0.2        0.46835443], recall=[0.26086957 0.19047619 0.385416

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.8        0.52380952 0.71311475], recall=[0.34782609 0.52380952 0.90625   ], f1=[0.48484848 0.52380952 0.79816514], accuracy=0.68
[*] predict with lsvc model




precision=[0.6        0.41666667 0.68421053], recall=[0.19565217 0.35714286 0.94791667], f1=[0.29508197 0.38461538 0.79475983], accuracy=0.62
[*] predict with lsvcl2 model




precision=[0.5        0.25949367 1.        ], recall=[0.04347826 0.97619048 0.22916667], f1=[0.08       0.41       0.37288136], accuracy=0.35
[*] predict with rf model
precision=[0.66666667 0.53658537 0.71311475], recall=[0.30434783 0.52380952 0.90625   ], f1=[0.41791045 0.53012048 0.79816514], accuracy=0.67
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.5s finished


precision=[0.76190476 0.55       0.70731707], recall=[0.34782609 0.52380952 0.90625   ], f1=[0.47761194 0.53658537 0.79452055], accuracy=0.68
[*] predict with xgb model
precision=[0.64285714 0.46808511 0.69105691], recall=[0.19565217 0.52380952 0.88541667], f1=[0.3        0.49438202 0.77625571], accuracy=0.63
rec:['Indy500-2019', 1128, 184, '+:96,0:42,-:46', 0.22826086956521738, 0.4673913043478261, 0.30978260869565216, 0.6793478260869565, 0.625, 0.3532608695652174, 0.6684782608695652, 0.6793478260869565, 0.6304347826086957]


In [7]:
df_event

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lr,lsvc,lsvcl2,rf,lrbias,xgb
0,Indy500-2013,1146,166,"+:76,0:34,-:56",0.204819,0.46988,0.349398,0.686747,0.698795,0.662651,0.704819,0.668675,0.656627
0,Indy500-2014,1117,195,"+:108,0:43,-:44",0.220513,0.45641,0.384615,0.641026,0.687179,0.620513,0.666667,0.641026,0.589744
0,Indy500-2015,1145,167,"+:74,0:41,-:52",0.245509,0.42515,0.269461,0.550898,0.568862,0.54491,0.60479,0.562874,0.610778
0,Indy500-2016,1086,226,"+:84,0:44,-:98",0.19469,0.402655,0.345133,0.535398,0.588496,0.460177,0.570796,0.544248,0.610619
0,Indy500-2017,1097,215,"+:64,0:76,-:75",0.353488,0.386047,0.334884,0.381395,0.423256,0.497674,0.544186,0.381395,0.506977
0,Indy500-2018,1153,159,"+:92,0:44,-:23",0.27673,0.459119,0.396226,0.704403,0.735849,0.704403,0.679245,0.691824,0.672956
0,Indy500-2019,1128,184,"+:96,0:42,-:46",0.228261,0.467391,0.309783,0.679348,0.625,0.353261,0.668478,0.679348,0.630435


In [8]:
stagedata[(stagedata['eventid']==5) & (stagedata['car_number']==12)]

Unnamed: 0.1,Unnamed: 0,target,eventid,car_number,stageid,firststage,pit_in_caution,start_position,start_rank,start_rank_ratio,...,laptime_std_all,laps_prev,laps_after_last_pitstop,pittime_prev,prev_nb0_change_in_rank,prev_nb1_change_in_rank,prev_nb2_change_in_rank,follow_nb0_change_in_rank,follow_nb1_change_in_rank,follow_nb2_change_in_rank
1002,1172,6,5,12,1,1,0,3,4,0.121212,...,8.527139,2,32,66.10705,0,0,0,0,0,0
1003,1173,-2,5,12,2,1,1,3,6,0.181818,...,11.242669,2,18,61.85245,0,0,0,0,0,0
1004,1174,8,5,12,3,1,0,3,1,0.030303,...,23.296174,2,44,117.01525,0,0,0,0,0,0
1005,1175,5,5,12,4,1,0,3,2,0.060606,...,20.554273,2,35,59.24225,0,0,0,0,0,0


In [9]:
### fix train
#load data
_trim = 0
_include_final = False
include_str = '1' if _include_final else '0'
suffix = f'indy500-2013-2019-end{include_str}-t{_trim}'
output_file = f'stage-indy500-2013-2019-end{include_str}-t{_trim}-l{_predictlen}.csv'
stagedata = pd.read_csv(output_file)
stagedata.fillna(0, inplace=True)
#stagedata.info()

cols = ['runid','trainsize','testsize','testdistribution']
cols.extend(classifiers)
print('cols:%s'%cols)
retdf = pd.DataFrame([],columns=cols)


events = set(stagedata['eventid'])

years = ['2013','2014','2015','2016','2017','2018','2019']
#events = ['Indy500']
eventsname = [f'Indy500-{x}' for x in years]
events_id={key:idx for idx, key in enumerate(eventsname)}

#first 
eventid = events_id['Indy500-2018']
ignore_eventid = events_id['Indy500-2019']
stdata_2018 = stagedata[stagedata['eventid']!=ignore_eventid]

print('Testset = %s'%eventsname[eventid])

train, test, train_x, train_y, test_x, test_y = split_by_eventid(stdata_2018, eventid)
test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
#print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
#      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

#record
rec = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

acc = [0 for x in range(len(classifiers))]
for idx, clf in enumerate(classifiers):
    acc[idx] = classifier_model(clf)

rec.extend(acc)
print('rec:%s'%rec)

#new df
df = pd.DataFrame([rec],columns=cols)
retdf = pd.concat([retdf, df])        


eventid = events_id['Indy500-2019']
ignore_eventid = events_id['Indy500-2018']
stdata_2019 = stagedata[stagedata['eventid']!=ignore_eventid]

print('Testset = %s'%eventsname[eventid])

train2, test2, train_x2, train_y2, test_x, test_y = split_by_eventid(stdata_2019, eventid)
test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))

#record
rec = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

acc = [0 for x in range(len(classifiers))]
for idx, clf in enumerate(classifiers):
    acc[idx] = classifier_model(clf)

rec.extend(acc)
print('rec:%s'%rec)

#new df
df = pd.DataFrame([rec],columns=cols)
retdf = pd.concat([retdf, df]) 
retdf

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lr', 'lsvc', 'lsvcl2', 'rf', 'lrbias', 'xgb']
Testset = Indy500-2018
[*] predict with currank model
precision=[0.         0.27672956 0.        ], recall=[0. 1. 0.], f1=[0.         0.43349754 0.        ], accuracy=0.28
[*] predict with avgrank model
precision=[0.22857143 0.1        0.56140351], recall=[0.34782609 0.02272727 0.69565217], f1=[0.27586207 0.03703704 0.62135922], accuracy=0.46
[*] predict with dice model
precision=[0.18965517 0.28125    0.60869565], recall=[0.47826087 0.20454545 0.45652174], f1=[0.27160494 0.23684211 0.52173913], accuracy=0.39
[*] predict with lr model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.30612245 0.73333333 0.84210526], recall=[0.65217391 0.25       0.86956522], f1=[0.41666667 0.37288136 0.85561497], accuracy=0.67
[*] predict with lsvc model
precision=[0.40540541 0.73076923 0.875     ], recall=[0.65217391 0.43181818 0.91304348], f1=[0.5        0.54285714 0.89361702], accuracy=0.74
[*] predict with lsvcl2 model




precision=[0.66666667 0.61111111 0.83333333], recall=[0.08695652 0.75       0.92391304], f1=[0.15384615 0.67346939 0.87628866], accuracy=0.75
[*] predict with rf model
precision=[0.34       0.66666667 0.84615385], recall=[0.73913043 0.27272727 0.83695652], f1=[0.46575342 0.38709677 0.84153005], accuracy=0.67
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.7s finished


precision=[0.33333333 0.76190476 0.83870968], recall=[0.65217391 0.36363636 0.84782609], f1=[0.44117647 0.49230769 0.84324324], accuracy=0.69
[*] predict with xgb model
precision=[0.34       0.52380952 0.84090909], recall=[0.73913043 0.25       0.80434783], f1=[0.46575342 0.33846154 0.82222222], accuracy=0.64
rec:['Indy500-2018', 969, 159, '+:92,0:44,-:23', 0.27672955974842767, 0.4591194968553459, 0.389937106918239, 0.6666666666666666, 0.7421383647798742, 0.7547169811320755, 0.6666666666666666, 0.6855345911949685, 0.6415094339622641]
Testset = Indy500-2019
[*] predict with currank model
precision=[0.         0.22826087 0.        ], recall=[0. 1. 0.], f1=[0.         0.37168142 0.        ], accuracy=0.23
[*] predict with avgrank model
precision=[0.40909091 0.14285714 0.52380952], recall=[0.39130435 0.04761905 0.6875    ], f1=[0.4        0.07142857 0.59459459], accuracy=0.47
[*] predict with dice model
precision=[0.18181818 0.20512821 0.46835443], recall=[0.26086957 0.19047619 0.38541667]

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.54237288 0.56521739 0.78431373], recall=[0.69565217 0.30952381 0.83333333], f1=[0.60952381 0.4        0.80808081], accuracy=0.68
[*] predict with lsvc model




precision=[0.6        0.51724138 0.704     ], recall=[0.39130435 0.35714286 0.91666667], f1=[0.47368421 0.42253521 0.79638009], accuracy=0.66
[*] predict with lsvcl2 model




precision=[1.         1.         0.53038674], recall=[0.04347826 0.02380952 1.        ], f1=[0.08333333 0.04651163 0.69314079], accuracy=0.54
[*] predict with rf model
precision=[0.56756757 0.51282051 0.76851852], recall=[0.45652174 0.47619048 0.86458333], f1=[0.5060241  0.49382716 0.81372549], accuracy=0.67
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.7s finished


precision=[0.6122449  0.51851852 0.78703704], recall=[0.65217391 0.33333333 0.88541667], f1=[0.63157895 0.4057971  0.83333333], accuracy=0.70
[*] predict with xgb model
precision=[0.5        0.51428571 0.68376068], recall=[0.34782609 0.42857143 0.83333333], f1=[0.41025641 0.46753247 0.75117371], accuracy=0.62
rec:['Indy500-2019', 969, 184, '+:96,0:42,-:46', 0.22826086956521738, 0.4673913043478261, 0.30978260869565216, 0.6793478260869565, 0.657608695652174, 0.5380434782608695, 0.6739130434782609, 0.7010869565217391, 0.6195652173913043]


Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lr,lsvc,lsvcl2,rf,lrbias,xgb
0,Indy500-2018,969,159,"+:92,0:44,-:23",0.27673,0.459119,0.389937,0.666667,0.742138,0.754717,0.666667,0.685535,0.641509
0,Indy500-2019,969,184,"+:96,0:42,-:46",0.228261,0.467391,0.309783,0.679348,0.657609,0.538043,0.673913,0.701087,0.619565


In [10]:
retdf.to_csv(f'stint_classifier_result_t2013-2017_t{_trim}-l{_predictlen}.csv', float_format='%.3f')