### stage_model_classifier

prediction models of sign classifiers on stage dataset

data format:
    target , eventid ,    car_number,    stageid,     features...

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import xgboost as xgb


In [3]:
# bulid regression model
#classifiers = ['currank','avgrank','dice','lr','lrl1','lsvc','lsvcl2','rf','lrbias','xgb']
classifiers = ['currank','avgrank','dice','lr','lsvc','lsvcl2','rf','lrbias','xgb']
def get_classifier(classifier = 'lr'):
    
    class_weight = None
    
    if classifier == "lsvc":
        clf = LinearSVC(penalty='l1',dual=False, tol=1e-3, class_weight=class_weight )
    elif classifier == "lsvcl2":
        clf = LinearSVC(penalty='l2', tol=1e-4, class_weight=class_weight)
    elif classifier == 'rf':
        #clf = RandomForestClassifier(n_estimators=100, n_jobs=4,criterion='entropy', min_samples_split=1,class_weight = class_weight)
        clf = RandomForestClassifier(n_estimators=100, n_jobs=-1,criterion='entropy', class_weight = class_weight)
    elif classifier == 'lr':
        clf = LogisticRegression(class_weight = class_weight, n_jobs=-1, fit_intercept = False, verbose = 0)
    elif classifier == 'lrbias':
        clf = LogisticRegression(class_weight = class_weight, n_jobs=-1, fit_intercept = True, verbose = 1)
    elif classifier == 'lrl1':
        clf = LogisticRegression(class_weight = class_weight, penalty='l1',n_jobs=-1)
    elif classifier == 'xgb':
        clf = xgb.XGBClassifier(booster = 'gbtree', nthread = -1, subsample = 1, 
                                n_estimators = 600, colsample_bytree = 1, max_depth = 6, min_child_weight = 1)
    elif classifier == 'dice':
        clf = RandomDice('1234')
    elif classifier == 'currank':
        clf = CurRank()
    elif classifier == 'avgrank':
        clf = AverageRank()        
    else:
        clf = None
        
    return clf


class CurRank():
    """
    predict with current rank
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = [0 for x in range(test_x.shape[0])]
        return np.array(pred_y)
    
class AverageRank():
    """
    print('[*] predict with average rankchg (change_in_rank_all):idx = 15')
    change_in_rank_all = test[:,15]
    pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in change_in_rank_all])
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            #13, change_in_rank_all
            pred_y.append(x[13])
        pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in pred_y])
        return np.array(pred_y_avg)   

class RandomDice():
    """
    a random dice model
    """
    def __init__(self, seed='1234'):
        self.dist = []
        self.val = []
        random.seed(seed)
    
    def fit(self, x, y):
        total = y.shape[0]
        yval = set(y)
        
        ratio = 0.
        for val in yval:
            self.val.append(val)
            ratio += np.sum(y==val)*1.0 / total
            self.dist.append(ratio)
            
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            dice = random.random()
            #search in self.dist
            find_idx = -1
            for idx, ratio in enumerate(self.dist):
                if dice <= ratio:
                    find_idx = idx
                    break
            
            #or the last one match
            pred_y.append(self.val[find_idx])
            
        return np.array(pred_y)

def evaluate(test_y, pred_y):
    precision = metrics.precision_score(test_y, pred_y, average=None) 
    recall = metrics.recall_score(test_y, pred_y, average=None)
    f1 = metrics.f1_score(test_y, pred_y, average=None)
    accuracy = metrics.accuracy_score(test_y, pred_y)
    print('precision=%s, recall=%s, f1=%s, accuracy=%.2f'%(precision,recall, f1, accuracy))
    return accuracy
    
#
#features
#    cols=[Myidx, 'target','eventid','car_number','stageid',
#             'firststage','pit_in_caution','start_position',
#             'start_rank','start_rank_ratio','top_pack','bottom_pack',
#             'average_rank','average_rank_all',
#             'change_in_rank','change_in_rank_all','rate_of_change','rate_of_change_all']    
def split_by_eventid(stagedata, eventid):
    """
    split by eventid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['eventid'] != eventid].to_numpy()
    test  = stagedata[stagedata['eventid'] == eventid].to_numpy()

    #2:car_number
    train_x = train[:,2:]
    train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    test_x = test[:,2:]
    test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    
    return train, test, train_x, train_y, test_x, test_y


def split_by_stageid(stagedata, stageid):
    """
    split by stageid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['stageid'] <= stageid].to_numpy()
    test  = stagedata[stagedata['stageid'] > stageid].to_numpy()

    train_x = train[:,2:]
    train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    test_x = test[:,2:]
    test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    
    return train, test, train_x, train_y, test_x, test_y


### baseline
def baseline_model():
    #1. predict with current rank, rankchg = 0
    print('[*] predict with current rank, rankchg = 0')
    pred_y_simple = np.zeros_like(test_y)
    score1 = evaluate(test_y, pred_y_simple)

    #2. predict with average rankchg (change_in_rank_all):idx = 15
    print('[*] predict with average rankchg (change_in_rank_all):idx = 15')
    change_in_rank_all = test[:,15]
    pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in change_in_rank_all])
    score2 = evaluate(test_y, pred_y_avg)
    return score1, score2

def classifier_model(name='lr'):
    ### test learning models
    print('[*] predict with %s model'%name)
    clf = get_classifier(name)
    clf.fit(train_x, train_y)

    pred_y = clf.predict(test_x)
    score = evaluate(test_y, pred_y)
    return score

In [4]:
#load data
_trim = 2
suffix='withneighbor-newfeatures'
stagedata = pd.read_csv('stage-%s-%s-t%s.csv'%('indy500-2013-2018',suffix, _trim))

stagedata.fillna(0, inplace=True)
stagedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1521 entries, 0 to 1520
Data columns (total 35 columns):
Unnamed: 0                   1521 non-null int64
target                       1521 non-null int64
eventid                      1521 non-null int64
car_number                   1521 non-null int64
stageid                      1521 non-null int64
firststage                   1521 non-null int64
pit_in_caution               1521 non-null int64
start_position               1521 non-null int64
start_rank                   1521 non-null int64
start_rank_ratio             1521 non-null float64
top_pack                     1521 non-null int64
bottom_pack                  1521 non-null int64
average_rank                 1521 non-null float64
average_rank_all             1521 non-null float64
change_in_rank               1521 non-null int64
change_in_rank_all           1521 non-null float64
rate_of_change               1521 non-null int64
rate_of_change_all           1521 non-null float64
l

In [5]:
stagedata.head(5)

Unnamed: 0.1,Unnamed: 0,target,eventid,car_number,stageid,firststage,pit_in_caution,start_position,start_rank,start_rank_ratio,...,laptime_std_all,laps_prev,laps_after_last_pitstop,pittime_prev,prev_nb0_change_in_rank,prev_nb1_change_in_rank,prev_nb2_change_in_rank,follow_nb0_change_in_rank,follow_nb1_change_in_rank,follow_nb2_change_in_rank
0,0,-4,0,1,0,0,0,7,7,0.212121,...,0.0,0,0,0.0,0,0,0,0,0,0
1,1,0,0,1,1,1,0,7,3,0.090909,...,23.559273,29,29,66.0815,2,-1,0,2,2,0
2,2,0,0,1,2,1,1,7,3,0.090909,...,24.168072,27,25,62.0677,0,-2,0,0,4,0
3,3,0,0,1,3,1,0,7,3,0.090909,...,22.044163,32,30,91.23935,-3,-8,0,-8,3,0
4,4,-1,0,1,4,1,0,7,3,0.090909,...,19.526487,32,30,61.19415,-2,-1,0,0,-5,-3


In [6]:
cols = ['runid','trainsize','testsize','testdistribution']
cols.extend(classifiers)
print('cols:%s'%cols)
retdf = pd.DataFrame([],columns=cols)


events = set(stagedata['eventid'])

years = ['2013','2014','2015','2016','2017','2018']
#events = ['Indy500']
eventsname = [f'Indy500-{x}' for x in years]
events_id={key:idx for idx, key in enumerate(eventsname)}


for eventid in events:
    print('Testset = %s'%eventsname[eventid])
    
    train, test, train_x, train_y, test_x, test_y = split_by_eventid(stagedata, eventid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
    #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
    #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))
    
    #record
    rec = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    
    acc = [0 for x in range(len(classifiers))]
    for idx, clf in enumerate(classifiers):
        acc[idx] = classifier_model(clf)

    rec.extend(acc)
    print('rec:%s'%rec)
    
    #new df
    df = pd.DataFrame([rec],columns=cols)
    retdf = pd.concat([retdf, df])        
    
retdf.to_csv('crossvalid_stagedata_splitbyevent%s.csv'%suffix)
df_event = retdf

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lr', 'lsvc', 'lsvcl2', 'rf', 'lrbias', 'xgb']
Testset = Indy500-2013
[*] predict with currank model
precision=[0.         0.14718615 0.        ], recall=[0. 1. 0.], f1=[0.         0.25660377 0.        ], accuracy=0.15
[*] predict with avgrank model
precision=[0.35820896 0.07317073 0.26785714], recall=[0.46153846 0.08823529 0.16129032], f1=[0.40336134 0.08       0.20134228], accuracy=0.29
[*] predict with dice model
precision=[0.43846154 0.175      0.44262295], recall=[0.54807692 0.20588235 0.29032258], f1=[0.48717949 0.18918919 0.35064935], accuracy=0.39
[*] predict with lr model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.56451613 0.         0.53398058], recall=[0.67307692 0.         0.59139785], f1=[0.61403509 0.         0.56122449], accuracy=0.54
[*] predict with lsvc model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.59848485 0.         0.56565657], recall=[0.75961538 0.         0.60215054], f1=[0.66949153 0.         0.58333333], accuracy=0.58
[*] predict with lsvcl2 model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.45021645 0.         0.        ], recall=[1. 0. 0.], f1=[0.62089552 0.         0.        ], accuracy=0.45
[*] predict with rf model
precision=[0.58715596 0.28571429 0.50434783], recall=[0.61538462 0.05882353 0.62365591], f1=[0.60093897 0.09756098 0.55769231], accuracy=0.54
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.6s finished


precision=[0.56097561 0.         0.53398058], recall=[0.66346154 0.         0.59139785], f1=[0.60792952 0.         0.56122449], accuracy=0.54
[*] predict with xgb model
precision=[0.53982301 0.16666667 0.48113208], recall=[0.58653846 0.05882353 0.5483871 ], f1=[0.56221198 0.08695652 0.51256281], accuracy=0.49
rec:['Indy500-2013', 1290, 231, '+:93,0:34,-:104', 0.1471861471861472, 0.2857142857142857, 0.3939393939393939, 0.5411255411255411, 0.5844155844155844, 0.45021645021645024, 0.5367965367965368, 0.5367965367965368, 0.4935064935064935]
Testset = Indy500-2014
[*] predict with currank model
precision=[0.         0.16858238 0.        ], recall=[0. 1. 0.], f1=[0.         0.28852459 0.        ], accuracy=0.17
[*] predict with avgrank model
precision=[0.50359712 0.12765957 0.22666667], recall=[0.4964539  0.13636364 0.22368421], f1=[0.5        0.13186813 0.22516556], accuracy=0.36
[*] predict with dice model
precision=[0.57042254 0.14634146 0.32051282], recall=[0.57446809 0.13636364 0.328947

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.59854015 0.2        0.34453782], recall=[0.58156028 0.02272727 0.53947368], f1=[0.58992806 0.04081633 0.42051282], accuracy=0.48
[*] predict with lsvc model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.60416667 0.         0.33333333], recall=[0.61702128 0.         0.51315789], f1=[0.61052632 0.         0.40414508], accuracy=0.48
[*] predict with lsvcl2 model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.55555556 0.         0.29444444], recall=[0.31914894 0.         0.69736842], f1=[0.40540541 0.         0.4140625 ], accuracy=0.38
[*] predict with rf model
precision=[0.60833333 0.17777778 0.3125    ], recall=[0.5177305  0.18181818 0.39473684], f1=[0.55938697 0.17977528 0.34883721], accuracy=0.43
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.6s finished


precision=[0.59854015 0.2        0.33613445], recall=[0.58156028 0.02272727 0.52631579], f1=[0.58992806 0.04081633 0.41025641], accuracy=0.47
[*] predict with xgb model
precision=[0.64754098 0.16666667 0.35820896], recall=[0.56028369 0.27272727 0.31578947], f1=[0.60076046 0.20689655 0.33566434], accuracy=0.44
rec:['Indy500-2014', 1260, 261, '+:76,0:44,-:141', 0.1685823754789272, 0.3563218390804598, 0.42911877394636017, 0.47509578544061304, 0.4827586206896552, 0.37547892720306514, 0.42528735632183906, 0.47126436781609193, 0.44061302681992337]
Testset = Indy500-2015
[*] predict with currank model
precision=[0.         0.16017316 0.        ], recall=[0. 1. 0.], f1=[0.        0.2761194 0.       ], accuracy=0.16
[*] predict with avgrank model
precision=[0.43925234 0.23529412 0.26027397], recall=[0.4122807  0.32432432 0.2375    ], f1=[0.42533937 0.27272727 0.24836601], accuracy=0.34
[*] predict with dice model
precision=[0.51162791 0.17948718 0.41269841], recall=[0.57894737 0.18918919 0.325 

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.61417323 0.1        0.4893617 ], recall=[0.68421053 0.02702703 0.575     ], f1=[0.6473029  0.04255319 0.52873563], accuracy=0.54
[*] predict with lsvc model




precision=[0.63087248 0.         0.58024691], recall=[0.8245614 0.        0.5875   ], f1=[0.7148289  0.         0.58385093], accuracy=0.61
[*] predict with lsvcl2 model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.75       0.         0.38190955], recall=[0.21052632 0.         0.95      ], f1=[0.32876712 0.         0.54480287], accuracy=0.43
[*] predict with rf model
precision=[0.55244755 0.22222222 0.46835443], recall=[0.69298246 0.05405405 0.4625    ], f1=[0.61478599 0.08695652 0.46540881], accuracy=0.51
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.6s finished


precision=[0.60483871 0.08333333 0.45263158], recall=[0.65789474 0.02702703 0.5375    ], f1=[0.6302521  0.04081633 0.49142857], accuracy=0.52
[*] predict with xgb model
precision=[0.5511811  0.06666667 0.48648649], recall=[0.61403509 0.05405405 0.45      ], f1=[0.58091286 0.05970149 0.46753247], accuracy=0.47
rec:['Indy500-2015', 1290, 231, '+:80,0:37,-:114', 0.16017316017316016, 0.33766233766233766, 0.42857142857142855, 0.5411255411255411, 0.6103896103896104, 0.4329004329004329, 0.5108225108225108, 0.5151515151515151, 0.4675324675324675]
Testset = Indy500-2016
[*] predict with currank model
precision=[0.         0.15753425 0.        ], recall=[0. 1. 0.], f1=[0.         0.27218935 0.        ], accuracy=0.16
[*] predict with avgrank model
precision=[0.39072848 0.10204082 0.2826087 ], recall=[0.42753623 0.10869565 0.24074074], f1=[0.4083045  0.10526316 0.26      ], accuracy=0.31
[*] predict with dice model
precision=[0.51219512 0.15555556 0.42168675], recall=[0.60869565 0.15217391 0.3240

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.58247423 0.24       0.65753425], recall=[0.81884058 0.13043478 0.44444444], f1=[0.68072289 0.16901408 0.53038674], accuracy=0.57
[*] predict with lsvc model




precision=[0.58851675 0.33333333 0.78873239], recall=[0.89130435 0.08695652 0.51851852], f1=[0.70893372 0.13793103 0.62569832], accuracy=0.63
[*] predict with lsvcl2 model




precision=[0.51383399 0.19354839 0.625     ], recall=[0.94202899 0.13043478 0.0462963 ], f1=[0.66496164 0.15584416 0.0862069 ], accuracy=0.48
[*] predict with rf model
precision=[0.58653846 0.16666667 0.68055556], recall=[0.88405797 0.04347826 0.4537037 ], f1=[0.70520231 0.06896552 0.54444444], accuracy=0.59
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.6s finished


precision=[0.5786802  0.22222222 0.64705882], recall=[0.82608696 0.13043478 0.40740741], f1=[0.68059701 0.16438356 0.5       ], accuracy=0.56
[*] predict with xgb model
precision=[0.56774194 0.1875     0.50413223], recall=[0.63768116 0.06521739 0.56481481], f1=[0.60068259 0.09677419 0.53275109], accuracy=0.52
rec:['Indy500-2016', 1229, 292, '+:108,0:46,-:138', 0.15753424657534246, 0.3082191780821918, 0.4315068493150685, 0.571917808219178, 0.6267123287671232, 0.4828767123287671, 0.5924657534246576, 0.5616438356164384, 0.5205479452054794]
Testset = Indy500-2017
[*] predict with currank model
precision=[0.         0.23843416 0.        ], recall=[0. 1. 0.], f1=[0.         0.38505747 0.        ], accuracy=0.24
[*] predict with avgrank model
precision=[0.39506173 0.18       0.27536232], recall=[0.5        0.13432836 0.22093023], f1=[0.44137931 0.15384615 0.24516129], accuracy=0.33
[*] predict with dice model
precision=[0.41772152 0.17948718 0.35714286], recall=[0.515625   0.10447761 0.348837

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.44155844 0.19333333 0.46296296], recall=[0.265625   0.43283582 0.29069767], f1=[0.33170732 0.26728111 0.35714286], accuracy=0.31
[*] predict with lsvc model




precision=[0.4556962  0.2        0.49019608], recall=[0.28125    0.29850746 0.58139535], f1=[0.34782609 0.23952096 0.53191489], accuracy=0.38
[*] predict with lsvcl2 model




precision=[0.6        0.17391304 0.30128205], recall=[0.046875   0.29850746 0.54651163], f1=[0.08695652 0.21978022 0.38842975], accuracy=0.26
[*] predict with rf model
precision=[0.55555556 0.33333333 0.5       ], recall=[0.7421875  0.11940299 0.5       ], f1=[0.63545151 0.17582418 0.5       ], accuracy=0.52
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.4s finished


precision=[0.44444444 0.1986755  0.44827586], recall=[0.25       0.44776119 0.30232558], f1=[0.32       0.27522936 0.36111111], accuracy=0.31
[*] predict with xgb model
precision=[0.4969697  0.22641509 0.46031746], recall=[0.640625   0.17910448 0.3372093 ], f1=[0.55972696 0.2        0.38926174], accuracy=0.44
rec:['Indy500-2017', 1240, 281, '+:86,0:67,-:128', 0.23843416370106763, 0.3274021352313167, 0.3665480427046263, 0.31316725978647686, 0.37722419928825623, 0.2597864768683274, 0.5195729537366548, 0.31316725978647686, 0.4377224199288256]
Testset = Indy500-2018
[*] predict with currank model
precision=[0.         0.15111111 0.        ], recall=[0. 1. 0.], f1=[0.         0.26254826 0.        ], accuracy=0.15
[*] predict with avgrank model
precision=[0.3515625 0.1509434 0.25     ], recall=[0.45454545 0.23529412 0.11956522], f1=[0.39647577 0.18390805 0.16176471], accuracy=0.28
[*] predict with dice model
precision=[0.40944882 0.13157895 0.33333333], recall=[0.52525253 0.14705882 0.217391

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.60683761 0.         0.58333333], recall=[0.71717172 0.         0.68478261], f1=[0.65740741 0.         0.63      ], accuracy=0.60
[*] predict with lsvc model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.6728972  0.         0.59322034], recall=[0.72727273 0.         0.76086957], f1=[0.69902913 0.         0.66666667], accuracy=0.63
[*] predict with lsvcl2 model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.45 0.   1.  ], recall=[1.         0.         0.05434783], f1=[0.62068966 0.         0.10309278], accuracy=0.46
[*] predict with rf model
precision=[0.58119658 0.         0.58490566], recall=[0.68686869 0.         0.67391304], f1=[0.62962963 0.         0.62626263], accuracy=0.58
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.6s finished
  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.62280702 0.         0.57657658], recall=[0.71717172 0.         0.69565217], f1=[0.66666667 0.         0.63054187], accuracy=0.60
[*] predict with xgb model
precision=[0.55670103 0.11764706 0.55855856], recall=[0.54545455 0.05882353 0.67391304], f1=[0.55102041 0.07843137 0.61083744], accuracy=0.52
rec:['Indy500-2018', 1296, 225, '+:92,0:34,-:99', 0.1511111111111111, 0.28444444444444444, 0.3422222222222222, 0.5955555555555555, 0.6311111111111111, 0.4622222222222222, 0.5777777777777777, 0.6, 0.5244444444444445]


In [7]:
df_event

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lr,lsvc,lsvcl2,rf,lrbias,xgb
0,Indy500-2013,1290,231,"+:93,0:34,-:104",0.147186,0.285714,0.393939,0.541126,0.584416,0.450216,0.536797,0.536797,0.493506
0,Indy500-2014,1260,261,"+:76,0:44,-:141",0.168582,0.356322,0.429119,0.475096,0.482759,0.375479,0.425287,0.471264,0.440613
0,Indy500-2015,1290,231,"+:80,0:37,-:114",0.160173,0.337662,0.428571,0.541126,0.61039,0.4329,0.510823,0.515152,0.467532
0,Indy500-2016,1229,292,"+:108,0:46,-:138",0.157534,0.308219,0.431507,0.571918,0.626712,0.482877,0.592466,0.561644,0.520548
0,Indy500-2017,1240,281,"+:86,0:67,-:128",0.238434,0.327402,0.366548,0.313167,0.377224,0.259786,0.519573,0.313167,0.437722
0,Indy500-2018,1296,225,"+:92,0:34,-:99",0.151111,0.284444,0.342222,0.595556,0.631111,0.462222,0.577778,0.6,0.524444


In [8]:
stagedata[(stagedata['eventid']==5) & (stagedata['car_number']==12)]

Unnamed: 0.1,Unnamed: 0,target,eventid,car_number,stageid,firststage,pit_in_caution,start_position,start_rank,start_rank_ratio,...,laptime_std_all,laps_prev,laps_after_last_pitstop,pittime_prev,prev_nb0_change_in_rank,prev_nb1_change_in_rank,prev_nb2_change_in_rank,follow_nb0_change_in_rank,follow_nb1_change_in_rank,follow_nb2_change_in_rank
1343,1343,1,5,12,0,0,0,3,3,0.090909,...,0.0,0,0,0.0,0,0,0,0,0,0
1344,1344,2,5,12,1,1,0,3,4,0.121212,...,8.791527,30,30,66.10705,0,-2,0,0,0,0
1345,1345,-5,5,12,2,1,1,3,6,0.181818,...,8.097341,18,16,61.85245,-3,2,1,6,3,1
1346,1346,0,5,12,3,1,0,3,1,0.030303,...,23.476106,44,42,117.01525,0,0,0,-1,-2,-7
1347,1347,0,5,12,4,1,0,3,1,0.030303,...,20.684387,35,33,59.24225,0,0,0,0,-7,2
1348,1348,0,5,12,5,1,0,3,1,0.030303,...,23.159675,42,40,59.5001,0,0,0,1,-1,0


In [12]:
### fix train
#load data
_trim = 2
suffix='withneighbor-newfeatures'
stagedata = pd.read_csv('stage-%s-%s-t%s.csv'%('indy500-2013-2019',suffix, _trim))

stagedata.fillna(0, inplace=True)
#stagedata.info()

cols = ['runid','trainsize','testsize','testdistribution']
cols.extend(classifiers)
print('cols:%s'%cols)
retdf = pd.DataFrame([],columns=cols)


events = set(stagedata['eventid'])

years = ['2013','2014','2015','2016','2017','2018','2019']
#events = ['Indy500']
eventsname = [f'Indy500-{x}' for x in years]
events_id={key:idx for idx, key in enumerate(eventsname)}

#first 
eventid = events_id['Indy500-2018']
ignore_eventid = events_id['Indy500-2019']
stdata_2018 = stagedata[stagedata['eventid']!=ignore_eventid]

print('Testset = %s'%eventsname[eventid])

train, test, train_x, train_y, test_x, test_y = split_by_eventid(stdata_2018, eventid)
test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
#print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
#      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

#record
rec = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

acc = [0 for x in range(len(classifiers))]
for idx, clf in enumerate(classifiers):
    acc[idx] = classifier_model(clf)

rec.extend(acc)
print('rec:%s'%rec)

#new df
df = pd.DataFrame([rec],columns=cols)
retdf = pd.concat([retdf, df])        


eventid = events_id['Indy500-2019']
ignore_eventid = events_id['Indy500-2018']
stdata_2019 = stagedata[stagedata['eventid']!=ignore_eventid]

print('Testset = %s'%eventsname[eventid])

train2, test2, train_x2, train_y2, test_x, test_y = split_by_eventid(stdata_2019, eventid)
test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))

#record
rec = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

acc = [0 for x in range(len(classifiers))]
for idx, clf in enumerate(classifiers):
    acc[idx] = classifier_model(clf)

rec.extend(acc)
print('rec:%s'%rec)

#new df
df = pd.DataFrame([rec],columns=cols)
retdf = pd.concat([retdf, df]) 
retdf

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lr', 'lsvc', 'lsvcl2', 'rf', 'lrbias', 'xgb']
Testset = Indy500-2018
[*] predict with currank model
precision=[0.         0.15111111 0.        ], recall=[0. 1. 0.], f1=[0.         0.26254826 0.        ], accuracy=0.15
[*] predict with avgrank model
precision=[0.3515625 0.1509434 0.25     ], recall=[0.45454545 0.23529412 0.11956522], f1=[0.39647577 0.18390805 0.16176471], accuracy=0.28
[*] predict with dice model
precision=[0.40944882 0.13157895 0.33333333], recall=[0.52525253 0.14705882 0.2173913 ], f1=[0.46017699 0.13888889 0.26315789], accuracy=0.34
[*] predict with lr model


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.60683761 0.         0.58333333], recall=[0.71717172 0.         0.68478261], f1=[0.65740741 0.         0.63      ], accuracy=0.60
[*] predict with lsvc model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.6728972  0.         0.59322034], recall=[0.72727273 0.         0.76086957], f1=[0.69902913 0.         0.66666667], accuracy=0.63
[*] predict with lsvcl2 model




precision=[0.45714286 0.5        0.61538462], recall=[0.96969697 0.02941176 0.08695652], f1=[0.62135922 0.05555556 0.15238095], accuracy=0.47
[*] predict with rf model
precision=[0.59130435 0.         0.57943925], recall=[0.68686869 0.         0.67391304], f1=[0.63551402 0.         0.62311558], accuracy=0.58
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.4s finished
  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.62280702 0.         0.57657658], recall=[0.71717172 0.         0.69565217], f1=[0.66666667 0.         0.63054187], accuracy=0.60
[*] predict with xgb model
precision=[0.55670103 0.11764706 0.55855856], recall=[0.54545455 0.05882353 0.67391304], f1=[0.55102041 0.07843137 0.61083744], accuracy=0.52
rec:['Indy500-2018', 1296, 225, '+:92,0:34,-:99', 0.1511111111111111, 0.28444444444444444, 0.3422222222222222, 0.5955555555555555, 0.6311111111111111, 0.4666666666666667, 0.5777777777777777, 0.6, 0.5244444444444445]
Testset = Indy500-2019
[*] predict with currank model
precision=[0.  0.2 0. ], recall=[0. 1. 0.], f1=[0.         0.33333333 0.        ], accuracy=0.20
[*] predict with avgrank model
precision=[0.44444444 0.19148936 0.23529412], recall=[0.5  0.18 0.2 ], f1=[0.47058824 0.18556701 0.21621622], accuracy=0.34
[*] predict with dice model
precision=[0.4964539  0.19512195 0.35294118], recall=[0.58333333 0.16       0.3       ], f1=[0.53639847 0.17582418 0.32432432], accuracy=0.



precision=[0.61320755 0.75       0.42857143], recall=[0.54166667 0.06       0.75      ], f1=[0.57522124 0.11111111 0.54545455], accuracy=0.51
[*] predict with lsvcl2 model




precision=[0.76666667 0.16071429 0.44444444], recall=[0.19166667 0.36       0.6       ], f1=[0.30666667 0.22222222 0.5106383 ], accuracy=0.36
[*] predict with rf model
precision=[0.54032258 0.         0.416     ], recall=[0.55833333 0.         0.65      ], f1=[0.54918033 0.         0.50731707], accuracy=0.48
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.4s finished


precision=[0.60483871 0.         0.4516129 ], recall=[0.625 0.    0.7  ], f1=[0.6147541  0.         0.54901961], accuracy=0.52
[*] predict with xgb model
precision=[0.536      0.25       0.42735043], recall=[0.55833333 0.04       0.625     ], f1=[0.54693878 0.06896552 0.50761421], accuracy=0.48
rec:['Indy500-2019', 1296, 250, '+:80,0:50,-:120', 0.2, 0.34, 0.408, 0.52, 0.512, 0.356, 0.476, 0.524, 0.476]


Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lr,lsvc,lsvcl2,rf,lrbias,xgb
0,Indy500-2018,1296,225,"+:92,0:34,-:99",0.151111,0.284444,0.342222,0.595556,0.631111,0.466667,0.577778,0.6,0.524444
0,Indy500-2019,1296,250,"+:80,0:50,-:120",0.2,0.34,0.408,0.52,0.512,0.356,0.476,0.524,0.476


In [14]:
retdf.to_csv('stint_classifier_result_t2013-2017.csv', float_format='%.3f')