### stage_model_classifier

base: 14./stage_model_classifier_withneighbor-newfeatures

prediction models of sign classifiers on stage dataset

data format:
    target , eventid ,    car_number,    stageid,     features...

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import xgboost as xgb


In [3]:
# bulid regression model
#classifiers = ['currank','avgrank','dice','lr','lrl1','lsvc','lsvcl2','rf','lrbias','xgb']
classifiers = ['currank','avgrank','dice','lr','lsvc','lsvcl2','rf','lrbias','xgb']
def get_classifier(classifier = 'lr'):
    
    class_weight = None
    
    if classifier == "lsvc":
        clf = LinearSVC(penalty='l1',dual=False, tol=1e-3, class_weight=class_weight )
    elif classifier == "lsvcl2":
        clf = LinearSVC(penalty='l2', tol=1e-4, class_weight=class_weight)
    elif classifier == 'rf':
        #clf = RandomForestClassifier(n_estimators=100, n_jobs=4,criterion='entropy', min_samples_split=1,class_weight = class_weight)
        clf = RandomForestClassifier(n_estimators=100, n_jobs=-1,criterion='entropy', class_weight = class_weight)
    elif classifier == 'lr':
        clf = LogisticRegression(class_weight = class_weight, n_jobs=-1, fit_intercept = False, verbose = 0)
    elif classifier == 'lrbias':
        clf = LogisticRegression(class_weight = class_weight, n_jobs=-1, fit_intercept = True, verbose = 1)
    elif classifier == 'lrl1':
        clf = LogisticRegression(class_weight = class_weight, penalty='l1',n_jobs=-1)
    elif classifier == 'xgb':
        clf = xgb.XGBClassifier(booster = 'gbtree', nthread = -1, subsample = 1, 
                                n_estimators = 600, colsample_bytree = 1, max_depth = 6, min_child_weight = 1)
    elif classifier == 'dice':
        clf = RandomDice('1234')
    elif classifier == 'currank':
        clf = CurRank()
    elif classifier == 'avgrank':
        clf = AverageRank()        
    else:
        clf = None
        
    return clf


class CurRank():
    """
    predict with current rank
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = [0 for x in range(test_x.shape[0])]
        return np.array(pred_y)
    
class AverageRank():
    """
    print('[*] predict with average rankchg (change_in_rank_all):idx = 15')
    change_in_rank_all = test[:,15]
    pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in change_in_rank_all])
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            #13, change_in_rank_all
            pred_y.append(x[13])
        pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in pred_y])
        return np.array(pred_y_avg)   

class RandomDice():
    """
    a random dice model
    """
    def __init__(self, seed='1234'):
        self.dist = []
        self.val = []
        random.seed(seed)
    
    def fit(self, x, y):
        total = y.shape[0]
        yval = set(y)
        
        ratio = 0.
        for val in yval:
            self.val.append(val)
            ratio += np.sum(y==val)*1.0 / total
            self.dist.append(ratio)
            
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            dice = random.random()
            #search in self.dist
            find_idx = -1
            for idx, ratio in enumerate(self.dist):
                if dice <= ratio:
                    find_idx = idx
                    break
            
            #or the last one match
            pred_y.append(self.val[find_idx])
            
        return np.array(pred_y)

def evaluate(test_y, pred_y):
    precision = metrics.precision_score(test_y, pred_y, average=None) 
    recall = metrics.recall_score(test_y, pred_y, average=None)
    f1 = metrics.f1_score(test_y, pred_y, average=None)
    accuracy = metrics.accuracy_score(test_y, pred_y)
    print('precision=%s, recall=%s, f1=%s, accuracy=%.2f'%(precision,recall, f1, accuracy))
    return accuracy
    
#
#features
#    cols=[Myidx, 'target','eventid','car_number','stageid',
#             'firststage','pit_in_caution','start_position',
#             'start_rank','start_rank_ratio','top_pack','bottom_pack',
#             'average_rank','average_rank_all',
#             'change_in_rank','change_in_rank_all','rate_of_change','rate_of_change_all']    
def split_by_eventid(stagedata, eventid):
    """
    split by eventid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['eventid'] != eventid].to_numpy()
    test  = stagedata[stagedata['eventid'] == eventid].to_numpy()

    #2:car_number
    train_x = train[:,2:]
    train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    test_x = test[:,2:]
    test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    
    return train, test, train_x, train_y, test_x, test_y


def split_by_stageid(stagedata, stageid):
    """
    split by stageid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['stageid'] <= stageid].to_numpy()
    test  = stagedata[stagedata['stageid'] > stageid].to_numpy()

    train_x = train[:,2:]
    train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    test_x = test[:,2:]
    test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    
    return train, test, train_x, train_y, test_x, test_y


### baseline
def baseline_model():
    #1. predict with current rank, rankchg = 0
    print('[*] predict with current rank, rankchg = 0')
    pred_y_simple = np.zeros_like(test_y)
    score1 = evaluate(test_y, pred_y_simple)

    #2. predict with average rankchg (change_in_rank_all):idx = 15
    print('[*] predict with average rankchg (change_in_rank_all):idx = 15')
    change_in_rank_all = test[:,15]
    pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in change_in_rank_all])
    score2 = evaluate(test_y, pred_y_avg)
    return score1, score2

def classifier_model(name='lr'):
    ### test learning models
    print('[*] predict with %s model'%name)
    clf = get_classifier(name)
    clf.fit(train_x, train_y)

    pred_y = clf.predict(test_x)
    score = evaluate(test_y, pred_y)
    return score

In [4]:
#load data
_trim = 2
_include_final = False
include_str = '1' if _include_final else '0'
suffix = f'indy500-2013-2019-end{include_str}-t{_trim}'
output_file = f'stage-indy500-2013-2019-end{include_str}-t{_trim}.csv'
stagedata = pd.read_csv(output_file)


stagedata.fillna(0, inplace=True)
stagedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 35 columns):
Unnamed: 0                   1313 non-null int64
target                       1313 non-null int64
eventid                      1313 non-null int64
car_number                   1313 non-null int64
stageid                      1313 non-null int64
firststage                   1313 non-null int64
pit_in_caution               1313 non-null int64
start_position               1313 non-null int64
start_rank                   1313 non-null int64
start_rank_ratio             1313 non-null float64
top_pack                     1313 non-null int64
bottom_pack                  1313 non-null int64
average_rank                 1313 non-null float64
average_rank_all             1313 non-null float64
change_in_rank               1313 non-null int64
change_in_rank_all           1313 non-null float64
rate_of_change               1313 non-null int64
rate_of_change_all           1313 non-null float64
l

In [5]:
stagedata.head(5)

Unnamed: 0.1,Unnamed: 0,target,eventid,car_number,stageid,firststage,pit_in_caution,start_position,start_rank,start_rank_ratio,...,laptime_std_all,laps_prev,laps_after_last_pitstop,pittime_prev,prev_nb0_change_in_rank,prev_nb1_change_in_rank,prev_nb2_change_in_rank,follow_nb0_change_in_rank,follow_nb1_change_in_rank,follow_nb2_change_in_rank
0,1,0,0,1,1,1,0,7,3,0.090909,...,23.559273,29,29,66.0815,2,-1,0,2,2,0
1,2,0,0,1,2,1,1,7,3,0.090909,...,24.168072,27,25,62.0677,0,-2,0,0,4,0
2,3,0,0,1,3,1,0,7,3,0.090909,...,22.044163,32,30,91.23935,-3,-8,0,-8,3,0
3,4,-1,0,1,4,1,0,7,3,0.090909,...,19.526487,32,30,61.19415,-2,-1,0,0,-5,-3
4,5,1,0,1,5,1,0,7,2,0.060606,...,17.837355,30,28,60.8541,-2,0,0,1,0,-3


In [6]:
cols = ['runid','trainsize','testsize','testdistribution']
cols.extend(classifiers)
print('cols:%s'%cols)
retdf = pd.DataFrame([],columns=cols)


events = set(stagedata['eventid'])

years = ['2013','2014','2015','2016','2017','2018','2019']
#events = ['Indy500']
eventsname = [f'Indy500-{x}' for x in years]
events_id={key:idx for idx, key in enumerate(eventsname)}


for eventid in events:
    print('Testset = %s'%eventsname[eventid])
    
    train, test, train_x, train_y, test_x, test_y = split_by_eventid(stagedata, eventid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
    #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
    #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))
    
    #record
    rec = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    
    acc = [0 for x in range(len(classifiers))]
    for idx, clf in enumerate(classifiers):
        acc[idx] = classifier_model(clf)

    rec.extend(acc)
    print('rec:%s'%rec)
    
    #new df
    df = pd.DataFrame([rec],columns=cols)
    retdf = pd.concat([retdf, df])        
    
retdf.to_csv('crossvalid_stagedata_splitbyevent%s.csv'%suffix)
df_event = retdf

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lr', 'lsvc', 'lsvcl2', 'rf', 'lrbias', 'xgb']
Testset = Indy500-2013
[*] predict with currank model
precision=[0.        0.1746988 0.       ], recall=[0. 1. 0.], f1=[0.        0.2974359 0.       ], accuracy=0.17
[*] predict with avgrank model
precision=[0.42201835 0.25       0.20408163], recall=[0.56790123 0.06896552 0.17857143], f1=[0.48421053 0.10810811 0.19047619], accuracy=0.35
[*] predict with dice model
precision=[0.52173913 0.26666667 0.36363636], recall=[0.59259259 0.27586207 0.28571429], f1=[0.55491329 0.27118644 0.32      ], accuracy=0.43
[*] predict with lr model


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.625      0.         0.44871795], recall=[0.67901235 0.         0.625     ], f1=[0.65088757 0.         0.52238806], accuracy=0.54
[*] predict with lsvc model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.63333333 0.         0.46052632], recall=[0.7037037 0.        0.625    ], f1=[0.66666667 0.         0.53030303], accuracy=0.55
[*] predict with lsvcl2 model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[1.         0.         0.33939394], recall=[0.01234568 0.         1.        ], f1=[0.02439024 0.         0.50678733], accuracy=0.34
[*] predict with rf model
precision=[0.62857143 1.         0.55      ], recall=[0.81481481 0.03448276 0.58928571], f1=[0.70967742 0.06666667 0.56896552], accuracy=0.60
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.5s finished
  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.62921348 0.         0.45454545], recall=[0.69135802 0.         0.625     ], f1=[0.65882353 0.         0.52631579], accuracy=0.55
[*] predict with xgb model
precision=[0.65116279 0.33333333 0.51470588], recall=[0.69135802 0.13793103 0.625     ], f1=[0.67065868 0.19512195 0.56451613], accuracy=0.57
rec:['Indy500-2013', 1147, 166, '+:56,0:29,-:81', 0.1746987951807229, 0.3493975903614458, 0.43373493975903615, 0.5421686746987951, 0.5542168674698795, 0.3433734939759036, 0.6024096385542169, 0.5481927710843374, 0.572289156626506]
Testset = Indy500-2014
[*] predict with currank model
precision=[0.         0.14871795 0.        ], recall=[0. 1. 0.], f1=[0.         0.25892857 0.        ], accuracy=0.15
[*] predict with avgrank model
precision=[0.51694915 0.         0.25396825], recall=[0.56481481 0.         0.27586207], f1=[0.53982301 0.         0.26446281], accuracy=0.39
[*] predict with dice model
precision=[0.60952381 0.11764706 0.33928571], recall=[0.59259259 0.13793103 0.32758621

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.66141732 0.         0.44117647], recall=[0.77777778 0.         0.51724138], f1=[0.71489362 0.         0.47619048], accuracy=0.58
[*] predict with lsvc model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.61016949 0.         0.37662338], recall=[0.66666667 0.         0.5       ], f1=[0.63716814 0.         0.42962963], accuracy=0.52
[*] predict with lsvcl2 model




precision=[0.6875     0.34782609 0.48333333], recall=[0.71296296 0.27586207 0.5       ], f1=[0.7        0.30769231 0.49152542], accuracy=0.58
[*] predict with rf model
precision=[0.61038961 0.15384615 0.36363636], recall=[0.43518519 0.27586207 0.4137931 ], f1=[0.50810811 0.19753086 0.38709677], accuracy=0.41
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.4s finished
  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.65648855 0.         0.453125  ], recall=[0.7962963 0.        0.5      ], f1=[0.71966527 0.         0.47540984], accuracy=0.59
[*] predict with xgb model
precision=[0.71111111 0.1686747  0.3880597 ], recall=[0.2962963  0.48275862 0.44827586], f1=[0.41830065 0.25       0.416     ], accuracy=0.37
rec:['Indy500-2014', 1118, 195, '+:58,0:29,-:108', 0.14871794871794872, 0.39487179487179486, 0.4461538461538462, 0.5846153846153846, 0.517948717948718, 0.5846153846153846, 0.40512820512820513, 0.5897435897435898, 0.36923076923076925]
Testset = Indy500-2015
[*] predict with currank model
precision=[0.         0.15568862 0.        ], recall=[0. 1. 0.], f1=[0.         0.26943005 0.        ], accuracy=0.16
[*] predict with avgrank model
precision=[0.47191011 0.33333333 0.26984127], recall=[0.53846154 0.19230769 0.26984127], f1=[0.50299401 0.24390244 0.26984127], accuracy=0.38
[*] predict with dice model
precision=[0.45652174 0.06666667 0.4       ], recall=[0.53846154 0.07692308 0.2857142

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.52238806 0.         0.63636364], recall=[0.8974359  0.         0.33333333], f1=[0.66037736 0.         0.4375    ], accuracy=0.54
[*] predict with lsvc model




precision=[0.5530303 0.        0.6969697], recall=[0.93589744 0.         0.36507937], f1=[0.6952381  0.         0.47916667], accuracy=0.57
[*] predict with lsvcl2 model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.47852761 0.         1.        ], recall=[1.         0.         0.06349206], f1=[0.6473029  0.         0.11940299], accuracy=0.49
[*] predict with rf model
precision=[0.55645161 0.14285714 0.63888889], recall=[0.88461538 0.03846154 0.36507937], f1=[0.68316832 0.06060606 0.46464646], accuracy=0.56
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.4s finished
  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.51851852 0.         0.625     ], recall=[0.8974359  0.         0.31746032], f1=[0.657277   0.         0.42105263], accuracy=0.54
[*] predict with xgb model
precision=[0.57731959 0.17391304 0.59574468], recall=[0.71794872 0.15384615 0.44444444], f1=[0.64       0.16326531 0.50909091], accuracy=0.53
rec:['Indy500-2015', 1146, 167, '+:63,0:26,-:78', 0.15568862275449102, 0.38323353293413176, 0.3712574850299401, 0.5449101796407185, 0.5748502994011976, 0.49101796407185627, 0.5568862275449101, 0.5389221556886228, 0.5269461077844312]
Testset = Indy500-2016
[*] predict with currank model
precision=[0.         0.15929204 0.        ], recall=[0. 1. 0.], f1=[0.         0.27480916 0.        ], accuracy=0.16
[*] predict with avgrank model
precision=[0.42519685 0.13333333 0.28571429], recall=[0.48648649 0.05555556 0.30379747], f1=[0.45378151 0.07843137 0.29447853], accuracy=0.35
[*] predict with dice model
precision=[0.504      0.19512195 0.4       ], recall=[0.56756757 0.22222222 0.30379



precision=[0.58682635 0.17647059 0.78571429], recall=[0.88288288 0.08333333 0.41772152], f1=[0.70503597 0.11320755 0.54545455], accuracy=0.59
[*] predict with lsvcl2 model




precision=[0.         0.1978022  0.44360902], recall=[0.         0.5        0.74683544], f1=[0.         0.28346457 0.55660377], accuracy=0.34
[*] predict with rf model
precision=[0.58227848 0.1        0.53448276], recall=[0.82882883 0.02777778 0.39240506], f1=[0.68401487 0.04347826 0.45255474], accuracy=0.55
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.4s finished


precision=[0.56875    0.33333333 0.6875    ], recall=[0.81981982 0.16666667 0.41772152], f1=[0.67158672 0.22222222 0.51968504], accuracy=0.58
[*] predict with xgb model
precision=[0.536      0.125      0.44155844], recall=[0.6036036  0.08333333 0.43037975], f1=[0.56779661 0.1        0.43589744], accuracy=0.46
rec:['Indy500-2016', 1087, 226, '+:79,0:36,-:111', 0.1592920353982301, 0.35398230088495575, 0.42035398230088494, 0.5663716814159292, 0.5929203539823009, 0.3407079646017699, 0.5486725663716814, 0.5752212389380531, 0.46017699115044247]
Testset = Indy500-2017
[*] predict with currank model
precision=[0.        0.2744186 0.       ], recall=[0. 1. 0.], f1=[0.         0.43065693 0.        ], accuracy=0.27
[*] predict with avgrank model
precision=[0.39007092 0.3125     0.27586207], recall=[0.59782609 0.08474576 0.25      ], f1=[0.472103   0.13333333 0.26229508], accuracy=0.35
[*] predict with dice model
precision=[0.43697479 0.29411765 0.24193548], recall=[0.56521739 0.16949153 0.234375 

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.3968254  0.21052632 0.47368421], recall=[0.27173913 0.40677966 0.28125   ], f1=[0.32258065 0.27745665 0.35294118], accuracy=0.31
[*] predict with lsvc model




precision=[0.38961039 0.20588235 0.5       ], recall=[0.32608696 0.3559322  0.28125   ], f1=[0.35502959 0.26086957 0.36      ], accuracy=0.32
[*] predict with lsvcl2 model




precision=[0.45454545 0.26612903 0.33333333], recall=[0.10869565 0.55932203 0.359375  ], f1=[0.1754386  0.36065574 0.34586466], accuracy=0.31
[*] predict with rf model
precision=[0.48       0.23076923 0.51923077], recall=[0.7826087  0.05084746 0.421875  ], f1=[0.59504132 0.08333333 0.46551724], accuracy=0.47
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.4s finished


precision=[0.38596491 0.20338983 0.475     ], recall=[0.23913043 0.40677966 0.296875  ], f1=[0.29530201 0.27118644 0.36538462], accuracy=0.30
[*] predict with xgb model
precision=[0.5        0.33333333 0.51785714], recall=[0.68478261 0.18644068 0.453125  ], f1=[0.57798165 0.23913043 0.48333333], accuracy=0.48
rec:['Indy500-2017', 1098, 215, '+:64,0:59,-:92', 0.2744186046511628, 0.35348837209302325, 0.3581395348837209, 0.3116279069767442, 0.3209302325581395, 0.30697674418604654, 0.4744186046511628, 0.3023255813953488, 0.4790697674418605]
Testset = Indy500-2018
[*] predict with currank model
precision=[0.        0.1572327 0.       ], recall=[0. 1. 0.], f1=[0.         0.27173913 0.        ], accuracy=0.16
[*] predict with avgrank model
precision=[0.39047619 0.25       0.21052632], recall=[0.58571429 0.16       0.125     ], f1=[0.46857143 0.19512195 0.15686275], accuracy=0.33
[*] predict with dice model
precision=[0.50561798 0.17857143 0.5       ], recall=[0.64285714 0.2        0.328125  ]

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.5505618  1.         0.56521739], recall=[0.7      0.04     0.609375], f1=[0.6163522  0.07692308 0.58646617], accuracy=0.56
[*] predict with lsvc model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.57608696 0.         0.56716418], recall=[0.75714286 0.         0.59375   ], f1=[0.65432099 0.         0.58015267], accuracy=0.57
[*] predict with lsvcl2 model


  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.         0.38461538 0.41780822], recall=[0.       0.2      0.953125], f1=[0.         0.26315789 0.58095238], accuracy=0.42
[*] predict with rf model
precision=[0.56989247 0.2        0.57377049], recall=[0.75714286 0.04       0.546875  ], f1=[0.65030675 0.06666667 0.56      ], accuracy=0.56
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.4s finished
  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.54545455 0.         0.54929577], recall=[0.68571429 0.         0.609375  ], f1=[0.60759494 0.         0.57777778], accuracy=0.55
[*] predict with xgb model
precision=[0.56       0.26315789 0.6       ], recall=[0.6      0.2      0.609375], f1=[0.57931034 0.22727273 0.60465116], accuracy=0.54
rec:['Indy500-2018', 1154, 159, '+:64,0:25,-:70', 0.15723270440251572, 0.3333333333333333, 0.44654088050314467, 0.559748427672956, 0.5723270440251572, 0.41509433962264153, 0.559748427672956, 0.5471698113207547, 0.5408805031446541]


IndexError: list index out of range

In [None]:
df_event

In [None]:
stagedata[(stagedata['eventid']==5) & (stagedata['car_number']==12)]

In [7]:
### fix train
#load data
_trim = 2
_include_final = False
include_str = '1' if _include_final else '0'
suffix = f'indy500-2013-2019-end{include_str}-t{_trim}'
output_file = f'stage-indy500-2013-2019-end{include_str}-t{_trim}.csv'
stagedata = pd.read_csv(output_file)
stagedata.fillna(0, inplace=True)
#stagedata.info()

cols = ['runid','trainsize','testsize','testdistribution']
cols.extend(classifiers)
print('cols:%s'%cols)
retdf = pd.DataFrame([],columns=cols)


events = set(stagedata['eventid'])

years = ['2013','2014','2015','2016','2017','2018','2019']
#events = ['Indy500']
eventsname = [f'Indy500-{x}' for x in years]
events_id={key:idx for idx, key in enumerate(eventsname)}

#first 
eventid = events_id['Indy500-2018']
ignore_eventid = events_id['Indy500-2019']
stdata_2018 = stagedata[stagedata['eventid']!=ignore_eventid]

print('Testset = %s'%eventsname[eventid])

train, test, train_x, train_y, test_x, test_y = split_by_eventid(stdata_2018, eventid)
test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
#print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
#      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

#record
rec = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

acc = [0 for x in range(len(classifiers))]
for idx, clf in enumerate(classifiers):
    acc[idx] = classifier_model(clf)

rec.extend(acc)
print('rec:%s'%rec)

#new df
df = pd.DataFrame([rec],columns=cols)
retdf = pd.concat([retdf, df])        


eventid = events_id['Indy500-2019']
ignore_eventid = events_id['Indy500-2018']
stdata_2019 = stagedata[stagedata['eventid']!=ignore_eventid]

print('Testset = %s'%eventsname[eventid])

train2, test2, train_x2, train_y2, test_x, test_y = split_by_eventid(stdata_2019, eventid)
test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))

#record
rec = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

acc = [0 for x in range(len(classifiers))]
for idx, clf in enumerate(classifiers):
    acc[idx] = classifier_model(clf)

rec.extend(acc)
print('rec:%s'%rec)

#new df
df = pd.DataFrame([rec],columns=cols)
retdf = pd.concat([retdf, df]) 
retdf

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lr', 'lsvc', 'lsvcl2', 'rf', 'lrbias', 'xgb']
Testset = Indy500-2018
[*] predict with currank model
precision=[0.        0.1572327 0.       ], recall=[0. 1. 0.], f1=[0.         0.27173913 0.        ], accuracy=0.16
[*] predict with avgrank model
precision=[0.39047619 0.25       0.21052632], recall=[0.58571429 0.16       0.125     ], f1=[0.46857143 0.19512195 0.15686275], accuracy=0.33
[*] predict with dice model
precision=[0.50561798 0.17857143 0.5       ], recall=[0.64285714 0.2        0.328125  ], f1=[0.56603774 0.18867925 0.39622642], accuracy=0.45
[*] predict with lr model
precision=[0.58666667 1.         0.55421687], recall=[0.62857143 0.04       0.71875   ], f1=[0.60689655 0.07692308 0.62585034], accuracy=0.57
[*] predict with lsvc model




precision=[0.61971831 1.         0.55172414], recall=[0.62857143 0.04       0.75      ], f1=[0.62411348 0.07692308 0.63576159], accuracy=0.58
[*] predict with lsvcl2 model




precision=[0.46808511 0.3125     0.5       ], recall=[0.94285714 0.2        0.015625  ], f1=[0.62559242 0.24390244 0.03030303], accuracy=0.45
[*] predict with rf model


  _warn_prf(average, modifier, msg_start, len(result))
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.


precision=[0.58823529 0.         0.58108108], recall=[0.71428571 0.         0.671875  ], f1=[0.64516129 0.         0.62318841], accuracy=0.58
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.4s finished


precision=[0.59722222 0.5        0.55294118], recall=[0.61428571 0.04       0.734375  ], f1=[0.6056338  0.07407407 0.63087248], accuracy=0.57
[*] predict with xgb model
precision=[0.56060606 0.17647059 0.56578947], recall=[0.52857143 0.12       0.671875  ], f1=[0.54411765 0.14285714 0.61428571], accuracy=0.52
rec:['Indy500-2018', 969, 159, '+:64,0:25,-:70', 0.15723270440251572, 0.3333333333333333, 0.44654088050314467, 0.5723270440251572, 0.5849056603773585, 0.4528301886792453, 0.5849056603773585, 0.5723270440251572, 0.5220125786163522]
Testset = Indy500-2019
[*] predict with currank model
precision=[0.         0.21621622 0.        ], recall=[0. 1. 0.], f1=[0.         0.35555556 0.        ], accuracy=0.22
[*] predict with avgrank model
precision=[0.46956522 0.41666667 0.22413793], recall=[0.63529412 0.125      0.21666667], f1=[0.54       0.19230769 0.22033898], accuracy=0.39
[*] predict with dice model
precision=[0.47572816 0.21875    0.36      ], recall=[0.57647059 0.175      0.3      

  _warn_prf(average, modifier, msg_start, len(result))


precision=[0.57575758 0.25       0.42608696], recall=[0.44705882 0.025      0.81666667], f1=[0.50331126 0.04545455 0.56      ], accuracy=0.48
[*] predict with lsvc model




precision=[0.52542373 0.44444444 0.41880342], recall=[0.36470588 0.1        0.81666667], f1=[0.43055556 0.16326531 0.55367232], accuracy=0.45
[*] predict with lsvcl2 model




precision=[0.59701493 0.28888889 0.47945205], recall=[0.47058824 0.325      0.58333333], f1=[0.52631579 0.30588235 0.52631579], accuracy=0.48
[*] predict with rf model
precision=[0.57281553 0.         0.49382716], recall=[0.69411765 0.         0.66666667], f1=[0.62765957 0.         0.56737589], accuracy=0.54
[*] predict with lrbias model


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.4s finished


precision=[0.57575758 0.16666667 0.42477876], recall=[0.44705882 0.025      0.8       ], f1=[0.50331126 0.04347826 0.55491329], accuracy=0.47
[*] predict with xgb model
precision=[0.53191489 0.33333333 0.45882353], recall=[0.58823529 0.05       0.65      ], f1=[0.55865922 0.08695652 0.53793103], accuracy=0.49
rec:['Indy500-2019', 969, 185, '+:60,0:40,-:85', 0.21621621621621623, 0.3891891891891892, 0.4, 0.4756756756756757, 0.4540540540540541, 0.4756756756756757, 0.5351351351351351, 0.4702702702702703, 0.4918918918918919]


Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lr,lsvc,lsvcl2,rf,lrbias,xgb
0,Indy500-2018,969,159,"+:64,0:25,-:70",0.157233,0.333333,0.446541,0.572327,0.584906,0.45283,0.584906,0.572327,0.522013
0,Indy500-2019,969,185,"+:60,0:40,-:85",0.216216,0.389189,0.4,0.475676,0.454054,0.475676,0.535135,0.47027,0.491892


In [8]:
retdf.to_csv('stint_classifier_result_t2013-2017.csv', float_format='%.3f')