### stage_model_regressor

predicting target: end_rank = start_rank + change(old target)

base: 14./stage_model_regressor_withneighbor-newfeatures

prediction models of chg_of_rank_in_stage on stage dataset

data format:
    target , eventid ,    car_number,    stageid,     features...

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import math
import pickle

In [2]:
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble.forest import RandomForestRegressor
from sklearn.linear_model.ridge import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model.stochastic_gradient import SGDRegressor
from sklearn.svm.classes import SVR
from sklearn.utils import shuffle
from sklearn import metrics
import xgboost as xgb



In [3]:
# bulid regression model
regressors = ['currank','avgrank','dice','lasso','ridge','rf','svr','xgb']
train_x, train_y, test_x, test_y = None, None, None, None

def get_regressor(regressor = 'lr'):
    if regressor == "lasso":
        clf = LassoCV(cv=5, random_state=0)
    elif regressor == "ridge":
        clf = RidgeCV(alphas=np.logspace(-6, 6, 13))
    elif regressor == "rf":
        clf = RandomForestRegressor(n_estimators=100)
    elif regressor == 'svr':
        clf = SVR(kernel='rbf')
    elif regressor == 'xgb':
        clf = xgb.XGBRegressor(objective="reg:linear", random_state=42, max_depth=3)
    elif regressor == 'dice':
        clf = RandomDice('1234')
    elif regressor == 'currank':
        clf = CurRank()
    elif regressor == 'avgrank':
        clf = AverageRank()        
    else:
        clf = None
        
    return clf


class CurRank():
    """
    predict with current rank
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = [0 for x in range(test_x.shape[0])]
        return np.array(pred_y)
    
class AverageRank():
    """
    print('[*] predict with average rankchg (change_in_rank_all):idx = 15')
    change_in_rank_all = test[:,15]
    pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in change_in_rank_all])
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            #13, change_in_rank_all
            pred_y.append(x[13])
        #pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in pred_y])
        pred_y_avg = pred_y
        return np.array(pred_y_avg)   

class RandomDice():
    """
    a random dice model
    """
    def __init__(self, seed='1234'):
        self.dist = []
        self.val = []
        random.seed(seed)
    
    def fit(self, x, y):
        total = y.shape[0]
        yval = set(y)
        
        ratio = 0.
        for val in yval:
            self.val.append(val)
            ratio += np.sum(y==val)*1.0 / total
            self.dist.append(ratio)
            
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            dice = random.random()
            #search in self.dist
            find_idx = -1
            for idx, ratio in enumerate(self.dist):
                if dice <= ratio:
                    find_idx = idx
                    break
            
            #or the last one match
            pred_y.append(self.val[find_idx])
            
        return np.array(pred_y)

def evaluate(test_y, pred_y):
    mae = metrics.mean_absolute_error(test_y, pred_y) 
    rmse = math.sqrt(metrics.mean_squared_error(test_y, pred_y))
    r2 = metrics.r2_score(test_y, pred_y)
    print('rmse=%.2f, mae=%.2f, r2=%.2f'%(rmse, mae, r2))
    return mae,rmse, r2
    
#
#features
#    cols=[Myidx, 'target','eventid','car_number','stageid',
#             'firststage','pit_in_caution','start_position',
#             'start_rank','start_rank_ratio','top_pack','bottom_pack',
#             'average_rank','average_rank_all',
#             'change_in_rank','change_in_rank_all','rate_of_change','rate_of_change_all']    
def split_by_eventidx(stagedata, eventid):
    """
    split by eventid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['eventid'].isin(train_events)].to_numpy()
    test  = stagedata[stagedata['eventid'].isin(test_events)].to_numpy()

    #2:car_number
    train_x = train[:,2:]
    #train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    train_y = train[:,1]
    test_x = test[:,2:]
    #test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    test_y = test[:,1]
    
    #change target to endrank
    #train_y = train_y + train[:,8] 
    #test_y = test_y + test[:,8]
    
    train = stagedata[stagedata['eventid'].isin(train_events)]
    test  = stagedata[stagedata['eventid'].isin(test_events)]
    
    return train, test, train_x, train_y, test_x, test_y

def split_by_eventid(stagedata, train_events, test_events, startid=2):
    """
    split by eventid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['eventid'].isin(train_events)].to_numpy()
    test  = stagedata[stagedata['eventid'].isin(test_events)].to_numpy()

    #2:car_number
    train_x = train[:,startid:]
    #train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    train_y = train[:,1]
    test_x = test[:,startid:]
    #test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    test_y = test[:,1]
    
    #change target to endrank
    #train_y = train_y + train[:,8] 
    #test_y = test_y + test[:,8]
    
    train = stagedata[stagedata['eventid'].isin(train_events)]
    test  = stagedata[stagedata['eventid'].isin(test_events)]
    
    return train, test, train_x, train_y, test_x, test_y

def split_by_stageid(stagedata, stageid):
    """
    split by stageid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['stageid'] <= stageid].to_numpy()
    test  = stagedata[stagedata['stageid'] > stageid].to_numpy()

    train_x = train[:,2:]
    #train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    train_y = train[:,1]
    test_x = test[:,2:]
    #test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    test_y = test[:,1]
    
    return train, test, train_x, train_y, test_x, test_y


def regressor_model(name='svr'):
    ### test learning models
    print('[*] predict with %s model'%name)
    clf = get_regressor(name)
    clf.fit(train_x, train_y)

    pred_y = clf.predict(test_x)
    
    #int only
    #pred_y = pred_y.astype(int)

    
    score = evaluate(test_y, pred_y)
    return score, pred_y

In [9]:
def do_rerank(dfout, col=4):
    """
    output of prediction of target can be float
    ['carno','startlap', 'startrank','endrank']
    resort the endrank globally
    
    """
    
    #df = dfout.sort_values(by=['startlap','carno'])
    print('rerank...')
    laps = set(dfout.startlap.values)
    
    dfs = []
    for lap in laps:
        df = dfout[dfout['startlap']==lap].to_numpy()
        
        #print('in',df)
        
        idx = np.argsort(df[:,col], axis=0)
        true_rank = np.argsort(idx, axis=0)
    
        df[:,col] = true_rank
        
        #print('out',df)
        if len(dfs) == 0:
            dfs = df
        else:
            dfs = np.vstack((dfs, df))
        #dfs.append(df)
        #np.vstack(df)
        
    #dfret = pd.concat(dfs)
    #data = np.array(dfs)
    dfret = pd.DataFrame(dfs, columns =['carno','startlap', 'startrank','endrank','pred_endrank'])
    
    return dfret

def build_df(testdf, pred_y, dorerank=True):
    """
    build a standard stint prediction result:
    carno	startlap	startrank	endrank	diff	sign	pred_endrank	pred_diff	pred_sign
    """
    
    print('build_df: len testdf=%d, len of pred_y=%d'%(len(testdf), len(pred_y)))
    
    test = testdf[['car_number','start_lap','start_rank','target']].values
    
    # todo: fix
    # start_lap already been 0-index number
    # current code in Model_compare_ranknet_vs_others-v4-all.ipynb::long_predict_bymloutput
    # follows this 'wrong' index
    test[:,1] = test[:,1]-1
    
    test[:,2] = test[:,2]-1
    test[:,3] = test[:,2] + test[:,3]
    dfout = pd.DataFrame(test, columns =['carno','startlap', 'startrank','endrank'])
    

    # add predictions
    dfout['pred_endrank'] = pred_y +  dfout['startrank']
    
    if dorerank:
        dfout = do_rerank(dfout,col=4)    
    
    dfout['diff'] = dfout['endrank'] - dfout['startrank']
    signVec = dfout['diff'].values.copy()
    for idx in range(len(signVec)):
        sign = 0
        if signVec[idx] > 0:
            sign = 1
        elif signVec[idx] < 0:
            sign = -1
        signVec[idx] = sign
    dfout['sign'] = signVec


    #dfout['pred_diff'] = pred_y 
    dfout['pred_diff'] = dfout['pred_endrank'] - dfout['startrank']
    signVec = dfout['pred_diff'].values.copy()
    for idx in range(len(signVec)):
        sign = 0
        if signVec[idx] > 0:
            sign = 1
        elif signVec[idx] < 0:
            sign = -1
        signVec[idx] = sign
    dfout['pred_sign'] = signVec
    return dfout
    

def test_cv():
    global train_x, train_y, test_x, test_y
    
    cols = ['runid','trainsize','testsize','testdistribution']
    cols.extend(regressors)
    print('cols:%s'%cols)
    retdf0 = pd.DataFrame([],columns=cols)
    retdf1 = pd.DataFrame([],columns=cols)

    events = set(stagedata['eventid'])

    years = ['2013','2014','2015','2016','2017','2018','2019']
    #events = ['Indy500']
    eventsname = [f'Indy500-{x}' for x in years]
    events_id={key:idx for idx, key in enumerate(eventsname)}
    for eventid in events:
        print('Testset = %s'%eventsname[eventid])

        train, test, train_x, train_y, test_x, test_y = split_by_eventid(stagedata, eventid)
        test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
        #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
        #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

        #record
        rec0 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
        rec1 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

        pred_y = [0 for x in range(len(regressors))]
        acc0 = [0 for x in range(len(regressors))]
        acc1 = [0 for x in range(len(regressors))]
        for idx, clf in enumerate(regressors):
            acc, pred_y[idx] = regressor_model(clf)
            acc0[idx] = acc[0]
            acc1[idx] = acc[2]

        rec0.extend(acc0)
        rec1.extend(acc1)
        #print('rec:%s'%rec)

        #new df
        df = pd.DataFrame([rec0],columns=cols)
        retdf0 = pd.concat([retdf0, df])        

        df = pd.DataFrame([rec1],columns=cols)
        retdf1 = pd.concat([retdf1, df])        


    #retdf0.to_csv('regressors_stagedata_splitbyevent%s_rmse.csv'%suffix)
    #retdf1.to_csv('regressors_stagedata_splitbyevent%s_r2.csv'%suffix)
    retdf0.to_csv('crossvalid_stagedata_regressor_%s.csv'%suffix, float_format='%.3f')

    df_event_rmse = retdf0
    df_event_r2 = retdf1
    return df_event_rmse
    
def test_20182019(dorerank=True, startid = 2):
    global train_x, train_y, test_x, test_y

    pred_df = {}
    
    ### train 2013-2017
    #load data
    cols = ['runid','trainsize','testsize','testdistribution']
    cols.extend(regressors)
    print('cols:%s'%cols)
    retdf0 = pd.DataFrame([],columns=cols)
    retdf1 = pd.DataFrame([],columns=cols)

    eventsids = set(stagedata['eventid'])

    for event in events:
        test_eventid = events_id[event]
        
        if test_eventid in _train_events:
            continue
        
        if test_eventid not in _test_events:
            continue
        
        print('Testset = %s'% event, test_eventid)

        traindf, testdf, train_x, train_y, test_x, test_y = split_by_eventid(stagedata, 
                                                            _train_events, [test_eventid], startid=startid)
        test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
        #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
        #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

        #record
        rec0 = [event,train_x.shape[0],test_x.shape[0],test_distribution]
        rec1 = [event,train_x.shape[0],test_x.shape[0],test_distribution]

        pred_y = [0 for x in range(len(regressors))]
        acc0 = [0 for x in range(len(regressors))]
        acc1 = [0 for x in range(len(regressors))]
        for idx, clf in enumerate(regressors):
            acc, pred_y[idx] = regressor_model(clf)
            acc0[idx] = acc[0]
            acc1[idx] = acc[2]

            #build pred df
            if not event in pred_df:
                pred_df[event] = {}
            pred_df[event][clf] = build_df(testdf, pred_y[idx],dorerank=dorerank)


        rec0.extend(acc0)
        rec1.extend(acc1)
        #print('rec:%s'%rec)

        #new df
        df = pd.DataFrame([rec0],columns=cols)
        retdf0 = pd.concat([retdf0, df])            
    
    return retdf0, pred_df   

In [10]:
import pickle
def save_result(dfs, datafile):
    with open(datafile, 'wb') as f:
        #pack [global_carids, laptime_data]
        savedata = [dfs]
        #savedata = [freq, train_set, test_set]
        # Pickle the 'data' dictionary using the highest protocol available.
        pickle.dump(savedata, f, pickle.HIGHEST_PROTOCOL)

### test oracle with stint_len

In [11]:
events_info = {
    'Phoenix':(256, 1.022, 250),'Indy500':(500,2.5,200),'Texas':(372,1.5,248), 
    'Iowa':(268,0.894,300),'Pocono':(500,2.5,200),'Gateway':(310,1.25,248)
}

years = ['2013','2014','2015','2016','2017','2018','2019']
events = [f'Indy500-{x}' for x in years]

events.extend(['Phoenix-2018','Texas-2018','Texas-2019','Pocono-2018','Pocono-2019','Iowa-2018','Iowa-2019',
              'Gateway-2018','Gateway-2019'])

events_id={key:idx for idx, key in enumerate(events)}

dataroot = 'test/'

_train_events = [events_id[x] for x in [f'Indy500-{x}' for x in ['2013','2014','2015','2016','2017']]]
_test_events = [events_id[x] for x in [f'Indy500-{x}' for x in ['2018','2019']]]

In [12]:
#load data
_trim = 0
_include_final = True
_include_stintlen = True
#_include_stintlen = False
include_str = '1' if _include_final else '0'
stint_str = '1' if _include_stintlen else ''


_context_len = 60

In [13]:
version = f'IndyCar-d{len(events)}-debug'

#for plen in [2,4,6,8]:
for plen in [2]:

    #output_file = f'shortterm-indy500-2013-2019-end{include_str}{stint_str}-t{plen}-c{_context_len}.csv'
    output_file = f'{dataroot}/shortterm-IndyCar-d{len(events)}-end{include_str}{stint_str}-t{plen}-c{_context_len}.csv'
    stagedata = pd.read_csv(output_file)
    stagedata.fillna(0, inplace=True)
    #stagedata.info()
    retdf_rerank, preddf_rerank = test_20182019(dorerank=True)
    #retdf_norank, preddf_norank = test_20182019(dorerank=False)

    #output_file = f'stage-indy500-2013-2019-end{include_str}{stint_str}-t{_trim}.csv'
    #outfile=f'shortterm-dfout-mlmodels-indy500-tr2013_2017-te2018_2019-end{include_str}-normal-t{_trim}.pickle'
    #save_result(preddf, outfile)
    outfile=f'{dataroot}/shortterm-dfout-mlmodels-{version}-end{include_str}-rerank-t{plen}-c{_context_len}.pickle'
    save_result(preddf_rerank, outfile)
    #outfile=f'{dataroot}/shortterm-dfout-mlmodels-{version}-end{include_str}-norerank-t{plen}-c{_context_len}.pickle'
    #save_result(preddf_norank, outfile)


cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lasso', 'ridge', 'rf', 'svr', 'xgb']
Testset = Indy500-2018 5
[*] predict with currank model
rmse=3.25, mae=1.36, r2=-0.00
build_df: len testdf=5340, len of pred_y=5340
rerank...
[*] predict with avgrank model
rmse=17.57, mae=15.58, r2=-28.14
build_df: len testdf=5340, len of pred_y=5340
rerank...
[*] predict with dice model
rmse=4.37, mae=2.25, r2=-0.81
build_df: len testdf=5340, len of pred_y=5340
rerank...
[*] predict with lasso model


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  positive)


rmse=3.10, mae=1.61, r2=0.09
build_df: len testdf=5340, len of pred_y=5340
rerank...
[*] predict with ridge model
rmse=3.10, mae=1.61, r2=0.09
build_df: len testdf=5340, len of pred_y=5340
rerank...
[*] predict with rf model
rmse=3.43, mae=1.94, r2=-0.11
build_df: len testdf=5340, len of pred_y=5340
rerank...
[*] predict with svr model
rmse=3.25, mae=1.39, r2=0.00
build_df: len testdf=5340, len of pred_y=5340
rerank...
[*] predict with xgb model
rmse=3.62, mae=1.94, r2=-0.23
build_df: len testdf=5340, len of pred_y=5340
rerank...
Testset = Indy500-2019 6
[*] predict with currank model
rmse=3.07, mae=1.18, r2=-0.00
build_df: len testdf=5629, len of pred_y=5629
rerank...
[*] predict with avgrank model
rmse=18.16, mae=16.11, r2=-33.95
build_df: len testdf=5629, len of pred_y=5629
rerank...
[*] predict with dice model
rmse=4.15, mae=2.08, r2=-0.82
build_df: len testdf=5629, len of pred_y=5629
rerank...
[*] predict with lasso model


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  positive)


rmse=2.93, mae=1.45, r2=0.09
build_df: len testdf=5629, len of pred_y=5629
rerank...
[*] predict with ridge model
rmse=2.92, mae=1.43, r2=0.10
build_df: len testdf=5629, len of pred_y=5629
rerank...
[*] predict with rf model
rmse=2.97, mae=1.60, r2=0.07
build_df: len testdf=5629, len of pred_y=5629
rerank...
[*] predict with svr model
rmse=3.07, mae=1.23, r2=0.00
build_df: len testdf=5629, len of pred_y=5629
rerank...
[*] predict with xgb model
rmse=3.04, mae=1.51, r2=0.02
build_df: len testdf=5629, len of pred_y=5629
rerank...


### test

In [2]:
def loaddata(datafile):
    with open(datafile, 'rb') as f:
        # The protocol version used is detected automatically, so we do not
        # have to specify it.
        dfout = pickle.load(f, encoding='latin1') 
        return dfout   

In [10]:
#load data
_trim = 0
_include_final = True
_include_stintlen = True
#_include_stintlen = False
include_str = '1' if _include_final else '0'
stint_str = '1' if _include_stintlen else ''
suffix = f'indy500-2013-2019-end{include_str}{stint_str}-t{_trim}'
plen=2
outfile=f'shortterm-dfout-mlmodels-indy500-tr2013_2017-te2018_2019-end{include_str}-rerank-t{plen}.pickle'
preddf_rerank = loaddata(outfile)[0]

In [15]:
output_file = f'shortterm-indy500-2013-2019-end{include_str}{stint_str}-t{plen}-.csv'
stagedata = pd.read_csv(output_file)
stagedata.fillna(0, inplace=True)


In [16]:
stagedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38063 entries, 0 to 38062
Data columns (total 37 columns):
Unnamed: 0                   38063 non-null int64
target                       38063 non-null int64
start_lap                    38063 non-null int64
stint_len                    38063 non-null int64
eventid                      38063 non-null int64
car_number                   38063 non-null int64
stageid                      38063 non-null int64
firststage                   38063 non-null int64
pit_in_caution               38063 non-null int64
start_position               38063 non-null int64
start_rank                   38063 non-null int64
start_rank_ratio             38063 non-null float64
top_pack                     38063 non-null int64
bottom_pack                  38063 non-null int64
average_rank                 38063 non-null float64
average_rank_all             38063 non-null float64
change_in_rank               38063 non-null int64
change_in_rank_all           38063 

In [11]:
df = preddf_rerank['2018']['xgb']

In [12]:
df[df['carno']==12]

Unnamed: 0,carno,startlap,startrank,endrank,pred_endrank,diff,sign,pred_diff,pred_sign
7,12.0,9.0,2.0,2.0,1.0,0.0,0.0,-1.0,-1.0
40,12.0,10.0,2.0,2.0,1.0,0.0,0.0,-1.0,-1.0
73,12.0,11.0,2.0,2.0,0.0,0.0,0.0,-2.0,-1.0
106,12.0,12.0,2.0,2.0,0.0,0.0,0.0,-2.0,-1.0
139,12.0,13.0,2.0,2.0,0.0,0.0,0.0,-2.0,-1.0
...,...,...,...,...,...,...,...,...,...
5236,12.0,192.0,2.0,0.0,1.0,-2.0,-1.0,-1.0,-1.0
5259,12.0,193.0,2.0,0.0,0.0,-2.0,-1.0,-2.0,-1.0
5282,12.0,194.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5305,12.0,195.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
df = preddf_rerank['2018']['svr']

In [18]:
dfx = df[df['carno']==12]

In [19]:
dfx[dfx['startlap']>190]

Unnamed: 0,carno,startlap,startrank,endrank,pred_endrank,diff,sign,pred_diff,pred_sign
5213,12.0,191.0,3.0,2.0,3.0,-1.0,-1.0,0.0,0.0
5236,12.0,192.0,2.0,0.0,2.0,-2.0,-1.0,0.0,0.0
5259,12.0,193.0,2.0,0.0,2.0,-2.0,-1.0,0.0,0.0
5282,12.0,194.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5305,12.0,195.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5326,12.0,196.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
### test blackhorse car=27
df2018 = preddf_oracle['2018']['rf']
car27 = df2018[df2018['carno']==27]

NameError: name 'preddf_oracle' is not defined

In [None]:
car27

In [None]:
evaluate(car27['pred_diff'].values,car27['diff'].values)

In [None]:
### test blackhorse car=27
df2018 = preddf_oracle['2018']['xgb']
car27 = df2018[df2018['carno']==12]
car27

In [None]:
evaluate(car27['pred_diff'].values,car27['diff'].values)

In [None]:
car27 = df2018
evaluate(car27['pred_diff'].values,car27['diff'].values)

In [None]:
len(df2018)

In [None]:
retdf_oracle

In [None]:
top10 = [12, 20, 9, 27, 28, 22, 29, 1, 6]
car27 = df2018[df2018['carno'].isin(top10)]
evaluate(car27['pred_diff'].values,car27['diff'].values)

In [None]:
top10 = [12, 20, 9, 27, 28]
car27 = df2018[df2018['carno'].isin(top10)]
evaluate(car27['pred_diff'].values,car27['diff'].values)

In [None]:
df2018 = preddf['2018']['xgb']
car27 = df2018[df2018['carno'].isin(top10)]
evaluate(car27['pred_diff'].values,car27['diff'].values)

In [12]:
preddf_rerank['Indy500-2018']

{'currank':       carno  startlap  startrank  endrank  pred_endrank  diff  sign  \
 0         1         9          3        3             3     0     0   
 1         3         9          7        7             7     0     0   
 2         4         9         12       12            12     0     0   
 3         6         9         15       15            15     0     0   
 4         7         9         28       28            28     0     0   
 ...     ...       ...        ...      ...           ...   ...   ...   
 5335     60       196         15       15            15     0     0   
 5336     64       196         16       16            16     0     0   
 5337     66       196         10       10            10     0     0   
 5338     88       196         13       13            13     0     0   
 5339     98       196         11       11            11     0     0   
 
       pred_diff  pred_sign  
 0             0          0  
 1             0          0  
 2             0          0  
 3 