### stage_model_regressor

predicting target: end_rank = start_rank + change(old target)

base: 14./stage_model_regressor_withneighbor-newfeatures

prediction models of chg_of_rank_in_stage on stage dataset

data format:
    target , eventid ,    car_number,    stageid,     features...

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import math


In [2]:
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble.forest import RandomForestRegressor
from sklearn.linear_model.ridge import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model.stochastic_gradient import SGDRegressor
from sklearn.svm.classes import SVR
from sklearn.utils import shuffle
from sklearn import metrics
import xgboost as xgb



In [3]:
# bulid regression model
regressors = ['currank','avgrank','dice','lasso','ridge','rf','svr','xgb']
train_x, train_y, test_x, test_y = None, None, None, None

def get_regressor(regressor = 'lr'):
    if regressor == "lasso":
        clf = LassoCV(cv=5, random_state=0)
    elif regressor == "ridge":
        clf = RidgeCV(alphas=np.logspace(-6, 6, 13))
    elif regressor == "rf":
        clf = RandomForestRegressor(n_estimators=100)
    elif regressor == 'svr':
        clf = SVR(kernel='rbf')
    elif regressor == 'xgb':
        clf = xgb.XGBRegressor(objective="reg:linear", random_state=42, max_depth=3)
    elif regressor == 'dice':
        clf = RandomDice('1234')
    elif regressor == 'currank':
        clf = CurRank()
    elif regressor == 'avgrank':
        clf = AverageRank()        
    else:
        clf = None
        
    return clf


class CurRank():
    """
    predict with current rank
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = [0 for x in range(test_x.shape[0])]
        return np.array(pred_y)
    
class AverageRank():
    """
    print('[*] predict with average rankchg (change_in_rank_all):idx = 15')
    change_in_rank_all = test[:,15]
    pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in change_in_rank_all])
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            #13, change_in_rank_all
            pred_y.append(x[13])
        #pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in pred_y])
        pred_y_avg = pred_y
        return np.array(pred_y_avg)   

class RandomDice():
    """
    a random dice model
    """
    def __init__(self, seed='1234'):
        self.dist = []
        self.val = []
        random.seed(seed)
    
    def fit(self, x, y):
        total = y.shape[0]
        yval = set(y)
        
        ratio = 0.
        for val in yval:
            self.val.append(val)
            ratio += np.sum(y==val)*1.0 / total
            self.dist.append(ratio)
            
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            dice = random.random()
            #search in self.dist
            find_idx = -1
            for idx, ratio in enumerate(self.dist):
                if dice <= ratio:
                    find_idx = idx
                    break
            
            #or the last one match
            pred_y.append(self.val[find_idx])
            
        return np.array(pred_y)

def evaluate(test_y, pred_y):
    mae = metrics.mean_absolute_error(test_y, pred_y) 
    rmse = math.sqrt(metrics.mean_squared_error(test_y, pred_y))
    r2 = metrics.r2_score(test_y, pred_y)
    print('rmse=%.2f, mae=%.2f, r2=%.2f'%(rmse, mae, r2))
    return mae,rmse, r2
    
#
#features
#    cols=[Myidx, 'target','eventid','car_number','stageid',
#             'firststage','pit_in_caution','start_position',
#             'start_rank','start_rank_ratio','top_pack','bottom_pack',
#             'average_rank','average_rank_all',
#             'change_in_rank','change_in_rank_all','rate_of_change','rate_of_change_all']    
def split_by_eventid(stagedata, eventid):
    """
    split by eventid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['eventid'] != eventid].to_numpy()
    test  = stagedata[stagedata['eventid'] == eventid].to_numpy()

    #2:car_number
    train_x = train[:,2:]
    #train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    train_y = train[:,1]
    test_x = test[:,2:]
    #test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    test_y = test[:,1]
    
    #change target to endrank
    #train_y = train_y + train[:,8] 
    #test_y = test_y + test[:,8]
    
    train = stagedata[stagedata['eventid'] != eventid]
    test  = stagedata[stagedata['eventid'] == eventid]
    
    return train, test, train_x, train_y, test_x, test_y


def split_by_stageid(stagedata, stageid):
    """
    split by stageid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['stageid'] <= stageid].to_numpy()
    test  = stagedata[stagedata['stageid'] > stageid].to_numpy()

    train_x = train[:,2:]
    #train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    train_y = train[:,1]
    test_x = test[:,2:]
    #test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    test_y = test[:,1]
    
    return train, test, train_x, train_y, test_x, test_y


def regressor_model(name='svr'):
    ### test learning models
    print('[*] predict with %s model'%name)
    clf = get_regressor(name)
    clf.fit(train_x, train_y)

    pred_y = clf.predict(test_x)
    
    #int only
    #pred_y = pred_y.astype(int)

    
    score = evaluate(test_y, pred_y)
    return score, pred_y

In [53]:
def do_rerank(dfout, col=4):
    """
    output of prediction of target can be float
    ['carno','startlap', 'startrank','endrank']
    resort the endrank globally
    
    """
    
    #df = dfout.sort_values(by=['startlap','carno'])
    print('rerank...')
    laps = set(dfout.startlap.values)
    
    dfs = []
    for lap in laps:
        df = dfout[dfout['startlap']==lap].to_numpy()
        
        #print('in',df)
        
        idx = np.argsort(df[:,col], axis=0)
        true_rank = np.argsort(idx, axis=0)
    
        df[:,col] = true_rank
        
        #print('out',df)
        if len(dfs) == 0:
            dfs = df
        else:
            dfs = np.vstack((dfs, df))
        #dfs.append(df)
        #np.vstack(df)
        
    #dfret = pd.concat(dfs)
    #data = np.array(dfs)
    dfret = pd.DataFrame(dfs, columns =['carno','startlap', 'startrank','endrank','pred_endrank'])
    
    return dfret

def build_df(testdf, pred_y, dorerank=True):
    """
    build a standard stint prediction result:
    carno	startlap	startrank	endrank	diff	sign	pred_endrank	pred_diff	pred_sign
    """
    
    print('build_df: len testdf=%d, len of pred_y=%d'%(len(testdf), len(pred_y)))
    
    test = testdf[['car_number','start_lap','start_rank','target']].values
    test[:,1] = test[:,1]-1
    test[:,2] = test[:,2]-1
    test[:,3] = test[:,2] + test[:,3]
    dfout = pd.DataFrame(test, columns =['carno','startlap', 'startrank','endrank'])
    

    # add predictions
    dfout['pred_endrank'] = pred_y +  dfout['startrank']
    
    if dorerank:
        dfout = do_rerank(dfout,col=4)    
    
    dfout['diff'] = dfout['endrank'] - dfout['startrank']
    signVec = dfout['diff'].values.copy()
    for idx in range(len(signVec)):
        sign = 0
        if signVec[idx] > 0:
            sign = 1
        elif signVec[idx] < 0:
            sign = -1
        signVec[idx] = sign
    dfout['sign'] = signVec


    #dfout['pred_diff'] = pred_y 
    dfout['pred_diff'] = dfout['pred_endrank'] - dfout['startrank']
    signVec = dfout['pred_diff'].values.copy()
    for idx in range(len(signVec)):
        sign = 0
        if signVec[idx] > 0:
            sign = 1
        elif signVec[idx] < 0:
            sign = -1
        signVec[idx] = sign
    dfout['pred_sign'] = signVec
    return dfout
    

def test_cv():
    global train_x, train_y, test_x, test_y
    
    cols = ['runid','trainsize','testsize','testdistribution']
    cols.extend(regressors)
    print('cols:%s'%cols)
    retdf0 = pd.DataFrame([],columns=cols)
    retdf1 = pd.DataFrame([],columns=cols)

    events = set(stagedata['eventid'])

    years = ['2013','2014','2015','2016','2017','2018','2019']
    #events = ['Indy500']
    eventsname = [f'Indy500-{x}' for x in years]
    events_id={key:idx for idx, key in enumerate(eventsname)}
    for eventid in events:
        print('Testset = %s'%eventsname[eventid])

        train, test, train_x, train_y, test_x, test_y = split_by_eventid(stagedata, eventid)
        test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
        #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
        #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

        #record
        rec0 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
        rec1 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

        pred_y = [0 for x in range(len(regressors))]
        acc0 = [0 for x in range(len(regressors))]
        acc1 = [0 for x in range(len(regressors))]
        for idx, clf in enumerate(regressors):
            acc, pred_y[idx] = regressor_model(clf)
            acc0[idx] = acc[0]
            acc1[idx] = acc[2]

        rec0.extend(acc0)
        rec1.extend(acc1)
        #print('rec:%s'%rec)

        #new df
        df = pd.DataFrame([rec0],columns=cols)
        retdf0 = pd.concat([retdf0, df])        

        df = pd.DataFrame([rec1],columns=cols)
        retdf1 = pd.concat([retdf1, df])        


    #retdf0.to_csv('regressors_stagedata_splitbyevent%s_rmse.csv'%suffix)
    #retdf1.to_csv('regressors_stagedata_splitbyevent%s_r2.csv'%suffix)
    retdf0.to_csv('crossvalid_stagedata_regressor_%s.csv'%suffix, float_format='%.3f')

    df_event_rmse = retdf0
    df_event_r2 = retdf1
    return df_event_rmse
    
def test_20182019(dorerank=True):
    global train_x, train_y, test_x, test_y

    pred_df = {'2018':{}, '2019':{}}
    
    ### train 2013-2017
    #load data
    cols = ['runid','trainsize','testsize','testdistribution']
    cols.extend(regressors)
    print('cols:%s'%cols)
    retdf0 = pd.DataFrame([],columns=cols)
    retdf1 = pd.DataFrame([],columns=cols)

    events = set(stagedata['eventid'])

    years = ['2013','2014','2015','2016','2017','2018','2019']
    #events = ['Indy500']
    eventsname = [f'Indy500-{x}' for x in years]
    events_id={key:idx for idx, key in enumerate(eventsname)}

    #first 
    eventid = events_id['Indy500-2018']
    ignore_eventid = events_id['Indy500-2019']
    stdata_2018 = stagedata[stagedata['eventid']!=ignore_eventid]

    print('Testset = %s'%eventsname[eventid])

    traindf, testdf, train_x, train_y, test_x, test_y = split_by_eventid(stdata_2018, eventid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
    #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
    #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

    #record
    rec0 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    rec1 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

    pred_y = [0 for x in range(len(regressors))]
    acc0 = [0 for x in range(len(regressors))]
    acc1 = [0 for x in range(len(regressors))]
    for idx, clf in enumerate(regressors):
        acc, pred_y[idx] = regressor_model(clf)
        acc0[idx] = acc[0]
        acc1[idx] = acc[2]
        
        #build pred df
        pred_df['2018'][clf] = build_df(testdf, pred_y[idx],dorerank=dorerank)
        

    rec0.extend(acc0)
    rec1.extend(acc1)
    #print('rec:%s'%rec)

    #new df
    df = pd.DataFrame([rec0],columns=cols)
    retdf0 = pd.concat([retdf0, df])        

    #second 
    eventid = events_id['Indy500-2019']
    ignore_eventid = events_id['Indy500-2018']
    stdata_2019 = stagedata[stagedata['eventid']!=ignore_eventid]

    print('Testset = %s'%eventsname[eventid])

    traindf, testdf, train_x, train_y, test_x, test_y = split_by_eventid(stdata_2019, eventid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
    #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
    #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

    #record
    rec0 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    rec1 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

    acc0 = [0 for x in range(len(regressors))]
    acc1 = [0 for x in range(len(regressors))]
    for idx, clf in enumerate(regressors):
        acc, pred_y[idx] = regressor_model(clf)
        acc0[idx] = acc[0]
        acc1[idx] = acc[2]
        
        #build pred df
        pred_df['2019'][clf] = build_df(testdf, pred_y[idx],dorerank=dorerank)


    rec0.extend(acc0)
    rec1.extend(acc1)
    #print('rec:%s'%rec)

    #new df
    df = pd.DataFrame([rec0],columns=cols)
    retdf0 = pd.concat([retdf0, df])    

    retdf0.to_csv(f'stint_regressor_result_{suffix}.csv', float_format='%.3f')
    
    return retdf0, pred_df   

### test oracle with stint_len

In [5]:
#load data
_trim = 0
_include_final = True
_include_stintlen = True
#_include_stintlen = False
include_str = '1' if _include_final else '0'
stint_str = '1' if _include_stintlen else ''
suffix = f'indy500-2013-2019-end{include_str}{stint_str}-t{_trim}'
output_file = f'shortterm-indy500-2013-2019-end{include_str}{stint_str}-t{_trim}-v5430.csv'


stagedata = pd.read_csv(output_file)
stagedata.fillna(0, inplace=True)
stagedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38063 entries, 0 to 38062
Data columns (total 37 columns):
Unnamed: 0                   38063 non-null int64
target                       38063 non-null int64
start_lap                    38063 non-null int64
stint_len                    38063 non-null int64
eventid                      38063 non-null int64
car_number                   38063 non-null int64
stageid                      38063 non-null int64
firststage                   38063 non-null int64
pit_in_caution               38063 non-null int64
start_position               38063 non-null int64
start_rank                   38063 non-null int64
start_rank_ratio             38063 non-null float64
top_pack                     38063 non-null int64
bottom_pack                  38063 non-null int64
average_rank                 38063 non-null float64
average_rank_all             38063 non-null float64
change_in_rank               38063 non-null int64
change_in_rank_all           38063 

In [6]:
#df_event_oracle = test_cv()

In [7]:
#df_event_oracle

In [54]:
retdf_rerank, preddf_rerank = test_20182019(dorerank=True)

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lasso', 'ridge', 'rf', 'svr', 'xgb']
Testset = Indy500-2018
[*] predict with currank model
rmse=3.25, mae=1.36, r2=-0.00
build_df: len testdf=5340, len of pred_y=5340
rerank...
[*] predict with avgrank model
rmse=17.57, mae=15.58, r2=-28.14
build_df: len testdf=5340, len of pred_y=5340
rerank...
[*] predict with dice model
rmse=4.37, mae=2.25, r2=-0.81
build_df: len testdf=5340, len of pred_y=5340
rerank...
[*] predict with lasso model


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  positive)


rmse=3.10, mae=1.61, r2=0.09
build_df: len testdf=5340, len of pred_y=5340
rerank...
[*] predict with ridge model
rmse=3.10, mae=1.61, r2=0.09
build_df: len testdf=5340, len of pred_y=5340
rerank...
[*] predict with rf model
rmse=3.44, mae=1.96, r2=-0.12
build_df: len testdf=5340, len of pred_y=5340
rerank...
[*] predict with svr model
rmse=3.25, mae=1.39, r2=0.00
build_df: len testdf=5340, len of pred_y=5340
rerank...
[*] predict with xgb model
rmse=3.62, mae=1.94, r2=-0.23
build_df: len testdf=5340, len of pred_y=5340
rerank...
Testset = Indy500-2019
[*] predict with currank model
rmse=3.07, mae=1.18, r2=-0.00
build_df: len testdf=5629, len of pred_y=5629
rerank...
[*] predict with avgrank model
rmse=18.16, mae=16.11, r2=-33.95
build_df: len testdf=5629, len of pred_y=5629
rerank...
[*] predict with dice model
rmse=4.15, mae=2.08, r2=-0.82
build_df: len testdf=5629, len of pred_y=5629
rerank...
[*] predict with lasso model


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  positive)


rmse=2.93, mae=1.45, r2=0.09
build_df: len testdf=5629, len of pred_y=5629
rerank...
[*] predict with ridge model
rmse=2.92, mae=1.43, r2=0.10
build_df: len testdf=5629, len of pred_y=5629
rerank...
[*] predict with rf model
rmse=2.95, mae=1.60, r2=0.08
build_df: len testdf=5629, len of pred_y=5629
rerank...
[*] predict with svr model
rmse=3.07, mae=1.23, r2=0.00
build_df: len testdf=5629, len of pred_y=5629
rerank...
[*] predict with xgb model
rmse=3.04, mae=1.51, r2=0.02
build_df: len testdf=5629, len of pred_y=5629
rerank...


In [12]:
retdf_norank, preddf_norank = test_20182019(dorerank=False)

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lasso', 'ridge', 'rf', 'svr', 'xgb']
Testset = Indy500-2018
[*] predict with currank model
rmse=3.25, mae=1.36, r2=-0.00
build_df: len testdf=5340, len of pred_y=5340
[*] predict with avgrank model
rmse=17.57, mae=15.58, r2=-28.14
build_df: len testdf=5340, len of pred_y=5340
[*] predict with dice model
rmse=4.37, mae=2.25, r2=-0.81
build_df: len testdf=5340, len of pred_y=5340
[*] predict with lasso model


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  positive)


rmse=3.10, mae=1.61, r2=0.09
build_df: len testdf=5340, len of pred_y=5340
[*] predict with ridge model
rmse=3.10, mae=1.61, r2=0.09
build_df: len testdf=5340, len of pred_y=5340
[*] predict with rf model
rmse=3.43, mae=1.97, r2=-0.11
build_df: len testdf=5340, len of pred_y=5340
[*] predict with svr model
rmse=3.25, mae=1.39, r2=0.00
build_df: len testdf=5340, len of pred_y=5340
[*] predict with xgb model
rmse=3.62, mae=1.94, r2=-0.23
build_df: len testdf=5340, len of pred_y=5340
Testset = Indy500-2019
[*] predict with currank model
rmse=3.07, mae=1.18, r2=-0.00
build_df: len testdf=5629, len of pred_y=5629
[*] predict with avgrank model
rmse=18.16, mae=16.11, r2=-33.95
build_df: len testdf=5629, len of pred_y=5629
[*] predict with dice model
rmse=4.15, mae=2.08, r2=-0.82
build_df: len testdf=5629, len of pred_y=5629
[*] predict with lasso model


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  positive)


rmse=2.93, mae=1.45, r2=0.09
build_df: len testdf=5629, len of pred_y=5629
[*] predict with ridge model
rmse=2.92, mae=1.43, r2=0.10
build_df: len testdf=5629, len of pred_y=5629
[*] predict with rf model
rmse=2.96, mae=1.62, r2=0.07
build_df: len testdf=5629, len of pred_y=5629
[*] predict with svr model
rmse=3.07, mae=1.23, r2=0.00
build_df: len testdf=5629, len of pred_y=5629
[*] predict with xgb model
rmse=3.04, mae=1.51, r2=0.02
build_df: len testdf=5629, len of pred_y=5629


In [13]:
import pickle
def save_result(dfs, datafile):
    with open(datafile, 'wb') as f:
        #pack [global_carids, laptime_data]
        savedata = [dfs]
        #savedata = [freq, train_set, test_set]
        # Pickle the 'data' dictionary using the highest protocol available.
        pickle.dump(savedata, f, pickle.HIGHEST_PROTOCOL)

In [59]:
_trim = 0
_include_final = True
_include_stintlen = False
include_str = '1' if _include_final else '0'
stint_str = '1' if _include_stintlen else ''
#output_file = f'stage-indy500-2013-2019-end{include_str}{stint_str}-t{_trim}.csv'
#outfile=f'shortterm-dfout-mlmodels-indy500-tr2013_2017-te2018_2019-end{include_str}-normal-t{_trim}.pickle'
#save_result(preddf, outfile)
outfile=f'shortterm-dfout-mlmodels-indy500-tr2013_2017-te2018_2019-end{include_str}-rerank-t{_trim}.pickle'
save_result(preddf_rerank, outfile)
outfile=f'shortterm-dfout-mlmodels-indy500-tr2013_2017-te2018_2019-end{include_str}-norerank-t{_trim}.pickle'
save_result(preddf_norank, outfile)

In [60]:
df = preddf_rerank['2018']['xgb']

In [61]:
df.pred_endrank

0        3.0
1        7.0
2       12.0
3       15.0
4       28.0
        ... 
5335    14.0
5336    15.0
5337    13.0
5338    11.0
5339     9.0
Name: pred_endrank, Length: 5340, dtype: float64

In [18]:
### test blackhorse car=27
df2018 = preddf_oracle['2018']['rf']
car27 = df2018[df2018['carno']==27]

In [19]:
car27

Unnamed: 0,carno,startlap,startrank,endrank,diff,sign,pred_endrank,pred_diff,pred_sign
3484,27,9,25,25,0,0,25,0,0
3485,27,10,25,25,0,0,25,0,0
3486,27,11,25,25,0,0,25,0,0
3487,27,12,25,25,0,0,25,0,0
3488,27,13,25,25,0,0,25,0,0
...,...,...,...,...,...,...,...,...,...
3667,27,192,6,4,-2,-1,8,2,1
3668,27,193,6,3,-3,-1,8,2,1
3669,27,194,4,3,-1,-1,5,1,1
3670,27,195,3,3,0,0,5,2,1


In [20]:
evaluate(car27['pred_diff'].values,car27['diff'].values)

rmse=3.29, mae=1.72, r2=-0.44


(1.7180851063829787, 3.291672277578484, -0.4443157946346745)

In [31]:
### test blackhorse car=27
df2018 = preddf_oracle['2018']['xgb']
car27 = df2018[df2018['carno']==12]
car27

Unnamed: 0,carno,startlap,startrank,endrank,diff,sign,pred_endrank,pred_diff,pred_sign
7,12,9,2,2,0,0,-0.598732,-2.598732,-1.0
40,12,10,2,2,0,0,1.198137,-0.801863,-1.0
73,12,11,2,2,0,0,0.704956,-1.295044,-1.0
106,12,12,2,2,0,0,1.415470,-0.584530,-1.0
139,12,13,2,2,0,0,1.511625,-0.488375,-1.0
...,...,...,...,...,...,...,...,...,...
5236,12,192,2,0,-2,-1,0.841341,-1.158659,-1.0
5259,12,193,2,0,-2,-1,2.206739,0.206739,1.0
5282,12,194,0,0,0,0,-0.014519,-0.014519,-1.0
5305,12,195,0,0,0,0,8.409620,8.409620,1.0


In [22]:
evaluate(car27['pred_diff'].values,car27['diff'].values)

rmse=4.31, mae=2.28, r2=0.04


(2.276595744680851, 4.314744636313456, 0.036400060920603994)

In [30]:
car27 = df2018
evaluate(car27['pred_diff'].values,car27['diff'].values)

NameError: name 'df2018' is not defined

In [24]:
len(df2018)

5340

In [25]:
retdf_oracle

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lasso,ridge,rf,svr,xgb
0,Indy500-2018,27094,5340,"+:606,0:3040,-:1694",1.364232,15.139513,2.251498,1.409551,1.422472,1.751873,1.364232,1.762921
0,Indy500-2019,27094,5629,"+:574,0:3583,-:1472",1.184758,15.646829,2.077456,1.214781,1.221531,1.402558,1.184758,1.318174


In [37]:
top10 = [12, 20, 9, 27, 28, 22, 29, 1, 6]
car27 = df2018[df2018['carno'].isin(top10)]
evaluate(car27['pred_diff'].values,car27['diff'].values)

rmse=4.79, mae=2.65, r2=-1.01


(2.6498176575446806, 4.792228578780256, -1.005963099525081)

In [27]:
top10 = [12, 20, 9, 27, 28]
car27 = df2018[df2018['carno'].isin(top10)]
evaluate(car27['pred_diff'].values,car27['diff'].values)

rmse=4.25, mae=2.26, r2=-0.04


(2.25531914893617, 4.2453980090586505, -0.04400240824593493)

In [28]:
df2018 = preddf['2018']['xgb']
car27 = df2018[df2018['carno'].isin(top10)]
evaluate(car27['pred_diff'].values,car27['diff'].values)

rmse=4.62, mae=3.48, r2=-0.53


(3.48, 4.6173585522460785, -0.525646897183421)

### test rerank

In [55]:
def rerank0(dfout):
    """
    output of prediction of target can be float
    ['carno','startlap', 'startrank','endrank']
    resort the endrank globally
    
    """
    
    #df = dfout.sort_values(by=['startlap','carno'])
    
    laps = set(dfout.startlap.values)
    
    dfs = []
    for lap in laps:
        df = dfout[dfout['startlap']==lap]
        
        #print('in',df)
        
        idx = np.argsort(df.endrank.values.reshape((-1,1)), axis=0)
        true_rank = np.argsort(idx, axis=0)
    
        df['endrank'] = true_rank
        
        #print('out',df)
        dfs.append(df)
        
    dfret = pd.concat(dfs)
    
    
    return dfret


def rerank(dfout):
    """
    output of prediction of target can be float
    ['carno','startlap', 'startrank','endrank']
    resort the endrank globally
    
    """
    
    #df = dfout.sort_values(by=['startlap','carno'])
    
    laps = set(dfout.startlap.values)
    
    dfs = []
    for lap in laps:
        df = dfout[dfout['startlap']==lap].to_numpy()
        
        #print('in',df)
        
        idx = np.argsort(df[:,3], axis=0)
        true_rank = np.argsort(idx, axis=0)
    
        df[:,3] = true_rank
        
        #print('out',df)
        if len(dfs) == 0:
            dfs = df
        else:
            dfs = np.vstack((dfs, df))
        #dfs.append(df)
        #np.vstack(df)
        
    #dfret = pd.concat(dfs)
    #data = np.array(dfs)
    dfret = pd.DataFrame(dfs, columns =['carno','startlap', 'startrank','endrank'])
    
    return dfret

In [38]:
dfin = car27.copy()
dftest = car27[car27['startlap']==31][['carno','startlap', 'startrank','endrank']]

In [39]:
dfout = rerank(dftest)

In [40]:
dfout

Unnamed: 0,carno,startlap,startrank,endrank
0,1,31,12,1
1,6,31,24,6
2,9,31,19,4
3,12,31,17,3
4,20,31,9,0
5,22,31,13,2
6,27,31,7,8
7,28,31,23,5
8,29,31,3,7


In [48]:
data = dfin[['carno','startlap', 'startrank','endrank']]
dfout = data
dfout = do_rerank(dfout)

rerank...


In [49]:
dfout[dfout['startlap']==31]

Unnamed: 0,carno,startlap,startrank,endrank
198,1,31,12,1
199,6,31,24,6
200,9,31,19,4
201,12,31,17,3
202,20,31,9,0
203,22,31,13,2
204,27,31,7,8
205,28,31,23,5
206,29,31,3,7


In [67]:
car27

Unnamed: 0,carno,startlap,startrank,endrank,diff,sign,pred_endrank,pred_diff,pred_sign
32,9,31,5,8,3,1,8,3,1
33,9,49,8,10,2,1,12,4,1
34,9,94,10,8,-2,-1,7,-3,-1
35,9,129,8,10,2,1,8,0,0
36,9,159,10,2,-8,-1,11,1,1
39,12,31,3,5,2,1,8,5,1
40,12,49,5,0,-5,-1,8,3,1
41,12,93,0,1,1,1,2,2,1
42,12,128,1,7,6,1,5,4,1
43,12,170,7,0,-7,-1,2,-5,-1
