### stage_model_regressor

predicting target: end_rank = start_rank + change(old target)

base: 14./stage_model_regressor_withneighbor-newfeatures

prediction models of chg_of_rank_in_stage on stage dataset

data format:
    target , eventid ,    car_number,    stageid,     features...

In [2]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import math


In [3]:
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble.forest import RandomForestRegressor
from sklearn.linear_model.ridge import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model.stochastic_gradient import SGDRegressor
from sklearn.svm.classes import SVR
from sklearn.utils import shuffle
from sklearn import metrics
import xgboost as xgb



In [4]:
# bulid regression model
regressors = ['currank','avgrank','dice','lasso','ridge','rf',
              'svr_rbf','svr_lin','xgb']
              #'svr_rbf','svr_lin','svr_poly','xgb']
train_x, train_y, test_x, test_y = None, None, None, None

def get_regressor(regressor = 'lr'):
    if regressor == "lasso":
        clf = LassoCV(cv=5, random_state=0)
    elif regressor == "ridge":
        clf = RidgeCV(alphas=np.logspace(-6, 6, 13))
    elif regressor == "rf":
        clf = RandomForestRegressor(n_estimators=100)
    elif regressor == 'svr_rbf':
        clf = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)
    elif regressor == 'svr_lin':
        clf = SVR(kernel='linear', C=100, gamma='auto')
    elif regressor == 'svr_poly':
        clf = SVR(kernel='poly', C=100, gamma='auto', degree=3, epsilon=.1,
               coef0=1)
    elif regressor == 'xgb':
        clf = xgb.XGBRegressor(objective="reg:linear", random_state=42, max_depth=3)
    elif regressor == 'dice':
        clf = RandomDice('1234')
    elif regressor == 'currank':
        clf = CurRank()
    elif regressor == 'avgrank':
        clf = AverageRank()        
    else:
        clf = None
        
    return clf


class CurRank():
    """
    predict with current rank
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = [0 for x in range(test_x.shape[0])]
        return np.array(pred_y)
    
class AverageRank():
    """
    print('[*] predict with average rankchg (change_in_rank_all):idx = 15')
    change_in_rank_all = test[:,15]
    pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in change_in_rank_all])
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            #13, change_in_rank_all
            pred_y.append(x[13])
        #pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in pred_y])
        pred_y_avg = pred_y
        return np.array(pred_y_avg)   

class RandomDice():
    """
    a random dice model
    """
    def __init__(self, seed='1234'):
        self.dist = []
        self.val = []
        random.seed(seed)
    
    def fit(self, x, y):
        total = y.shape[0]
        yval = set(y)
        
        ratio = 0.
        for val in yval:
            self.val.append(val)
            ratio += np.sum(y==val)*1.0 / total
            self.dist.append(ratio)
            
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            dice = random.random()
            #search in self.dist
            find_idx = -1
            for idx, ratio in enumerate(self.dist):
                if dice <= ratio:
                    find_idx = idx
                    break
            
            #or the last one match
            pred_y.append(self.val[find_idx])
            
        return np.array(pred_y)

def evaluate(test_y, pred_y):
    mae = metrics.mean_absolute_error(test_y, pred_y) 
    rmse = math.sqrt(metrics.mean_squared_error(test_y, pred_y))
    r2 = metrics.r2_score(test_y, pred_y)
    print('rmse=%.2f, mae=%.2f, r2=%.2f'%(rmse, mae, r2))
    return mae,rmse, r2
    
#
#features
#    cols=[Myidx, 'target','eventid','car_number','stageid',
#             'firststage','pit_in_caution','start_position',
#             'start_rank','start_rank_ratio','top_pack','bottom_pack',
#             'average_rank','average_rank_all',
#             'change_in_rank','change_in_rank_all','rate_of_change','rate_of_change_all']    
def split_by_eventid(stagedata, eventid):
    """
    split by eventid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['eventid'] != eventid].to_numpy()
    test  = stagedata[stagedata['eventid'] == eventid].to_numpy()

    #2:car_number
    train_x = train[:,2:]
    #train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    train_y = train[:,1]
    test_x = test[:,2:]
    #test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    test_y = test[:,1]
    
    #change target to endrank
    #train_y = train_y + train[:,8] 
    #test_y = test_y + test[:,8]
    
    train = stagedata[stagedata['eventid'] != eventid]
    test  = stagedata[stagedata['eventid'] == eventid]
    
    return train, test, train_x, train_y, test_x, test_y


def split_by_stageid(stagedata, stageid):
    """
    split by stageid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['stageid'] <= stageid].to_numpy()
    test  = stagedata[stagedata['stageid'] > stageid].to_numpy()

    train_x = train[:,2:]
    #train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    train_y = train[:,1]
    test_x = test[:,2:]
    #test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    test_y = test[:,1]
    
    return train, test, train_x, train_y, test_x, test_y


def regressor_model(name='svr'):
    ### test learning models
    print('[*] predict with %s model'%name)
    clf = get_regressor(name)
    clf.fit(train_x, train_y)

    pred_y = clf.predict(test_x)
    
    #int only
    pred_y = pred_y.astype(int)

    
    score = evaluate(test_y, pred_y)
    return score, pred_y

In [5]:
def build_df(testdf, pred_y):
    """
    build a standard stint prediction result:
    carno	startlap	startrank	endrank	diff	sign	pred_endrank	pred_diff	pred_sign
    endlap pred_endlap
    """
    
    print('build_df: len testdf=%d, len of pred_y=%d'%(len(testdf), len(pred_y)))
    
    test = testdf[['car_number','start_lap','start_rank','target']].values
    test[:,1] = test[:,1]-1
    test[:,2] = test[:,2]-1
    test[:,3] = test[:,2] + test[:,3]
    dfout = pd.DataFrame(test, columns =['carno','startlap', 'startrank','endrank'])
    
    dfout['diff'] = dfout['endrank'] - dfout['startrank']
    signVec = dfout['diff'].values.copy()
    for idx in range(len(signVec)):
        sign = 0
        if signVec[idx] > 0:
            sign = 1
        elif signVec[idx] < 0:
            sign = -1
        signVec[idx] = sign
    dfout['sign'] = signVec

    # add predictions
    dfout['pred_endrank'] = pred_y +  dfout['startrank']
    dfout['pred_diff'] = pred_y 
    signVec = dfout['pred_diff'].values.copy()
    for idx in range(len(signVec)):
        sign = 0
        if signVec[idx] > 0:
            sign = 1
        elif signVec[idx] < 0:
            sign = -1
        signVec[idx] = sign
    dfout['pred_sign'] = signVec
    return dfout
    

def test_cv():
    global train_x, train_y, test_x, test_y
    
    cols = ['runid','trainsize','testsize','testdistribution']
    cols.extend(regressors)
    print('cols:%s'%cols)
    retdf0 = pd.DataFrame([],columns=cols)
    retdf1 = pd.DataFrame([],columns=cols)

    events = set(stagedata['eventid'])

    years = ['2013','2014','2015','2016','2017','2018','2019']
    #events = ['Indy500']
    eventsname = [f'Indy500-{x}' for x in years]
    events_id={key:idx for idx, key in enumerate(eventsname)}
    for eventid in events:
        print('Testset = %s'%eventsname[eventid])

        train, test, train_x, train_y, test_x, test_y = split_by_eventid(stagedata, eventid)
        test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
        #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
        #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

        #record
        rec0 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
        rec1 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

        pred_y = [0 for x in range(len(regressors))]
        acc0 = [0 for x in range(len(regressors))]
        acc1 = [0 for x in range(len(regressors))]
        for idx, clf in enumerate(regressors):
            acc, pred_y[idx] = regressor_model(clf)
            acc0[idx] = acc[0]
            acc1[idx] = acc[2]

        rec0.extend(acc0)
        rec1.extend(acc1)
        #print('rec:%s'%rec)

        #new df
        df = pd.DataFrame([rec0],columns=cols)
        retdf0 = pd.concat([retdf0, df])        

        df = pd.DataFrame([rec1],columns=cols)
        retdf1 = pd.concat([retdf1, df])        


    #retdf0.to_csv('regressors_stagedata_splitbyevent%s_rmse.csv'%suffix)
    #retdf1.to_csv('regressors_stagedata_splitbyevent%s_r2.csv'%suffix)
    retdf0.to_csv('crossvalid_stagedata_regressor_%s.csv'%suffix, float_format='%.3f')

    df_event_rmse = retdf0
    df_event_r2 = retdf1
    return df_event_rmse
    
def test_20182019():
    global train_x, train_y, test_x, test_y

    pred_df = {'2018':{}, '2019':{}}
    
    ### train 2013-2017
    #load data
    cols = ['runid','trainsize','testsize','testdistribution']
    cols.extend(regressors)
    print('cols:%s'%cols)
    retdf0 = pd.DataFrame([],columns=cols)
    retdf1 = pd.DataFrame([],columns=cols)

    events = set(stagedata['eventid'])

    years = ['2013','2014','2015','2016','2017','2018','2019']
    #events = ['Indy500']
    eventsname = [f'Indy500-{x}' for x in years]
    events_id={key:idx for idx, key in enumerate(eventsname)}

    #first 
    eventid = events_id['Indy500-2018']
    ignore_eventid = events_id['Indy500-2019']
    stdata_2018 = stagedata[stagedata['eventid']!=ignore_eventid]

    print('Testset = %s'%eventsname[eventid])

    traindf, testdf, train_x, train_y, test_x, test_y = split_by_eventid(stdata_2018, eventid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
    #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
    #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

    #record
    rec0 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    rec1 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

    pred_y = [0 for x in range(len(regressors))]
    acc0 = [0 for x in range(len(regressors))]
    acc1 = [0 for x in range(len(regressors))]
    for idx, clf in enumerate(regressors):
        acc, pred_y[idx] = regressor_model(clf)
        acc0[idx] = acc[0]
        acc1[idx] = acc[2]
        
        #build pred df
        pred_df['2018'][clf] = build_df(testdf, pred_y[idx])
        

    rec0.extend(acc0)
    rec1.extend(acc1)
    #print('rec:%s'%rec)

    #new df
    df = pd.DataFrame([rec0],columns=cols)
    retdf0 = pd.concat([retdf0, df])        

    #second 
    eventid = events_id['Indy500-2019']
    ignore_eventid = events_id['Indy500-2018']
    stdata_2019 = stagedata[stagedata['eventid']!=ignore_eventid]

    print('Testset = %s'%eventsname[eventid])

    traindf, testdf, train_x, train_y, test_x, test_y = split_by_eventid(stdata_2019, eventid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
    #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
    #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

    #record
    rec0 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    rec1 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

    acc0 = [0 for x in range(len(regressors))]
    acc1 = [0 for x in range(len(regressors))]
    for idx, clf in enumerate(regressors):
        acc, pred_y[idx] = regressor_model(clf)
        acc0[idx] = acc[0]
        acc1[idx] = acc[2]
        
        #build pred df
        pred_df['2019'][clf] = build_df(testdf, pred_y[idx])


    rec0.extend(acc0)
    rec1.extend(acc1)
    #print('rec:%s'%rec)

    #new df
    df = pd.DataFrame([rec0],columns=cols)
    retdf0 = pd.concat([retdf0, df])    

    retdf0.to_csv(f'stint_regressor_result_{suffix}.csv', float_format='%.3f')
    
    return retdf0, pred_df   

### test oracle with stint_len

In [9]:
#load data
_trim = 0
_include_final = True
_include_stintlen = True
#_include_stintlen = False
include_str = '1' if _include_final else '0'
stint_str = '1' if _include_stintlen else ''
suffix = f'indy500-2013-2019-end{include_str}{stint_str}-t{_trim}'
output_file = f'stage-indy500-2013-2019-end{include_str}{stint_str}-t{_trim}.csv'


stagedata = pd.read_csv(output_file)
stagedata.fillna(0, inplace=True)
stagedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1522 entries, 0 to 1521
Data columns (total 37 columns):
Unnamed: 0                   1522 non-null int64
target                       1522 non-null int64
start_lap                    1522 non-null int64
stint_len                    1522 non-null int64
eventid                      1522 non-null int64
car_number                   1522 non-null int64
stageid                      1522 non-null int64
firststage                   1522 non-null int64
pit_in_caution               1522 non-null int64
start_position               1522 non-null int64
start_rank                   1522 non-null int64
start_rank_ratio             1522 non-null float64
top_pack                     1522 non-null int64
bottom_pack                  1522 non-null int64
average_rank                 1522 non-null float64
average_rank_all             1522 non-null float64
change_in_rank               1522 non-null int64
change_in_rank_all           1522 non-null float64
rat

In [11]:
stagedf = stagedata
stagedf[(stagedf['car_number']==12) & (stagedf['eventid']==5)]

Unnamed: 0.1,Unnamed: 0,target,start_lap,stint_len,eventid,car_number,stageid,firststage,pit_in_caution,start_position,...,laptime_std_all,laps_prev,laps_after_last_pitstop,pittime_prev,prev_nb0_change_in_rank,prev_nb1_change_in_rank,prev_nb2_change_in_rank,follow_nb0_change_in_rank,follow_nb1_change_in_rank,follow_nb2_change_in_rank
1156,1327,2,32,18,5,12,1,1,0,3,...,8.527139,32,32,66.10705,-1,0,0,0,-3,-5
1157,1328,-5,50,44,5,12,2,1,1,3,...,11.242669,18,18,61.85245,-6,-5,0,6,3,3
1158,1329,1,94,35,5,12,3,1,0,3,...,23.296174,44,44,117.01525,0,0,0,-3,3,3
1159,1330,6,129,42,5,12,4,1,0,3,...,20.554273,35,35,59.24225,-3,0,0,2,1,0
1160,1331,-7,171,29,5,12,5,1,0,3,...,23.071877,42,42,59.5001,5,2,6,4,-6,5


In [6]:
#df_event_oracle = test_cv()

In [7]:
#df_event_oracle

In [8]:
retdf_oracle, preddf_oracle = test_20182019()

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lasso', 'ridge', 'rf', 'svr_rbf', 'svr_lin', 'xgb']
Testset = Indy500-2018
[*] predict with currank model
rmse=5.50, mae=4.10, r2=-0.01
build_df: len testdf=189, len of pred_y=189
[*] predict with avgrank model
rmse=19.94, mae=17.43, r2=-12.24
build_df: len testdf=189, len of pred_y=189
[*] predict with dice model
rmse=8.57, mae=6.56, r2=-1.44
build_df: len testdf=189, len of pred_y=189
[*] predict with lasso model
rmse=4.51, mae=3.29, r2=0.32
build_df: len testdf=189, len of pred_y=189
[*] predict with ridge model
rmse=4.87, mae=3.64, r2=0.21
build_df: len testdf=189, len of pred_y=189
[*] predict with rf model
rmse=4.53, mae=3.28, r2=0.32
build_df: len testdf=189, len of pred_y=189
[*] predict with svr_rbf model
rmse=5.50, mae=4.10, r2=-0.01
build_df: len testdf=189, len of pred_y=189
[*] predict with svr_lin model
rmse=4.78, mae=3.42, r2=0.24
build_df: len testdf=189, len of pred_y=189
[*] pre

### test without stint_len

In [7]:
### test without stint_len
#load data
_trim = 0
_include_final = True
_include_stintlen = False
include_str = '1' if _include_final else '0'
stint_str = '1' if _include_stintlen else ''
suffix = f'indy500-2013-2019-end{include_str}{stint_str}-t{_trim}'
output_file = f'stage-indy500-2013-2019-end{include_str}{stint_str}-t{_trim}.csv'


stagedata = pd.read_csv(output_file)
stagedata.fillna(0, inplace=True)
stagedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1522 entries, 0 to 1521
Data columns (total 37 columns):
Unnamed: 0                   1522 non-null int64
target                       1522 non-null int64
start_lap                    1522 non-null int64
stint_len                    1522 non-null int64
eventid                      1522 non-null int64
car_number                   1522 non-null int64
stageid                      1522 non-null int64
firststage                   1522 non-null int64
pit_in_caution               1522 non-null int64
start_position               1522 non-null int64
start_rank                   1522 non-null int64
start_rank_ratio             1522 non-null float64
top_pack                     1522 non-null int64
bottom_pack                  1522 non-null int64
average_rank                 1522 non-null float64
average_rank_all             1522 non-null float64
change_in_rank               1522 non-null int64
change_in_rank_all           1522 non-null float64
rat

In [12]:
stagedf = stagedata
stagedf[(stagedf['car_number']==12) & (stagedf['eventid']==5)]

Unnamed: 0.1,Unnamed: 0,target,start_lap,stint_len,eventid,car_number,stageid,firststage,pit_in_caution,start_position,...,laptime_std_all,laps_prev,laps_after_last_pitstop,pittime_prev,prev_nb0_change_in_rank,prev_nb1_change_in_rank,prev_nb2_change_in_rank,follow_nb0_change_in_rank,follow_nb1_change_in_rank,follow_nb2_change_in_rank
1156,1327,2,32,18,5,12,1,1,0,3,...,8.527139,32,32,66.10705,-1,0,0,0,-3,-5
1157,1328,-5,50,44,5,12,2,1,1,3,...,11.242669,18,18,61.85245,-6,-5,0,6,3,3
1158,1329,1,94,35,5,12,3,1,0,3,...,23.296174,44,44,117.01525,0,0,0,-3,3,3
1159,1330,6,129,42,5,12,4,1,0,3,...,20.554273,35,35,59.24225,-3,0,0,2,1,0
1160,1331,-7,171,29,5,12,5,1,0,3,...,23.071877,42,42,59.5001,5,2,6,4,-6,5


In [10]:
#df_event = test_cv()

In [11]:
#df_event

In [12]:
retdf, preddf = test_20182019()

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lasso', 'ridge', 'rf', 'svr_rbf', 'svr_lin', 'xgb']
Testset = Indy500-2018
[*] predict with currank model
rmse=5.50, mae=4.10, r2=-0.01
build_df: len testdf=189, len of pred_y=189
[*] predict with avgrank model
rmse=19.94, mae=17.43, r2=-12.24
build_df: len testdf=189, len of pred_y=189
[*] predict with dice model
rmse=8.57, mae=6.56, r2=-1.44
build_df: len testdf=189, len of pred_y=189
[*] predict with lasso model
rmse=4.51, mae=3.29, r2=0.32
build_df: len testdf=189, len of pred_y=189
[*] predict with ridge model
rmse=4.87, mae=3.64, r2=0.21
build_df: len testdf=189, len of pred_y=189
[*] predict with rf model
rmse=4.53, mae=3.35, r2=0.32
build_df: len testdf=189, len of pred_y=189
[*] predict with svr_rbf model
rmse=5.50, mae=4.10, r2=-0.01
build_df: len testdf=189, len of pred_y=189
[*] predict with svr_lin model
rmse=4.78, mae=3.42, r2=0.24
build_df: len testdf=189, len of pred_y=189
[*] pre

In [13]:
retdf

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lasso,ridge,rf,svr_rbf,svr_lin,xgb
0,Indy500-2018,1117,189,"+:72,0:19,-:98",4.100529,17.428571,6.560847,3.285714,3.640212,3.349206,4.100529,3.42328,3.571429
0,Indy500-2019,1117,216,"+:66,0:33,-:117",4.291667,17.486111,6.199074,4.027778,4.287037,4.055556,4.291667,4.722222,4.425926


In [14]:
retdf_oracle

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lasso,ridge,rf,svr_rbf,svr_lin,xgb
0,Indy500-2018,1117,189,"+:72,0:19,-:98",4.100529,17.428571,6.560847,3.285714,3.640212,3.275132,4.100529,3.42328,3.571429
0,Indy500-2019,1117,216,"+:66,0:33,-:117",4.291667,17.486111,6.199074,4.027778,4.287037,4.023148,4.291667,4.722222,4.425926


### free test

In [15]:
### test blackhorse car=27
df2018 = preddf_oracle['2018']['rf']
car27 = df2018[df2018['carno']==27]

In [16]:
car27

Unnamed: 0,carno,startlap,startrank,endrank,diff,sign,pred_endrank,pred_diff,pred_sign
125,27,33,19,21,2,1,22,3,1
126,27,49,21,21,0,0,20,-1,-1
127,27,94,21,17,-4,-1,20,-1,-1
128,27,129,17,8,-9,-1,14,-3,-1
129,27,173,8,3,-5,-1,9,1,1


In [17]:
evaluate(car27['pred_diff'].values,car27['diff'].values)

rmse=4.07, mae=3.40, r2=-2.99


(3.4, 4.0743097574926725, -2.9903846153846145)

In [18]:
### test blackhorse car=27
df2018 = preddf_oracle['2018']['xgb']
car27 = df2018[df2018['carno']==12]
car27

Unnamed: 0,carno,startlap,startrank,endrank,diff,sign,pred_endrank,pred_diff,pred_sign
39,12,31,3,5,2,1,8,5,1
40,12,49,5,0,-5,-1,8,3,1
41,12,93,0,1,1,1,2,2,1
42,12,128,1,7,6,1,5,4,1
43,12,170,7,0,-7,-1,2,-5,-1


In [19]:
evaluate(car27['pred_diff'].values,car27['diff'].values)

rmse=4.05, mae=3.20, r2=-0.31


(3.2, 4.049691346263317, -0.3057324840764333)

In [20]:
car27 = df2018
evaluate(car27['pred_diff'].values,car27['diff'].values)

rmse=4.76, mae=3.57, r2=-0.48


(3.5714285714285716, 4.761507919972167, -0.48108828757653566)

In [21]:
len(df2018)

189

In [22]:
retdf_oracle

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lasso,ridge,rf,svr_rbf,svr_lin,xgb
0,Indy500-2018,1117,189,"+:72,0:19,-:98",4.100529,17.428571,6.560847,3.285714,3.640212,3.275132,4.100529,3.42328,3.571429
0,Indy500-2019,1117,216,"+:66,0:33,-:117",4.291667,17.486111,6.199074,4.027778,4.287037,4.023148,4.291667,4.722222,4.425926


In [23]:
top10 = [12, 20, 9, 27, 28, 22, 29, 1, 6]
car27 = df2018[df2018['carno'].isin(top10)]
evaluate(car27['pred_diff'].values,car27['diff'].values)

rmse=4.97, mae=4.00, r2=-1.05


(4.0, 4.965184913423648, -1.0535664723841243)

In [24]:
top10 = [12, 20, 9, 27, 28]
car27 = df2018[df2018['carno'].isin(top10)]
evaluate(car27['pred_diff'].values,car27['diff'].values)

rmse=4.62, mae=3.48, r2=-0.53


(3.48, 4.6173585522460785, -0.525646897183421)

In [25]:
df2018 = preddf['2018']['xgb']
car27 = df2018[df2018['carno'].isin(top10)]
evaluate(car27['pred_diff'].values,car27['diff'].values)

rmse=4.62, mae=3.48, r2=-0.53


(3.48, 4.6173585522460785, -0.525646897183421)

### save result df

In [26]:
import pickle
def save_result(dfs, datafile):
    with open(datafile, 'wb') as f:
        #pack [global_carids, laptime_data]
        savedata = [dfs]
        #savedata = [freq, train_set, test_set]
        # Pickle the 'data' dictionary using the highest protocol available.
        pickle.dump(savedata, f, pickle.HIGHEST_PROTOCOL)

In [27]:
#output_file = f'stage-indy500-2013-2019-end{include_str}{stint_str}-t{_trim}.csv'
outfile=f'stint-dfout-mlmodels-indy500-tr2013_2017-te2018_2019-end{include_str}-normal-t{_trim}.pickle'
save_result(preddf, outfile)
outfile=f'stint-dfout-mlmodels-indy500-tr2013_2017-te2018_2019-end{include_str}-oracle-t{_trim}.pickle'
save_result(preddf_oracle, outfile)
