### stage_model_regressor

predicting target: end_rank = start_rank + change(old target)

base: 14./stage_model_regressor_withneighbor-newfeatures

prediction models of chg_of_rank_in_stage on stage dataset

data format:
    target , eventid ,    car_number,    stageid,     features...

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import math


In [2]:
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble.forest import RandomForestRegressor
from sklearn.linear_model.ridge import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model.stochastic_gradient import SGDRegressor
from sklearn.svm.classes import SVR
from sklearn.svm import LinearSVR
from sklearn.utils import shuffle
from sklearn import metrics
import xgboost as xgb



In [3]:
# bulid regression model
regressors = ['currank','avgrank','dice','lasso','ridge','rf',
              'svr_rbf','svr_lin','xgb']
              #'svr_rbf','svr_lin','svr_poly','xgb']
    
train_x, train_y, test_x, test_y = None, None, None, None

def get_regressor(regressor = 'lr'):
    if regressor == "lasso":
        clf = LassoCV(cv=5, random_state=0)
    elif regressor == "ridge":
        clf = RidgeCV(alphas=np.logspace(-6, 6, 13))
    elif regressor == "rf":
        #clf = RandomForestRegressor(n_estimators=100)
        clf = RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=5,
                      min_samples_split=12, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)
    elif regressor == 'svr_rbf':
        clf = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)
    elif regressor == 'svr_lin':
        #clf = SVR(kernel='linear', C=100, gamma='auto')
        clf = SVR(C=10, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
            kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
        #clf = SVR(kernel='linear', C=1, gamma='auto')
        #clf = LinearSVR(random_state=0, tol=1e-5)
    elif regressor == 'svr_poly':
        clf = SVR(kernel='poly', C=100, gamma='auto', degree=3, epsilon=.1,
               coef0=1)
    elif regressor == 'xgb':
        #clf = xgb.XGBRegressor(objective="reg:linear", random_state=42, max_depth=3)
        clf = xgb.XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0.3, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.300000012, max_delta_step=0, max_depth=7,
             min_child_weight=4, monotone_constraints=None,
             n_estimators=100, n_jobs=1, nthread=1, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.8, tree_method=None,
             validate_parameters=False, verbosity=None)
    elif regressor == 'dice':
        clf = RandomDice('1234')
    elif regressor == 'currank':
        clf = CurRank()
    elif regressor == 'avgrank':
        clf = AverageRank()        
    else:
        clf = None
        
    return clf


class CurRank():
    """
    predict with current rank
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = [0 for x in range(test_x.shape[0])]
        return np.array(pred_y)
    
class AverageRank():
    """
    print('[*] predict with average rankchg (change_in_rank_all):idx = 15')
    change_in_rank_all = test[:,15]
    pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in change_in_rank_all])
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            #13, change_in_rank_all
            pred_y.append(x[13])
        #pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in pred_y])
        pred_y_avg = pred_y
        return np.array(pred_y_avg)   

class RandomDice():
    """
    a random dice model
    """
    def __init__(self, seed='1234'):
        self.dist = []
        self.val = []
        random.seed(seed)
    
    def fit(self, x, y):
        total = y.shape[0]
        yval = set(y)
        
        ratio = 0.
        for val in yval:
            self.val.append(val)
            ratio += np.sum(y==val)*1.0 / total
            self.dist.append(ratio)
            
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            dice = random.random()
            #search in self.dist
            find_idx = -1
            for idx, ratio in enumerate(self.dist):
                if dice <= ratio:
                    find_idx = idx
                    break
            
            #or the last one match
            pred_y.append(self.val[find_idx])
            
        return np.array(pred_y)

def evaluate(test_y, pred_y):
    mae = metrics.mean_absolute_error(test_y, pred_y) 
    rmse = math.sqrt(metrics.mean_squared_error(test_y, pred_y))
    r2 = metrics.r2_score(test_y, pred_y)
    print('rmse=%.2f, mae=%.2f, r2=%.2f'%(rmse, mae, r2))
    return mae,rmse, r2
    
#
#features
#    cols=[Myidx, 'target','start_lap', eventid','car_number','stageid',
#             'firststage','pit_in_caution','start_position',
#             'start_rank','start_rank_ratio','top_pack','bottom_pack',
#             'average_rank','average_rank_all',
#             'change_in_rank','change_in_rank_all','rate_of_change','rate_of_change_all']    
#
#  target id = 1
#  start id = 2 by default

def split_by_eventid(stagedata, train_events, test_events, startid=2):
    """
    split by eventid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['eventid'].isin(train_events)].to_numpy()
    test  = stagedata[stagedata['eventid'].isin(test_events)].to_numpy()

    #2:car_number
    train_x = train[:,startid:]
    #train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    train_y = train[:,1]
    test_x = test[:,startid:]
    #test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    test_y = test[:,1]
    
    #change target to endrank
    #train_y = train_y + train[:,8] 
    #test_y = test_y + test[:,8]
    
    train = stagedata[stagedata['eventid'].isin(train_events)]
    test  = stagedata[stagedata['eventid'].isin(test_events)]
    
    return train, test, train_x, train_y, test_x, test_y


def split_by_stageid(stagedata, stageid):
    """
    split by stageid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['stageid'] <= stageid].to_numpy()
    test  = stagedata[stagedata['stageid'] > stageid].to_numpy()

    train_x = train[:,2:]
    #train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    train_y = train[:,1]
    test_x = test[:,2:]
    #test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    test_y = test[:,1]
    
    return train, test, train_x, train_y, test_x, test_y


def regressor_model(name='svr'):
    ### test learning models
    print('[*] predict with %s model'%name)
    clf = get_regressor(name)
    clf.fit(train_x, train_y)

    pred_y = clf.predict(test_x)
    
    #int only
    pred_y = pred_y.astype(int)

    
    score = evaluate(test_y, pred_y)
    return score, pred_y

In [4]:
def build_df(testdf, pred_y):
    """
    build a standard stint prediction result:
    carno	startlap	startrank	endrank	diff	sign	pred_endrank	pred_diff	pred_sign
    endlap pred_endlap
    """
    
    print('build_df: len testdf=%d, len of pred_y=%d'%(len(testdf), len(pred_y)))
    
    test = testdf[['car_number','start_lap','start_rank','target']].values
    test[:,1] = test[:,1]-1
    test[:,2] = test[:,2]-1
    test[:,3] = test[:,2] + test[:,3]
    dfout = pd.DataFrame(test, columns =['carno','startlap', 'startrank','endrank'])
    
    dfout['diff'] = dfout['endrank'] - dfout['startrank']
    signVec = dfout['diff'].values.copy()
    for idx in range(len(signVec)):
        sign = 0
        if signVec[idx] > 0:
            sign = 1
        elif signVec[idx] < 0:
            sign = -1
        signVec[idx] = sign
    dfout['sign'] = signVec

    # add predictions
    dfout['pred_endrank'] = pred_y +  dfout['startrank']
    dfout['pred_diff'] = pred_y 
    signVec = dfout['pred_diff'].values.copy()
    for idx in range(len(signVec)):
        sign = 0
        if signVec[idx] > 0:
            sign = 1
        elif signVec[idx] < 0:
            sign = -1
        signVec[idx] = sign
    dfout['pred_sign'] = signVec
    return dfout
    

def test_cv():
    global train_x, train_y, test_x, test_y
    
    cols = ['runid','trainsize','testsize','testdistribution']
    cols.extend(regressors)
    print('cols:%s'%cols)
    retdf0 = pd.DataFrame([],columns=cols)
    retdf1 = pd.DataFrame([],columns=cols)

    events = set(stagedata['eventid'])

    years = ['2013','2014','2015','2016','2017','2018','2019']
    #events = ['Indy500']
    eventsname = [f'Indy500-{x}' for x in years]
    events_id={key:idx for idx, key in enumerate(eventsname)}
    for eventid in events:
        print('Testset = %s'%eventsname[eventid])

        train, test, train_x, train_y, test_x, test_y = split_by_eventid(stagedata, eventid)
        test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
        #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
        #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

        #record
        rec0 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
        rec1 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

        pred_y = [0 for x in range(len(regressors))]
        acc0 = [0 for x in range(len(regressors))]
        acc1 = [0 for x in range(len(regressors))]
        for idx, clf in enumerate(regressors):
            acc, pred_y[idx] = regressor_model(clf)
            acc0[idx] = acc[0]
            acc1[idx] = acc[2]

        rec0.extend(acc0)
        rec1.extend(acc1)
        #print('rec:%s'%rec)

        #new df
        df = pd.DataFrame([rec0],columns=cols)
        retdf0 = pd.concat([retdf0, df])        

        df = pd.DataFrame([rec1],columns=cols)
        retdf1 = pd.concat([retdf1, df])        


    #retdf0.to_csv('regressors_stagedata_splitbyevent%s_rmse.csv'%suffix)
    #retdf1.to_csv('regressors_stagedata_splitbyevent%s_r2.csv'%suffix)
    #retdf0.to_csv('crossvalid_stagedata_regressor_%s.csv'%suffix, float_format='%.3f')

    df_event_rmse = retdf0
    df_event_r2 = retdf1
    return df_event_rmse
    
def test_20182019(startid = 2):
    global train_x, train_y, test_x, test_y

    pred_df = {}
    
    ### train 2013-2017
    #load data
    cols = ['runid','trainsize','testsize','testdistribution']
    cols.extend(regressors)
    print('cols:%s'%cols)
    retdf0 = pd.DataFrame([],columns=cols)
    retdf1 = pd.DataFrame([],columns=cols)

    eventsids = set(stagedata['eventid'])

    for event in events:
        test_eventid = events_id[event]
        
        if test_eventid in _train_events:
            continue
        
        print('Testset = %s'% event, test_eventid)

        traindf, testdf, train_x, train_y, test_x, test_y = split_by_eventid(stagedata, 
                                                            _train_events, [test_eventid], startid=startid)
        test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
        #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
        #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

        #record
        rec0 = [event,train_x.shape[0],test_x.shape[0],test_distribution]
        rec1 = [event,train_x.shape[0],test_x.shape[0],test_distribution]

        pred_y = [0 for x in range(len(regressors))]
        acc0 = [0 for x in range(len(regressors))]
        acc1 = [0 for x in range(len(regressors))]
        for idx, clf in enumerate(regressors):
            acc, pred_y[idx] = regressor_model(clf)
            acc0[idx] = acc[0]
            acc1[idx] = acc[2]

            #build pred df
            if not event in pred_df:
                pred_df[event] = {}
            pred_df[event][clf] = build_df(testdf, pred_y[idx])


        rec0.extend(acc0)
        rec1.extend(acc1)
        #print('rec:%s'%rec)

        #new df
        df = pd.DataFrame([rec0],columns=cols)
        retdf0 = pd.concat([retdf0, df])        

    return retdf0, pred_df   

### test oracle with stint_len

In [5]:
events_info = {
    'Phoenix':(256, 1.022),'Indy500':(500,2.5),'Texas':(372,1.5),
    'Iowa':(268,0.894),'Pocono':(500,2.5),'Gateway':(310,1.25)
}

years = ['2013','2014','2015','2016','2017','2018','2019']
events = [f'Indy500-{x}' for x in years]

events.extend(['Phoenix-2018','Texas-2018','Texas-2019','Pocono-2018','Pocono-2019','Iowa-2018','Iowa-2019',
              'Gateway-2018','Gateway-2019'])

events_id={key:idx for idx, key in enumerate(events)}

dataroot = 'test/'

_train_events = [events_id[x] for x in [f'Indy500-{x}' for x in ['2013','2014','2015','2016','2017']]]


In [6]:
#load data
_trim = 0
_include_final = True
_include_stintlen = True
#_include_stintlen = False
include_str = '1' if _include_final else '0'
stint_str = '1' if _include_stintlen else ''
#output_file = f'stage-indy500-2013-2019-end{include_str}{stint_str}-t{_trim}-newtry.csv'
output_file = f'{dataroot}/stage-IndyCar-d{len(events)}-end{include_str}{stint_str}.csv'

stagedata = pd.read_csv(output_file)
stagedata.fillna(0, inplace=True)
stagedata.info()
stagedata_raw = stagedata.copy()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2265 entries, 0 to 2264
Data columns (total 39 columns):
Unnamed: 0                   2265 non-null int64
target                       2265 non-null int64
start_lap                    2265 non-null int64
eventid                      2265 non-null int64
car_number                   2265 non-null int64
stageid                      2265 non-null int64
firststage                   2265 non-null int64
pit_in_caution               2265 non-null int64
start_position               2265 non-null int64
start_rank                   2265 non-null int64
start_rank_ratio             2265 non-null float64
top_pack                     2265 non-null int64
bottom_pack                  2265 non-null int64
average_rank                 2265 non-null float64
average_rank_all             2265 non-null float64
change_in_rank               2265 non-null int64
change_in_rank_all           2265 non-null float64
rate_of_change               2265 non-null int64
rat

In [7]:
stagedf = stagedata
stagedf[(stagedf['car_number']==12) & (stagedf['eventid']==5)]

Unnamed: 0.1,Unnamed: 0,target,start_lap,eventid,car_number,stageid,firststage,pit_in_caution,start_position,start_rank,...,pittime_prev,prev_nb0_change_in_rank,prev_nb1_change_in_rank,prev_nb2_change_in_rank,follow_nb0_change_in_rank,follow_nb1_change_in_rank,follow_nb2_change_in_rank,cautionlaps_all,cautionlaps_cur,stint_len
1156,1327,2,32,5,12,1,1,0,3,4,...,66.10705,-1,0,0,0,-3,-5,0,0,18
1157,1328,-5,50,5,12,2,1,1,3,6,...,61.85245,-6,-5,0,6,3,3,2,2,44
1158,1329,1,94,5,12,3,1,0,3,1,...,117.01525,0,0,0,-3,3,3,15,0,35
1159,1330,6,129,5,12,4,1,0,3,2,...,59.24225,-3,0,0,2,1,0,0,0,42
1160,1331,-7,171,5,12,5,1,0,3,8,...,59.5001,5,2,6,4,-6,5,20,0,29


In [8]:
#df_event_oracle = test_cv()

In [9]:
retdf, preddf = {}, {}

In [10]:
retdf['oracle'], preddf['oracle'] = test_20182019()
retdf['oracle-start5'], preddf['oracle-start5'] = test_20182019(startid=5)

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lasso', 'ridge', 'rf', 'svr_rbf', 'svr_lin', 'xgb']
Testset = Indy500-2018 5
[*] predict with currank model
rmse=5.50, mae=4.10, r2=-0.01
build_df: len testdf=189, len of pred_y=189
[*] predict with avgrank model
rmse=9.38, mae=7.21, r2=-1.93
build_df: len testdf=189, len of pred_y=189
[*] predict with dice model
rmse=8.57, mae=6.56, r2=-1.44
build_df: len testdf=189, len of pred_y=189
[*] predict with lasso model
rmse=4.52, mae=3.28, r2=0.32
build_df: len testdf=189, len of pred_y=189
[*] predict with ridge model
rmse=4.68, mae=3.48, r2=0.27
build_df: len testdf=189, len of pred_y=189
[*] predict with rf model
rmse=4.51, mae=3.25, r2=0.32
build_df: len testdf=189, len of pred_y=189
[*] predict with svr_rbf model
rmse=5.50, mae=4.10, r2=-0.01
build_df: len testdf=189, len of pred_y=189
[*] predict with svr_lin model
rmse=4.74, mae=3.39, r2=0.25
build_df: len testdf=189, len of pred_y=189
[*] pred

rmse=2.83, mae=2.31, r2=-0.16
build_df: len testdf=81, len of pred_y=81
[*] predict with svr_rbf model
rmse=2.64, mae=1.85, r2=-0.01
build_df: len testdf=81, len of pred_y=81
[*] predict with svr_lin model
rmse=8.28, mae=7.57, r2=-8.90
build_df: len testdf=81, len of pred_y=81
[*] predict with xgb model
rmse=3.47, mae=2.84, r2=-0.74
build_df: len testdf=81, len of pred_y=81
Testset = Gateway-2018 14
[*] predict with currank model
rmse=5.30, mae=3.82, r2=-0.01
build_df: len testdf=87, len of pred_y=87
[*] predict with avgrank model
rmse=8.24, mae=6.45, r2=-1.43
build_df: len testdf=87, len of pred_y=87
[*] predict with dice model
rmse=8.97, mae=6.49, r2=-1.88
build_df: len testdf=87, len of pred_y=87
[*] predict with lasso model
rmse=5.14, mae=4.15, r2=0.06
build_df: len testdf=87, len of pred_y=87
[*] predict with ridge model
rmse=7.32, mae=6.22, r2=-0.92
build_df: len testdf=87, len of pred_y=87
[*] predict with rf model
rmse=4.29, mae=3.22, r2=0.34
build_df: len testdf=87, len of pre

ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.

In [None]:
stagedata = stagedata.drop(['cautionlaps_all','cautionlaps_cur'],axis=1)
retdf['oracle-start5-nonewtry'], preddf['oracle-start5-nonewtry'] = test_20182019(startid=5)
retdf['oracle-start2-nonewtry'], preddf['oracle-start2-nonewtry'] = test_20182019(startid=2)

### test without stint_len

In [None]:
stagedata = stagedata_raw.copy()    

In [None]:
stagedata = stagedata.drop(['stint_len'],axis=1)

In [None]:
stagedf = stagedata
stagedf[(stagedf['car_number']==12) & (stagedf['eventid']==5)]

In [None]:
#df_event = test_cv()

In [None]:
#df_event

In [None]:
retdf['normal'], preddf['normal'] = test_20182019()
retdf['normal-start5'], preddf['normal-start5'] = test_20182019(startid=5)

In [None]:
stagedata = stagedata.drop(['cautionlaps_all','cautionlaps_cur'],axis=1)
retdf['normal-start5-nonewtry'], preddf['normal-start5-nonewtry'] = test_20182019(startid=5)
retdf['normal-start2-nonewtry'], preddf['normal-start2-nonewtry'] = test_20182019(startid=2)

In [None]:
for key in retdf:
    print(key)
    print(retdf[key])

### save result df

In [None]:
import pickle
def save_result(dfs, datafile):
    with open(datafile, 'wb') as f:
        #pack [global_carids, laptime_data]
        savedata = [dfs]
        #savedata = [freq, train_set, test_set]
        # Pickle the 'data' dictionary using the highest protocol available.
        pickle.dump(savedata, f, pickle.HIGHEST_PROTOCOL)

In [None]:

version = f'IndyCar-d{len(events)}'

outfile=f'{dataroot}/stint-dfout-mlmodels-{version}-end{include_str}-normal-t{_trim}-alldata.pickle'
save_result(preddf, outfile)


In [None]:
#output_file = f'stage-indy500-2013-2019-end{include_str}{stint_str}-t{_trim}.csv'
outfile=f'{dataroot}/stint-dfout-mlmodels-{version}-end{include_str}-normal-t{_trim}-newtry-tuned.pickle'
save_result(preddf['normal'], outfile)
outfile=f'{dataroot}/stint-dfout-mlmodels-{version}-end{include_str}-oracle-t{_trim}-newtry-tuned.pickle'
save_result(preddf['oracle'], outfile)

outfile=f'{dataroot}/stint-dfout-mlmodels-{version}-end{include_str}-normal-t{_trim}-tuned.pickle'
save_result(preddf['normal-start2-nonewtry'], outfile)
outfile=f'{dataroot}/stint-dfout-mlmodels-{version}-end{include_str}-oracle-t{_trim}-tuned.pickle'
save_result(preddf['oracle-start2-nonewtry'], outfile)
