### stage_model_regressor

prediction models of lap_cnt on pitstop dataset

data format:
     'eventid', 'carno','pit_id', 'lap_number', 
     'lap_time',
     'lap_cnt','cautionlap_cnt','pit_oncaution_len','pit_oncaution'

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import math


In [2]:
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble.forest import RandomForestRegressor
from sklearn.linear_model.ridge import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model.stochastic_gradient import SGDRegressor
from sklearn.svm.classes import SVR
from sklearn.utils import shuffle
from sklearn import metrics
import xgboost as xgb



In [3]:
# bulid regression model
regressors = ['currank','avgrank','dice','lasso','ridge','rf','svr','xgb']
regressors = ['lasso','ridge','rf','svr','xgb','xgb-square']
def get_regressor(regressor = 'lr'):
    if regressor == "lasso":
        clf = LassoCV(cv=5, random_state=0)
    elif regressor == "ridge":
        clf = RidgeCV(alphas=np.logspace(-6, 6, 13))
    elif regressor == "rf":
        clf = RandomForestRegressor(n_estimators=100)
    elif regressor == 'svr':
        clf = SVR(kernel='rbf')
    elif regressor == 'xgb':
        clf = xgb.XGBRegressor(objective="reg:linear", random_state=42, max_depth=2)
    elif regressor == 'xgb-square':
        clf = xgb.XGBRegressor(objective="reg:squarederror", random_state=42, max_depth=3)
    elif regressor == 'dice':
        clf = RandomDice('1234')
    elif regressor == 'currank':
        clf = CurRank()
    elif regressor == 'avgrank':
        clf = AverageRank()        
    else:
        clf = None
        
    return clf


class CurRank():
    """
    predict with current rank
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = [0 for x in range(test_x.shape[0])]
        return np.array(pred_y)
    
class AverageRank():
    """
    print('[*] predict with average rankchg (change_in_rank_all):idx = 15')
    change_in_rank_all = test[:,15]
    pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in change_in_rank_all])
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            #13, change_in_rank_all
            pred_y.append(x[13])
        #pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in pred_y])
        pred_y_avg = pred_y
        return np.array(pred_y_avg)   

class RandomDice():
    """
    a random dice model
    """
    def __init__(self, seed='1234'):
        self.dist = []
        self.val = []
        random.seed(seed)
    
    def fit(self, x, y):
        total = y.shape[0]
        yval = set(y)
        
        ratio = 0.
        for val in yval:
            self.val.append(val)
            ratio += np.sum(y==val)*1.0 / total
            self.dist.append(ratio)
            
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            dice = random.random()
            #search in self.dist
            find_idx = -1
            for idx, ratio in enumerate(self.dist):
                if dice <= ratio:
                    find_idx = idx
                    break
            
            #or the last one match
            pred_y.append(self.val[find_idx])
            
        return np.array(pred_y)

def evaluate(test_y, pred_y):
    mae = metrics.mean_absolute_error(test_y, pred_y) 
    rmse = math.sqrt(metrics.mean_squared_error(test_y, pred_y))
    r2 = metrics.r2_score(test_y, pred_y)
    print('rmse=%.2f, mae=%.2f, r2=%.2f'%(rmse, mae, r2))
    return rmse, mae, r2
    
#
def split_by_eventid(stagedata, eventid):
    """
    split by eventid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['eventid'] != eventid].to_numpy()
    test  = stagedata[stagedata['eventid'] == eventid].to_numpy()

    #2:car_number
    train_x = train[:,2:]
    #train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    train_y = train[:,1]
    test_x = test[:,2:]
    #test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    test_y = test[:,1]
    
    return train, test, train_x, train_y, test_x, test_y


def regressor_model(name,train_x, train_y, test_x, test_y):
    ### test learning models
    print(f'[*] predict with {name} model, features# {train_x.shape[1]}')
    clf = get_regressor(name)
    clf.fit(train_x, train_y)

    pred_y = clf.predict(test_x).astype(np.int)
    score = evaluate(test_y, pred_y)
    return score, test_y.astype(np.int), pred_y

In [4]:
cols = ['runid','trainsize','testsize','testdistribution']
cols.extend(regressors)
print('cols:%s'%cols)
retdf0 = pd.DataFrame([],columns=cols)
retdf1 = pd.DataFrame([],columns=cols)

def test_model(test_eventid):
    global retdf0, retdf1
    
    print('Testset = %s'%eventsname[test_eventid])
    
    train, test, train_x, train_y, test_x, test_y = split_by_eventid(stagedata, test_eventid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>30),np.sum(test_y==30),np.sum(test_y<30))
    #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
    #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

    #record
    rec0 = [eventsname[test_eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    rec1 = [eventsname[test_eventid],train_x.shape[0],test_x.shape[0],test_distribution]

    acc0 = [0 for x in range(len(regressors))]
    acc1 = [0 for x in range(len(regressors))]
    ydata = [[] for x in range(len(regressors))]
    for idx, clf in enumerate(regressors):
        acc, testy, predy = regressor_model(clf,train_x, train_y, test_x, test_y)

        acc0[idx] = acc[0]
        acc1[idx] = acc[2]
        ydata[idx] = [testy, predy]

    rec0.extend(acc0)
    rec1.extend(acc1)
    #print('rec:%s'%rec)

    #new df
    df = pd.DataFrame([rec0],columns=cols)
    retdf0 = pd.concat([retdf0, df])        

    df = pd.DataFrame([rec1],columns=cols)
    retdf1 = pd.concat([retdf1, df])        


    #retdf0.to_csv('regressors_stagedata_splitbyevent%s_rmse.csv'%suffix)
    #retdf1.to_csv('regressors_stagedata_splitbyevent%s_r2.csv'%suffix)

    df_event_rmse = retdf0
    df_event_r2 = retdf1
    
    return df_event_rmse,df_event_r2,ydata

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'lasso', 'ridge', 'rf', 'svr', 'xgb', 'xgb-square']


In [6]:
#load data
pitdata = pd.read_csv('pitstop-indy2013-2019.csv')
#stagedata.fillna(0, inplace=True)
pitdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1540 entries, 0 to 1539
Data columns (total 10 columns):
Unnamed: 0           1540 non-null int64
eventid              1540 non-null int64
carno                1540 non-null int64
pit_id               1540 non-null int64
lap_number           1540 non-null int64
lap_time             1540 non-null float64
lap_cnt              1540 non-null int64
cautionlap_cnt       1540 non-null int64
pit_oncaution_len    1540 non-null int64
pit_oncaution        1540 non-null int64
dtypes: float64(1), int64(9)
memory usage: 120.4 KB


### model on data split by event

In [7]:
events = set(pitdata['eventid'])
years = ['2013','2014','2015','2016','2017','2018','2019']
eventsname = [f'Indy500-{x}' for x in years]
events_id={key:idx for idx, key in enumerate(eventsname)}

test_event = eventsname[-1]
test_eventid = events_id[test_event]

#'eventid', 'carno','pit_id', 'lap_number', 'lap_time',
#'lap_cnt','cautionlap_cnt','pit_oncaution_len','pit_oncaution'
# eventid, y, x
stagedata = pitdata[['eventid','lap_cnt','lap_time','cautionlap_cnt','pit_oncaution_len','pit_oncaution']]

rmse, r2, ydata1 = test_model(test_eventid)

###
stagedata = pitdata[['eventid','lap_cnt','lap_time','cautionlap_cnt','pit_oncaution_len','pit_oncaution']]

laptime = stagedata['lap_time'].values
longlap = np.zeros((len(laptime)))
longlap[laptime > 80] = 1
stagedata['lap_time'] = longlap
#stagedata = pitdata[['eventid','lap_cnt','pit_oncaution']]

rmse, r2, ydata2 = test_model(test_eventid)

###
df = pitdata[['eventid','lap_cnt','lap_time','cautionlap_cnt','pit_oncaution_len','pit_oncaution']]

stagedata = df[(df['cautionlap_cnt']==0) & (df['lap_cnt']>20) & (df['lap_time']<80) & (df['pit_oncaution']==0)]

rmse, r2, ydata3 = test_model(test_eventid)

###
df = pitdata[['eventid','lap_cnt','lap_time','cautionlap_cnt','pit_oncaution_len','pit_oncaution']]

stagedata = df[(df['cautionlap_cnt']==0) & (df['lap_cnt']>20) & (df['lap_time']<80) & (df['pit_oncaution']==0)]

laptime = stagedata['lap_time'].values
longlap = np.zeros((len(laptime)))
longlap[laptime > 60] = 1
stagedata['lap_time'] = longlap

rmse, r2, ydata4 = test_model(test_eventid)

Testset = Indy500-2019
[*] predict with lasso model, features# 4
rmse=9.80, mae=7.66, r2=0.48
[*] predict with ridge model, features# 4
rmse=9.62, mae=7.33, r2=0.50
[*] predict with rf model, features# 4
rmse=9.89, mae=6.93, r2=0.47
[*] predict with svr model, features# 4
rmse=10.72, mae=8.58, r2=0.38
[*] predict with xgb model, features# 4
rmse=9.68, mae=7.45, r2=0.49
[*] predict with xgb-square model, features# 4
rmse=9.80, mae=7.30, r2=0.48


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Testset = Indy500-2019
[*] predict with lasso model, features# 4
rmse=9.60, mae=7.21, r2=0.50
[*] predict with ridge model, features# 4
rmse=9.62, mae=7.25, r2=0.50
[*] predict with rf model, features# 4
rmse=10.28, mae=7.40, r2=0.43
[*] predict with svr model, features# 4
rmse=9.65, mae=7.08, r2=0.49
[*] predict with xgb model, features# 4
rmse=9.68, mae=7.43, r2=0.49
[*] predict with xgb-square model, features# 4
rmse=10.11, mae=7.48, r2=0.44
Testset = Indy500-2019
[*] predict with lasso model, features# 4
rmse=2.86, mae=2.53, r2=-0.37
[*] predict with ridge model, features# 4
rmse=2.86, mae=2.51, r2=-0.37
[*] predict with rf model, features# 4
rmse=2.85, mae=2.28, r2=-0.36
[*] predict with svr model, features# 4
rmse=2.70, mae=2.23, r2=-0.22
[*] predict with xgb model, features# 4
rmse=2.77, mae=2.16, r2=-0.28
[*] predict with xgb-square model, features# 4
rmse=2.80, mae=2.21, r2=-0.31
Testset = Indy500-2019
[*] predict with lasso model, features# 4
rmse=2.86, mae=2.51, r2=-0.37
[*]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


rmse=2.86, mae=2.51, r2=-0.37
[*] predict with svr model, features# 4
rmse=2.70, mae=2.23, r2=-0.22
[*] predict with xgb model, features# 4
rmse=2.86, mae=2.51, r2=-0.37
[*] predict with xgb-square model, features# 4
rmse=2.86, mae=2.51, r2=-0.37


In [8]:
rmse

Unnamed: 0,runid,trainsize,testsize,testdistribution,lasso,ridge,rf,svr,xgb,xgb-square
0,Indy500-2019,1323,217,"+:128,0:7,-:82",9.802896,9.624757,9.887151,10.724665,9.6799,9.799605
0,Indy500-2019,1323,217,"+:128,0:7,-:82",9.599827,9.624757,10.279952,9.651293,9.677995,10.109994
0,Indy500-2019,337,43,"+:30,0:5,-:8",2.85706,2.861127,2.852987,2.702282,2.766074,2.799502
0,Indy500-2019,337,43,"+:30,0:5,-:8",2.861127,2.861127,2.861127,2.702282,2.861127,2.861127


In [8]:
#ydata

In [9]:
test_event = eventsname[-2]
test_eventid = events_id[test_event]


#'eventid', 'carno','pit_id', 'lap_number', 'lap_time',
#'lap_cnt','cautionlap_cnt','pit_oncaution_len','pit_oncaution'
# eventid, y, x
stagedata = pitdata[['eventid','lap_cnt','lap_time','cautionlap_cnt','pit_oncaution_len','pit_oncaution']]

rmse, r2, ydata21 = test_model(test_eventid)

stagedata = pitdata[['eventid','lap_cnt','lap_time','cautionlap_cnt','pit_oncaution_len','pit_oncaution']]

laptime = stagedata['lap_time'].values
longlap = np.zeros((len(laptime)))
longlap[laptime > 80] = 1
stagedata['lap_time'] = longlap
#stagedata = pitdata[['eventid','lap_cnt','pit_oncaution']]

rmse, r2, ydata22 = test_model(test_eventid)

df = pitdata[['eventid','lap_cnt','lap_time','cautionlap_cnt','pit_oncaution_len','pit_oncaution']]

stagedata = df[(df['cautionlap_cnt']==0) & (df['lap_cnt']>20) & (df['lap_time']<80) & (df['pit_oncaution']==0)]

rmse, r2, ydata23 = test_model(test_eventid)

###
df = pitdata[['eventid','lap_cnt','lap_time','cautionlap_cnt','pit_oncaution_len','pit_oncaution']]

stagedata = df[(df['cautionlap_cnt']==0) & (df['lap_cnt']>20) & (df['lap_time']<80) & (df['pit_oncaution']==0)]

laptime = stagedata['lap_time'].values
longlap = np.zeros((len(laptime)))
longlap[laptime > 60] = 1
stagedata['lap_time'] = longlap

rmse, r2, ydata24 = test_model(test_eventid)

Testset = Indy500-2018
[*] predict with lasso model, features# 4
rmse=7.74, mae=6.30, r2=0.59
[*] predict with ridge model, features# 4
rmse=7.76, mae=6.17, r2=0.58
[*] predict with rf model, features# 4
rmse=8.93, mae=6.54, r2=0.45
[*] predict with svr model, features# 4
rmse=9.04, mae=6.78, r2=0.43
[*] predict with xgb model, features# 4
rmse=8.47, mae=6.67, r2=0.50
[*] predict with xgb-square model, features# 4
rmse=8.90, mae=6.82, r2=0.45
Testset = Indy500-2018
[*] predict with lasso model, features# 4
rmse=7.56, mae=6.01, r2=0.60
[*] predict with ridge model, features# 4
rmse=7.50, mae=5.94, r2=0.61
[*] predict with rf model, features# 4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


rmse=7.41, mae=5.72, r2=0.62
[*] predict with svr model, features# 4
rmse=8.83, mae=6.61, r2=0.46
[*] predict with xgb model, features# 4
rmse=7.82, mae=6.09, r2=0.58
[*] predict with xgb-square model, features# 4
rmse=7.79, mae=5.97, r2=0.58
Testset = Indy500-2018
[*] predict with lasso model, features# 4
rmse=4.04, mae=3.65, r2=-5.08
[*] predict with ridge model, features# 4
rmse=4.04, mae=3.65, r2=-5.08
[*] predict with rf model, features# 4
rmse=4.38, mae=3.93, r2=-6.13
[*] predict with svr model, features# 4
rmse=3.70, mae=3.33, r2=-4.08
[*] predict with xgb model, features# 4
rmse=4.43, mae=3.96, r2=-6.30
[*] predict with xgb-square model, features# 4
rmse=4.46, mae=3.96, r2=-6.40
Testset = Indy500-2018
[*] predict with lasso model, features# 4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


rmse=3.86, mae=3.49, r2=-4.54
[*] predict with ridge model, features# 4
rmse=3.86, mae=3.49, r2=-4.54
[*] predict with rf model, features# 4
rmse=3.86, mae=3.49, r2=-4.54
[*] predict with svr model, features# 4
rmse=3.49, mae=3.05, r2=-3.52
[*] predict with xgb model, features# 4
rmse=3.86, mae=3.49, r2=-4.54
[*] predict with xgb-square model, features# 4
rmse=3.86, mae=3.49, r2=-4.54


In [10]:
rmse

Unnamed: 0,runid,trainsize,testsize,testdistribution,lasso,ridge,rf,svr,xgb,xgb-square
0,Indy500-2019,1323,217,"+:128,0:7,-:82",9.802896,9.624757,9.947095,10.724665,9.6799,9.799605
0,Indy500-2019,1323,217,"+:128,0:7,-:82",9.599827,9.624757,10.332268,9.651293,9.677995,10.109994
0,Indy500-2019,337,43,"+:30,0:5,-:8",2.85706,2.861127,2.93733,2.702282,2.766074,2.799502
0,Indy500-2019,337,43,"+:30,0:5,-:8",2.861127,2.861127,2.861127,2.702282,2.861127,2.861127
0,Indy500-2018,1348,192,"+:101,0:7,-:84",7.741931,7.757725,8.932618,9.043875,8.474533,8.898736
0,Indy500-2018,1348,192,"+:101,0:7,-:84",7.559142,7.496527,7.40636,8.831761,7.819247,7.793895
0,Indy500-2018,325,55,"+:48,0:5,-:2",4.042951,4.042951,4.377629,3.695206,4.427189,4.459923
0,Indy500-2018,325,55,"+:48,0:5,-:2",3.856518,3.856518,3.856518,3.485033,3.856518,3.856518


### test

In [11]:
pitdata[pitdata['lap_cnt']==0]

Unnamed: 0.1,Unnamed: 0,eventid,carno,pit_id,lap_number,lap_time,lap_cnt,cautionlap_cnt,pit_oncaution_len,pit_oncaution
277,277,1,14,7,193,164.0781,0,0,3,1
383,383,1,63,2,62,344.7278,0,0,0,0
433,433,2,2,1,8,140.1497,0,0,7,1
558,558,2,26,5,151,80.8967,0,0,0,0
574,574,2,29,1,8,110.6246,0,0,7,1
604,604,2,63,6,115,154.1135,0,0,4,1
633,633,3,3,7,165,114.9221,0,0,3,1
667,667,3,8,6,153,109.5656,0,0,4,1
717,717,3,15,6,153,110.0731,0,0,3,1
748,748,3,20,4,68,383.5855,0,0,0,1


In [9]:
pitdata[pitdata['carno']==12]

Unnamed: 0.1,Unnamed: 0,eventid,carno,pit_id,lap_number,lap_time,lap_cnt,cautionlap_cnt,pit_oncaution_len,pit_oncaution
56,56,0,12,0,33,54.996,33,3,0,0
57,57,0,12,1,58,91.9951,24,9,1,1
58,58,0,12,2,89,53.169,30,2,0,0
59,59,0,12,3,121,53.6528,31,0,0,0
60,60,0,12,4,154,52.706,32,0,0,0
61,61,0,12,5,185,52.5213,30,0,0,0
62,62,0,12,6,196,130.494,10,2,2,1
262,262,1,12,0,31,51.9776,31,0,0,0
263,263,1,12,1,62,53.5795,30,0,0,0
264,264,1,12,2,94,51.9107,31,0,0,0
