### stage_model_regressor

predicting target: end_rank = start_rank + change(old target)

base: 14./stage_model_regressor_withneighbor-newfeatures

prediction models of chg_of_rank_in_stage on stage dataset

data format:
    target , eventid ,    car_number,    stageid,     features...

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import math


In [2]:
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble.forest import RandomForestRegressor
from sklearn.linear_model.ridge import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model.stochastic_gradient import SGDRegressor
from sklearn.svm.classes import SVR
from sklearn.utils import shuffle
from sklearn import metrics
import xgboost as xgb



In [3]:
# bulid regression model
regressors = ['currank','avgrank','dice','lasso','ridge','rf','svr','xgb']
def get_regressor(regressor = 'lr'):
    if regressor == "lasso":
        clf = LassoCV(cv=5, random_state=0)
    elif regressor == "ridge":
        clf = RidgeCV(alphas=np.logspace(-6, 6, 13))
    elif regressor == "rf":
        clf = RandomForestRegressor(n_estimators=100)
    elif regressor == 'svr':
        clf = SVR(kernel='rbf')
    elif regressor == 'xgb':
        clf = xgb.XGBRegressor(objective="reg:linear", random_state=42, max_depth=3)
    elif regressor == 'dice':
        clf = RandomDice('1234')
    elif regressor == 'currank':
        clf = CurRank()
    elif regressor == 'avgrank':
        clf = AverageRank()        
    else:
        clf = None
        
    return clf


class CurRank():
    """
    predict with current rank
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = [0 for x in range(test_x.shape[0])]
        return np.array(pred_y)
    
class AverageRank():
    """
    print('[*] predict with average rankchg (change_in_rank_all):idx = 15')
    change_in_rank_all = test[:,15]
    pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in change_in_rank_all])
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            #13, change_in_rank_all
            pred_y.append(x[13])
        #pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in pred_y])
        pred_y_avg = pred_y
        return np.array(pred_y_avg)   

class RandomDice():
    """
    a random dice model
    """
    def __init__(self, seed='1234'):
        self.dist = []
        self.val = []
        random.seed(seed)
    
    def fit(self, x, y):
        total = y.shape[0]
        yval = set(y)
        
        ratio = 0.
        for val in yval:
            self.val.append(val)
            ratio += np.sum(y==val)*1.0 / total
            self.dist.append(ratio)
            
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            dice = random.random()
            #search in self.dist
            find_idx = -1
            for idx, ratio in enumerate(self.dist):
                if dice <= ratio:
                    find_idx = idx
                    break
            
            #or the last one match
            pred_y.append(self.val[find_idx])
            
        return np.array(pred_y)

def evaluate(test_y, pred_y):
    mae = metrics.mean_absolute_error(test_y, pred_y) 
    rmse = math.sqrt(metrics.mean_squared_error(test_y, pred_y))
    r2 = metrics.r2_score(test_y, pred_y)
    print('rmse=%.2f, mae=%.2f, r2=%.2f'%(rmse, mae, r2))
    return mae,rmse, r2
    
#
#features
#    cols=[Myidx, 'target','eventid','car_number','stageid',
#             'firststage','pit_in_caution','start_position',
#             'start_rank','start_rank_ratio','top_pack','bottom_pack',
#             'average_rank','average_rank_all',
#             'change_in_rank','change_in_rank_all','rate_of_change','rate_of_change_all']    
def split_by_eventid(stagedata, eventid):
    """
    split by eventid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['eventid'] != eventid].to_numpy()
    test  = stagedata[stagedata['eventid'] == eventid].to_numpy()

    #2:car_number
    train_x = train[:,2:]
    #train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    train_y = train[:,1]
    test_x = test[:,2:]
    #test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    test_y = test[:,1]
    
    #change target to endrank
    #train_y = train_y + train[:,8] 
    #test_y = test_y + test[:,8]
    
    return train, test, train_x, train_y, test_x, test_y


def split_by_stageid(stagedata, stageid):
    """
    split by stageid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['stageid'] <= stageid].to_numpy()
    test  = stagedata[stagedata['stageid'] > stageid].to_numpy()

    train_x = train[:,2:]
    #train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    train_y = train[:,1]
    test_x = test[:,2:]
    #test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    test_y = test[:,1]
    
    return train, test, train_x, train_y, test_x, test_y


def regressor_model(name='svr'):
    ### test learning models
    print('[*] predict with %s model'%name)
    clf = get_regressor(name)
    clf.fit(train_x, train_y)

    pred_y = clf.predict(test_x)
    
    #int only
    pred_y = pred_y.astype(int)
    
    score = evaluate(test_y, pred_y)
    return score

In [4]:
#load data
_trim = 0
_predictlen = 2
_include_final = False
include_str = '1' if _include_final else '0'
suffix = f'indy500-2013-2019-end{include_str}-t{_trim}'
output_file = f'stage-indy500-2013-2019-end{include_str}-t{_trim}-l{_predictlen}.csv'
stagedata = pd.read_csv(output_file)
stagedata.fillna(0, inplace=True)
stagedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1312 entries, 0 to 1311
Data columns (total 35 columns):
Unnamed: 0                   1312 non-null int64
target                       1312 non-null int64
eventid                      1312 non-null int64
car_number                   1312 non-null int64
stageid                      1312 non-null int64
firststage                   1312 non-null int64
pit_in_caution               1312 non-null int64
start_position               1312 non-null int64
start_rank                   1312 non-null int64
start_rank_ratio             1312 non-null float64
top_pack                     1312 non-null int64
bottom_pack                  1312 non-null int64
average_rank                 1312 non-null float64
average_rank_all             1312 non-null float64
change_in_rank               1312 non-null int64
change_in_rank_all           1312 non-null float64
rate_of_change               1312 non-null int64
rate_of_change_all           1312 non-null float64
l

### model on data split by event

In [5]:
cols = ['runid','trainsize','testsize','testdistribution']
cols.extend(regressors)
print('cols:%s'%cols)
retdf0 = pd.DataFrame([],columns=cols)
retdf1 = pd.DataFrame([],columns=cols)

events = set(stagedata['eventid'])

years = ['2013','2014','2015','2016','2017','2018','2019']
#events = ['Indy500']
eventsname = [f'Indy500-{x}' for x in years]
events_id={key:idx for idx, key in enumerate(eventsname)}
for eventid in events:
    print('Testset = %s'%eventsname[eventid])
    
    train, test, train_x, train_y, test_x, test_y = split_by_eventid(stagedata, eventid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
    #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
    #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))
    
    #record
    rec0 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    rec1 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    
    acc0 = [0 for x in range(len(regressors))]
    acc1 = [0 for x in range(len(regressors))]
    for idx, clf in enumerate(regressors):
        acc = regressor_model(clf)
        acc0[idx] = acc[0]
        acc1[idx] = acc[2]

    rec0.extend(acc0)
    rec1.extend(acc1)
    #print('rec:%s'%rec)
    
    #new df
    df = pd.DataFrame([rec0],columns=cols)
    retdf0 = pd.concat([retdf0, df])        
    
    df = pd.DataFrame([rec1],columns=cols)
    retdf1 = pd.concat([retdf1, df])        

    
#retdf0.to_csv('regressors_stagedata_splitbyevent%s_rmse.csv'%suffix)
#retdf1.to_csv('regressors_stagedata_splitbyevent%s_r2.csv'%suffix)

df_event_rmse = retdf0
df_event_r2 = retdf1

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lasso', 'ridge', 'rf', 'svr', 'xgb']
Testset = Indy500-2013
[*] predict with currank model
rmse=5.89, mae=3.87, r2=-0.16
[*] predict with avgrank model
rmse=5.48, mae=3.92, r2=0.00
[*] predict with dice model
rmse=7.66, mae=5.58, r2=-0.95
[*] predict with lasso model
rmse=3.82, mae=2.80, r2=0.51
[*] predict with ridge model
rmse=3.77, mae=2.79, r2=0.53
[*] predict with rf model
rmse=4.07, mae=2.84, r2=0.45
[*] predict with svr model
rmse=5.49, mae=3.66, r2=-0.00
[*] predict with xgb model
rmse=3.96, mae=2.76, r2=0.48
Testset = Indy500-2014
[*] predict with currank model
rmse=4.47, mae=3.09, r2=-0.17
[*] predict with avgrank model
rmse=4.49, mae=3.29, r2=-0.18
[*] predict with dice model
rmse=5.93, mae=4.45, r2=-1.05
[*] predict with lasso model
rmse=3.19, mae=2.25, r2=0.41
[*] predict with ridge model
rmse=3.24, mae=2.33, r2=0.39
[*] predict with rf model
rmse=3.74, mae=2.62, r2=0.18
[*] predict 

In [6]:
df_event_rmse

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lasso,ridge,rf,svr,xgb
0,Indy500-2013,1146,166,"+:76,0:34,-:56",3.873494,3.915663,5.584337,2.801205,2.789157,2.843373,3.662651,2.759036
0,Indy500-2014,1117,195,"+:108,0:43,-:44",3.092308,3.287179,4.446154,2.246154,2.333333,2.620513,2.866667,3.210256
0,Indy500-2015,1145,167,"+:74,0:41,-:52",2.790419,3.42515,4.964072,2.54491,2.634731,2.383234,2.646707,2.502994
0,Indy500-2016,1086,226,"+:84,0:44,-:98",2.530973,3.022124,4.650442,2.384956,2.362832,2.261062,2.411504,2.274336
0,Indy500-2017,1097,215,"+:64,0:76,-:75",2.525581,3.102326,4.990698,2.511628,2.423256,2.344186,2.539535,2.586047
0,Indy500-2018,1153,159,"+:92,0:44,-:23",3.704403,3.672956,4.779874,2.371069,2.333333,2.27673,3.433962,2.213836
0,Indy500-2019,1128,184,"+:96,0:42,-:46",3.951087,4.494565,5.603261,2.543478,2.559783,2.38587,3.684783,2.728261


In [7]:
### train 2013-2017
#load data
_trim = 0
_include_final = False
include_str = '1' if _include_final else '0'
suffix = f'indy500-2013-2019-end{include_str}-t{_trim}'
output_file = f'stage-indy500-2013-2019-end{include_str}-t{_trim}-l{_predictlen}.csv'
stagedata = pd.read_csv(output_file)

stagedata.fillna(0, inplace=True)

cols = ['runid','trainsize','testsize','testdistribution']
cols.extend(regressors)
print('cols:%s'%cols)
retdf0 = pd.DataFrame([],columns=cols)
retdf1 = pd.DataFrame([],columns=cols)

events = set(stagedata['eventid'])

years = ['2013','2014','2015','2016','2017','2018','2019']
#events = ['Indy500']
eventsname = [f'Indy500-{x}' for x in years]
events_id={key:idx for idx, key in enumerate(eventsname)}

#first 
eventid = events_id['Indy500-2018']
ignore_eventid = events_id['Indy500-2019']
stdata_2018 = stagedata[stagedata['eventid']!=ignore_eventid]

print('Testset = %s'%eventsname[eventid])

train, test, train_x, train_y, test_x, test_y = split_by_eventid(stdata_2018, eventid)
test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
#print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
#      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

#record
rec0 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
rec1 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

acc0 = [0 for x in range(len(regressors))]
acc1 = [0 for x in range(len(regressors))]
for idx, clf in enumerate(regressors):
    acc = regressor_model(clf)
    acc0[idx] = acc[0]
    acc1[idx] = acc[2]

rec0.extend(acc0)
rec1.extend(acc1)
#print('rec:%s'%rec)

#new df
df = pd.DataFrame([rec0],columns=cols)
retdf0 = pd.concat([retdf0, df])        


#second 
eventid = events_id['Indy500-2019']
ignore_eventid = events_id['Indy500-2018']
stdata_2019 = stagedata[stagedata['eventid']!=ignore_eventid]

print('Testset = %s'%eventsname[eventid])

train, test, train_x, train_y, test_x, test_y = split_by_eventid(stdata_2019, eventid)
test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
#print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
#      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

#record
rec0 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
rec1 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

acc0 = [0 for x in range(len(regressors))]
acc1 = [0 for x in range(len(regressors))]
for idx, clf in enumerate(regressors):
    acc = regressor_model(clf)
    acc0[idx] = acc[0]
    acc1[idx] = acc[2]

rec0.extend(acc0)
rec1.extend(acc1)
#print('rec:%s'%rec)

#new df
df = pd.DataFrame([rec0],columns=cols)
retdf0 = pd.concat([retdf0, df])    

retdf0

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lasso', 'ridge', 'rf', 'svr', 'xgb']
Testset = Indy500-2018
[*] predict with currank model
rmse=5.64, mae=3.70, r2=-0.46
[*] predict with avgrank model
rmse=5.00, mae=3.67, r2=-0.15
[*] predict with dice model
rmse=7.32, mae=5.23, r2=-1.45
[*] predict with lasso model
rmse=3.79, mae=2.43, r2=0.34
[*] predict with ridge model
rmse=3.71, mae=2.40, r2=0.37
[*] predict with rf model
rmse=3.42, mae=2.18, r2=0.46
[*] predict with svr model
rmse=5.38, mae=3.52, r2=-0.32
[*] predict with xgb model
rmse=3.70, mae=2.37, r2=0.37
Testset = Indy500-2019
[*] predict with currank model
rmse=5.97, mae=3.95, r2=-0.25
[*] predict with avgrank model
rmse=6.02, mae=4.49, r2=-0.27
[*] predict with dice model
rmse=8.02, mae=5.91, r2=-1.26
[*] predict with lasso model
rmse=3.91, mae=2.45, r2=0.46
[*] predict with ridge model
rmse=3.88, mae=2.44, r2=0.47
[*] predict with rf model
rmse=3.81, mae=2.46, r2=0.49
[*] predict

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lasso,ridge,rf,svr,xgb
0,Indy500-2018,969,159,"+:92,0:44,-:23",3.704403,3.672956,5.226415,2.433962,2.396226,2.176101,3.522013,2.371069
0,Indy500-2019,969,184,"+:96,0:42,-:46",3.951087,4.494565,5.913043,2.451087,2.440217,2.461957,3.755435,2.717391


In [9]:
retdf0.to_csv(f'stint_regressor_result_t2013-2017_t{_trim}_predictlen2.csv', float_format='%.3f')

### test

In [12]:
_trim = 0
_include_final = False
include_str = '1' if _include_final else '0'
suffix = f'indy500-2013-2019-end{include_str}-t{_trim}'
output_file = f'stage-indy500-2013-2019-end{include_str}-t{_trim}-l{_predictlen}.csv'
stagedata = pd.read_csv(output_file)

stagedata.fillna(0, inplace=True)

cols = ['runid','trainsize','testsize','testdistribution']
cols.extend(regressors)
print('cols:%s'%cols)
retdf0 = pd.DataFrame([],columns=cols)
retdf1 = pd.DataFrame([],columns=cols)

events = set(stagedata['eventid'])

years = ['2013','2014','2015','2016','2017','2018','2019']
#events = ['Indy500']
eventsname = [f'Indy500-{x}' for x in years]
events_id={key:idx for idx, key in enumerate(eventsname)}

#first 
eventid = events_id['Indy500-2018']
ignore_eventid = events_id['Indy500-2019']
stdata_2018 = stagedata[stagedata['eventid']!=ignore_eventid]

print('Testset = %s'%eventsname[eventid])

testdf = stdata_2018[stdata_2018['eventid'] == eventid]



cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lasso', 'ridge', 'rf', 'svr', 'xgb']
Testset = Indy500-2018


In [13]:
testdf

Unnamed: 0.1,Unnamed: 0,target,eventid,car_number,stageid,firststage,pit_in_caution,start_position,start_rank,start_rank_ratio,...,laptime_std_all,laps_prev,laps_after_last_pitstop,pittime_prev,prev_nb0_change_in_rank,prev_nb1_change_in_rank,prev_nb2_change_in_rank,follow_nb0_change_in_rank,follow_nb1_change_in_rank,follow_nb2_change_in_rank
969,1132,2,5,1,1,1,0,4,3,0.090909,...,8.421594,2,32,66.06825,0,0,0,0,0,0
970,1133,2,5,1,2,1,1,4,3,0.090909,...,11.374793,2,18,61.12735,0,0,0,0,0,0
971,1134,11,5,1,3,1,1,4,9,0.272727,...,23.928703,2,20,119.20330,0,0,0,0,0,0
972,1135,12,5,1,4,1,0,4,9,0.272727,...,23.131872,2,31,117.22760,0,0,0,0,0,0
973,1136,11,5,1,5,1,0,4,8,0.242424,...,20.608892,2,34,59.83245,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1123,1317,2,5,88,7,1,0,22,17,0.515152,...,21.858578,2,30,104.37145,0,0,0,0,0,0
1124,1319,6,5,98,1,1,0,12,7,0.212121,...,7.174138,2,32,63.28770,0,0,0,0,0,0
1125,1320,3,5,98,2,1,1,12,10,0.303030,...,9.690737,2,18,61.57250,0,0,0,0,0,0
1126,1321,8,5,98,3,1,0,12,15,0.454545,...,22.939522,2,42,116.28375,0,0,0,0,0,0


In [14]:
df = testdf
#test = df[(df['eventid']==5)][['car_number','start_lap','start_rank','target']].values
test = df[(df['eventid']==5)][['car_number','start_lap','start_rank','target']].values
test.shape
test[:,1] = test[:,1]-1
test[:,2] = test[:,2]-1
test[:,3] = test[:,2] + test[:,3]
dfout = pd.DataFrame(test, columns =['carno','startlap', 'startrank','endrank'])
dfout.to_csv('test_db_ml_2018.csv')

KeyError: "['start_lap'] not in index"