### stage_model_regressor

predicting target: end_rank = start_rank + change(old target)

base: 14./stage_model_regressor_withneighbor-newfeatures

prediction models of chg_of_rank_in_stage on stage dataset

data format:
    target , eventid ,    car_number,    stageid,     features...

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import math


In [2]:
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble.forest import RandomForestRegressor
from sklearn.linear_model.ridge import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model.stochastic_gradient import SGDRegressor
from sklearn.svm.classes import SVR
from sklearn.utils import shuffle
from sklearn import metrics
import xgboost as xgb



In [3]:
# bulid regression model
regressors = ['currank','avgrank','dice','lasso','ridge','rf','svr','xgb']
train_x, train_y, test_x, test_y = None, None, None, None

def get_regressor(regressor = 'lr'):
    if regressor == "lasso":
        clf = LassoCV(cv=5, random_state=0)
    elif regressor == "ridge":
        clf = RidgeCV(alphas=np.logspace(-6, 6, 13))
    elif regressor == "rf":
        clf = RandomForestRegressor(n_estimators=100)
    elif regressor == 'svr':
        clf = SVR(kernel='rbf')
    elif regressor == 'xgb':
        clf = xgb.XGBRegressor(objective="reg:linear", random_state=42, max_depth=3)
    elif regressor == 'dice':
        clf = RandomDice('1234')
    elif regressor == 'currank':
        clf = CurRank()
    elif regressor == 'avgrank':
        clf = AverageRank()        
    else:
        clf = None
        
    return clf


class CurRank():
    """
    predict with current rank
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = [0 for x in range(test_x.shape[0])]
        return np.array(pred_y)
    
class AverageRank():
    """
    print('[*] predict with average rankchg (change_in_rank_all):idx = 15')
    change_in_rank_all = test[:,15]
    pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in change_in_rank_all])
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            #13, change_in_rank_all
            pred_y.append(x[13])
        #pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in pred_y])
        pred_y_avg = pred_y
        return np.array(pred_y_avg)   

class RandomDice():
    """
    a random dice model
    """
    def __init__(self, seed='1234'):
        self.dist = []
        self.val = []
        random.seed(seed)
    
    def fit(self, x, y):
        total = y.shape[0]
        yval = set(y)
        
        ratio = 0.
        for val in yval:
            self.val.append(val)
            ratio += np.sum(y==val)*1.0 / total
            self.dist.append(ratio)
            
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            dice = random.random()
            #search in self.dist
            find_idx = -1
            for idx, ratio in enumerate(self.dist):
                if dice <= ratio:
                    find_idx = idx
                    break
            
            #or the last one match
            pred_y.append(self.val[find_idx])
            
        return np.array(pred_y)

def evaluate(test_y, pred_y):
    mae = metrics.mean_absolute_error(test_y, pred_y) 
    rmse = math.sqrt(metrics.mean_squared_error(test_y, pred_y))
    r2 = metrics.r2_score(test_y, pred_y)
    print('rmse=%.2f, mae=%.2f, r2=%.2f'%(rmse, mae, r2))
    return mae,rmse, r2
    
#
#features
#    cols=[Myidx, 'target','eventid','car_number','stageid',
#             'firststage','pit_in_caution','start_position',
#             'start_rank','start_rank_ratio','top_pack','bottom_pack',
#             'average_rank','average_rank_all',
#             'change_in_rank','change_in_rank_all','rate_of_change','rate_of_change_all']    
def split_by_eventid(stagedata, eventid):
    """
    split by eventid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['eventid'] != eventid].to_numpy()
    test  = stagedata[stagedata['eventid'] == eventid].to_numpy()

    #2:car_number
    train_x = train[:,2:]
    #train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    train_y = train[:,1]
    test_x = test[:,2:]
    #test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    test_y = test[:,1]
    
    #change target to endrank
    #train_y = train_y + train[:,8] 
    #test_y = test_y + test[:,8]
    
    return train, test, train_x, train_y, test_x, test_y


def split_by_stageid(stagedata, stageid):
    """
    split by stageid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['stageid'] <= stageid].to_numpy()
    test  = stagedata[stagedata['stageid'] > stageid].to_numpy()

    train_x = train[:,2:]
    #train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    train_y = train[:,1]
    test_x = test[:,2:]
    #test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    test_y = test[:,1]
    
    return train, test, train_x, train_y, test_x, test_y


def regressor_model(name='svr'):
    ### test learning models
    print('[*] predict with %s model'%name)
    clf = get_regressor(name)
    clf.fit(train_x, train_y)

    pred_y = clf.predict(test_x)
    
    #int only
    pred_y = pred_y.astype(int)
    
    score = evaluate(test_y, pred_y)
    return score

In [4]:
def test_cv():
    global train_x, train_y, test_x, test_y
    
    cols = ['runid','trainsize','testsize','testdistribution']
    cols.extend(regressors)
    print('cols:%s'%cols)
    retdf0 = pd.DataFrame([],columns=cols)
    retdf1 = pd.DataFrame([],columns=cols)

    events = set(stagedata['eventid'])

    years = ['2013','2014','2015','2016','2017','2018','2019']
    #events = ['Indy500']
    eventsname = [f'Indy500-{x}' for x in years]
    events_id={key:idx for idx, key in enumerate(eventsname)}
    for eventid in events:
        print('Testset = %s'%eventsname[eventid])

        train, test, train_x, train_y, test_x, test_y = split_by_eventid(stagedata, eventid)
        test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
        #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
        #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

        #record
        rec0 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
        rec1 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

        acc0 = [0 for x in range(len(regressors))]
        acc1 = [0 for x in range(len(regressors))]
        for idx, clf in enumerate(regressors):
            acc = regressor_model(clf)
            acc0[idx] = acc[0]
            acc1[idx] = acc[2]

        rec0.extend(acc0)
        rec1.extend(acc1)
        #print('rec:%s'%rec)

        #new df
        df = pd.DataFrame([rec0],columns=cols)
        retdf0 = pd.concat([retdf0, df])        

        df = pd.DataFrame([rec1],columns=cols)
        retdf1 = pd.concat([retdf1, df])        


    #retdf0.to_csv('regressors_stagedata_splitbyevent%s_rmse.csv'%suffix)
    #retdf1.to_csv('regressors_stagedata_splitbyevent%s_r2.csv'%suffix)
    retdf0.to_csv('crossvalid_stagedata_regressor_%s.csv'%suffix, float_format='%.3f')

    df_event_rmse = retdf0
    df_event_r2 = retdf1
    return df_event_rmse
    
def test_20182019():
    global train_x, train_y, test_x, test_y
    
    ### train 2013-2017
    #load data
    cols = ['runid','trainsize','testsize','testdistribution']
    cols.extend(regressors)
    print('cols:%s'%cols)
    retdf0 = pd.DataFrame([],columns=cols)
    retdf1 = pd.DataFrame([],columns=cols)

    events = set(stagedata['eventid'])

    years = ['2013','2014','2015','2016','2017','2018','2019']
    #events = ['Indy500']
    eventsname = [f'Indy500-{x}' for x in years]
    events_id={key:idx for idx, key in enumerate(eventsname)}

    #first 
    eventid = events_id['Indy500-2018']
    ignore_eventid = events_id['Indy500-2019']
    stdata_2018 = stagedata[stagedata['eventid']!=ignore_eventid]

    print('Testset = %s'%eventsname[eventid])

    train, test, train_x, train_y, test_x, test_y = split_by_eventid(stdata_2018, eventid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
    #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
    #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

    #record
    rec0 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    rec1 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

    acc0 = [0 for x in range(len(regressors))]
    acc1 = [0 for x in range(len(regressors))]
    for idx, clf in enumerate(regressors):
        acc = regressor_model(clf)
        acc0[idx] = acc[0]
        acc1[idx] = acc[2]

    rec0.extend(acc0)
    rec1.extend(acc1)
    #print('rec:%s'%rec)

    #new df
    df = pd.DataFrame([rec0],columns=cols)
    retdf0 = pd.concat([retdf0, df])        


    #second 
    eventid = events_id['Indy500-2019']
    ignore_eventid = events_id['Indy500-2018']
    stdata_2019 = stagedata[stagedata['eventid']!=ignore_eventid]

    print('Testset = %s'%eventsname[eventid])

    train, test, train_x, train_y, test_x, test_y = split_by_eventid(stdata_2019, eventid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
    #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
    #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

    #record
    rec0 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    rec1 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

    acc0 = [0 for x in range(len(regressors))]
    acc1 = [0 for x in range(len(regressors))]
    for idx, clf in enumerate(regressors):
        acc = regressor_model(clf)
        acc0[idx] = acc[0]
        acc1[idx] = acc[2]

    rec0.extend(acc0)
    rec1.extend(acc1)
    #print('rec:%s'%rec)

    #new df
    df = pd.DataFrame([rec0],columns=cols)
    retdf0 = pd.concat([retdf0, df])    

    retdf0.to_csv(f'stint_regressor_result_{suffix}.csv', float_format='%.3f')
    
    return retdf0    

### test oracle with stint_len

In [5]:
#load data
_trim = 0
_include_final = True
_include_stintlen = True
include_str = '1' if _include_final else '0'
stint_str = '1' if _include_stintlen else ''
suffix = f'indy500-2013-2019-end{include_str}{stint_str}-t{_trim}'
output_file = f'stage-indy500-2013-2019-end{include_str}{stint_str}-t{_trim}.csv'


stagedata = pd.read_csv(output_file)
stagedata.fillna(0, inplace=True)
stagedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1522 entries, 0 to 1521
Data columns (total 36 columns):
Unnamed: 0                   1522 non-null int64
target                       1522 non-null int64
stint_len                    1522 non-null int64
eventid                      1522 non-null int64
car_number                   1522 non-null int64
stageid                      1522 non-null int64
firststage                   1522 non-null int64
pit_in_caution               1522 non-null int64
start_position               1522 non-null int64
start_rank                   1522 non-null int64
start_rank_ratio             1522 non-null float64
top_pack                     1522 non-null int64
bottom_pack                  1522 non-null int64
average_rank                 1522 non-null float64
average_rank_all             1522 non-null float64
change_in_rank               1522 non-null int64
change_in_rank_all           1522 non-null float64
rate_of_change               1522 non-null int64
rat

In [6]:
df_event_oracle = test_cv()

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lasso', 'ridge', 'rf', 'svr', 'xgb']
Testset = Indy500-2013
[*] predict with currank model
rmse=7.57, mae=5.44, r2=-0.01
[*] predict with avgrank model
rmse=13.41, mae=9.40, r2=-2.16
[*] predict with dice model
rmse=9.82, mae=7.40, r2=-0.70
[*] predict with lasso model
rmse=5.99, mae=4.62, r2=0.37
[*] predict with ridge model
rmse=5.98, mae=4.63, r2=0.37
[*] predict with rf model
rmse=5.94, mae=4.41, r2=0.38
[*] predict with svr model
rmse=7.20, mae=5.17, r2=0.09
[*] predict with xgb model
rmse=6.26, mae=4.76, r2=0.31
Testset = Indy500-2014
[*] predict with currank model
rmse=5.50, mae=3.49, r2=-0.00
[*] predict with avgrank model
rmse=8.70, mae=5.92, r2=-1.51
[*] predict with dice model
rmse=8.88, mae=6.23, r2=-1.62
[*] predict with lasso model
rmse=5.01, mae=3.61, r2=0.17
[*] predict with ridge model
rmse=4.84, mae=3.45, r2=0.22
[*] predict with rf model
rmse=5.22, mae=3.73, r2=0.10
[*] predict

In [7]:
df_event_oracle

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lasso,ridge,rf,svr,xgb
0,Indy500-2013,1327,195,"+:68,0:29,-:98",5.441026,9.4,7.4,4.620513,4.630769,4.405128,5.169231,4.758974
0,Indy500-2014,1297,225,"+:63,0:43,-:119",3.488889,5.92,6.231111,3.613333,3.453333,3.733333,3.315556,3.64
0,Indy500-2015,1327,195,"+:74,0:26,-:95",4.907692,8.641026,6.502564,4.030769,4.046154,4.107692,4.605128,4.276923
0,Indy500-2016,1268,254,"+:98,0:32,-:124",4.011811,6.933071,5.885827,3.476378,3.507874,3.409449,3.846457,3.480315
0,Indy500-2017,1274,248,"+:72,0:49,-:127",4.024194,7.052419,6.294355,3.512097,4.165323,3.71371,3.846774,3.741935
0,Indy500-2018,1333,189,"+:72,0:19,-:98",4.100529,7.206349,6.338624,3.391534,3.275132,3.227513,3.767196,3.402116
0,Indy500-2019,1306,216,"+:66,0:33,-:117",4.291667,7.930556,6.175926,4.175926,4.12963,4.180556,4.013889,4.111111


In [8]:
retdf_oracle = test_20182019()

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lasso', 'ridge', 'rf', 'svr', 'xgb']
Testset = Indy500-2018
[*] predict with currank model
rmse=5.50, mae=4.10, r2=-0.01
[*] predict with avgrank model
rmse=9.38, mae=7.21, r2=-1.93
[*] predict with dice model
rmse=8.57, mae=6.56, r2=-1.44
[*] predict with lasso model
rmse=4.52, mae=3.31, r2=0.32
[*] predict with ridge model
rmse=4.75, mae=3.54, r2=0.25
[*] predict with rf model
rmse=4.55, mae=3.31, r2=0.31
[*] predict with svr model
rmse=5.19, mae=3.80, r2=0.10
[*] predict with xgb model
rmse=4.57, mae=3.31, r2=0.30
Testset = Indy500-2019
[*] predict with currank model
rmse=6.46, mae=4.29, r2=-0.01
[*] predict with avgrank model
rmse=11.07, mae=7.93, r2=-1.96
[*] predict with dice model
rmse=8.64, mae=6.20, r2=-0.80
[*] predict with lasso model
rmse=5.58, mae=4.20, r2=0.25
[*] predict with ridge model
rmse=5.72, mae=4.23, r2=0.21
[*] predict with rf model
rmse=5.66, mae=4.03, r2=0.23
[*] predict

In [9]:
### test without stint_len
#load data
_trim = 0
_include_final = True
_include_stintlen = False
include_str = '1' if _include_final else '0'
stint_str = '1' if _include_stintlen else ''
suffix = f'indy500-2013-2019-end{include_str}{stint_str}-t{_trim}'
output_file = f'stage-indy500-2013-2019-end{include_str}{stint_str}-t{_trim}.csv'


stagedata = pd.read_csv(output_file)
stagedata.fillna(0, inplace=True)
stagedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1522 entries, 0 to 1521
Data columns (total 36 columns):
Unnamed: 0                   1522 non-null int64
target                       1522 non-null int64
stint_len                    1522 non-null int64
eventid                      1522 non-null int64
car_number                   1522 non-null int64
stageid                      1522 non-null int64
firststage                   1522 non-null int64
pit_in_caution               1522 non-null int64
start_position               1522 non-null int64
start_rank                   1522 non-null int64
start_rank_ratio             1522 non-null float64
top_pack                     1522 non-null int64
bottom_pack                  1522 non-null int64
average_rank                 1522 non-null float64
average_rank_all             1522 non-null float64
change_in_rank               1522 non-null int64
change_in_rank_all           1522 non-null float64
rate_of_change               1522 non-null int64
rat

In [10]:
df_event = test_cv()

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lasso', 'ridge', 'rf', 'svr', 'xgb']
Testset = Indy500-2013
[*] predict with currank model
rmse=7.57, mae=5.44, r2=-0.01
[*] predict with avgrank model
rmse=13.41, mae=9.40, r2=-2.16
[*] predict with dice model
rmse=9.82, mae=7.40, r2=-0.70
[*] predict with lasso model
rmse=5.99, mae=4.62, r2=0.37
[*] predict with ridge model
rmse=5.98, mae=4.63, r2=0.37
[*] predict with rf model
rmse=5.98, mae=4.52, r2=0.37
[*] predict with svr model
rmse=7.20, mae=5.17, r2=0.09
[*] predict with xgb model
rmse=6.26, mae=4.76, r2=0.31
Testset = Indy500-2014
[*] predict with currank model
rmse=5.50, mae=3.49, r2=-0.00
[*] predict with avgrank model
rmse=8.70, mae=5.92, r2=-1.51
[*] predict with dice model
rmse=8.88, mae=6.23, r2=-1.62
[*] predict with lasso model
rmse=5.01, mae=3.61, r2=0.17
[*] predict with ridge model
rmse=4.84, mae=3.45, r2=0.22
[*] predict with rf model
rmse=5.21, mae=3.73, r2=0.10
[*] predict

In [11]:
df_event

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lasso,ridge,rf,svr,xgb
0,Indy500-2013,1327,195,"+:68,0:29,-:98",5.441026,9.4,7.4,4.620513,4.630769,4.523077,5.169231,4.758974
0,Indy500-2014,1297,225,"+:63,0:43,-:119",3.488889,5.92,6.231111,3.613333,3.453333,3.733333,3.315556,3.64
0,Indy500-2015,1327,195,"+:74,0:26,-:95",4.907692,8.641026,6.502564,4.030769,4.046154,3.969231,4.605128,4.276923
0,Indy500-2016,1268,254,"+:98,0:32,-:124",4.011811,6.933071,5.885827,3.476378,3.507874,3.374016,3.846457,3.480315
0,Indy500-2017,1274,248,"+:72,0:49,-:127",4.024194,7.052419,6.294355,3.512097,4.165323,3.794355,3.846774,3.741935
0,Indy500-2018,1333,189,"+:72,0:19,-:98",4.100529,7.206349,6.338624,3.391534,3.275132,3.21164,3.767196,3.402116
0,Indy500-2019,1306,216,"+:66,0:33,-:117",4.291667,7.930556,6.175926,4.175926,4.12963,4.032407,4.013889,4.111111


In [12]:
retdf = test_20182019()

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lasso', 'ridge', 'rf', 'svr', 'xgb']
Testset = Indy500-2018
[*] predict with currank model
rmse=5.50, mae=4.10, r2=-0.01
[*] predict with avgrank model
rmse=9.38, mae=7.21, r2=-1.93
[*] predict with dice model
rmse=8.57, mae=6.56, r2=-1.44
[*] predict with lasso model
rmse=4.52, mae=3.31, r2=0.32
[*] predict with ridge model
rmse=4.75, mae=3.54, r2=0.25
[*] predict with rf model
rmse=4.53, mae=3.25, r2=0.32
[*] predict with svr model
rmse=5.19, mae=3.80, r2=0.10
[*] predict with xgb model
rmse=4.57, mae=3.31, r2=0.30
Testset = Indy500-2019
[*] predict with currank model
rmse=6.46, mae=4.29, r2=-0.01
[*] predict with avgrank model
rmse=11.07, mae=7.93, r2=-1.96
[*] predict with dice model
rmse=8.64, mae=6.20, r2=-0.80
[*] predict with lasso model
rmse=5.58, mae=4.20, r2=0.25
[*] predict with ridge model
rmse=5.72, mae=4.23, r2=0.21
[*] predict with rf model
rmse=5.63, mae=4.03, r2=0.24
[*] predict

In [13]:
retdf

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lasso,ridge,rf,svr,xgb
0,Indy500-2018,1117,189,"+:72,0:19,-:98",4.100529,7.206349,6.560847,3.306878,3.539683,3.253968,3.798942,3.306878
0,Indy500-2019,1117,216,"+:66,0:33,-:117",4.291667,7.930556,6.199074,4.199074,4.231481,4.027778,4.064815,4.324074


In [14]:
retdf_oracle

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lasso,ridge,rf,svr,xgb
0,Indy500-2018,1117,189,"+:72,0:19,-:98",4.100529,7.206349,6.560847,3.306878,3.539683,3.312169,3.798942,3.306878
0,Indy500-2019,1117,216,"+:66,0:33,-:117",4.291667,7.930556,6.199074,4.199074,4.231481,4.032407,4.064815,4.324074
