### stage_model_regressor

base: 14./stage_model_regressor_withneighbor-newfeatures

prediction models of chg_of_rank_in_stage on stage dataset

data format:
    target , eventid ,    car_number,    stageid,     features...

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import math


In [2]:
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble.forest import RandomForestRegressor
from sklearn.linear_model.ridge import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model.stochastic_gradient import SGDRegressor
from sklearn.svm.classes import SVR
from sklearn.utils import shuffle
from sklearn import metrics
import xgboost as xgb



In [3]:
# bulid regression model
regressors = ['currank','avgrank','dice','lasso','ridge','rf','svr','xgb']
def get_regressor(regressor = 'lr'):
    if regressor == "lasso":
        clf = LassoCV(cv=5, random_state=0)
    elif regressor == "ridge":
        clf = RidgeCV(alphas=np.logspace(-6, 6, 13))
    elif regressor == "rf":
        clf = RandomForestRegressor(n_estimators=100)
    elif regressor == 'svr':
        clf = SVR(kernel='rbf')
    elif regressor == 'xgb':
        clf = xgb.XGBRegressor(objective="reg:linear", random_state=42, max_depth=3)
    elif regressor == 'dice':
        clf = RandomDice('1234')
    elif regressor == 'currank':
        clf = CurRank()
    elif regressor == 'avgrank':
        clf = AverageRank()        
    else:
        clf = None
        
    return clf


class CurRank():
    """
    predict with current rank
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = [0 for x in range(test_x.shape[0])]
        return np.array(pred_y)
    
class AverageRank():
    """
    print('[*] predict with average rankchg (change_in_rank_all):idx = 15')
    change_in_rank_all = test[:,15]
    pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in change_in_rank_all])
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            #13, change_in_rank_all
            pred_y.append(x[13])
        #pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in pred_y])
        pred_y_avg = pred_y
        return np.array(pred_y_avg)   

class RandomDice():
    """
    a random dice model
    """
    def __init__(self, seed='1234'):
        self.dist = []
        self.val = []
        random.seed(seed)
    
    def fit(self, x, y):
        total = y.shape[0]
        yval = set(y)
        
        ratio = 0.
        for val in yval:
            self.val.append(val)
            ratio += np.sum(y==val)*1.0 / total
            self.dist.append(ratio)
            
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            dice = random.random()
            #search in self.dist
            find_idx = -1
            for idx, ratio in enumerate(self.dist):
                if dice <= ratio:
                    find_idx = idx
                    break
            
            #or the last one match
            pred_y.append(self.val[find_idx])
            
        return np.array(pred_y)

def evaluate(test_y, pred_y):
    mae = metrics.mean_absolute_error(test_y, pred_y) 
    rmse = math.sqrt(metrics.mean_squared_error(test_y, pred_y))
    r2 = metrics.r2_score(test_y, pred_y)
    print('rmse=%.2f, mae=%.2f, r2=%.2f'%(rmse, mae, r2))
    return mae,rmse, r2
    
#
#features
#    cols=[Myidx, 'target','eventid','car_number','stageid',
#             'firststage','pit_in_caution','start_position',
#             'start_rank','start_rank_ratio','top_pack','bottom_pack',
#             'average_rank','average_rank_all',
#             'change_in_rank','change_in_rank_all','rate_of_change','rate_of_change_all']    
def split_by_eventid(stagedata, eventid):
    """
    split by eventid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['eventid'] != eventid].to_numpy()
    test  = stagedata[stagedata['eventid'] == eventid].to_numpy()

    #2:car_number
    train_x = train[:,2:]
    #train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    train_y = train[:,1]
    test_x = test[:,2:]
    #test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    test_y = test[:,1]
    
    return train, test, train_x, train_y, test_x, test_y


def split_by_stageid(stagedata, stageid):
    """
    split by stageid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['stageid'] <= stageid].to_numpy()
    test  = stagedata[stagedata['stageid'] > stageid].to_numpy()

    train_x = train[:,2:]
    #train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    train_y = train[:,1]
    test_x = test[:,2:]
    #test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    test_y = test[:,1]
    
    return train, test, train_x, train_y, test_x, test_y


def regressor_model(name='svr'):
    ### test learning models
    print('[*] predict with %s model'%name)
    clf = get_regressor(name)
    clf.fit(train_x, train_y)

    pred_y = clf.predict(test_x)
    score = evaluate(test_y, pred_y)
    return score

In [7]:
#load data
_trim = 2
_include_final = False
include_str = '1' if _include_final else '0'
suffix = f'indy500-2013-2019-end{include_str}-t{_trim}'
output_file = f'stage-indy500-2013-2019-end{include_str}-t{_trim}.csv'
stagedata = pd.read_csv(output_file)
stagedata.fillna(0, inplace=True)
stagedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 35 columns):
Unnamed: 0                   1313 non-null int64
target                       1313 non-null int64
eventid                      1313 non-null int64
car_number                   1313 non-null int64
stageid                      1313 non-null int64
firststage                   1313 non-null int64
pit_in_caution               1313 non-null int64
start_position               1313 non-null int64
start_rank                   1313 non-null int64
start_rank_ratio             1313 non-null float64
top_pack                     1313 non-null int64
bottom_pack                  1313 non-null int64
average_rank                 1313 non-null float64
average_rank_all             1313 non-null float64
change_in_rank               1313 non-null int64
change_in_rank_all           1313 non-null float64
rate_of_change               1313 non-null int64
rate_of_change_all           1313 non-null float64
l

### model on data split by event

In [8]:
cols = ['runid','trainsize','testsize','testdistribution']
cols.extend(regressors)
print('cols:%s'%cols)
retdf0 = pd.DataFrame([],columns=cols)
retdf1 = pd.DataFrame([],columns=cols)

events = set(stagedata['eventid'])

years = ['2013','2014','2015','2016','2017','2018','2019']
#events = ['Indy500']
eventsname = [f'Indy500-{x}' for x in years]
events_id={key:idx for idx, key in enumerate(eventsname)}
for eventid in events:
    print('Testset = %s'%eventsname[eventid])
    
    train, test, train_x, train_y, test_x, test_y = split_by_eventid(stagedata, eventid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
    #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
    #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))
    
    #record
    rec0 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    rec1 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    
    acc0 = [0 for x in range(len(regressors))]
    acc1 = [0 for x in range(len(regressors))]
    for idx, clf in enumerate(regressors):
        acc = regressor_model(clf)
        acc0[idx] = acc[0]
        acc1[idx] = acc[2]

    rec0.extend(acc0)
    rec1.extend(acc1)
    #print('rec:%s'%rec)
    
    #new df
    df = pd.DataFrame([rec0],columns=cols)
    retdf0 = pd.concat([retdf0, df])        
    
    df = pd.DataFrame([rec1],columns=cols)
    retdf1 = pd.concat([retdf1, df])        

    
retdf0.to_csv('regressors_stagedata_splitbyevent%s_rmse.csv'%suffix)
retdf1.to_csv('regressors_stagedata_splitbyevent%s_r2.csv'%suffix)

df_event_rmse = retdf0
df_event_r2 = retdf1

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lasso', 'ridge', 'rf', 'svr', 'xgb']
Testset = Indy500-2013
[*] predict with currank model
rmse=7.49, mae=5.02, r2=-0.00
[*] predict with avgrank model
rmse=10.03, mae=6.59, r2=-0.80
[*] predict with dice model
rmse=10.11, mae=6.84, r2=-0.83
[*] predict with lasso model
rmse=6.31, mae=4.81, r2=0.29
[*] predict with ridge model
rmse=6.66, mae=5.05, r2=0.21
[*] predict with rf model
rmse=6.28, mae=4.82, r2=0.29
[*] predict with svr model
rmse=7.08, mae=4.85, r2=0.10
[*] predict with xgb model
rmse=6.54, mae=4.92, r2=0.23
Testset = Indy500-2014
[*] predict with currank model
rmse=4.41, mae=2.81, r2=-0.01
[*] predict with avgrank model
rmse=5.55, mae=3.82, r2=-0.59
[*] predict with dice model
rmse=8.28, mae=5.45, r2=-2.54
[*] predict with lasso model
rmse=4.29, mae=3.08, r2=0.05
[*] predict with ridge model
rmse=4.38, mae=3.17, r2=0.01
[*] predict with rf model
rmse=4.92, mae=3.69, r2=-0.25
[*] predi

In [9]:
df_event_rmse

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lasso,ridge,rf,svr,xgb
0,Indy500-2013,1147,166,"+:56,0:29,-:81",5.018072,6.585822,6.837349,4.812384,5.051718,4.818554,4.852957,4.915279
0,Indy500-2014,1118,195,"+:58,0:29,-:108",2.805128,3.815659,5.451282,3.075516,3.173665,3.687744,2.73908,4.098577
0,Indy500-2015,1146,167,"+:63,0:26,-:78",3.862275,5.005168,6.161677,3.671969,3.791131,3.838743,3.691637,4.157183
0,Indy500-2016,1087,226,"+:79,0:36,-:111",4.438053,5.405334,6.535398,4.161169,4.353652,4.073938,4.248965,4.18107
0,Indy500-2017,1098,215,"+:64,0:59,-:92",3.265116,4.078729,5.874419,3.780368,3.767061,3.65586,3.387893,3.741042
0,Indy500-2018,1154,159,"+:64,0:25,-:70",4.477987,6.134471,6.163522,4.660953,4.595188,5.243145,4.44905,4.968542
0,Indy500-2019,1128,185,"+:60,0:40,-:85",4.362162,5.786136,6.556757,4.908849,4.896917,4.780054,4.355242,5.260124


In [12]:
### train 2013-2017
#load data
_trim = 2
_include_final = False
include_str = '1' if _include_final else '0'
suffix = f'indy500-2013-2019-end{include_str}-t{_trim}'
output_file = f'stage-indy500-2013-2019-end{include_str}-t{_trim}.csv'
stagedata = pd.read_csv(output_file)

stagedata.fillna(0, inplace=True)

cols = ['runid','trainsize','testsize','testdistribution']
cols.extend(regressors)
print('cols:%s'%cols)
retdf0 = pd.DataFrame([],columns=cols)
retdf1 = pd.DataFrame([],columns=cols)

events = set(stagedata['eventid'])

years = ['2013','2014','2015','2016','2017','2018','2019']
#events = ['Indy500']
eventsname = [f'Indy500-{x}' for x in years]
events_id={key:idx for idx, key in enumerate(eventsname)}

#first 
eventid = events_id['Indy500-2018']
ignore_eventid = events_id['Indy500-2019']
stdata_2018 = stagedata[stagedata['eventid']!=ignore_eventid]

print('Testset = %s'%eventsname[eventid])

train, test, train_x, train_y, test_x, test_y = split_by_eventid(stdata_2018, eventid)
test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
#print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
#      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

#record
rec0 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
rec1 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

acc0 = [0 for x in range(len(regressors))]
acc1 = [0 for x in range(len(regressors))]
for idx, clf in enumerate(regressors):
    acc = regressor_model(clf)
    acc0[idx] = acc[0]
    acc1[idx] = acc[2]

rec0.extend(acc0)
rec1.extend(acc1)
#print('rec:%s'%rec)

#new df
df = pd.DataFrame([rec0],columns=cols)
retdf0 = pd.concat([retdf0, df])        


#second 
eventid = events_id['Indy500-2019']
ignore_eventid = events_id['Indy500-2018']
stdata_2019 = stagedata[stagedata['eventid']!=ignore_eventid]

print('Testset = %s'%eventsname[eventid])

train, test, train_x, train_y, test_x, test_y = split_by_eventid(stdata_2019, eventid)
test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
#print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
#      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

#record
rec0 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
rec1 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

acc0 = [0 for x in range(len(regressors))]
acc1 = [0 for x in range(len(regressors))]
for idx, clf in enumerate(regressors):
    acc = regressor_model(clf)
    acc0[idx] = acc[0]
    acc1[idx] = acc[2]

rec0.extend(acc0)
rec1.extend(acc1)
#print('rec:%s'%rec)

#new df
df = pd.DataFrame([rec0],columns=cols)
retdf0 = pd.concat([retdf0, df])    

retdf0

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lasso', 'ridge', 'rf', 'svr', 'xgb']
Testset = Indy500-2018
[*] predict with currank model
rmse=6.84, mae=4.48, r2=-0.00
[*] predict with avgrank model
rmse=8.93, mae=6.13, r2=-0.71
[*] predict with dice model
rmse=8.46, mae=5.98, r2=-0.54
[*] predict with lasso model
rmse=6.00, mae=4.58, r2=0.23
[*] predict with ridge model
rmse=6.28, mae=4.80, r2=0.15
[*] predict with rf model
rmse=7.40, mae=5.76, r2=-0.18
[*] predict with svr model
rmse=6.51, mae=4.42, r2=0.09
[*] predict with xgb model
rmse=6.89, mae=5.29, r2=-0.02
Testset = Indy500-2019
[*] predict with currank model
rmse=7.02, mae=4.36, r2=-0.00
[*] predict with avgrank model
rmse=8.72, mae=5.79, r2=-0.54
[*] predict with dice model
rmse=8.84, mae=6.19, r2=-0.59
[*] predict with lasso model
rmse=6.60, mae=4.86, r2=0.12
[*] predict with ridge model
rmse=7.20, mae=5.43, r2=-0.05
[*] predict with rf model
rmse=6.85, mae=5.08, r2=0.05
[*] predi

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lasso,ridge,rf,svr,xgb
0,Indy500-2018,969,159,"+:64,0:25,-:70",4.477987,6.134471,5.981132,4.581084,4.800066,5.758113,4.416857,5.292516
0,Indy500-2019,969,185,"+:60,0:40,-:85",4.362162,5.786136,6.194595,4.864314,5.433956,5.083568,4.31833,5.517444


In [13]:
retdf0.to_csv('stint_regressor_result_t2013-2017.csv', float_format='%.3f')