### stage_model_regressor

predicting target: end_rank = start_rank + change(old target)

base: 14./stage_model_regressor_withneighbor-newfeatures

prediction models of chg_of_rank_in_stage on stage dataset

data format:
    target , eventid ,    car_number,    stageid,     features...

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import math


In [2]:
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble.forest import RandomForestRegressor
from sklearn.linear_model.ridge import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model.stochastic_gradient import SGDRegressor
from sklearn.svm.classes import SVR
from sklearn.utils import shuffle
from sklearn import metrics
import xgboost as xgb



In [3]:
# bulid regression model
regressors = ['currank','avgrank','dice','lasso','ridge','rf','svr','xgb']
def get_regressor(regressor = 'lr'):
    if regressor == "lasso":
        clf = LassoCV(cv=5, random_state=0)
    elif regressor == "ridge":
        clf = RidgeCV(alphas=np.logspace(-6, 6, 13))
    elif regressor == "rf":
        clf = RandomForestRegressor(n_estimators=100)
    elif regressor == 'svr':
        clf = SVR(kernel='rbf')
    elif regressor == 'xgb':
        clf = xgb.XGBRegressor(objective="reg:linear", random_state=42, max_depth=3)
    elif regressor == 'dice':
        clf = RandomDice('1234')
    elif regressor == 'currank':
        clf = CurRank()
    elif regressor == 'avgrank':
        clf = AverageRank()        
    else:
        clf = None
        
    return clf


class CurRank():
    """
    predict with current rank
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = [0 for x in range(test_x.shape[0])]
        return np.array(pred_y)
    
class AverageRank():
    """
    print('[*] predict with average rankchg (change_in_rank_all):idx = 15')
    change_in_rank_all = test[:,15]
    pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in change_in_rank_all])
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            #13, change_in_rank_all
            pred_y.append(x[13])
        #pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in pred_y])
        pred_y_avg = pred_y
        return np.array(pred_y_avg)   

class RandomDice():
    """
    a random dice model
    """
    def __init__(self, seed='1234'):
        self.dist = []
        self.val = []
        random.seed(seed)
    
    def fit(self, x, y):
        total = y.shape[0]
        yval = set(y)
        
        ratio = 0.
        for val in yval:
            self.val.append(val)
            ratio += np.sum(y==val)*1.0 / total
            self.dist.append(ratio)
            
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            dice = random.random()
            #search in self.dist
            find_idx = -1
            for idx, ratio in enumerate(self.dist):
                if dice <= ratio:
                    find_idx = idx
                    break
            
            #or the last one match
            pred_y.append(self.val[find_idx])
            
        return np.array(pred_y)

def evaluate(test_y, pred_y):
    mae = metrics.mean_absolute_error(test_y, pred_y) 
    rmse = math.sqrt(metrics.mean_squared_error(test_y, pred_y))
    r2 = metrics.r2_score(test_y, pred_y)
    print('rmse=%.2f, mae=%.2f, r2=%.2f'%(rmse, mae, r2))
    return mae,rmse, r2
    
#
#features
#    cols=[Myidx, 'target','eventid','car_number','stageid',
#             'firststage','pit_in_caution','start_position',
#             'start_rank','start_rank_ratio','top_pack','bottom_pack',
#             'average_rank','average_rank_all',
#             'change_in_rank','change_in_rank_all','rate_of_change','rate_of_change_all']    
def split_by_eventid(stagedata, eventid):
    """
    split by eventid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['eventid'] != eventid].to_numpy()
    test  = stagedata[stagedata['eventid'] == eventid].to_numpy()

    #2:car_number
    train_x = train[:,2:]
    #train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    train_y = train[:,1]
    test_x = test[:,2:]
    #test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    test_y = test[:,1]
    
    #change target to endrank
    #train_y = train_y + train[:,8] 
    #test_y = test_y + test[:,8]
    
    return train, test, train_x, train_y, test_x, test_y


def split_by_stageid(stagedata, stageid):
    """
    split by stageid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['stageid'] <= stageid].to_numpy()
    test  = stagedata[stagedata['stageid'] > stageid].to_numpy()

    train_x = train[:,2:]
    #train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    train_y = train[:,1]
    test_x = test[:,2:]
    #test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    test_y = test[:,1]
    
    return train, test, train_x, train_y, test_x, test_y


def regressor_model(name='svr'):
    ### test learning models
    print('[*] predict with %s model'%name)
    clf = get_regressor(name)
    clf.fit(train_x, train_y)

    pred_y = clf.predict(test_x)
    
    #int only
    pred_y = pred_y.astype(int)
    
    score = evaluate(test_y, pred_y)
    return score

In [4]:
#load data
_trim = 0
_include_final = False
include_str = '1' if _include_final else '0'
suffix = f'indy500-2013-2019-end{include_str}-t{_trim}'
output_file = f'stage-indy500-2013-2019-end{include_str}-t{_trim}-noneighbor.csv'
stagedata = pd.read_csv(output_file)
stagedata.fillna(0, inplace=True)
stagedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 35 columns):
Unnamed: 0                   1313 non-null int64
target                       1313 non-null int64
eventid                      1313 non-null int64
car_number                   1313 non-null int64
stageid                      1313 non-null int64
firststage                   1313 non-null int64
pit_in_caution               1313 non-null int64
start_position               1313 non-null int64
start_rank                   1313 non-null int64
start_rank_ratio             1313 non-null float64
top_pack                     1313 non-null int64
bottom_pack                  1313 non-null int64
average_rank                 1313 non-null float64
average_rank_all             1313 non-null float64
change_in_rank               1313 non-null int64
change_in_rank_all           1313 non-null float64
rate_of_change               1313 non-null int64
rate_of_change_all           1313 non-null float64
l

### model on data split by event

In [5]:
cols = ['runid','trainsize','testsize','testdistribution']
cols.extend(regressors)
print('cols:%s'%cols)
retdf0 = pd.DataFrame([],columns=cols)
retdf1 = pd.DataFrame([],columns=cols)

events = set(stagedata['eventid'])

years = ['2013','2014','2015','2016','2017','2018','2019']
#events = ['Indy500']
eventsname = [f'Indy500-{x}' for x in years]
events_id={key:idx for idx, key in enumerate(eventsname)}
for eventid in events:
    print('Testset = %s'%eventsname[eventid])
    
    train, test, train_x, train_y, test_x, test_y = split_by_eventid(stagedata, eventid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
    #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
    #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))
    
    #record
    rec0 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    rec1 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    
    acc0 = [0 for x in range(len(regressors))]
    acc1 = [0 for x in range(len(regressors))]
    for idx, clf in enumerate(regressors):
        acc = regressor_model(clf)
        acc0[idx] = acc[0]
        acc1[idx] = acc[2]

    rec0.extend(acc0)
    rec1.extend(acc1)
    #print('rec:%s'%rec)
    
    #new df
    df = pd.DataFrame([rec0],columns=cols)
    retdf0 = pd.concat([retdf0, df])        
    
    df = pd.DataFrame([rec1],columns=cols)
    retdf1 = pd.concat([retdf1, df])        

    
#retdf0.to_csv('regressors_stagedata_splitbyevent%s_rmse.csv'%suffix)
#retdf1.to_csv('regressors_stagedata_splitbyevent%s_r2.csv'%suffix)

df_event_rmse = retdf0
df_event_r2 = retdf1

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lasso', 'ridge', 'rf', 'svr', 'xgb']
Testset = Indy500-2013
[*] predict with currank model
rmse=7.83, mae=5.60, r2=-0.01
[*] predict with avgrank model
rmse=10.80, mae=7.33, r2=-0.91
[*] predict with dice model
rmse=10.41, mae=7.63, r2=-0.77
[*] predict with lasso model
rmse=6.56, mae=5.07, r2=0.30
[*] predict with ridge model
rmse=6.57, mae=5.05, r2=0.29
[*] predict with rf model
rmse=6.55, mae=5.10, r2=0.30
[*] predict with svr model
rmse=7.55, mae=5.42, r2=0.07
[*] predict with xgb model
rmse=6.88, mae=5.27, r2=0.23
Testset = Indy500-2014
[*] predict with currank model
rmse=5.43, mae=3.55, r2=-0.01
[*] predict with avgrank model
rmse=7.72, mae=5.10, r2=-1.03
[*] predict with dice model
rmse=8.91, mae=6.12, r2=-1.71
[*] predict with lasso model
rmse=4.88, mae=3.59, r2=0.19
[*] predict with ridge model
rmse=4.96, mae=3.67, r2=0.16
[*] predict with rf model
rmse=5.23, mae=3.89, r2=0.07
[*] predic

In [6]:
df_event_rmse

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lasso,ridge,rf,svr,xgb
0,Indy500-2013,1147,166,"+:59,0:27,-:80",5.596386,7.331325,7.626506,5.072289,5.054217,5.096386,5.421687,5.26506
0,Indy500-2014,1118,195,"+:57,0:33,-:105",3.553846,5.102564,6.117949,3.594872,3.671795,3.887179,3.435897,4.015385
0,Indy500-2015,1146,167,"+:67,0:21,-:79",5.149701,6.461078,7.035928,4.628743,4.628743,4.622754,5.011976,4.760479
0,Indy500-2016,1087,226,"+:87,0:27,-:112",4.070796,5.336283,6.358407,3.867257,3.862832,3.809735,3.982301,3.858407
0,Indy500-2017,1098,215,"+:66,0:44,-:105",3.846512,4.995349,6.237209,3.376744,3.548837,3.734884,3.613953,3.827907
0,Indy500-2018,1154,159,"+:64,0:16,-:79",4.163522,5.72956,6.056604,3.647799,3.522013,3.761006,3.987421,3.993711
0,Indy500-2019,1128,185,"+:58,0:28,-:99",4.643243,6.816216,7.005405,4.394595,4.405405,4.681081,4.513514,4.810811


In [9]:
### train 2013-2017
#load data
_trim = 0
_include_final = False
include_str = '1' if _include_final else '0'
suffix = f'indy500-2013-2019-end{include_str}-t{_trim}'
output_file = f'stage-indy500-2013-2019-end{include_str}-t{_trim}-noneighbor.csv'
stagedata = pd.read_csv(output_file)

stagedata.fillna(0, inplace=True)

cols = ['runid','trainsize','testsize','testdistribution']
cols.extend(regressors)
print('cols:%s'%cols)
retdf0 = pd.DataFrame([],columns=cols)
retdf1 = pd.DataFrame([],columns=cols)

events = set(stagedata['eventid'])

years = ['2013','2014','2015','2016','2017','2018','2019']
#events = ['Indy500']
eventsname = [f'Indy500-{x}' for x in years]
events_id={key:idx for idx, key in enumerate(eventsname)}

#first 
eventid = events_id['Indy500-2018']
ignore_eventid = events_id['Indy500-2019']
stdata_2018 = stagedata[stagedata['eventid']!=ignore_eventid]

print('Testset = %s'%eventsname[eventid])

train, test, train_x, train_y, test_x, test_y = split_by_eventid(stdata_2018, eventid)
test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
#print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
#      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

#record
rec0 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
rec1 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

acc0 = [0 for x in range(len(regressors))]
acc1 = [0 for x in range(len(regressors))]
for idx, clf in enumerate(regressors):
    acc = regressor_model(clf)
    acc0[idx] = acc[0]
    acc1[idx] = acc[2]

rec0.extend(acc0)
rec1.extend(acc1)
#print('rec:%s'%rec)

#new df
df = pd.DataFrame([rec0],columns=cols)
retdf0 = pd.concat([retdf0, df])        


#second 
eventid = events_id['Indy500-2019']
ignore_eventid = events_id['Indy500-2018']
stdata_2019 = stagedata[stagedata['eventid']!=ignore_eventid]

print('Testset = %s'%eventsname[eventid])

train, test, train_x, train_y, test_x, test_y = split_by_eventid(stdata_2019, eventid)
test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
#print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
#      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

#record
rec0 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
rec1 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

acc0 = [0 for x in range(len(regressors))]
acc1 = [0 for x in range(len(regressors))]
for idx, clf in enumerate(regressors):
    acc = regressor_model(clf)
    acc0[idx] = acc[0]
    acc1[idx] = acc[2]

rec0.extend(acc0)
rec1.extend(acc1)
#print('rec:%s'%rec)

#new df
df = pd.DataFrame([rec0],columns=cols)
retdf0 = pd.concat([retdf0, df])    

retdf0

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lasso', 'ridge', 'rf', 'svr', 'xgb']
Testset = Indy500-2018
[*] predict with currank model
rmse=5.60, mae=4.16, r2=-0.00
[*] predict with avgrank model
rmse=7.64, mae=5.73, r2=-0.86
[*] predict with dice model
rmse=8.18, mae=6.14, r2=-1.13
[*] predict with lasso model
rmse=4.93, mae=3.61, r2=0.23
[*] predict with ridge model
rmse=4.76, mae=3.48, r2=0.28
[*] predict with rf model
rmse=5.03, mae=3.79, r2=0.19
[*] predict with svr model
rmse=5.45, mae=4.03, r2=0.05
[*] predict with xgb model
rmse=5.27, mae=3.91, r2=0.11
Testset = Indy500-2019
[*] predict with currank model
rmse=6.88, mae=4.64, r2=-0.01
[*] predict with avgrank model
rmse=9.56, mae=6.82, r2=-0.94
[*] predict with dice model
rmse=9.52, mae=7.07, r2=-0.93
[*] predict with lasso model
rmse=6.08, mae=4.37, r2=0.22
[*] predict with ridge model
rmse=6.07, mae=4.45, r2=0.22
[*] predict with rf model
rmse=6.32, mae=4.56, r2=0.15
[*] predict 

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lasso,ridge,rf,svr,xgb
0,Indy500-2018,969,159,"+:64,0:16,-:79",4.163522,5.72956,6.144654,3.610063,3.484277,3.786164,4.025157,3.91195
0,Indy500-2019,969,185,"+:58,0:28,-:99",4.643243,6.816216,7.07027,4.372973,4.454054,4.556757,4.502703,4.908108


In [8]:
retdf0.to_csv(f'stint_regressor_result_t2013-2017_t{_trim}_intchange.csv', float_format='%.3f')