### stage_model_regressor

prediction models of chg_of_rank_in_stage on stage dataset

data format:
    target , eventid ,    car_number,    stageid,     features...

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import math

# to use only one GPU.
# use this on r-001
# otherwise comment
import os
os.environ["CUDA_VISIBLE_DEVICES"]="7"

In [2]:
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble.forest import RandomForestRegressor
from sklearn.linear_model.ridge import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model.stochastic_gradient import SGDRegressor
from sklearn.svm.classes import SVR
from sklearn.utils import shuffle
from sklearn import metrics
import xgboost as xgb

In [3]:
# bulid regression model
regressors = ['currank','avgrank','dice','lasso','ridge','rf','svr','xgb']
def get_regressor(regressor = 'lr'):
    if regressor == "lasso":
        clf = LassoCV(cv=5, random_state=0)
    elif regressor == "ridge":
        clf = RidgeCV(alphas=np.logspace(-6, 6, 13))
    elif regressor == "rf":
        clf = RandomForestRegressor(n_estimators=100)
    elif regressor == 'svr':
        clf = SVR(kernel='rbf')
    elif regressor == 'xgb':
        clf = xgb.XGBRegressor(objective="reg:linear", random_state=42, max_depth=3)
    elif regressor == 'dice':
        clf = RandomDice('1234')
    elif regressor == 'currank':
        clf = CurRank()
    elif regressor == 'avgrank':
        clf = AverageRank()        
    else:
        clf = None
        
    return clf


class CurRank():
    """
    predict with current rank
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = [0 for x in range(test_x.shape[0])]
        return np.array(pred_y)
    
class AverageRank():
    """
    print('[*] predict with average rankchg (change_in_rank_all):idx = 15')
    change_in_rank_all = test[:,15]
    pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in change_in_rank_all])
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            #13, change_in_rank_all
            pred_y.append(x[13])
        #pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in pred_y])
        pred_y_avg = pred_y
        return np.array(pred_y_avg)   

class RandomDice():
    """
    a random dice model
    """
    def __init__(self, seed='1234'):
        self.dist = []
        self.val = []
        random.seed(seed)
    
    def fit(self, x, y):
        total = y.shape[0]
        yval = set(y)
        
        ratio = 0.
        for val in yval:
            self.val.append(val)
            ratio += np.sum(y==val)*1.0 / total
            self.dist.append(ratio)
            
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            dice = random.random()
            #search in self.dist
            find_idx = -1
            for idx, ratio in enumerate(self.dist):
                if dice <= ratio:
                    find_idx = idx
                    break
            
            #or the last one match
            pred_y.append(self.val[find_idx])
            
        return np.array(pred_y)

def evaluate(test_y, pred_y):
    mae = metrics.mean_absolute_error(test_y, pred_y) 
    rmse = math.sqrt(metrics.mean_squared_error(test_y, pred_y))
    r2 = metrics.r2_score(test_y, pred_y)
    print('rmse=%.2f, mae=%.2f, r2=%.2f'%(rmse, mae, r2))
    return rmse, mae, r2
    
#
#features
#    cols=[Myidx, 'target','eventid','car_number','stageid',
#             'firststage','pit_in_caution','start_position',
#             'start_rank','start_rank_ratio','top_pack','bottom_pack',
#             'average_rank','average_rank_all',
#             'change_in_rank','change_in_rank_all','rate_of_change','rate_of_change_all']    
def split_by_eventid(stagedata, eventid):
    """
    split by eventid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['eventid'] != eventid].to_numpy()
    test  = stagedata[stagedata['eventid'] == eventid].to_numpy()

    #2:car_number
    train_x = train[:,2:]
    #train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    train_y = train[:,1]
    test_x = test[:,2:]
    #test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    test_y = test[:,1]
    
    return train, test, train_x, train_y, test_x, test_y


def split_by_stageid(stagedata, stageid):
    """
    split by stageid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['stageid'] <= stageid].to_numpy()
    test  = stagedata[stagedata['stageid'] > stageid].to_numpy()

    train_x = train[:,2:]
    #train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    train_y = train[:,1]
    test_x = test[:,2:]
    #test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    test_y = test[:,1]
    
    return train, test, train_x, train_y, test_x, test_y


def regressor_model(name='svr'):
    ### test learning models
    print('[*] predict with %s model'%name)
    clf = get_regressor(name)
    clf.fit(train_x, train_y)

    pred_y = clf.predict(test_x)
    score = evaluate(test_y, pred_y)
    return score

In [4]:
#load data
stagedata = pd.read_csv('stage-2018.csv')
stagedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 805 entries, 0 to 804
Data columns (total 18 columns):
Unnamed: 0            805 non-null int64
target                805 non-null int64
eventid               805 non-null int64
car_number            805 non-null int64
stageid               805 non-null int64
firststage            805 non-null int64
pit_in_caution        805 non-null int64
start_position        805 non-null int64
start_rank            805 non-null int64
start_rank_ratio      805 non-null float64
top_pack              805 non-null int64
bottom_pack           805 non-null int64
average_rank          805 non-null float64
average_rank_all      805 non-null float64
change_in_rank        805 non-null int64
change_in_rank_all    805 non-null float64
rate_of_change        805 non-null int64
rate_of_change_all    805 non-null float64
dtypes: float64(5), int64(13)
memory usage: 113.3 KB


### model on data split by event

In [5]:
cols = ['runid','trainsize','testsize','testdistribution']
cols.extend(regressors)
print('cols:%s'%cols)
retdf0 = pd.DataFrame([],columns=cols)
retdf1 = pd.DataFrame([],columns=cols)

eventsname = ['Phoenix','Indy500','Texas','Iowa','Pocono','Gateway']
events = set(stagedata['eventid'])
for eventid in events:
    print('Testset = %s'%eventsname[eventid])
    
    train, test, train_x, train_y, test_x, test_y = split_by_eventid(stagedata, eventid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
    #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
    #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))
    
    #record
    rec0 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    rec1 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    
    acc0 = [0 for x in range(len(regressors))]
    acc1 = [0 for x in range(len(regressors))]
    for idx, clf in enumerate(regressors):
        acc0[idx] = regressor_model(clf)[0]
        acc1[idx] = regressor_model(clf)[2]

    rec0.extend(acc0)
    rec1.extend(acc1)
    #print('rec:%s'%rec)
    
    #new df
    df = pd.DataFrame([rec0],columns=cols)
    retdf0 = pd.concat([retdf0, df])        
    
    df = pd.DataFrame([rec1],columns=cols)
    retdf1 = pd.concat([retdf1, df])        

    
retdf0.to_csv('regressors_stagedata_splitbyevent_rmse.csv')
retdf1.to_csv('regressors_stagedata_splitbyevent_r2.csv')

df_event_rmse = retdf0
df_event_r2 = retdf1

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lasso', 'ridge', 'rf', 'svr', 'xgb']
Testset = Phoenix
[*] predict with currank model
rmse=4.73, mae=3.20, r2=-0.00
[*] predict with currank model
rmse=4.73, mae=3.20, r2=-0.00
[*] predict with avgrank model
rmse=5.61, mae=3.83, r2=-0.40
[*] predict with avgrank model
rmse=5.61, mae=3.83, r2=-0.40
[*] predict with dice model
rmse=7.01, mae=4.88, r2=-1.19
[*] predict with dice model
rmse=7.01, mae=4.88, r2=-1.19
[*] predict with lasso model
rmse=4.45, mae=2.89, r2=0.12
[*] predict with lasso model
rmse=4.45, mae=2.89, r2=0.12
[*] predict with ridge model
rmse=4.42, mae=2.86, r2=0.13
[*] predict with ridge model
rmse=4.42, mae=2.86, r2=0.13
[*] predict with rf model
rmse=4.54, mae=3.20, r2=0.08
[*] predict with rf model
rmse=4.61, mae=3.20, r2=0.05
[*] predict with svr model
rmse=4.67, mae=3.12, r2=0.03
[*] predict with svr model
rmse=4.67, mae=3.12, r2=0.03
[*] predict with xgb model
rmse=4.29, ma



rmse=4.29, mae=2.99, r2=0.18
Testset = Indy500
[*] predict with currank model
rmse=6.17, mae=3.90, r2=-0.00
[*] predict with currank model
rmse=6.17, mae=3.90, r2=-0.00
[*] predict with avgrank model
rmse=6.94, mae=4.75, r2=-0.27
[*] predict with avgrank model
rmse=6.94, mae=4.75, r2=-0.27
[*] predict with dice model
rmse=7.55, mae=5.28, r2=-0.50
[*] predict with dice model
rmse=7.55, mae=5.28, r2=-0.50
[*] predict with lasso model
rmse=5.65, mae=3.97, r2=0.16
[*] predict with lasso model
rmse=5.65, mae=3.97, r2=0.16
[*] predict with ridge model
rmse=5.49, mae=3.99, r2=0.20
[*] predict with ridge model
rmse=5.49, mae=3.99, r2=0.20
[*] predict with rf model
rmse=5.74, mae=4.27, r2=0.13
[*] predict with rf model
rmse=5.74, mae=4.32, r2=0.13
[*] predict with svr model
rmse=6.14, mae=3.90, r2=0.01
[*] predict with svr model
rmse=6.14, mae=3.90, r2=0.01
[*] predict with xgb model
rmse=5.75, mae=4.00, r2=0.13
[*] predict with xgb model
rmse=5.75, mae=4.00, r2=0.13




Testset = Texas
[*] predict with currank model
rmse=4.26, mae=2.60, r2=-0.01
[*] predict with currank model
rmse=4.26, mae=2.60, r2=-0.01
[*] predict with avgrank model
rmse=5.48, mae=3.73, r2=-0.67
[*] predict with avgrank model
rmse=5.48, mae=3.73, r2=-0.67
[*] predict with dice model
rmse=6.73, mae=4.68, r2=-1.52
[*] predict with dice model
rmse=6.73, mae=4.68, r2=-1.52
[*] predict with lasso model
rmse=3.79, mae=2.51, r2=0.20
[*] predict with lasso model
rmse=3.79, mae=2.51, r2=0.20
[*] predict with ridge model
rmse=3.77, mae=2.47, r2=0.21
[*] predict with ridge model
rmse=3.77, mae=2.47, r2=0.21
[*] predict with rf model
rmse=4.01, mae=2.84, r2=0.10
[*] predict with rf model
rmse=4.04, mae=2.88, r2=0.09
[*] predict with svr model
rmse=4.12, mae=2.57, r2=0.06
[*] predict with svr model
rmse=4.12, mae=2.57, r2=0.06
[*] predict with xgb model
rmse=4.08, mae=2.77, r2=0.08
[*] predict with xgb model




rmse=4.08, mae=2.77, r2=0.08
Testset = Iowa
[*] predict with currank model
rmse=3.98, mae=2.57, r2=0.00
[*] predict with currank model
rmse=3.98, mae=2.57, r2=0.00
[*] predict with avgrank model
rmse=5.80, mae=3.85, r2=-1.12
[*] predict with avgrank model
rmse=5.80, mae=3.85, r2=-1.12
[*] predict with dice model
rmse=6.45, mae=4.61, r2=-1.63
[*] predict with dice model
rmse=6.45, mae=4.61, r2=-1.63
[*] predict with lasso model
rmse=3.74, mae=2.75, r2=0.12
[*] predict with lasso model
rmse=3.74, mae=2.75, r2=0.12
[*] predict with ridge model
rmse=3.83, mae=2.78, r2=0.07
[*] predict with ridge model
rmse=3.83, mae=2.78, r2=0.07
[*] predict with rf model
rmse=4.07, mae=2.89, r2=-0.04
[*] predict with rf model
rmse=4.17, mae=3.00, r2=-0.10
[*] predict with svr model
rmse=3.92, mae=2.61, r2=0.03
[*] predict with svr model
rmse=3.92, mae=2.61, r2=0.03
[*] predict with xgb model
rmse=4.12, mae=3.06, r2=-0.07
[*] predict with xgb model




rmse=4.12, mae=3.06, r2=-0.07
Testset = Pocono
[*] predict with currank model
rmse=2.48, mae=1.29, r2=-0.04
[*] predict with currank model
rmse=2.48, mae=1.29, r2=-0.04
[*] predict with avgrank model
rmse=2.93, mae=1.92, r2=-0.45
[*] predict with avgrank model
rmse=2.93, mae=1.92, r2=-0.45
[*] predict with dice model
rmse=5.61, mae=3.88, r2=-4.34
[*] predict with dice model
rmse=5.61, mae=3.88, r2=-4.34
[*] predict with lasso model
rmse=2.50, mae=1.99, r2=-0.06
[*] predict with lasso model
rmse=2.50, mae=1.99, r2=-0.06
[*] predict with ridge model
rmse=2.57, mae=2.06, r2=-0.12
[*] predict with ridge model
rmse=2.57, mae=2.06, r2=-0.12
[*] predict with rf model
rmse=3.25, mae=2.43, r2=-0.79
[*] predict with rf model
rmse=3.22, mae=2.46, r2=-0.76
[*] predict with svr model
rmse=2.34, mae=1.41, r2=0.07
[*] predict with svr model
rmse=2.34, mae=1.41, r2=0.07
[*] predict with xgb model
rmse=3.02, mae=2.37, r2=-0.55
[*] predict with xgb model




rmse=3.02, mae=2.37, r2=-0.55
Testset = Gateway
[*] predict with currank model
rmse=3.41, mae=2.16, r2=-0.00
[*] predict with currank model
rmse=3.41, mae=2.16, r2=-0.00
[*] predict with avgrank model
rmse=4.30, mae=2.81, r2=-0.59
[*] predict with avgrank model
rmse=4.30, mae=2.81, r2=-0.59
[*] predict with dice model
rmse=5.89, mae=4.16, r2=-1.98
[*] predict with dice model
rmse=5.89, mae=4.16, r2=-1.98
[*] predict with lasso model
rmse=3.09, mae=2.11, r2=0.18
[*] predict with lasso model
rmse=3.09, mae=2.11, r2=0.18
[*] predict with ridge model
rmse=3.08, mae=2.07, r2=0.19
[*] predict with ridge model
rmse=3.08, mae=2.07, r2=0.19
[*] predict with rf model
rmse=3.49, mae=2.47, r2=-0.05
[*] predict with rf model
rmse=3.45, mae=2.42, r2=-0.03
[*] predict with svr model
rmse=3.35, mae=2.21, r2=0.03
[*] predict with svr model
rmse=3.35, mae=2.21, r2=0.03
[*] predict with xgb model
rmse=3.31, mae=2.32, r2=0.06
[*] predict with xgb model




rmse=3.31, mae=2.32, r2=0.06


### model on data split by stage

In [14]:
retdf0 = pd.DataFrame([],columns=cols)
retdf1 = pd.DataFrame([],columns=cols)

for stageid in range(8):
    train, test, train_x, train_y, test_x, test_y =split_by_stageid(stagedata, stageid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
    #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
    #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))
    
    #record
    rec0 = ['stage%d'%stageid,train_x.shape[0],test_x.shape[0],test_distribution]
    rec1 = ['stage%d'%stageid,train_x.shape[0],test_x.shape[0],test_distribution]
    
    acc0 = [0 for x in range(len(regressors))]
    acc1 = [0 for x in range(len(regressors))]
    for idx, clf in enumerate(regressors):
        acc0[idx] = regressor_model(clf)[0]
        acc1[idx] = regressor_model(clf)[2]

    rec0.extend(acc0)
    rec1.extend(acc1)
    #print('rec:%s'%rec)
    
    #new df
    df = pd.DataFrame([rec0],columns=cols)
    retdf0 = pd.concat([retdf0, df])  
    
    df = pd.DataFrame([rec1],columns=cols)
    retdf1 = pd.concat([retdf1, df])  

retdf0.to_csv('regressor_stagedata_splitbystage_rmse.csv')
retdf1.to_csv('regressor_stagedata_splitbystage_r2.csv')

df_stage_rmse = retdf0
df_stage_r2 = retdf1

[*] predict with currank model
rmse=4.75, mae=2.85, r2=-0.00
[*] predict with currank model
rmse=4.75, mae=2.85, r2=-0.00
[*] predict with avgrank model
rmse=5.87, mae=3.91, r2=-0.53
[*] predict with avgrank model
rmse=5.87, mae=3.91, r2=-0.53
[*] predict with dice model
rmse=6.16, mae=4.36, r2=-0.68
[*] predict with dice model
rmse=6.16, mae=4.36, r2=-0.68
[*] predict with lasso model
rmse=4.85, mae=3.19, r2=-0.05
[*] predict with lasso model
rmse=4.85, mae=3.19, r2=-0.05
[*] predict with ridge model
rmse=4.96, mae=3.25, r2=-0.09
[*] predict with ridge model
rmse=4.96, mae=3.25, r2=-0.09
[*] predict with rf model
rmse=5.03, mae=3.36, r2=-0.12
[*] predict with rf model
rmse=5.00, mae=3.35, r2=-0.11
[*] predict with svr model
rmse=4.76, mae=2.92, r2=-0.00
[*] predict with svr model
rmse=4.76, mae=2.92, r2=-0.00
[*] predict with xgb model
rmse=5.13, mae=3.46, r2=-0.17
[*] predict with xgb model
rmse=5.13, mae=3.46, r2=-0.17
[*] predict with currank model
rmse=4.68, mae=2.72, r2=-0.00
[*]



rmse=4.27, mae=2.93, r2=0.17
[*] predict with lasso model
rmse=4.27, mae=2.93, r2=0.17
[*] predict with ridge model
rmse=4.51, mae=3.19, r2=0.07
[*] predict with ridge model
rmse=4.51, mae=3.19, r2=0.07
[*] predict with rf model
rmse=4.60, mae=3.42, r2=0.04
[*] predict with rf model
rmse=4.53, mae=3.34, r2=0.06
[*] predict with svr model
rmse=4.74, mae=2.86, r2=-0.03
[*] predict with svr model




rmse=4.74, mae=2.86, r2=-0.03
[*] predict with xgb model
rmse=4.93, mae=3.49, r2=-0.11
[*] predict with xgb model
rmse=4.93, mae=3.49, r2=-0.11
[*] predict with currank model
rmse=4.90, mae=2.74, r2=-0.01
[*] predict with currank model
rmse=4.90, mae=2.74, r2=-0.01
[*] predict with avgrank model
rmse=5.68, mae=3.66, r2=-0.35
[*] predict with avgrank model
rmse=5.68, mae=3.66, r2=-0.35
[*] predict with dice model
rmse=7.05, mae=5.03, r2=-1.08
[*] predict with dice model
rmse=7.05, mae=5.03, r2=-1.08
[*] predict with lasso model
rmse=4.54, mae=2.95, r2=0.14
[*] predict with lasso model
rmse=4.54, mae=2.95, r2=0.14
[*] predict with ridge model
rmse=4.62, mae=3.01, r2=0.11
[*] predict with ridge model
rmse=4.62, mae=3.01, r2=0.11
[*] predict with rf model
rmse=4.60, mae=3.06, r2=0.12
[*] predict with rf model
rmse=4.62, mae=3.07, r2=0.11
[*] predict with svr model
rmse=4.98, mae=2.91, r2=-0.04
[*] predict with svr model
rmse=4.98, mae=2.91, r2=-0.04
[*] predict with xgb model
rmse=4.76, ma



rmse=4.70, mae=3.02, r2=-0.00
[*] predict with lasso model
rmse=4.70, mae=3.02, r2=-0.00
[*] predict with ridge model
rmse=4.76, mae=3.09, r2=-0.03
[*] predict with ridge model
rmse=4.76, mae=3.09, r2=-0.03
[*] predict with rf model
rmse=4.67, mae=3.09, r2=0.01
[*] predict with rf model
rmse=4.67, mae=3.12, r2=0.01
[*] predict with svr model
rmse=4.81, mae=2.69, r2=-0.05
[*] predict with svr model
rmse=4.81, mae=2.69, r2=-0.05
[*] predict with xgb model
rmse=5.18, mae=3.35, r2=-0.22
[*] predict with xgb model
rmse=5.18, mae=3.35, r2=-0.22
[*] predict with currank model
rmse=4.91, mae=2.63, r2=-0.01
[*] predict with currank model
rmse=4.91, mae=2.63, r2=-0.01
[*] predict with avgrank model
rmse=5.65, mae=3.43, r2=-0.33
[*] predict with avgrank model
rmse=5.65, mae=3.43, r2=-0.33
[*] predict with dice model
rmse=6.45, mae=4.41, r2=-0.74
[*] predict with dice model
rmse=6.45, mae=4.41, r2=-0.74
[*] predict with lasso model




rmse=4.49, mae=2.85, r2=0.16
[*] predict with lasso model
rmse=4.49, mae=2.85, r2=0.16
[*] predict with ridge model
rmse=4.52, mae=2.89, r2=0.15
[*] predict with ridge model
rmse=4.52, mae=2.89, r2=0.15
[*] predict with rf model
rmse=4.69, mae=3.01, r2=0.08
[*] predict with rf model
rmse=4.71, mae=3.10, r2=0.07
[*] predict with svr model
rmse=4.95, mae=2.73, r2=-0.02
[*] predict with svr model
rmse=4.95, mae=2.73, r2=-0.02
[*] predict with xgb model
rmse=4.66, mae=3.05, r2=0.09
[*] predict with xgb model




rmse=4.66, mae=3.05, r2=0.09
[*] predict with currank model
rmse=5.05, mae=2.70, r2=-0.00
[*] predict with currank model
rmse=5.05, mae=2.70, r2=-0.00
[*] predict with avgrank model
rmse=5.85, mae=3.41, r2=-0.34
[*] predict with avgrank model
rmse=5.85, mae=3.41, r2=-0.34
[*] predict with dice model
rmse=6.65, mae=4.58, r2=-0.74
[*] predict with dice model
rmse=6.65, mae=4.58, r2=-0.74
[*] predict with lasso model
rmse=4.37, mae=2.77, r2=0.25
[*] predict with lasso model
rmse=4.37, mae=2.77, r2=0.25
[*] predict with ridge model
rmse=4.33, mae=2.70, r2=0.26
[*] predict with ridge model
rmse=4.33, mae=2.70, r2=0.26
[*] predict with rf model
rmse=4.99, mae=3.33, r2=0.02
[*] predict with rf model
rmse=4.82, mae=3.16, r2=0.09
[*] predict with svr model
rmse=5.08, mae=2.78, r2=-0.01
[*] predict with svr model
rmse=5.08, mae=2.78, r2=-0.01
[*] predict with xgb model
rmse=4.49, mae=2.86, r2=0.21
[*] predict with xgb model




rmse=4.49, mae=2.86, r2=0.21
[*] predict with currank model
rmse=3.85, mae=2.26, r2=-0.00
[*] predict with currank model
rmse=3.85, mae=2.26, r2=-0.00
[*] predict with avgrank model
rmse=4.36, mae=2.79, r2=-0.28
[*] predict with avgrank model
rmse=4.36, mae=2.79, r2=-0.28
[*] predict with dice model
rmse=7.17, mae=4.53, r2=-2.47
[*] predict with dice model
rmse=7.17, mae=4.53, r2=-2.47
[*] predict with lasso model
rmse=2.73, mae=1.94, r2=0.50
[*] predict with lasso model
rmse=2.73, mae=1.94, r2=0.50
[*] predict with ridge model
rmse=2.62, mae=1.82, r2=0.54
[*] predict with ridge model
rmse=2.62, mae=1.82, r2=0.54
[*] predict with rf model
rmse=4.21, mae=2.94, r2=-0.19
[*] predict with rf model
rmse=4.21, mae=3.03, r2=-0.20
[*] predict with svr model
rmse=3.85, mae=2.35, r2=0.00
[*] predict with svr model
rmse=3.85, mae=2.35, r2=0.00
[*] predict with xgb model
rmse=3.18, mae=2.42, r2=0.32
[*] predict with xgb model




rmse=3.18, mae=2.42, r2=0.32
[*] predict with currank model
rmse=2.68, mae=1.81, r2=-0.01
[*] predict with currank model
rmse=2.68, mae=1.81, r2=-0.01
[*] predict with avgrank model
rmse=3.05, mae=2.28, r2=-0.31
[*] predict with avgrank model
rmse=3.05, mae=2.28, r2=-0.31
[*] predict with dice model
rmse=5.99, mae=4.19, r2=-4.07
[*] predict with dice model
rmse=5.99, mae=4.19, r2=-4.07
[*] predict with lasso model
rmse=1.73, mae=1.27, r2=0.58
[*] predict with lasso model
rmse=1.73, mae=1.27, r2=0.58
[*] predict with ridge model
rmse=1.67, mae=1.16, r2=0.61
[*] predict with ridge model
rmse=1.67, mae=1.16, r2=0.61
[*] predict with rf model
rmse=3.41, mae=2.25, r2=-0.64
[*] predict with rf model
rmse=3.02, mae=1.83, r2=-0.28
[*] predict with svr model
rmse=2.69, mae=1.86, r2=-0.02
[*] predict with svr model
rmse=2.69, mae=1.86, r2=-0.02
[*] predict with xgb model
rmse=2.71, mae=2.21, r2=-0.03
[*] predict with xgb model




rmse=2.71, mae=2.21, r2=-0.03


In [7]:
df_event_rmse

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lasso,ridge,rf,svr,xgb
0,Phoenix,691,114,"+:38,0:16,-:60",4.734161,5.606406,7.008766,4.447984,4.415062,4.544965,4.67283,4.293633
0,Indy500,580,225,"+:82,0:47,-:96",6.165135,6.936325,7.545124,5.654568,5.492028,5.739337,6.135708,5.750406
0,Texas,678,127,"+:39,0:34,-:54",4.257462,5.4781,6.726373,3.786871,3.771194,4.013697,4.117686,4.076657
0,Iowa,696,109,"+:42,0:28,-:39",3.981609,5.800698,6.453077,3.738325,3.830665,4.066765,3.915588,4.118872
0,Pocono,679,126,"+:29,0:61,-:36",2.475275,2.928532,5.611072,2.498792,2.572223,3.249308,2.340347,3.019592
0,Gateway,701,104,"+:34,0:28,-:42",3.412364,4.30133,5.885935,3.086153,3.075081,3.48632,3.352309,3.314166


In [8]:
df_event_r2

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lasso,ridge,rf,svr,xgb
0,Phoenix,691,114,"+:38,0:16,-:60",-0.000416,-0.403019,-1.192692,0.116878,0.129902,0.053207,0.025337,0.177105
0,Indy500,580,225,"+:82,0:47,-:96",-0.003011,-0.269636,-0.502288,0.156239,0.20405,0.130859,0.006541,0.127395
0,Texas,678,127,"+:39,0:34,-:54",-0.007943,-0.668763,-1.515918,0.202564,0.209153,0.090855,0.057154,0.075849
0,Iowa,696,109,"+:42,0:28,-:39",0.0,-1.122479,-1.626736,0.11847,0.074384,-0.096279,0.032888,-0.070137
0,Pocono,679,126,"+:29,0:61,-:36",-0.038432,-0.453555,-4.336088,-0.058258,-0.121369,-0.758839,0.071693,-0.545354
0,Gateway,701,104,"+:34,0:28,-:42",-0.001344,-0.591028,-1.979225,0.180956,0.186822,-0.02542,0.033592,0.055459


In [15]:
df_stage_rmse

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lasso,ridge,rf,svr,xgb
0,stage0,153,652,"+:221,0:167,-:264",4.74778,5.870429,6.158315,4.852986,4.960882,5.027814,4.755545,5.130541
0,stage1,288,517,"+:186,0:136,-:195",4.682987,5.526302,6.856642,4.270439,4.510827,4.595698,4.744955,4.928826
0,stage2,421,384,"+:140,0:112,-:132",4.903761,5.681358,7.051706,4.543633,4.617479,4.597718,4.980098,4.75549
0,stage3,547,258,"+:91,0:89,-:78",4.73442,5.541856,7.011065,4.699386,4.758406,4.672375,4.810762,5.175445
0,stage4,657,148,"+:48,0:53,-:47",4.914815,5.64699,6.450435,4.491933,4.519653,4.692315,4.951282,4.664548
0,stage5,725,80,"+:26,0:29,-:25",5.054701,5.848562,6.650188,4.374176,4.331226,4.990362,5.076617,4.48723
0,stage6,767,38,"+:11,0:13,-:14",3.852545,4.358174,7.174516,2.726765,2.624649,4.206698,3.850762,3.181817
0,stage7,789,16,"+:4,0:4,-:8",2.680951,3.045832,5.994789,1.72824,1.670028,3.411449,2.6863,2.706842


In [16]:
df_stage_r2

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lasso,ridge,rf,svr,xgb
0,stage0,153,652,"+:221,0:167,-:264",-0.000316,-0.529311,-0.682983,-0.045139,-0.092129,-0.1077,-0.003591,-0.168106
0,stage1,288,517,"+:186,0:136,-:195",-0.000986,-0.393963,-1.145879,0.167609,0.071259,0.064858,-0.027653,-0.108841
0,stage2,421,384,"+:140,0:112,-:132",-0.005719,-0.349964,-1.079728,0.136575,0.108281,0.10908,-0.037275,0.05418
0,stage3,547,258,"+:91,0:89,-:78",-0.019516,-0.396919,-1.235778,-0.004484,-0.029873,0.007399,-0.052661,-0.218305
0,stage4,657,148,"+:48,0:53,-:47",-0.008557,-0.331435,-0.737257,0.157534,0.147104,0.071921,-0.023579,0.091541
0,stage5,725,80,"+:26,0:29,-:25",-0.004151,-0.344332,-0.738105,0.24803,0.262724,0.085499,-0.012878,0.208657
0,stage6,767,38,"+:11,0:13,-:14",-0.000747,-0.280672,-2.470676,0.498669,0.535515,-0.196821,0.000179,0.31738
0,stage7,789,16,"+:4,0:4,-:8",-0.013774,-0.308505,-4.068871,0.57872,0.606621,-0.283273,-0.017823,-0.03345
