### stage_model_regressor_featureselection

prediction models of chg_of_rank_in_stage on stage dataset

data format:
    target , eventid ,    car_number,    stageid,     features...

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import math
from sklearn.feature_selection import RFE
# to use only one GPU.
# use this on r-001
# otherwise comment
import os
os.environ["CUDA_VISIBLE_DEVICES"]="7"

In [2]:
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble.forest import RandomForestRegressor
from sklearn.linear_model.ridge import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model.stochastic_gradient import SGDRegressor
from sklearn.svm.classes import SVR
from sklearn.utils import shuffle
from sklearn import metrics
import xgboost as xgb

In [3]:
# bulid regression model
#regressors = ['currank','avgrank','dice','lasso','ridge','rf','svr','xgb']
regressors = ['lasso','ridge','rf','xgb']

def get_regressor(regressor = 'lr'):
    if regressor == "lasso":
        clf = LassoCV(cv=5, random_state=0)
    elif regressor == "ridge":
        clf = RidgeCV(alphas=np.logspace(-6, 6, 13))
    elif regressor == "rf":
        clf = RandomForestRegressor(n_estimators=100)
    elif regressor == 'svr':
        clf = SVR(kernel='rbf')
    elif regressor == 'xgb':
        clf = xgb.XGBRegressor(objective="reg:linear", random_state=42, max_depth=3)
    elif regressor == 'dice':
        clf = RandomDice('1234')
    elif regressor == 'currank':
        clf = CurRank()
    elif regressor == 'avgrank':
        clf = AverageRank()        
    else:
        clf = None
        
    return clf


class CurRank():
    """
    predict with current rank
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = [0 for x in range(test_x.shape[0])]
        return np.array(pred_y)
    
class AverageRank():
    """
    print('[*] predict with average rankchg (change_in_rank_all):idx = 15')
    change_in_rank_all = test[:,15]
    pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in change_in_rank_all])
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            #13, change_in_rank_all
            pred_y.append(x[13])
        #pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in pred_y])
        pred_y_avg = pred_y
        return np.array(pred_y_avg)   

class RandomDice():
    """
    a random dice model
    """
    def __init__(self, seed='1234'):
        self.dist = []
        self.val = []
        random.seed(seed)
    
    def fit(self, x, y):
        total = y.shape[0]
        yval = set(y)
        
        ratio = 0.
        for val in yval:
            self.val.append(val)
            ratio += np.sum(y==val)*1.0 / total
            self.dist.append(ratio)
            
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            dice = random.random()
            #search in self.dist
            find_idx = -1
            for idx, ratio in enumerate(self.dist):
                if dice <= ratio:
                    find_idx = idx
                    break
            
            #or the last one match
            pred_y.append(self.val[find_idx])
            
        return np.array(pred_y)

def evaluate(test_y, pred_y):
    mae = metrics.mean_absolute_error(test_y, pred_y) 
    rmse = math.sqrt(metrics.mean_squared_error(test_y, pred_y))
    r2 = metrics.r2_score(test_y, pred_y)
    print('rmse=%.2f, mae=%.2f, r2=%.2f'%(rmse, mae, r2))
    return rmse, mae, r2
    
#
#features
#    cols=[Myidx, 'target','eventid','car_number','stageid',
#             'firststage','pit_in_caution','start_position',
#             'start_rank','start_rank_ratio','top_pack','bottom_pack',
#             'average_rank','average_rank_all',
#             'change_in_rank','change_in_rank_all','rate_of_change','rate_of_change_all']    
def split_by_eventid(stagedata, eventid):
    """
    split by eventid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['eventid'] != eventid].to_numpy()
    test  = stagedata[stagedata['eventid'] == eventid].to_numpy()

    #2:car_number
    train_x = train[:,2:]
    #train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    train_y = train[:,1]
    test_x = test[:,2:]
    #test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    test_y = test[:,1]
    
    return train, test, train_x, train_y, test_x, test_y


def split_by_stageid(stagedata, stageid):
    """
    split by stageid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['stageid'] <= stageid].to_numpy()
    test  = stagedata[stagedata['stageid'] > stageid].to_numpy()

    train_x = train[:,2:]
    #train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    train_y = train[:,1]
    test_x = test[:,2:]
    #test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    test_y = test[:,1]
    
    return train, test, train_x, train_y, test_x, test_y


def regressor_model(name='svr', feature_num = 10):
    ### test learning models
    print('[*] predict with %s model'%name)
    clf = get_regressor(name)
   
    selector = RFE(clf, feature_num, step=1)
    
    features = selector.fit(train_x, train_y)

    pred_y = selector.predict(test_x)
    score = evaluate(test_y, pred_y)
    return score, features.support_    
    


In [4]:
#load data
suffix='-withneighbor-newfeatures-timediff'
stagedata = pd.read_csv('stage-2018%s.csv'%suffix)
stagedata.fillna(0, inplace=True)
stagedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 805 entries, 0 to 804
Data columns (total 35 columns):
Unnamed: 0                   805 non-null int64
target                       805 non-null int64
eventid                      805 non-null int64
car_number                   805 non-null int64
stageid                      805 non-null int64
firststage                   805 non-null int64
pit_in_caution               805 non-null int64
start_position               805 non-null int64
start_rank                   805 non-null int64
start_rank_ratio             805 non-null float64
top_pack                     805 non-null int64
bottom_pack                  805 non-null int64
average_rank                 805 non-null float64
average_rank_all             805 non-null float64
change_in_rank               805 non-null int64
change_in_rank_all           805 non-null float64
rate_of_change               805 non-null int64
rate_of_change_all           805 non-null float64
laptime_green_mean_pr

### model on data split by event

In [5]:
cols = ['runid','trainsize','testsize','testdistribution']
cols.extend(regressors)
print('cols:%s'%cols)
retdf0 = pd.DataFrame([],columns=cols)
retdf1 = pd.DataFrame([],columns=cols)


selFeatureNumber = 10
featureNames=['eventid','car_number','stageid',
             'firststage','pit_in_caution','start_position',
             'start_rank','start_rank_ratio','top_pack','bottom_pack',
             'average_rank','average_rank_all',
             'change_in_rank','change_in_rank_all','rate_of_change','rate_of_change_all',
             'laptime_green_mean_prev','laptime_green_std_prev','laptime_green_mean_all','laptime_green_std_all', 
             'laptime_mean_prev','laptime_std_prev','laptime_mean_all','laptime_std_all', 
             'laps_prev','laps_after_last_pitstop','pittime_prev',     
             'prev_nb0_change_in_rank','prev_nb1_change_in_rank','prev_nb2_change_in_rank',
             'follow_nb0_change_in_rank','follow_nb1_change_in_rank','follow_nb2_change_in_rank']
weights = np.zeros((len(featureNames)))

eventsname = ['Phoenix','Indy500','Texas','Iowa','Pocono','Gateway']
events = set(stagedata['eventid'])
for eventid in events:
    print('Testset = %s'%eventsname[eventid])
    
    train, test, train_x, train_y, test_x, test_y = split_by_eventid(stagedata, eventid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
    #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
    #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))
    
    #record
    rec0 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    rec1 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    
    acc0 = [0 for x in range(len(regressors))]
    acc1 = [0 for x in range(len(regressors))]
    features =[0 for x in range(len(regressors))]
    
    for idx, clf in enumerate(regressors):
        score, features[idx] = regressor_model(clf, feature_num= selFeatureNumber)
        
        acc0[idx] = score[0]
        acc1[idx] = score[2]

        fnames = [featureNames[id] if val else '' for id,val in enumerate(features[idx])]
        print('clf:%d :%s',idx, ','.join(fnames))
        
        #add to feature weights
        for id,val in enumerate(features[idx]):
            if val:
                weights[id] += 1        
        
        
        
    rec0.extend(acc0)
    rec1.extend(acc1)
    #print('rec:%s'%rec)
    
    #new df
    df = pd.DataFrame([rec0],columns=cols)
    retdf0 = pd.concat([retdf0, df])        
    
    df = pd.DataFrame([rec1],columns=cols)
    retdf1 = pd.concat([retdf1, df])        

    
retdf0.to_csv('regressors_stagedata_splitbyevent%s_rmse.csv'%suffix)
retdf1.to_csv('regressors_stagedata_splitbyevent%s_r2.csv'%suffix)

df_event_rmse = retdf0
df_event_r2 = retdf1

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'lasso', 'ridge', 'rf', 'xgb']
Testset = Phoenix
[*] predict with lasso model
rmse=4.40, mae=2.96, r2=0.14
clf:%d :%s 0 ,,stageid,,pit_in_caution,start_position,start_rank,,,,average_rank,average_rank_all,change_in_rank,,,,,,,,,,,,laps_prev,laps_after_last_pitstop,,,,,,,follow_nb2_change_in_rank
[*] predict with ridge model
rmse=4.38, mae=2.91, r2=0.14
clf:%d :%s 1 ,,stageid,,pit_in_caution,start_position,start_rank,start_rank_ratio,top_pack,,average_rank,average_rank_all,,,,,,,,,,,,,laps_prev,laps_after_last_pitstop,,,,,,,
[*] predict with rf model
rmse=4.46, mae=3.13, r2=0.11
clf:%d :%s 2 ,car_number,,,,,start_rank,start_rank_ratio,,,average_rank,average_rank_all,,,,,,laptime_green_std_prev,,,laptime_mean_prev,laptime_std_prev,,,,laps_after_last_pitstop,,,,,,,follow_nb2_change_in_rank
[*] predict with xgb model
rmse=4.22, mae=2.92, r2=0.21
clf:%d :%s 3 ,,,,pit_in_caution,,start_rank,start_rank_ratio,,,,average_rank_all,,,,,,

rmse=5.85, mae=4.12, r2=0.10
clf:%d :%s 3 ,,,,pit_in_caution,,start_rank,start_rank_ratio,,,average_rank,,,,,,,laptime_green_std_prev,,,,laptime_std_prev,laptime_mean_all,laptime_std_all,,laps_after_last_pitstop,pittime_prev,,,,,,
Testset = Texas
[*] predict with lasso model
rmse=3.84, mae=2.61, r2=0.18
clf:%d :%s 0 ,,stageid,,,start_position,start_rank,,,,average_rank,average_rank_all,,,,,,,,,,,,,laps_prev,laps_after_last_pitstop,,prev_nb0_change_in_rank,,prev_nb2_change_in_rank,,,follow_nb2_change_in_rank
[*] predict with ridge model
rmse=3.71, mae=2.45, r2=0.24
clf:%d :%s 1 eventid,,stageid,,pit_in_caution,,start_rank,start_rank_ratio,top_pack,,average_rank,,,,,rate_of_change_all,,,,,,,,,laps_prev,laps_after_last_pitstop,,,,,,,
[*] predict with rf model
rmse=4.21, mae=2.83, r2=0.01
clf:%d :%s 2 ,,,,,,start_rank,start_rank_ratio,,,average_rank,average_rank_all,,,,,laptime_green_mean_prev,,,,laptime_mean_prev,laptime_std_prev,,,laps_prev,,,,,,,follow_nb1_change_in_rank,follow_nb2_chan

rmse=4.89, mae=3.89, r2=-0.51
clf:%d :%s 2 ,,,,,,start_rank,start_rank_ratio,,,average_rank,average_rank_all,,,,,,laptime_green_std_prev,laptime_green_mean_all,,laptime_mean_prev,laptime_std_prev,,,laps_prev,,,,,,,,follow_nb2_change_in_rank
[*] predict with xgb model
rmse=4.77, mae=3.71, r2=-0.43
clf:%d :%s 3 ,,,,pit_in_caution,,start_rank,start_rank_ratio,,,average_rank,,,,rate_of_change,,,laptime_green_std_prev,laptime_green_mean_all,,,laptime_std_prev,,,,,,,,,follow_nb0_change_in_rank,,follow_nb2_change_in_rank
Testset = Pocono
[*] predict with lasso model












rmse=8.00, mae=3.85, r2=-9.84
clf:%d :%s 0 eventid,,stageid,,pit_in_caution,,start_rank,,,,average_rank,,,,,,,,,laptime_green_std_all,,,,laptime_std_all,laps_prev,laps_after_last_pitstop,,,,,,,follow_nb2_change_in_rank
[*] predict with ridge model
rmse=9.19, mae=4.45, r2=-13.31
clf:%d :%s 1 eventid,,stageid,firststage,pit_in_caution,,start_rank,,top_pack,bottom_pack,average_rank,,,,,,,,,laptime_green_std_all,,,,laptime_std_all,,,,,,,,,
[*] predict with rf model
rmse=4.06, mae=2.80, r2=-1.79
clf:%d :%s 2 ,car_number,,,,,start_rank,start_rank_ratio,,,average_rank,average_rank_all,,,,,,laptime_green_std_prev,,,laptime_mean_prev,laptime_std_prev,,,,laps_after_last_pitstop,,,,,,,follow_nb2_change_in_rank
[*] predict with xgb model
rmse=4.05, mae=3.24, r2=-1.79
clf:%d :%s 3 ,,stageid,,pit_in_caution,,start_rank,start_rank_ratio,,bottom_pack,,,,,,,,laptime_green_std_prev,,,,laptime_std_prev,,,laps_prev,laps_after_last_pitstop,,,,,,,follow_nb2_change_in_rank
Testset = Gateway
[*] predict with 











rmse=2.84, mae=2.03, r2=0.31
clf:%d :%s 0 ,,stageid,,,,start_rank,,,,average_rank,average_rank_all,change_in_rank,,,,,,,,laptime_mean_prev,,,,laps_prev,laps_after_last_pitstop,,prev_nb0_change_in_rank,,,,,follow_nb2_change_in_rank
[*] predict with ridge model
rmse=2.73, mae=2.02, r2=0.36
clf:%d :%s 1 eventid,,stageid,,pit_in_caution,,start_rank,start_rank_ratio,top_pack,bottom_pack,average_rank,,,,,,,,,,,,,,laps_prev,laps_after_last_pitstop,,,,,,,
[*] predict with rf model
rmse=3.47, mae=2.46, r2=-0.04
clf:%d :%s 2 ,car_number,,,,,start_rank,start_rank_ratio,,,average_rank,average_rank_all,,,,,,laptime_green_std_prev,,,laptime_mean_prev,,,,laps_prev,,,,,,,follow_nb1_change_in_rank,follow_nb2_change_in_rank
[*] predict with xgb model
rmse=3.16, mae=2.26, r2=0.14
clf:%d :%s 3 ,,,,pit_in_caution,,start_rank,start_rank_ratio,,,,average_rank_all,,,,,,laptime_green_std_prev,,,,laptime_std_prev,,,,,pittime_prev,,,,follow_nb0_change_in_rank,follow_nb1_change_in_rank,follow_nb2_change_in_rank


In [6]:
#weights analysis
idx = np.argsort(-weights)
fnames = [featureNames[id] for id in idx]
print('feature weights:',','.join(fnames))

feature weights: start_rank,average_rank,follow_nb2_change_in_rank,start_rank_ratio,laps_after_last_pitstop,laps_prev,pit_in_caution,stageid,average_rank_all,laptime_std_prev,laptime_green_std_prev,laptime_mean_prev,eventid,bottom_pack,top_pack,follow_nb0_change_in_rank,pittime_prev,car_number,laptime_std_all,change_in_rank_all,prev_nb0_change_in_rank,change_in_rank,laptime_mean_all,laptime_green_mean_all,follow_nb1_change_in_rank,start_position,laptime_green_std_all,prev_nb2_change_in_rank,rate_of_change,laptime_green_mean_prev,firststage,rate_of_change_all,prev_nb1_change_in_rank


### model on data split by stage

In [7]:
retdf0 = pd.DataFrame([],columns=cols)
retdf1 = pd.DataFrame([],columns=cols)
weights = np.zeros((len(featureNames)))

for stageid in range(8):
    train, test, train_x, train_y, test_x, test_y =split_by_stageid(stagedata, stageid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
    #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
    #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))
    
    #record
    rec0 = ['stage%d'%stageid,train_x.shape[0],test_x.shape[0],test_distribution]
    rec1 = ['stage%d'%stageid,train_x.shape[0],test_x.shape[0],test_distribution]
    
    acc0 = [0 for x in range(len(regressors))]
    acc1 = [0 for x in range(len(regressors))]
    for idx, clf in enumerate(regressors):
        #acc0[idx] = regressor_model(clf)[0]
        #acc1[idx] = regressor_model(clf)[2]
        
        score, features[idx] = regressor_model(clf, feature_num= selFeatureNumber)
        
        acc0[idx] = score[0]
        acc1[idx] = score[2]

        fnames = [featureNames[id] if val else '' for id,val in enumerate(features[idx])]
        print('clf:%d :%s',idx, ','.join(fnames))
        
        #add to feature weights
        for id,val in enumerate(features[idx]):
            if val:
                weights[id] += 1           

    rec0.extend(acc0)
    rec1.extend(acc1)
    #print('rec:%s'%rec)
    
    #new df
    df = pd.DataFrame([rec0],columns=cols)
    retdf0 = pd.concat([retdf0, df])  
    
    df = pd.DataFrame([rec1],columns=cols)
    retdf1 = pd.concat([retdf1, df])  

retdf0.to_csv('regressor_stagedata_splitbystage%s_rmse.csv'%suffix)
retdf1.to_csv('regressor_stagedata_splitbystage%s_r2.csv'%suffix)

df_stage_rmse = retdf0
df_stage_r2 = retdf1

[*] predict with lasso model
rmse=4.85, mae=3.19, r2=-0.05
clf:%d :%s 0 ,car_number,,,,start_position,,,,,,,,,,,,,,,,,,,,laps_after_last_pitstop,pittime_prev,prev_nb0_change_in_rank,prev_nb1_change_in_rank,prev_nb2_change_in_rank,follow_nb0_change_in_rank,follow_nb1_change_in_rank,follow_nb2_change_in_rank
[*] predict with ridge model
rmse=4.96, mae=3.25, r2=-0.09
clf:%d :%s 1 eventid,car_number,,,,start_position,start_rank,start_rank_ratio,top_pack,bottom_pack,average_rank,average_rank_all,,,,,,,,,,,,,laps_prev,,,,,,,,
[*] predict with rf model
rmse=5.02, mae=3.36, r2=-0.12
clf:%d :%s 2 eventid,car_number,,,,start_position,start_rank,start_rank_ratio,top_pack,bottom_pack,average_rank,average_rank_all,,,,,,,,,,,,,laps_prev,,,,,,,,
[*] predict with xgb model
rmse=5.13, mae=3.46, r2=-0.17
clf:%d :%s 3 eventid,car_number,,,,start_position,,start_rank_ratio,,bottom_pack,,,,,,,,,,,,,,,,laps_after_last_pitstop,pittime_prev,prev_nb0_change_in_rank,prev_nb1_change_in_rank,prev_nb2_change_in_ra



rmse=4.68, mae=3.33, r2=-0.00
clf:%d :%s 0 ,,,,,,start_rank,,,bottom_pack,average_rank,,,,,,,laptime_green_std_prev,,,,laptime_std_prev,,,,,pittime_prev,,prev_nb1_change_in_rank,prev_nb2_change_in_rank,follow_nb0_change_in_rank,follow_nb1_change_in_rank,
[*] predict with ridge model
rmse=4.68, mae=3.26, r2=0.00
clf:%d :%s 1 eventid,,,,pit_in_caution,,start_rank,,,,,,change_in_rank,change_in_rank_all,,,,,,laptime_green_std_all,,laptime_std_prev,,laptime_std_all,,,pittime_prev,,,,follow_nb0_change_in_rank,,
[*] predict with rf model
rmse=5.19, mae=3.83, r2=-0.23
clf:%d :%s 2 eventid,car_number,,,,,start_rank,start_rank_ratio,,,average_rank,average_rank_all,,,,,,laptime_green_std_prev,,laptime_green_std_all,,,,,laps_prev,,,,,,,,follow_nb2_change_in_rank
[*] predict with xgb model
rmse=5.53, mae=4.19, r2=-0.40
clf:%d :%s 3 eventid,,,,pit_in_caution,,,start_rank_ratio,,bottom_pack,,,,,,,,laptime_green_std_prev,,,,laptime_std_prev,,,laps_prev,,pittime_prev,,,,,follow_nb1_change_in_rank,follo



















rmse=4.70, mae=2.87, r2=0.08
clf:%d :%s 0 ,,,,,,start_rank,,,,,,,change_in_rank_all,,,,,,,laptime_mean_prev,laptime_std_prev,laptime_mean_all,laptime_std_all,laps_prev,laps_after_last_pitstop,pittime_prev,prev_nb0_change_in_rank,,,,,
[*] predict with ridge model
rmse=4.63, mae=3.06, r2=0.10
clf:%d :%s 1 ,,stageid,,pit_in_caution,start_position,start_rank,,top_pack,bottom_pack,average_rank,,change_in_rank,change_in_rank_all,,,,,,,,,,,laps_prev,,,,,,,,
[*] predict with rf model
rmse=5.04, mae=3.47, r2=-0.06
clf:%d :%s 2 ,car_number,,,,,start_rank,start_rank_ratio,,,average_rank,average_rank_all,,,,,,laptime_green_std_prev,,laptime_green_std_all,laptime_mean_prev,,,,,laps_after_last_pitstop,,,,,,follow_nb1_change_in_rank,
[*] predict with xgb model
rmse=5.15, mae=3.57, r2=-0.11
clf:%d :%s 3 ,,,,pit_in_caution,,,start_rank_ratio,,bottom_pack,,,,,,,,laptime_green_std_prev,,laptime_green_std_all,,,,,laps_prev,laps_after_last_pitstop,pittime_prev,,,,follow_nb0_change_in_rank,follow_nb1_change

rmse=4.87, mae=3.15, r2=-0.08
clf:%d :%s 3 ,,,,pit_in_caution,,start_rank,start_rank_ratio,,bottom_pack,,,,,,,,laptime_green_std_prev,,,laptime_mean_prev,,,,laps_prev,laps_after_last_pitstop,,,,,follow_nb0_change_in_rank,follow_nb1_change_in_rank,
[*] predict with lasso model
rmse=5.01, mae=3.31, r2=-0.05
clf:%d :%s 0 ,,,,,,start_rank,,,,average_rank,,,,rate_of_change,,,laptime_green_std_prev,laptime_green_mean_all,,,,,,laps_prev,laps_after_last_pitstop,,,,,follow_nb0_change_in_rank,follow_nb1_change_in_rank,follow_nb2_change_in_rank
[*] predict with ridge model
rmse=4.44, mae=2.82, r2=0.18
clf:%d :%s 1 ,,stageid,firststage,pit_in_caution,start_position,start_rank,,top_pack,,,average_rank_all,change_in_rank,change_in_rank_all,,,,,,,,,,,,laps_after_last_pitstop,,,,,,,
[*] predict with rf model
rmse=4.72, mae=3.21, r2=0.07
clf:%d :%s 2 ,car_number,,,,,start_rank,start_rank_ratio,,,average_rank,average_rank_all,,,,,,laptime_green_std_prev,,,laptime_mean_prev,laptime_std_prev,,,,laps_after

rmse=4.61, mae=2.97, r2=0.16
clf:%d :%s 0 ,,stageid,,,start_position,start_rank,,,,average_rank,,,,rate_of_change,,,,,,laptime_mean_prev,,laptime_mean_all,,laps_prev,laps_after_last_pitstop,,,,,,,follow_nb2_change_in_rank
[*] predict with ridge model
rmse=4.08, mae=2.70, r2=0.34
clf:%d :%s 1 eventid,,stageid,,pit_in_caution,,start_rank,start_rank_ratio,top_pack,,average_rank,,change_in_rank,,,,,,,,,,,,laps_prev,laps_after_last_pitstop,,,,,,,
[*] predict with rf model
rmse=5.15, mae=3.46, r2=-0.04
clf:%d :%s 2 ,car_number,,,,,start_rank,start_rank_ratio,,,average_rank,average_rank_all,,,,,,laptime_green_std_prev,,,laptime_mean_prev,laptime_std_prev,,,laps_prev,,,,,,,,follow_nb2_change_in_rank
[*] predict with xgb model
rmse=4.75, mae=3.01, r2=0.11
clf:%d :%s 3 ,,,,pit_in_caution,,start_rank,start_rank_ratio,,bottom_pack,average_rank,,,,,,,laptime_green_std_prev,,,laptime_mean_prev,laptime_std_prev,,,laps_prev,,,,,,,,follow_nb2_change_in_rank
[*] predict with lasso model
rmse=3.34, mae=2

rmse=5.13, mae=3.56, r2=-0.78
clf:%d :%s 3 ,,,,pit_in_caution,,start_rank,start_rank_ratio,,,average_rank,average_rank_all,,,,,,laptime_green_std_prev,,,,laptime_std_prev,laptime_mean_all,,,,pittime_prev,,,,,,follow_nb2_change_in_rank
[*] predict with lasso model
rmse=2.82, mae=2.22, r2=-0.12
clf:%d :%s 0 ,,stageid,,,,start_rank,,,,average_rank,average_rank_all,,,,,,,,,laptime_mean_prev,,laptime_mean_all,,laps_prev,laps_after_last_pitstop,,prev_nb0_change_in_rank,,,,,follow_nb2_change_in_rank
[*] predict with ridge model
rmse=2.15, mae=1.58, r2=0.35
clf:%d :%s 1 ,,stageid,firststage,pit_in_caution,,start_rank,start_rank_ratio,top_pack,bottom_pack,average_rank,,,,,,,,,,,,,,laps_prev,laps_after_last_pitstop,,,,,,,
[*] predict with rf model
rmse=2.91, mae=2.17, r2=-0.20
clf:%d :%s 2 ,car_number,,,,,start_rank,start_rank_ratio,,,average_rank,average_rank_all,,,,,,laptime_green_std_prev,,,laptime_mean_prev,laptime_std_prev,,,,laps_after_last_pitstop,,,,,,,follow_nb2_change_in_rank
[*] predi

In [8]:
#weights analysis
idx = np.argsort(-weights)
fnames = [featureNames[id] for id in idx]
print('feature weights:',','.join(fnames))

feature weights: start_rank,laps_after_last_pitstop,average_rank,laps_prev,start_rank_ratio,follow_nb2_change_in_rank,laptime_green_std_prev,pit_in_caution,laptime_std_prev,average_rank_all,laptime_mean_prev,bottom_pack,car_number,pittime_prev,eventid,follow_nb1_change_in_rank,top_pack,stageid,follow_nb0_change_in_rank,start_position,laptime_mean_all,prev_nb0_change_in_rank,change_in_rank_all,laptime_green_std_all,change_in_rank,laptime_std_all,rate_of_change,prev_nb1_change_in_rank,prev_nb2_change_in_rank,firststage,laptime_green_mean_all,rate_of_change_all,laptime_green_mean_prev


In [9]:
df_event_rmse

Unnamed: 0,runid,trainsize,testsize,testdistribution,lasso,ridge,rf,xgb
0,Phoenix,691,114,"+:38,0:16,-:60",4.398765,4.380087,4.458643,4.219574
0,Indy500,580,225,"+:82,0:47,-:96",5.630328,5.351745,5.781628,5.850635
0,Texas,678,127,"+:39,0:34,-:54",3.840843,3.708709,4.213285,4.066607
0,Iowa,696,109,"+:42,0:28,-:39",3.739747,3.882063,4.885989,4.767701
0,Pocono,679,126,"+:29,0:61,-:36",7.996732,9.188669,4.05712,4.054176
0,Gateway,701,104,"+:34,0:28,-:42",2.837333,2.733555,3.473191,3.158113


In [10]:
df_event_r2

Unnamed: 0,runid,trainsize,testsize,testdistribution,lasso,ridge,rf,xgb
0,Phoenix,691,114,"+:38,0:16,-:60",0.136314,0.143633,0.11264,0.205248
0,Indy500,580,225,"+:82,0:47,-:96",0.163458,0.244192,0.117894,0.096711
0,Texas,678,127,"+:39,0:34,-:54",0.179672,0.235143,0.012866,0.0804
0,Iowa,696,109,"+:42,0:28,-:39",0.1178,0.049378,-0.505871,-0.43384
0,Pocono,679,126,"+:29,0:61,-:36",-9.83818,-13.309897,-1.789761,-1.785714
0,Gateway,701,104,"+:34,0:28,-:42",0.307702,0.357419,-0.03736,0.142315


In [11]:
df_stage_rmse

Unnamed: 0,runid,trainsize,testsize,testdistribution,lasso,ridge,rf,xgb
0,stage0,153,652,"+:221,0:167,-:264",4.852986,4.960882,5.019123,5.130541
0,stage1,288,517,"+:186,0:136,-:195",4.68336,4.675763,5.190685,5.534386
0,stage2,421,384,"+:140,0:112,-:132",4.702512,4.629679,5.037207,5.152965
0,stage3,547,258,"+:91,0:89,-:78",4.65981,4.736646,4.670284,4.869058
0,stage4,657,148,"+:48,0:53,-:47",5.008392,4.4395,4.716051,4.751165
0,stage5,725,80,"+:26,0:29,-:25",4.612373,4.083631,5.149763,4.754257
0,stage6,767,38,"+:11,0:13,-:14",3.336107,2.912477,4.114751,5.132883
0,stage7,789,16,"+:4,0:4,-:8",2.819208,2.153756,2.91243,3.024083


In [12]:
df_stage_r2

Unnamed: 0,runid,trainsize,testsize,testdistribution,lasso,ridge,rf,xgb
0,stage0,153,652,"+:221,0:167,-:264",-0.045139,-0.092129,-0.117923,-0.168106
0,stage1,288,517,"+:186,0:136,-:195",-0.001146,0.002099,-0.229792,-0.398045
0,stage2,421,384,"+:140,0:112,-:132",0.075136,0.103563,-0.061201,-0.110535
0,stage3,547,258,"+:91,0:89,-:78",0.012364,-0.020475,0.007919,-0.078327
0,stage4,657,148,"+:48,0:53,-:47",-0.047328,0.177086,0.071369,0.05749
0,stage5,725,80,"+:26,0:29,-:25",0.163902,0.344608,-0.042276,0.111672
0,stage6,767,38,"+:11,0:13,-:14",0.249573,0.428055,-0.141605,-0.776444
0,stage7,789,16,"+:4,0:4,-:8",-0.121031,0.345731,-0.196395,-0.289884
