### stage_model_regressor

prediction models of chg_of_rank_in_stage on stage dataset

data format:
    target , eventid ,    car_number,    stageid,     features...

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import math


In [2]:
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble.forest import RandomForestRegressor
from sklearn.linear_model.ridge import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model.stochastic_gradient import SGDRegressor
from sklearn.svm.classes import SVR
from sklearn.utils import shuffle
from sklearn import metrics
import xgboost as xgb



In [9]:
# bulid regression model
regressors = ['currank','avgrank','dice','lasso','ridge','rf','svr','xgb']
def get_regressor(regressor = 'lr'):
    if regressor == "lasso":
        clf = LassoCV(cv=5, random_state=0)
    elif regressor == "ridge":
        clf = RidgeCV(alphas=np.logspace(-6, 6, 13))
    elif regressor == "rf":
        clf = RandomForestRegressor(n_estimators=100)
    elif regressor == 'svr':
        clf = SVR(kernel='rbf')
    elif regressor == 'xgb':
        clf = xgb.XGBRegressor(objective="reg:linear", random_state=42, max_depth=3)
    elif regressor == 'dice':
        clf = RandomDice('1234')
    elif regressor == 'currank':
        clf = CurRank()
    elif regressor == 'avgrank':
        clf = AverageRank()        
    else:
        clf = None
        
    return clf


class CurRank():
    """
    predict with current rank
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = [0 for x in range(test_x.shape[0])]
        return np.array(pred_y)
    
class AverageRank():
    """
    print('[*] predict with average rankchg (change_in_rank_all):idx = 15')
    change_in_rank_all = test[:,15]
    pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in change_in_rank_all])
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            #13, change_in_rank_all
            pred_y.append(x[13])
        #pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in pred_y])
        pred_y_avg = pred_y
        return np.array(pred_y_avg)   

class RandomDice():
    """
    a random dice model
    """
    def __init__(self, seed='1234'):
        self.dist = []
        self.val = []
        random.seed(seed)
    
    def fit(self, x, y):
        total = y.shape[0]
        yval = set(y)
        
        ratio = 0.
        for val in yval:
            self.val.append(val)
            ratio += np.sum(y==val)*1.0 / total
            self.dist.append(ratio)
            
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            dice = random.random()
            #search in self.dist
            find_idx = -1
            for idx, ratio in enumerate(self.dist):
                if dice <= ratio:
                    find_idx = idx
                    break
            
            #or the last one match
            pred_y.append(self.val[find_idx])
            
        return np.array(pred_y)

def evaluate(test_y, pred_y):
    mae = metrics.mean_absolute_error(test_y, pred_y) 
    rmse = math.sqrt(metrics.mean_squared_error(test_y, pred_y))
    r2 = metrics.r2_score(test_y, pred_y)
    print('rmse=%.2f, mae=%.2f, r2=%.2f'%(rmse, mae, r2))
    return mae,rmse, r2
    
#
#features
#    cols=[Myidx, 'target','eventid','car_number','stageid',
#             'firststage','pit_in_caution','start_position',
#             'start_rank','start_rank_ratio','top_pack','bottom_pack',
#             'average_rank','average_rank_all',
#             'change_in_rank','change_in_rank_all','rate_of_change','rate_of_change_all']    
def split_by_eventid(stagedata, eventid):
    """
    split by eventid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['eventid'] != eventid].to_numpy()
    test  = stagedata[stagedata['eventid'] == eventid].to_numpy()

    #2:car_number
    train_x = train[:,2:]
    #train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    train_y = train[:,1]
    test_x = test[:,2:]
    #test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    test_y = test[:,1]
    
    return train, test, train_x, train_y, test_x, test_y


def split_by_stageid(stagedata, stageid):
    """
    split by stageid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['stageid'] <= stageid].to_numpy()
    test  = stagedata[stagedata['stageid'] > stageid].to_numpy()

    train_x = train[:,2:]
    #train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    train_y = train[:,1]
    test_x = test[:,2:]
    #test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    test_y = test[:,1]
    
    return train, test, train_x, train_y, test_x, test_y


def regressor_model(name='svr'):
    ### test learning models
    print('[*] predict with %s model'%name)
    clf = get_regressor(name)
    clf.fit(train_x, train_y)

    pred_y = clf.predict(test_x)
    score = evaluate(test_y, pred_y)
    return score

In [4]:
#load data
_trim = 2
suffix='withneighbor-newfeatures'
stagedata = pd.read_csv('stage-%s-%s-t%s.csv'%('indy500-2013-2018',suffix, _trim))
stagedata.fillna(0, inplace=True)
stagedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1521 entries, 0 to 1520
Data columns (total 35 columns):
Unnamed: 0                   1521 non-null int64
target                       1521 non-null int64
eventid                      1521 non-null int64
car_number                   1521 non-null int64
stageid                      1521 non-null int64
firststage                   1521 non-null int64
pit_in_caution               1521 non-null int64
start_position               1521 non-null int64
start_rank                   1521 non-null int64
start_rank_ratio             1521 non-null float64
top_pack                     1521 non-null int64
bottom_pack                  1521 non-null int64
average_rank                 1521 non-null float64
average_rank_all             1521 non-null float64
change_in_rank               1521 non-null int64
change_in_rank_all           1521 non-null float64
rate_of_change               1521 non-null int64
rate_of_change_all           1521 non-null float64
l

### model on data split by event

In [10]:
cols = ['runid','trainsize','testsize','testdistribution']
cols.extend(regressors)
print('cols:%s'%cols)
retdf0 = pd.DataFrame([],columns=cols)
retdf1 = pd.DataFrame([],columns=cols)

events = set(stagedata['eventid'])

years = ['2013','2014','2015','2016','2017','2018']
#events = ['Indy500']
eventsname = [f'Indy500-{x}' for x in years]
events_id={key:idx for idx, key in enumerate(eventsname)}
for eventid in events:
    print('Testset = %s'%eventsname[eventid])
    
    train, test, train_x, train_y, test_x, test_y = split_by_eventid(stagedata, eventid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
    #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
    #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))
    
    #record
    rec0 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    rec1 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    
    acc0 = [0 for x in range(len(regressors))]
    acc1 = [0 for x in range(len(regressors))]
    for idx, clf in enumerate(regressors):
        acc = regressor_model(clf)
        acc0[idx] = acc[0]
        acc1[idx] = acc[2]

    rec0.extend(acc0)
    rec1.extend(acc1)
    #print('rec:%s'%rec)
    
    #new df
    df = pd.DataFrame([rec0],columns=cols)
    retdf0 = pd.concat([retdf0, df])        
    
    df = pd.DataFrame([rec1],columns=cols)
    retdf1 = pd.concat([retdf1, df])        

    
retdf0.to_csv('regressors_stagedata_splitbyevent%s_rmse.csv'%suffix)
retdf1.to_csv('regressors_stagedata_splitbyevent%s_r2.csv'%suffix)

df_event_rmse = retdf0
df_event_r2 = retdf1

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lasso', 'ridge', 'rf', 'svr', 'xgb']
Testset = Indy500-2013
[*] predict with currank model
rmse=7.17, mae=4.96, r2=-0.00
[*] predict with avgrank model
rmse=9.22, mae=6.21, r2=-0.66
[*] predict with dice model
rmse=9.57, mae=6.90, r2=-0.78
[*] predict with lasso model
rmse=6.24, mae=4.76, r2=0.24
[*] predict with ridge model
rmse=6.29, mae=4.81, r2=0.23
[*] predict with rf model
rmse=6.47, mae=4.85, r2=0.18
[*] predict with svr model
rmse=6.80, mae=4.82, r2=0.10
[*] predict with xgb model
rmse=6.29, mae=4.62, r2=0.23
Testset = Indy500-2014
[*] predict with currank model
rmse=4.90, mae=3.09, r2=-0.00
[*] predict with avgrank model
rmse=5.77, mae=3.92, r2=-0.39
[*] predict with dice model
rmse=8.20, mae=5.63, r2=-1.81
[*] predict with lasso model
rmse=4.84, mae=3.51, r2=0.02
[*] predict with ridge model
rmse=4.80, mae=3.46, r2=0.04
[*] predict with rf model
rmse=4.89, mae=3.69, r2=0.00
[*] predict 

In [11]:
df_event_rmse

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lasso,ridge,rf,svr,xgb
0,Indy500-2013,1290,231,"+:93,0:34,-:104",4.95671,6.206918,6.904762,4.756246,4.813451,4.846883,4.816488,4.61578
0,Indy500-2014,1260,261,"+:76,0:44,-:141",3.088123,3.918671,5.628352,3.508939,3.461968,3.689962,3.101193,3.573788
0,Indy500-2015,1290,231,"+:80,0:37,-:114",3.688312,4.608115,5.91342,3.366289,3.443093,3.888095,3.460813,4.047645
0,Indy500-2016,1229,292,"+:108,0:46,-:138",4.342466,5.189677,6.657534,4.14726,4.128352,3.963151,4.148108,4.170461
0,Indy500-2017,1240,281,"+:86,0:67,-:128",3.459075,4.149606,5.868327,3.963616,3.93353,3.571103,3.479797,4.143002
0,Indy500-2018,1296,225,"+:92,0:34,-:99",4.666667,6.006571,6.711111,4.542709,4.543093,5.14,4.481758,4.862362


In [12]:
### train 2013-2017
#load data
_trim = 2
suffix='withneighbor-newfeatures'
stagedata = pd.read_csv('stage-%s-%s-t%s.csv'%('indy500-2013-2019',suffix, _trim))

stagedata.fillna(0, inplace=True)

cols = ['runid','trainsize','testsize','testdistribution']
cols.extend(regressors)
print('cols:%s'%cols)
retdf0 = pd.DataFrame([],columns=cols)
retdf1 = pd.DataFrame([],columns=cols)

events = set(stagedata['eventid'])

years = ['2013','2014','2015','2016','2017','2018','2019']
#events = ['Indy500']
eventsname = [f'Indy500-{x}' for x in years]
events_id={key:idx for idx, key in enumerate(eventsname)}

#first 
eventid = events_id['Indy500-2018']
ignore_eventid = events_id['Indy500-2019']
stdata_2018 = stagedata[stagedata['eventid']!=ignore_eventid]

print('Testset = %s'%eventsname[eventid])

train, test, train_x, train_y, test_x, test_y = split_by_eventid(stdata_2018, eventid)
test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
#print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
#      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

#record
rec0 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
rec1 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

acc0 = [0 for x in range(len(regressors))]
acc1 = [0 for x in range(len(regressors))]
for idx, clf in enumerate(regressors):
    acc = regressor_model(clf)
    acc0[idx] = acc[0]
    acc1[idx] = acc[2]

rec0.extend(acc0)
rec1.extend(acc1)
#print('rec:%s'%rec)

#new df
df = pd.DataFrame([rec0],columns=cols)
retdf0 = pd.concat([retdf0, df])        


#second 
eventid = events_id['Indy500-2019']
ignore_eventid = events_id['Indy500-2018']
stdata_2019 = stagedata[stagedata['eventid']!=ignore_eventid]

print('Testset = %s'%eventsname[eventid])

train, test, train_x, train_y, test_x, test_y = split_by_eventid(stdata_2019, eventid)
test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
#print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
#      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))

#record
rec0 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
rec1 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]

acc0 = [0 for x in range(len(regressors))]
acc1 = [0 for x in range(len(regressors))]
for idx, clf in enumerate(regressors):
    acc = regressor_model(clf)
    acc0[idx] = acc[0]
    acc1[idx] = acc[2]

rec0.extend(acc0)
rec1.extend(acc1)
#print('rec:%s'%rec)

#new df
df = pd.DataFrame([rec0],columns=cols)
retdf0 = pd.concat([retdf0, df])    

retdf0

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lasso', 'ridge', 'rf', 'svr', 'xgb']
Testset = Indy500-2018
[*] predict with currank model
rmse=7.01, mae=4.67, r2=-0.00
[*] predict with avgrank model
rmse=8.69, mae=6.01, r2=-0.54
[*] predict with dice model
rmse=8.85, mae=6.71, r2=-0.59
[*] predict with lasso model
rmse=6.11, mae=4.54, r2=0.24
[*] predict with ridge model
rmse=6.12, mae=4.54, r2=0.24
[*] predict with rf model
rmse=6.91, mae=5.08, r2=0.03
[*] predict with svr model
rmse=6.55, mae=4.48, r2=0.13
[*] predict with xgb model
rmse=6.89, mae=4.86, r2=0.03
Testset = Indy500-2019
[*] predict with currank model
rmse=6.77, mae=4.27, r2=-0.00
[*] predict with avgrank model
rmse=8.15, mae=5.42, r2=-0.45
[*] predict with dice model
rmse=9.33, mae=6.37, r2=-0.90
[*] predict with lasso model
rmse=6.63, mae=4.90, r2=0.04
[*] predict with ridge model
rmse=6.66, mae=4.93, r2=0.03
[*] predict with rf model
rmse=6.71, mae=4.93, r2=0.02
[*] predict 

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lasso,ridge,rf,svr,xgb
0,Indy500-2018,1296,225,"+:92,0:34,-:99",4.666667,6.006571,6.711111,4.542709,4.543093,5.078533,4.481758,4.862362
0,Indy500-2019,1296,250,"+:80,0:50,-:120",4.272,5.424656,6.372,4.904459,4.932485,4.92892,4.153423,5.141489


In [13]:
retdf0.to_csv('stint_regressor_result_t2013-2017.csv', float_format='%.3f')