### stage_model_regressor

prediction models of chg_of_rank_in_stage on stage dataset

data format:
    target , eventid ,    car_number,    stageid,     features...

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import math

# to use only one GPU.
# use this on r-001
# otherwise comment
import os
#os.environ["CUDA_VISIBLE_DEVICES"]="7"

In [2]:
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble.forest import RandomForestRegressor
from sklearn.linear_model.ridge import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model.stochastic_gradient import SGDRegressor
from sklearn.svm.classes import SVR
from sklearn.utils import shuffle
from sklearn import metrics
import xgboost as xgb

In [3]:
# bulid regression model
regressors = ['currank','avgrank','dice','lasso','ridge','rf','svr','xgb']
def get_regressor(regressor = 'lr'):
    if regressor == "lasso":
        clf = LassoCV(cv=5, random_state=0)
    elif regressor == "ridge":
        clf = RidgeCV(alphas=np.logspace(-6, 6, 13))
    elif regressor == "rf":
        clf = RandomForestRegressor(n_estimators=100)
    elif regressor == 'svr':
        clf = SVR(kernel='rbf')
    elif regressor == 'xgb':
        clf = xgb.XGBRegressor(objective="reg:linear", random_state=42, max_depth=3)
    elif regressor == 'dice':
        clf = RandomDice('1234')
    elif regressor == 'currank':
        clf = CurRank()
    elif regressor == 'avgrank':
        clf = AverageRank()        
    else:
        clf = None
        
    return clf


class CurRank():
    """
    predict with current rank
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = [0 for x in range(test_x.shape[0])]
        return np.array(pred_y)
    
class AverageRank():
    """
    print('[*] predict with average rankchg (change_in_rank_all):idx = 15')
    change_in_rank_all = test[:,15]
    pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in change_in_rank_all])
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            #13, change_in_rank_all
            pred_y.append(x[13])
        #pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in pred_y])
        pred_y_avg = pred_y
        return np.array(pred_y_avg)   

class RandomDice():
    """
    a random dice model
    """
    def __init__(self, seed='1234'):
        self.dist = []
        self.val = []
        random.seed(seed)
    
    def fit(self, x, y):
        total = y.shape[0]
        yval = set(y)
        
        ratio = 0.
        for val in yval:
            self.val.append(val)
            ratio += np.sum(y==val)*1.0 / total
            self.dist.append(ratio)
            
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            dice = random.random()
            #search in self.dist
            find_idx = -1
            for idx, ratio in enumerate(self.dist):
                if dice <= ratio:
                    find_idx = idx
                    break
            
            #or the last one match
            pred_y.append(self.val[find_idx])
            
        return np.array(pred_y)

def evaluate(test_y, pred_y):
    mae = metrics.mean_absolute_error(test_y, pred_y) 
    rmse = math.sqrt(metrics.mean_squared_error(test_y, pred_y))
    r2 = metrics.r2_score(test_y, pred_y)
    print('rmse=%.2f, mae=%.2f, r2=%.2f'%(rmse, mae, r2))
    return rmse, mae, r2
    
#
#features
#    cols=[Myidx, 'target','eventid','car_number','stageid',
#             'firststage','pit_in_caution','start_position',
#             'start_rank','start_rank_ratio','top_pack','bottom_pack',
#             'average_rank','average_rank_all',
#             'change_in_rank','change_in_rank_all','rate_of_change','rate_of_change_all']    
def split_by_eventid(stagedata, eventid):
    """
    split by eventid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['eventid'] != eventid].to_numpy()
    test  = stagedata[stagedata['eventid'] == eventid].to_numpy()

    #2:car_number
    train_x = train[:,2:]
    #train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    train_y = train[:,1]
    test_x = test[:,2:]
    #test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    test_y = test[:,1]
    
    return train, test, train_x, train_y, test_x, test_y


def split_by_stageid(stagedata, stageid):
    """
    split by stageid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['stageid'] <= stageid].to_numpy()
    test  = stagedata[stagedata['stageid'] > stageid].to_numpy()

    train_x = train[:,2:]
    #train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    train_y = train[:,1]
    test_x = test[:,2:]
    #test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    test_y = test[:,1]
    
    return train, test, train_x, train_y, test_x, test_y


def regressor_model(name='svr'):
    ### test learning models
    print('[*] predict with %s model'%name)
    clf = get_regressor(name)
    clf.fit(train_x, train_y)

    pred_y = clf.predict(test_x)
    score = evaluate(test_y, pred_y)
    return score

In [4]:
#load data
suffix='-withneighbor-newfeatures-timediff'
stagedata = pd.read_csv('stage-2018%s.csv'%suffix)
stagedata.fillna(0, inplace=True)
stagedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 805 entries, 0 to 804
Data columns (total 35 columns):
Unnamed: 0                   805 non-null int64
target                       805 non-null int64
eventid                      805 non-null int64
car_number                   805 non-null int64
stageid                      805 non-null int64
firststage                   805 non-null int64
pit_in_caution               805 non-null int64
start_position               805 non-null int64
start_rank                   805 non-null int64
start_rank_ratio             805 non-null float64
top_pack                     805 non-null int64
bottom_pack                  805 non-null int64
average_rank                 805 non-null float64
average_rank_all             805 non-null float64
change_in_rank               805 non-null int64
change_in_rank_all           805 non-null float64
rate_of_change               805 non-null int64
rate_of_change_all           805 non-null float64
laptime_green_mean_pr

### model on data split by event

In [5]:
cols = ['runid','trainsize','testsize','testdistribution']
cols.extend(regressors)
print('cols:%s'%cols)
retdf0 = pd.DataFrame([],columns=cols)
retdf1 = pd.DataFrame([],columns=cols)

eventsname = ['Phoenix','Indy500','Texas','Iowa','Pocono','Gateway']
events = set(stagedata['eventid'])
for eventid in events:
    print('Testset = %s'%eventsname[eventid])
    
    train, test, train_x, train_y, test_x, test_y = split_by_eventid(stagedata, eventid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
    #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
    #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))
    
    #record
    rec0 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    rec1 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    
    acc0 = [0 for x in range(len(regressors))]
    acc1 = [0 for x in range(len(regressors))]
    for idx, clf in enumerate(regressors):
        acc0[idx] = regressor_model(clf)[0]
        acc1[idx] = regressor_model(clf)[2]

    rec0.extend(acc0)
    rec1.extend(acc1)
    #print('rec:%s'%rec)
    
    #new df
    df = pd.DataFrame([rec0],columns=cols)
    retdf0 = pd.concat([retdf0, df])        
    
    df = pd.DataFrame([rec1],columns=cols)
    retdf1 = pd.concat([retdf1, df])        

    
retdf0.to_csv('regressors_stagedata_splitbyevent%s_rmse.csv'%suffix)
retdf1.to_csv('regressors_stagedata_splitbyevent%s_r2.csv'%suffix)

df_event_rmse = retdf0
df_event_r2 = retdf1

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lasso', 'ridge', 'rf', 'svr', 'xgb']
Testset = Phoenix
[*] predict with currank model
rmse=5.43, mae=3.66, r2=-0.00
[*] predict with currank model
rmse=5.43, mae=3.66, r2=-0.00
[*] predict with avgrank model
rmse=6.77, mae=4.63, r2=-0.56
[*] predict with avgrank model
rmse=6.77, mae=4.63, r2=-0.56
[*] predict with dice model
rmse=7.16, mae=5.38, r2=-0.74
[*] predict with dice model
rmse=7.16, mae=5.38, r2=-0.74
[*] predict with lasso model
rmse=4.98, mae=3.51, r2=0.16
[*] predict with lasso model
rmse=4.98, mae=3.51, r2=0.16
[*] predict with ridge model
rmse=5.03, mae=3.57, r2=0.14
[*] predict with ridge model
rmse=5.03, mae=3.57, r2=0.14
[*] predict with rf model
rmse=5.09, mae=3.77, r2=0.12
[*] predict with rf model
rmse=4.99, mae=3.59, r2=0.16
[*] predict with svr model
rmse=5.42, mae=3.66, r2=0.00
[*] predict with svr model
rmse=5.42, mae=3.66, r2=0.00
[*] predict with xgb model




rmse=5.11, mae=3.65, r2=0.11
[*] predict with xgb model
rmse=5.11, mae=3.65, r2=0.11
Testset = Indy500
[*] predict with currank model
rmse=5.75, mae=4.32, r2=-0.00
[*] predict with currank model
rmse=5.75, mae=4.32, r2=-0.00
[*] predict with avgrank model
rmse=7.33, mae=5.59, r2=-0.63
[*] predict with avgrank model
rmse=7.33, mae=5.59, r2=-0.63
[*] predict with dice model
rmse=6.83, mae=5.26, r2=-0.42
[*] predict with dice model
rmse=6.83, mae=5.26, r2=-0.42
[*] predict with lasso model
rmse=5.48, mae=4.36, r2=0.09
[*] predict with lasso model
rmse=5.48, mae=4.36, r2=0.09
[*] predict with ridge model
rmse=5.48, mae=4.28, r2=0.09
[*] predict with ridge model
rmse=5.48, mae=4.28, r2=0.09
[*] predict with rf model
rmse=5.59, mae=4.25, r2=0.05
[*] predict with rf model
rmse=5.51, mae=4.17, r2=0.08
[*] predict with svr model
rmse=5.75, mae=4.32, r2=-0.00
[*] predict with svr model
rmse=5.75, mae=4.32, r2=-0.00
[*] predict with xgb model
rmse=5.60, mae=4.27, r2=0.05
[*] predict with xgb mode



rmse=3.00, mae=2.30, r2=-0.18
[*] predict with lasso model
rmse=3.00, mae=2.30, r2=-0.18
[*] predict with ridge model
rmse=3.36, mae=2.45, r2=-0.47
[*] predict with ridge model
rmse=3.36, mae=2.45, r2=-0.47
[*] predict with rf model
rmse=3.00, mae=2.29, r2=-0.17
[*] predict with rf model
rmse=3.28, mae=2.58, r2=-0.40
[*] predict with svr model
rmse=2.81, mae=1.88, r2=-0.03
[*] predict with svr model
rmse=2.81, mae=1.88, r2=-0.03
[*] predict with xgb model
rmse=2.83, mae=2.12, r2=-0.04
[*] predict with xgb model
rmse=2.83, mae=2.12, r2=-0.04
Testset = Gateway
[*] predict with currank model
rmse=3.12, mae=2.14, r2=-0.00
[*] predict with currank model
rmse=3.12, mae=2.14, r2=-0.00
[*] predict with avgrank model
rmse=4.34, mae=3.10, r2=-0.93
[*] predict with avgrank model
rmse=4.34, mae=3.10, r2=-0.93
[*] predict with dice model
rmse=5.69, mae=3.93, r2=-2.32
[*] predict with dice model
rmse=5.69, mae=3.93, r2=-2.32
[*] predict with lasso model
rmse=2.98, mae=2.22, r2=0.09
[*] predict with 

### model on data split by stage

In [6]:
retdf0 = pd.DataFrame([],columns=cols)
retdf1 = pd.DataFrame([],columns=cols)

for stageid in range(8):
    train, test, train_x, train_y, test_x, test_y =split_by_stageid(stagedata, stageid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
    #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
    #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))
    
    #record
    rec0 = ['stage%d'%stageid,train_x.shape[0],test_x.shape[0],test_distribution]
    rec1 = ['stage%d'%stageid,train_x.shape[0],test_x.shape[0],test_distribution]
    
    acc0 = [0 for x in range(len(regressors))]
    acc1 = [0 for x in range(len(regressors))]
    for idx, clf in enumerate(regressors):
        acc0[idx] = regressor_model(clf)[0]
        acc1[idx] = regressor_model(clf)[2]

    rec0.extend(acc0)
    rec1.extend(acc1)
    #print('rec:%s'%rec)
    
    #new df
    df = pd.DataFrame([rec0],columns=cols)
    retdf0 = pd.concat([retdf0, df])  
    
    df = pd.DataFrame([rec1],columns=cols)
    retdf1 = pd.concat([retdf1, df])  

retdf0.to_csv('regressor_stagedata_splitbystage%s_rmse.csv'%suffix)
retdf1.to_csv('regressor_stagedata_splitbystage%s_r2.csv'%suffix)

df_stage_rmse = retdf0
df_stage_r2 = retdf1

[*] predict with currank model
rmse=4.36, mae=2.83, r2=-0.01
[*] predict with currank model
rmse=4.36, mae=2.83, r2=-0.01
[*] predict with avgrank model
rmse=6.11, mae=4.21, r2=-0.98
[*] predict with avgrank model
rmse=6.11, mae=4.21, r2=-0.98
[*] predict with dice model
rmse=6.93, mae=5.36, r2=-1.55
[*] predict with dice model
rmse=6.93, mae=5.36, r2=-1.55
[*] predict with lasso model
rmse=4.41, mae=3.08, r2=-0.03
[*] predict with lasso model
rmse=4.41, mae=3.08, r2=-0.03
[*] predict with ridge model
rmse=4.23, mae=3.05, r2=0.05
[*] predict with ridge model
rmse=4.23, mae=3.05, r2=0.05
[*] predict with rf model
rmse=4.81, mae=3.50, r2=-0.22
[*] predict with rf model
rmse=4.78, mae=3.53, r2=-0.21
[*] predict with svr model
rmse=4.38, mae=2.90, r2=-0.01
[*] predict with svr model
rmse=4.38, mae=2.90, r2=-0.01
[*] predict with xgb model
rmse=5.23, mae=3.83, r2=-0.45
[*] predict with xgb model
rmse=5.23, mae=3.83, r2=-0.45
[*] predict with currank model
rmse=3.98, mae=2.57, r2=-0.00
[*] p

In [7]:
df_event_rmse

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lasso,ridge,rf,svr,xgb
0,Phoenix,691,114,"+:39,0:20,-:55",5.428159,6.770844,7.162892,4.976622,5.025101,5.091888,5.418394,5.105846
0,Indy500,580,225,"+:88,0:23,-:114",5.748816,7.326274,6.832276,5.481805,5.48323,5.594891,5.752005,5.602743
0,Texas,678,127,"+:35,0:30,-:62",4.627885,6.539167,6.457005,4.474083,4.756342,5.143519,4.528277,4.910475
0,Iowa,696,109,"+:39,0:25,-:45",3.604279,4.912422,5.908321,3.41208,3.494903,4.134058,3.602442,3.706305
0,Pocono,679,126,"+:38,0:39,-:49",2.808717,3.517821,5.133148,3.004095,3.358958,2.997389,2.806152,2.827545
0,Gateway,701,104,"+:32,0:27,-:45",3.124038,4.335754,5.689903,2.982362,3.058584,3.210711,3.095421,3.096799


In [8]:
df_event_r2

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lasso,ridge,rf,svr,xgb
0,Phoenix,691,114,"+:39,0:20,-:55",-0.000316,-0.556387,-0.741842,0.159183,0.142722,0.155055,0.00328,0.11495
0,Indy500,580,225,"+:88,0:23,-:114",-0.003464,-0.629714,-0.417346,0.087585,0.087111,0.077459,-0.004578,0.046883
0,Texas,678,127,"+:35,0:30,-:62",-0.006715,-1.009953,-0.959762,0.059088,-0.063377,-0.237311,0.036155,-0.133413
0,Iowa,696,109,"+:39,0:25,-:45",0.0,-0.85761,-1.687147,0.103807,0.059771,-0.258684,0.001019,-0.057415
0,Pocono,679,126,"+:38,0:39,-:49",-0.029595,-0.615094,-2.438887,-0.177817,-0.472514,-0.402576,-0.027715,-0.043445
0,Gateway,701,104,"+:32,0:27,-:45",-0.001604,-0.929267,-2.322561,0.087183,0.039928,-0.03378,0.016662,0.015787


In [9]:
df_stage_rmse

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lasso,ridge,rf,svr,xgb
0,stage0,153,652,"+:213,0:140,-:299",4.356963,6.10683,6.932961,4.411232,4.230496,4.806442,4.37609,5.225326
0,stage1,288,517,"+:164,0:123,-:230",3.97891,4.957904,6.721742,3.778881,3.741922,4.185232,4.00956,4.116786
0,stage2,421,384,"+:113,0:103,-:168",3.849378,4.546786,6.792881,3.560794,3.523728,3.788095,3.851122,3.884518
0,stage3,547,258,"+:70,0:87,-:101",3.538821,4.1186,6.068408,3.334479,3.297435,3.459451,3.558998,3.669002
0,stage4,657,148,"+:31,0:52,-:65",3.093148,3.390338,5.32206,2.922048,2.908027,2.915017,3.029977,2.991442
0,stage5,725,80,"+:18,0:29,-:33",2.865746,3.247557,6.034277,2.608161,2.684805,2.853844,2.810544,3.087544
0,stage6,767,38,"+:6,0:16,-:16",2.299886,2.505592,6.278619,2.159716,2.212388,2.416168,2.212289,2.687286
0,stage7,789,16,"+:2,0:6,-:8",1.75,2.049955,5.695393,1.837558,1.683205,1.823945,1.607084,2.526081


In [10]:
df_stage_r2

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lasso,ridge,rf,svr,xgb
0,stage0,153,652,"+:213,0:140,-:299",-0.005979,-0.976299,-1.547174,-0.031195,0.051574,-0.211707,-0.014831,-0.446931
0,stage1,288,517,"+:164,0:123,-:230",-0.001961,-0.555675,-1.859475,0.096249,0.11384,-0.07942,-0.017457,-0.072603
0,stage2,421,384,"+:113,0:103,-:168",-0.003889,-0.400598,-2.12617,0.14099,0.15878,0.044056,-0.004799,-0.022301
0,stage3,547,258,"+:70,0:87,-:101",-0.002021,-0.357246,-1.946518,0.110358,0.130015,0.031028,-0.013479,-0.077098
0,stage4,657,148,"+:31,0:52,-:65",-0.052238,-0.26415,-2.115101,0.060953,0.069943,0.074812,-0.009697,0.015822
0,stage5,725,80,"+:18,0:29,-:33",-0.043872,-0.340558,-3.628309,0.135349,0.083785,-0.070166,-0.004044,-0.211709
0,stage6,767,38,"+:6,0:16,-:16",-0.105515,-0.312117,-7.239108,0.025133,-0.022997,-0.022425,-0.022906,-0.509315
0,stage7,789,16,"+:2,0:6,-:8",-0.274797,-0.749257,-12.502439,-0.405552,-0.17934,-0.577795,-0.075083,-1.656193


### save the model

In [11]:
import pickle 
eventsname = ['Phoenix','Indy500','Texas','Iowa','Pocono','Gateway']
events = set(stagedata['eventid'])
#for eventid in events:
eventid = 1
name = 'lasso'
valuemodel = 'valuemodel-' + eventsname[eventid] + '-lasso' + '.pkl'
if True:
    print('Testset = %s'%eventsname[eventid])
    
    train, test, train_x, train_y, test_x, test_y = split_by_eventid(stagedata, eventid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
    #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
    #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))
    
    print('[*] predict with %s model'%name)
    clf = get_regressor(name)
    clf.fit(train_x, train_y)

    pred_y = clf.predict(test_x).astype(int)
    score = evaluate(test_y, pred_y)
    
    print('rec:', score)
    
    #save the model
    with open(valuemodel, 'wb') as fout:   
        pickle.dump((clf, test_x, test_y), fout)

Testset = Indy500
[*] predict with lasso model
rmse=5.44, mae=4.26, r2=0.10
('rec:', (5.4393627077688675, 4.262222222222222, 0.10165930556988323))
