## Evalute a lstmts model on rank prediction by lap time 

### The Problem

Rank is calculated by the elapsed time when car crosses the start-finish line. The order of the cars for the same lap number is its rank.

This experiment evaluates the performance of a model which predicts the lap time of the next lap, compared with a navie prediction.





In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from keras.models import load_model
from indycar.notebook import *

os.environ["CUDA_VISIBLE_DEVICES"]="7"


Using TensorFlow backend.


## Load Data

In [2]:
import os
os.getcwd()


'/scratch/hpda/indycar/predictor/notebook/CalculateRank'

In [3]:
#
# model trained in 'lstmts_20172018-completedcars' 
#
D = 5
datalist = ['2017-Indy500-completed_laps_diff.csv','2018-Indy500-completed_laps_diff.csv' ]
model_name = 'lstmts'
output_prefix = 'indy500-train2017-test2018-completed_laps-M%s-D%d'%(model_name, D)
pred_outputfile = output_prefix + '-pred.csv'
model_outputfile = output_prefix + '-model.h5'
trainhist_outputfile = output_prefix + '-trainhist.jpg'
eval_outputfile = output_prefix + '-eval.csv'

lstmts_model = load_model(model_outputfile)

scaler, dataset, dblens = load_data(datalist)

dataset.info(verbose=True)
print('dataset shape', dataset.shape)



load 2017-Indy500-completed_laps_diff.csv, len=3216
load 2018-Indy500-completed_laps_diff.csv, len=3618
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6834 entries, 0 to 3617
Data columns (total 10 columns):
MyIdx             6834 non-null int64
car_number        6834 non-null int64
completed_laps    6834 non-null int64
rank              6834 non-null int64
elapsed_time      6834 non-null float64
rank_diff         6834 non-null float64
time_diff         6834 non-null float64
dbid              6834 non-null int64
rank_diff_raw     6834 non-null float64
time_diff_raw     6834 non-null float64
dtypes: float64(5), int64(5)
memory usage: 587.3 KB
dataset shape (6834, 10)


In [4]:
# generate train/test dataset

# post-padding with 0
X, y, w = generate_data(dataset, D=D, target='time')

total = np.sum(dblens)
ntrain = np.sum(dblens[:-1])
#nval = int(dblens[-1] / 2)
nval = 0
ntest = total - ntrain - nval


print('train=%d, val=%d, test=%d'%(ntrain, nval, total-ntrain-nval))

X_train, X_val, X_test = X[:ntrain], X[ntrain:ntrain + nval], X[ntrain + nval:]
y_train, y_val, y_test = y[:ntrain], y[ntrain:ntrain + nval], y[ntrain + nval:]
#weights
w_train, w_val, w_test = w[:ntrain], w[ntrain:ntrain + nval], w[ntrain+nval:]
print('train shape:', X_train.shape)

carNumber = 34, max T =201
train=16, val=0, test=18
train shape: (16, 196, 1)


### predict

In [5]:
#
# parameters
#
def load_pred_fromfile():
    #year = '2017'
    year = '2018'
    #event = 'Toronto'
    event = 'indy500'
    #'indy500-2018-completed_laps'
    inputfile = event +'-' + year + '-completed_laps-pred.csv'
    outputprefix = year +'-' + event + '-eval-'
    pred = pd.read_csv(inputfile)
    pred_timediff = pred['pred']
    return pred_timediff

In [6]:
# X_test, y_test, w_test
#
#pred_timediff = load_pred_fromfile()
lstmts_result = predict('lstmts', lstmts_model, X_test, y_test, scaler)
pred_timediff = lstmts_result[1][:,1]

lstmts model mae=0.065154, raw mae=6.575450, raw mape=9.081444


In [7]:
print('pred shape',pred_timediff.shape)
pred_timediff[:4]

pred shape (3528,)


array([27.20769278, 35.47653671, 39.83164303, 41.75833875])

In [8]:
# get true/predicted elapsedtime
def get_elapsedtime(dataset, pred_timediff, testsize, D):
    """
    dataset ; raw data frame
    pred_timediff;  prediction result of lap time
    testsize ; size of testset
    """
    n = pred_timediff.shape[0]

    cars = []
    groups = []
    for car, group in dataset.groupby('car_number'):
        cars.append(car)
        groups.append(groups)

    testset = cars[-testsize:]
    rankdata = dataset[dataset['car_number'].isin(testset)]
    carnum = len(testset)
    print('testset car number', carnum)

    print('rankdata shape', rankdata.shape)
    rankdata[:4]

    # reshape to <carno, data>
    true_elapsedtime = np.array(rankdata['elapsed_time']).reshape((carnum, -1))
    pred_laptime = np.array(pred_timediff).reshape((carnum, -1))

    pred_elapsedtime = np.zeros_like(true_elapsedtime)
    pred_elapsedtime[:,D:] = true_elapsedtime[:,0:-D]
    pred_elapsedtime[:,D:] += pred_laptime
    
    return true_elapsedtime,pred_elapsedtime

In [9]:
true_elapsedtime, pred_elapsedtime = get_elapsedtime(dataset, pred_timediff, ntest, D)
print(true_elapsedtime[0:2,:10])
print(pred_elapsedtime[0:2,:10])

testset car number 18
rankdata shape (3618, 10)
[[  0.415   42.7829  83.8813 124.9756 166.0936 207.1653 248.3523 289.6172
  330.9584 372.3228]
 [  1.4313  46.9758  88.5802 130.104  171.3932 212.7971 254.3784 295.9577
  337.8812 379.6526]]
[[  0.           0.           0.           0.           0.
   27.62269278  78.25943671 123.71294303 166.73393875 208.34503361]
 [  0.           0.           0.           0.           0.
   28.63899278  82.64872649 128.54215698 171.9548852  213.70780401]]


In [10]:
#np.array(rankdata[rankdata['completed_laps']<7]['rank']).reshape((ntest,-1))

In [11]:
# rank caculation
# evalutate rank after 5 laps
idx = np.argsort(true_elapsedtime, axis=0)
true_rank = np.argsort(idx, axis=0)
idx = np.argsort(pred_elapsedtime, axis=0)
pred_rank = np.argsort(idx, axis=0)

In [12]:
# simple model
simple_rank = np.zeros_like(true_rank)
simple_rank[:,D:] = true_rank[:,0:-D] 
simple_rank[:,28:35]

array([[ 3,  3,  3,  3,  3,  3,  3],
       [ 7,  7,  7,  7,  7,  7,  7],
       [ 9,  9,  9,  9,  9,  9,  9],
       [ 4,  4,  4,  4,  4,  4,  4],
       [ 2,  2,  2,  2,  2,  2,  2],
       [13, 13, 13, 12, 12, 12, 12],
       [ 0,  0,  0,  0,  0,  0,  0],
       [ 1,  1,  1,  1,  1,  1,  1],
       [ 8,  8,  8,  8,  8,  8,  8],
       [11, 11, 11, 11, 11, 11, 11],
       [15, 15, 15, 15, 14, 14, 14],
       [ 6,  6,  6,  6,  6,  6,  6],
       [10, 10, 10, 10, 10, 10, 10],
       [17, 17, 17, 17, 17, 17, 17],
       [14, 14, 14, 14, 15, 15, 15],
       [16, 16, 16, 16, 16, 16, 16],
       [12, 12, 12, 13, 13, 13, 13],
       [ 5,  5,  5,  5,  5,  5,  5]])

In [13]:
true_rank[:,28:35]

array([[ 3,  3,  1,  0,  1,  7,  2],
       [ 7,  7,  7,  7,  0,  0,  9],
       [ 9,  9,  9,  8,  5, 14, 14],
       [ 4,  4,  4,  2,  3, 11,  5],
       [ 2,  2,  3,  1,  2, 10,  4],
       [12, 12, 12, 11,  8,  3,  0],
       [ 0,  0,  0,  5, 12,  6,  1],
       [ 1,  1,  2,  6, 13,  8,  3],
       [ 8,  8,  8, 12, 16, 15,  8],
       [11, 11, 11, 10,  7,  2, 11],
       [14, 14, 14, 14,  9,  4, 12],
       [ 6,  6,  6,  4, 10, 13,  7],
       [10, 10, 10,  9,  6,  1, 10],
       [17, 17, 16, 16, 14,  9, 15],
       [15, 15, 15, 15, 11,  5, 13],
       [16, 16, 17, 17, 15, 17, 17],
       [13, 13, 13, 13, 17, 16, 16],
       [ 5,  5,  5,  3,  4, 12,  6]])

In [14]:
pred_rank[:,28:35]

array([[ 3,  3,  3,  3,  3,  3,  3],
       [ 7,  7,  7,  7,  7,  7,  7],
       [ 9,  9,  9,  9,  9,  9,  9],
       [ 4,  4,  4,  4,  4,  4,  4],
       [ 2,  2,  2,  2,  2,  2,  2],
       [13, 13, 13, 12, 12, 12, 12],
       [ 0,  0,  0,  0,  0,  0,  0],
       [ 1,  1,  1,  1,  1,  1,  1],
       [ 8,  8,  8,  8,  8,  8,  8],
       [11, 11, 11, 11, 11, 11, 11],
       [15, 15, 15, 15, 14, 14, 14],
       [ 6,  6,  6,  6,  6,  6,  6],
       [10, 10, 10, 10, 10, 10, 10],
       [17, 17, 17, 17, 17, 17, 16],
       [14, 14, 14, 14, 15, 15, 15],
       [16, 16, 16, 16, 16, 16, 17],
       [12, 12, 12, 13, 13, 13, 13],
       [ 5,  5,  5,  5,  5,  5,  5]])

### evluate the rank

In [15]:
# 
def eval_model(name, trueth, pred, startlap = 5):
    match_mask = (trueth == pred)
    print('eval on model: ', name)
    accuracy = np.sum(match_mask)/(match_mask.shape[1]*match_mask.shape[0])
    print('accuracy:', accuracy)

    top1 = np.logical_and((trueth==0),match_mask)[:,startlap:]
    top1_accuracy = np.sum(top1)/top1.shape[1]
    print('top1 accuracy:', top1_accuracy)
    
    top5 = np.logical_and((trueth < 5),match_mask)[:,startlap:]
    top5_accuracy = np.sum(top5)/top5.shape[1]/5
    print('top5 accuracy:', top5_accuracy)
    # precision is more useful
    top5 = np.logical_and((trueth < 5),(pred < 5))[:,startlap:]
    top5_precision = np.sum(top5)/top5.shape[1]/5
    print('top5 precision:', top5_precision)
    
    return accuracy,top1_accuracy,top5_accuracy,top5_precision
    
models ={'lstmts':pred_rank, 'simple':simple_rank}
ret = []
for m in models:
    ret.append(eval_model(m, true_rank, models[m]))
    
result = np.array(ret)
#ret1 = eval_model('lstmts', true_rank, pred_rank)
#ret2 = eval_model('simple', true_rank, simple_rank)

df = pd.DataFrame({'model':list(models.keys()), 'accuracy':result[:,0],
                   'top1_accuracy':result[:,1],'top5_accuracy':result[:,2],
                   'top5_precision':result[:,3]})
df
df.to_csv(eval_outputfile)


eval on model:  lstmts
accuracy: 0.37258153676064126
top1 accuracy: 0.6020408163265306
top5 accuracy: 0.45510204081632655
top5 precision: 0.7183673469387755
eval on model:  simple
accuracy: 0.4508015478164732
top1 accuracy: 0.6989795918367347
top5 accuracy: 0.5714285714285714
top5 precision: 0.7887755102040817


In [16]:
df

Unnamed: 0,model,accuracy,top1_accuracy,top5_accuracy,top5_precision
0,lstmts,0.372582,0.602041,0.455102,0.718367
1,simple,0.450802,0.69898,0.571429,0.788776
