# Imports

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import random


## Load Data

In [2]:
import os
os.getcwd()


'/scratch/hpda/indycar/predictor/notebook/6.SectionRank'

In [3]:
#
# parameters
#
#year = '2017'
year = '2018'
#event = 'Toronto'
event = 'Indy500'

inputfile = '../data/final/C_'+ event +'-' + year + '-final.csv'
outputprefix = year +'-' + event + '-'
dataset = pd.read_csv(inputfile)
dataset.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18500 entries, 0 to 18499
Data columns (total 21 columns):
rank                    18500 non-null int64
car_number              18500 non-null int64
unique_id               18500 non-null object
completed_laps          18500 non-null int64
elapsed_time            18500 non-null float64
last_laptime            18500 non-null float64
lap_status              18500 non-null object
best_laptime            18500 non-null float64
best_lap                18500 non-null object
time_behind_leader      18500 non-null float64
laps_behind_leade       18500 non-null object
time_behind_prec        18500 non-null float64
laps_behind_prec        18500 non-null object
overall_rank            18500 non-null object
overall_best_laptime    18500 non-null float64
current_status          18500 non-null object
track_status            18500 non-null object
pit_stop_count          18500 non-null object
last_pitted_lap         18500 non-null object
start_position

### The Simulator

simple model without DNF

1. laptime, modeled by average lap time on green laps
2. pitstop, uniform distributed in pit window(10 laps)
3. pitime, modeled by inlap, outlap time


In [4]:
#green laps
alldata = dataset.copy()
carnos = np.sort(list(set(alldata.car_number.values)))
rankdata = alldata.rename_axis('MyIdx').sort_values(by=['elapsed_time','MyIdx'], ascending=True)

# since the flag changes in the middle of a lap, != 'Y' does not work here
#greendata = rankdata[rankdata['track_status']!='Y']
yellow_laps = rankdata[rankdata['track_status']=='Y'].completed_laps.values
green_laps = set(rankdata.completed_laps.values) - set(yellow_laps)
greendata = rankdata[rankdata['completed_laps'].isin(green_laps)]

# car_number, startpos, norm_lap, in_lap, out_lap
statdata = np.zeros((len(carnos), 8))
for idx, car in enumerate(carnos):
        thiscar = greendata[greendata['car_number']==car]
        
        pit_laps = thiscar[thiscar['lap_status']=='P'].completed_laps.values
        in_lap = thiscar[thiscar['completed_laps'].isin(pit_laps)].last_laptime.values
        out_laps = [x+1 for x in pit_laps]
        out_lap = thiscar[thiscar['completed_laps'].isin(out_laps)].last_laptime.values
        
        normal_laps = set(thiscar.completed_laps.values) - set(pit_laps) -set(out_laps)
        _laps = [x if x-1 in normal_laps else -1 for x in normal_laps]
        _laps=np.array(_laps)
        normal_laps = _laps[_laps>0]
        norm_lap = thiscar[thiscar['completed_laps'].isin(normal_laps)].last_laptime.values
        
        #save statistics
        statdata[idx, 0] = car
        startPos = thiscar[thiscar['completed_laps']==1].start_position.values[0]
        statdata[idx, 1] = int(startPos, 16)
        statdata[idx, 2] = np.mean(norm_lap)
        statdata[idx, 3] = np.std(norm_lap)
        statdata[idx, 4] = np.mean(in_lap)
        statdata[idx, 5] = np.std(in_lap)
        statdata[idx, 6] = np.mean(out_lap)
        statdata[idx, 7] = np.std(out_lap)
        
df = pd.DataFrame({'car_number':statdata[:,0].astype(int),'start_position':statdata[:,1].astype(int),'norm_lap_mean':statdata[:,2],'norm_lap_std':statdata[:,3],
                   'in_lap_mean':statdata[:,4],'in_lap_std':statdata[:,5],'out_lap_mean':statdata[:,6],'out_lap_std':statdata[:,7]})        
df.to_csv(outputprefix + 'simulator.csv')
simdf = df.copy()

In [5]:
df.head(10)

Unnamed: 0,car_number,in_lap_mean,in_lap_std,norm_lap_mean,norm_lap_std,out_lap_mean,out_lap_std,start_position
0,1,54.339015,0.8424845,41.743706,0.766367,66.104833,0.73025,4
1,3,54.28161,0.7251726,41.611673,0.676782,66.911113,0.401542,8
2,4,54.295602,0.319057,41.946153,0.770234,68.036082,0.729633,11
3,6,64.02726,0.891327,42.018802,0.778819,55.48844,0.238874,18
4,7,76.500685,5.074079,43.346298,1.081907,58.039793,0.983255,28
5,9,54.458102,1.106836,42.022496,0.70125,66.476525,0.545905,9
6,10,67.2981,1.421085e-14,42.409261,1.136934,55.7272,0.0,29
7,12,53.857717,1.671692,41.485985,0.776325,65.957933,0.901091,3
8,13,55.3417,7.105427e-15,41.927896,0.640302,67.6166,0.0,7
9,14,55.449998,1.114515,41.540961,0.629124,66.8767,0.556106,10


In [6]:
# pit window
# the first pit is more reasonable, the pit window should at least to be 7 laps
#
maxpitcnt = max([int(x,16) for x in rankdata.pit_stop_count.values])

for pit in range(1,maxpitcnt):
    pit_laps= np.sort(list(rankdata[(rankdata['pit_stop_count']==('%x'%(pit))) 
                                & (rankdata['lap_status']=='P')].completed_laps.values))
    print('%d:%d, %d'%(pit, min(pit_laps), max(pit_laps)))
    
    

1:29, 36
2:35, 70
3:52, 108
4:60, 141
5:96, 175
6:107, 197
7:120, 196
8:138, 195
9:180, 191


#### Simulator

In [13]:
def run_simulator(rankdata, simdf, savemodel=''):
    """
    input: simdf, rankdata
    
    simulator output the same data format as rankdata(C_xxx)
        #rank, car_number, completed_laps,elapsed_time,last_laptime
        #lap_status,track_status, pit_stop_count, last_pitted_lap
    """
    
    #init
    random.seed()

    maxlaps = max(set(rankdata.completed_laps.values))
    cols=['rank', 'car_number', 'completed_laps','elapsed_time','last_laptime',
          'lap_status','track_status', 'pit_stop_count', 'last_pitted_lap']
    colid={key:idx for idx, key in enumerate(cols)}

    # fixed pit strategy
    # max laps = 38
    # pit window = 8
    # uniform distribution in [last_pit+38-8, last_pit+38]
    pit_maxlaps = 38
    pit_window = 8
    carnos = np.sort(list(set(simdf.car_number.values)))
    #carnos = simdf.car_number.values
    carid = {key:idx for idx, key in enumerate(carnos)}

    data = np.zeros((len(carnos)*maxlaps, len(cols)))
    print('maxlaps=%d, data shape=%s'%(maxlaps, data.shape))    
    
    # fixed pit strategy
    # max laps = 38
    # pit window = 8
    # uniform distribution in [last_pit+38-8, last_pit+38]
    for car in carnos:
        curlap = 0
        pit_cnt = 0
        while curlap < maxlaps:
            #set the next pit lap
            #uniform in [curlap + ]
            right = curlap + pit_maxlaps
            if right > maxlaps:
                # no need to pitstop
                break
            left = curlap + pit_maxlaps - pit_window
            pit_lap = int(random.uniform(left, right))
            #set it
            data[carid[car] * maxlaps + pit_lap, colid['lap_status']] = 1
            data[carid[car] * maxlaps + pit_lap, colid['pit_stop_count']] = pit_cnt
            data[carid[car] * maxlaps + pit_lap, colid['last_pitted_lap']] = pit_lap

            pit_cnt += 1
            curlap = pit_lap
            
    # simulate the lap time
    # startPenalty = startPosition * 0.11(s)

    for car in carnos:
        last_ispit = 0
        param = simdf[simdf['car_number']==car]
        elapsed_time = param.start_position * 0.11
        for lap in range(maxlaps):
            cur_ispit = data[carid[car] * maxlaps + lap, colid['lap_status']]
            if last_ispit:
                #use out_lap
                laptime = random.gauss(param['out_lap_mean'],param['out_lap_std'])
            elif cur_ispit:
                #use in_lap
                laptime = random.gauss(param['in_lap_mean'],param['in_lap_std'])
            else:
                #use norm_lap
                laptime = random.gauss(param['norm_lap_mean'],param['norm_lap_std'])

            data[carid[car] * maxlaps + lap, colid['last_laptime']] = laptime
            elapsed_time += laptime
            data[carid[car] * maxlaps + lap, colid['elapsed_time']] = elapsed_time

            data[carid[car] * maxlaps + lap, colid['car_number']] = car
            #start from lap 1
            data[carid[car] * maxlaps + lap, colid['completed_laps']] = lap + 1

            #update and goto next lap
            last_ispit = cur_ispit

    # update the rank
    # carnumber = len(carnos)
    for lap in range(maxlaps):
        elapsed_time = [data[carid[car] * maxlaps + lap, colid['elapsed_time']] for car in carnos]
        indice = np.argsort(elapsed_time)
        rank = np.arange(len(carnos))
        out = np.arange(len(carnos))
        out[indice] = rank + 1
        for car in carnos:
            data[carid[car] * maxlaps + lap, colid['rank']] = int(out[carid[car]])

    #save data
    #rank, car_number, completed_laps,elapsed_time,last_laptime
    #lap_status,track_status, pit_stop_count, last_pitted_lap
    df = pd.DataFrame({'rank': data[:, 0].astype(int), 'car_number': data[:, 1].astype(int),
                       'completed_laps': data[:, 2].astype(int),
                       'elapsed_time': data[:, 3], 'last_laptime': data[:, 4], 
                       'lap_status': [ 'P' if x==1 else 'T' for x in data[:, 5]],
                       'track_status': [ 'G' for x in data[:, 6]],
                       'pit_stop_count': data[:, 7], 'last_pitted_lap': data[:, 8]})
    if savemodel:
        df.to_csv(savemodel)
    
    return df

In [14]:
df = run_simulator(rankdata, simdf, outputprefix + 'simulator_completedlaps.csv')

maxlaps=200, data shape=(6600, 9)


In [10]:
df[df['completed_laps']==200]

Unnamed: 0,car_number,completed_laps,elapsed_time,lap_status,last_laptime,last_pitted_lap,pit_stop_count,rank,track_status
199,1,200,8539.857106,T,40.464601,0.0,0.0,5,G
399,3,200,8549.712194,T,42.694062,0.0,0.0,7,G
599,4,200,8585.344486,T,40.948796,0.0,0.0,13,G
799,6,200,8600.533776,T,43.622196,0.0,0.0,17,G
999,7,200,8904.755519,T,44.310942,0.0,0.0,33,G
1199,9,200,8603.851604,T,41.643237,0.0,0.0,18,G
1399,10,200,8727.540495,T,42.832758,0.0,0.0,29,G
1599,12,200,8503.756168,T,40.852722,0.0,0.0,1,G
1799,13,200,8628.920096,T,41.737044,0.0,0.0,22,G
1999,14,200,8546.069349,T,42.5399,0.0,0.0,6,G


### run simulation

In [16]:
runs=10
maxlaps = max(set(rankdata.completed_laps.values))
carnos = np.sort(list(set(simdf.car_number.values)))
carid = {key:idx for idx, key in enumerate(carnos)}

laps = [100,200]
#contigency matrix
cmat = np.zeros((len(laps), len(carnos), len(carnos)))

for run in range(runs):
    df = run_simulator(rankdata, simdf)
    #save rank@lap100 and rank@lap200
    for idx, lap in enumerate(laps):
        data = df[df['completed_laps']==lap][['car_number','rank']].to_numpy()
        for pt in data:
            cmat[idx, carid[pt[0]], pt[1]-1] += 1
    
#check rank 1@lap100, @lap200
print('rank1@lap100:',cmat[0,:,0])
print('rank1@lap200:',cmat[1,:,0])


maxlaps=200, data shape=(6600, 9)
maxlaps=200, data shape=(6600, 9)
maxlaps=200, data shape=(6600, 9)
maxlaps=200, data shape=(6600, 9)
maxlaps=200, data shape=(6600, 9)
maxlaps=200, data shape=(6600, 9)
maxlaps=200, data shape=(6600, 9)
maxlaps=200, data shape=(6600, 9)
maxlaps=200, data shape=(6600, 9)
maxlaps=200, data shape=(6600, 9)
('rank1@lap100:', array([0., 0., 0., 0., 0., 0., 0., 4., 0., 2., 0., 0., 0., 0., 4., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))
('rank1@lap200:', array([0., 0., 0., 0., 0., 0., 0., 7., 0., 2., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]))


### conclusion

even such a simple simulator demonstrates the uncertainty of the long term rank distribution.