# Bulid synthetic dataset with simulation

using simulation dataset to verify the capability of these models
+ long time series
+ large dataset
+ fixed category feature(item_id)

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import random

## Load Data

In [2]:
import os
os.getcwd()

'/scratch/hpda/indycar/notebook/9.DeepModels'

In [3]:
#
# parameters
#
#year = '2017'
year = '2018'
#event = 'Toronto'
event = 'Indy500'
runid = event + '-laptimedataset'
inputfile = '../data/final/C_'+ event +'-' + year + '-final.csv'
outputprefix = year +'-' + event + '-'
dataset = pd.read_csv(inputfile)
#dataset.info(verbose=True)

### The Simulator

simple model without DNF

1. laptime, modeled by average lap time on green laps
2. pitstop, uniform distributed in pit window(10 laps)
3. pitime, modeled by inlap, outlap time


In [4]:
#green laps
alldata = dataset.copy()
carnos = np.sort(list(set(alldata.car_number.values)))
rankdata = alldata.rename_axis('MyIdx').sort_values(by=['elapsed_time','MyIdx'], ascending=True)

# since the flag changes in the middle of a lap, != 'Y' does not work here
#greendata = rankdata[rankdata['track_status']!='Y']
yellow_laps = rankdata[rankdata['track_status']=='Y'].completed_laps.values
green_laps = set(rankdata.completed_laps.values) - set(yellow_laps)
greendata = rankdata[rankdata['completed_laps'].isin(green_laps)]

# car_number, startpos, norm_lap, in_lap, out_lap
statdata = np.zeros((len(carnos), 8))
for idx, car in enumerate(carnos):
        thiscar = greendata[greendata['car_number']==car]
        
        pit_laps = thiscar[thiscar['lap_status']=='P'].completed_laps.values
        in_lap = thiscar[thiscar['completed_laps'].isin(pit_laps)].last_laptime.values
        out_laps = [x+1 for x in pit_laps]
        out_lap = thiscar[thiscar['completed_laps'].isin(out_laps)].last_laptime.values
        
        normal_laps = set(thiscar.completed_laps.values) - set(pit_laps) -set(out_laps)
        _laps = [x if x-1 in normal_laps else -1 for x in normal_laps]
        _laps=np.array(_laps)
        normal_laps = _laps[_laps>0]
        norm_lap = thiscar[thiscar['completed_laps'].isin(normal_laps)].last_laptime.values
        
        #save statistics
        statdata[idx, 0] = car
        startPos = thiscar[thiscar['completed_laps']==1].start_position.values[0]
        statdata[idx, 1] = int(startPos, 16)
        statdata[idx, 2] = np.mean(norm_lap)
        statdata[idx, 3] = np.std(norm_lap)
        statdata[idx, 4] = np.mean(in_lap)
        statdata[idx, 5] = np.std(in_lap)
        statdata[idx, 6] = np.mean(out_lap)
        statdata[idx, 7] = np.std(out_lap)
        
df = pd.DataFrame({'car_number':statdata[:,0].astype(int),'start_position':statdata[:,1].astype(int),'norm_lap_mean':statdata[:,2],'norm_lap_std':statdata[:,3],
                   'in_lap_mean':statdata[:,4],'in_lap_std':statdata[:,5],'out_lap_mean':statdata[:,6],'out_lap_std':statdata[:,7]})        
df.to_csv(outputprefix + 'simulator.csv')
simdf = df.copy()

In [5]:
df.head(10)

Unnamed: 0,car_number,start_position,norm_lap_mean,norm_lap_std,in_lap_mean,in_lap_std,out_lap_mean,out_lap_std
0,1,4,41.743706,0.766367,54.339015,0.8424845,66.104833,0.73025
1,3,8,41.611673,0.676782,54.28161,0.7251726,66.911113,0.401542
2,4,11,41.946153,0.770234,54.295602,0.319057,68.036082,0.729633
3,6,18,42.018802,0.778819,64.02726,0.891327,55.48844,0.238874
4,7,28,43.346298,1.081907,76.500685,5.074079,58.039793,0.983255
5,9,9,42.022496,0.70125,54.458102,1.106836,66.476525,0.545905
6,10,29,42.409261,1.136934,67.2981,1.421085e-14,55.7272,0.0
7,12,3,41.485985,0.776325,53.857717,1.671692,65.957933,0.901091
8,13,7,41.927896,0.640302,55.3417,7.105427e-15,67.6166,0.0
9,14,10,41.540961,0.629124,55.449998,1.114515,66.8767,0.556106


In [6]:
# pit window
# the first pit is more reasonable, the pit window should at least to be 7 laps
#
maxpitcnt = max([int(x,16) for x in rankdata.pit_stop_count.values])

for pit in range(1,maxpitcnt):
    pit_laps= np.sort(list(rankdata[(rankdata['pit_stop_count']==('%x'%(pit))) 
                                & (rankdata['lap_status']=='P')].completed_laps.values))
    print('%d:%d, %d'%(pit, min(pit_laps), max(pit_laps)))
    
    

1:29, 36
2:35, 70
3:52, 108
4:60, 141
5:96, 175
6:107, 197
7:120, 196
8:138, 195
9:180, 191


#### Simulator

In [7]:
#def run_simulator(rankdata, simdf, savemodel='', baselap=0, basemodel=None):
def run_simulator(maxlaps, simdf, savemodel='', baselap=0, basemodel=None):
    """
    input: simdf, rankdata
    baselap;  fix output before the startlap 
    basemode;  data[] of a previous run/simulation result
    
    simulator output the same data format as rankdata(C_xxx)
        #rank, car_number, completed_laps,elapsed_time,last_laptime
        #lap_status,track_status, pit_stop_count, last_pitted_lap
    """
    
    #init
    #random.seed(1234)
    random.seed()

    #maxlaps = max(set(rankdata.completed_laps.values))
    cols=['rank', 'car_number', 'completed_laps','elapsed_time','last_laptime',
          'lap_status','track_status', 'pit_stop_count', 'last_pitted_lap']
    colid={key:idx for idx, key in enumerate(cols)}

    # fixed pit strategy
    # max laps = 38
    # pit window = 8
    # uniform distribution in [last_pit+38-8, last_pit+38]
    pit_maxlaps = 38
    pit_window = 8
    carnos = np.sort(list(set(simdf.car_number.values)))
    #carnos = simdf.car_number.values
    carid = {key:idx for idx, key in enumerate(carnos)}

    data = np.zeros((len(carnos)*maxlaps, len(cols)))
    #print('maxlaps=%d, data shape=%s'%(maxlaps, data.shape))    
    
    # fixed pit strategy
    # max laps = 38
    # pit window = 8
    # uniform distribution in [last_pit+38-8, last_pit+38]
    for car in carnos:
        curlap = 0
        pit_cnt = 0
        
        #get data from basemodel
        if baselap > 0:
            thiscar_model = basemodel[carid[car] * maxlaps:(carid[car]+1) * maxlaps,:]
            thiscar_pitlaps = np.argwhere(thiscar_model[:,colid['lap_status']]==1) 
            thiscar_pitlaps_idx = 0
            #debug
            #if car == 88:
            #    print('car 88:', thiscar_pitlaps)
        
        while curlap < maxlaps:
            #set the next pit lap
            #uniform in [curlap + ]
            right = curlap + pit_maxlaps
            if right > maxlaps:
                # no need to pitstop
                break
            left = curlap + pit_maxlaps - pit_window
            
            #dynamic 
            if baselap > 0:
                pit_lap = thiscar_pitlaps[thiscar_pitlaps_idx] 
                if pit_lap >= baselap:
                    #wait until pit_lap >= baselap
                    pit_lap = int(random.uniform(left, right))    
                    while pit_lap < baselap:
                        pit_lap = int(random.uniform(left, right))
                else:
                    #use the basemodel
                    thiscar_pitlaps_idx += 1
            else:
                pit_lap = int(random.uniform(left, right))
            
            
            #set it
            data[carid[car] * maxlaps + pit_lap, colid['lap_status']] = 1
            data[carid[car] * maxlaps + pit_lap, colid['pit_stop_count']] = pit_cnt
            data[carid[car] * maxlaps + pit_lap, colid['last_pitted_lap']] = pit_lap + 1

            pit_cnt += 1
            curlap = pit_lap
            
    # simulate the lap time
    # startPenalty = startPosition * 0.11(s)

    for car in carnos:
        last_ispit = 0
        param = simdf[simdf['car_number']==car]
        elapsed_time = param.start_position * 0.11
        
        #get data from basemodel
        if baselap > 0:
            thiscar_model = basemodel[carid[car] * maxlaps:(carid[car]+1) * maxlaps,:]        
        
        for lap in range(maxlaps):
            #use out_lap
            cur_ispit = data[carid[car] * maxlaps + lap, colid['lap_status']]
            if baselap > 0 and lap <= baselap:
                laptime = thiscar_model[lap,colid['last_laptime']]
            else:
                if last_ispit:
                    laptime = random.gauss(param['out_lap_mean'],param['out_lap_std'])
                elif cur_ispit:
                    #use in_lap
                    laptime = random.gauss(param['in_lap_mean'],param['in_lap_std'])
                else:
                    #use norm_lap
                    laptime = random.gauss(param['norm_lap_mean'],param['norm_lap_std'])

            data[carid[car] * maxlaps + lap, colid['last_laptime']] = laptime
            if baselap > 0 and lap <= baselap:
                elapsed_time = thiscar_model[lap,colid['elapsed_time']]
            else:
                elapsed_time += laptime
                
            data[carid[car] * maxlaps + lap, colid['elapsed_time']] = elapsed_time

            data[carid[car] * maxlaps + lap, colid['car_number']] = car
            #start from lap 1
            data[carid[car] * maxlaps + lap, colid['completed_laps']] = lap + 1

            #update and goto next lap
            last_ispit = cur_ispit

    # update the rank
    # carnumber = len(carnos)
    for lap in range(maxlaps):
        elapsed_time = [data[carid[car] * maxlaps + lap, colid['elapsed_time']] for car in carnos]
        indice = np.argsort(elapsed_time)
        rank = np.arange(len(carnos))
        out = np.arange(len(carnos))
        out[indice] = rank + 1
        for car in carnos:
            data[carid[car] * maxlaps + lap, colid['rank']] = int(out[carid[car]])

    #save data
    #rank, car_number, completed_laps,elapsed_time,last_laptime
    #lap_status,track_status, pit_stop_count, last_pitted_lap
    df = pd.DataFrame({'rank': data[:, 0].astype(int), 'car_number': data[:, 1].astype(int),
                       'completed_laps': data[:, 2].astype(int),
                       'elapsed_time': data[:, 3], 'last_laptime': data[:, 4], 
                       'lap_status': [ 'P' if x==1 else 'T' for x in data[:, 5]],
                       'track_status': [ 'G' for x in data[:, 6]],
                       'pit_stop_count': data[:, 7], 'last_pitted_lap': data[:, 8]})
    if savemodel:
        df.to_csv(savemodel)
    
    return df, data

In [8]:
maxlaps = max(set(rankdata.completed_laps.values))
df0, data0 = run_simulator(maxlaps, simdf)
df1, data1 = run_simulator(maxlaps, simdf, '', 100, data0)

In [9]:
np.mean(data0[:100,:]==data1[:100,:])

1.0

In [10]:
np.mean(data0[100:,:]==data1[100:,:])

0.8383589743589743

In [11]:
df0[(df0['car_number']==88) & (df0['lap_status']=='P')]

Unnamed: 0,rank,car_number,completed_laps,elapsed_time,last_laptime,lap_status,track_status,pit_stop_count,last_pitted_lap
6236,13,88,37,1575.079307,64.954464,P,G,0.0,37.0
6267,18,88,68,2911.30079,67.138167,P,G,1.0,68.0
6301,18,88,102,4387.874709,65.963708,P,G,2.0,102.0
6331,22,88,132,5692.351163,68.130232,P,G,3.0,132.0
6367,22,88,168,7248.432155,62.879773,P,G,4.0,168.0


In [12]:
df0[df0['completed_laps']==100]

Unnamed: 0,rank,car_number,completed_laps,elapsed_time,last_laptime,lap_status,track_status,pit_stop_count,last_pitted_lap
99,16,1,100,4287.918097,42.912402,T,G,0.0,0.0
299,3,3,100,4254.797787,41.427417,T,G,0.0,0.0
499,9,4,100,4275.357873,42.581006,T,G,0.0,0.0
699,8,6,100,4271.165993,41.103697,T,G,0.0,0.0
899,33,7,100,4481.600787,43.833725,T,G,0.0,0.0
1099,17,9,100,4289.757742,40.68425,T,G,0.0,0.0
1299,26,10,100,4321.517185,41.679562,T,G,0.0,0.0
1499,1,12,100,4211.013192,41.118107,T,G,0.0,0.0
1699,22,13,100,4306.563091,40.943164,T,G,0.0,0.0
1899,5,14,100,4259.736807,66.94556,T,G,0.0,0.0


In [None]:
df1[df1['completed_laps']==200]

In [14]:
def get_laptime_dataset(simretdf):
    """
    input: simretdf[]
    output: 
    
    [(eventid,car_number,laptime : #car_number x #totallaps (padded by zeros))]
    
    """
    laptime_data = []
    for eventid, rankdata in enumerate(simretdf):
        
        laptime_rec = []
        
        
        carlist = set(rankdata['car_number'])
        laplist = set(rankdata['completed_laps'])
        totalcars = len(carlist)
        totallaps = len(laplist)

        #carnumber -> carid
        carids={key:idx for idx, key in enumerate(carlist)}
        
        #array: car_number x lap
        laptime = np.zeros((totalcars, totallaps))
        
        lapdata = rankdata[['car_number','completed_laps','last_laptime']].to_numpy()
        
        for row in lapdata:
            #completed_laps
            if int(row[1]) == 0:
                continue
                
            #add to laptime array
            # array[car_number, completed_laps] = time_diff
            laptime[carids[row[0]], int(row[1])-1] = row[2]

        #add one record
        laptime_data.append([eventid, carids, laptime])
        # push this event into stage dataframe
        #print('event=%d, records=%s'%(eventid, laptime.shape))
        
        
    return laptime_data

### run simulation

In [15]:
# total runs, model runs = 80%
runs = 360
#runs=120
modelruns = 100

#maxlaps = max(set(rankdata.completed_laps.values))
carnos = np.sort(list(set(simdf.car_number.values)))
carid = {key:idx for idx, key in enumerate(carnos)}

#laps = [100,200]
#maxlaps = 500
maxlaps = 200

#simulation result 
simretdf = []

# run simulator
for run in range(runs):
    #fix the laps before 100
    df,_ = run_simulator(maxlaps, simdf)
    simretdf.append(df)
    
print('simulation finished!')

simulation finished!


In [16]:
laptime_data = get_laptime_dataset(simretdf)
import pickle
#stintdf.to_csv('laptime-%s.csv'%year)
with open('sim-indy500-laptime-%s-%dlaps-%druns.pickle'%(year, maxlaps,runs), 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(laptime_data, f, pickle.HIGHEST_PROTOCOL)


In [17]:
with open('sim-indy500-df-%s-%dlaps-%druns.pickle'%(year, maxlaps,runs), 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(simretdf, f, pickle.HIGHEST_PROTOCOL)

### check the result

In [18]:
laptime_data[0][2].shape

(33, 200)

In [19]:
laptime_data[0][1]

{1: 0,
 3: 1,
 4: 2,
 6: 3,
 7: 4,
 9: 5,
 10: 6,
 12: 7,
 13: 8,
 14: 9,
 15: 10,
 17: 11,
 18: 12,
 19: 13,
 20: 14,
 21: 15,
 22: 16,
 23: 17,
 24: 18,
 25: 19,
 26: 20,
 27: 21,
 28: 22,
 29: 23,
 30: 24,
 32: 25,
 33: 26,
 59: 27,
 60: 28,
 64: 29,
 66: 30,
 88: 31,
 98: 32}