### telemetry ts dataset

build a time series dataset across all the oval races, including telemetry

distance, vspeed(vehicle speed), espeed(engine speed), brake, throttle, etc.

when this dataset aims to be used in forecasting, covariates of the racing status can not be included, such as track_status and current_status

+  [(eventid,carids: carno -> rowid, vspeed : #car_number x #totallaps (padded by zeros))]

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
os.environ["CUDA_VISIBLE_DEVICES"]="7"

## Load Data

In [2]:
import os
os.getcwd()

'/scratch/hpda/indycar/notebook/9.DeepModels'

In [3]:
#
# parameters
#
#year = '2017'
year = '2018'
#event = 'Toronto'
#https://www.racing-reference.info/season-stats/2018/O/#
events_totalmiles=[256,500,372,268,500,310]
events_laplen = [1.022,2.5,1.5,0.894,2.5,1.25]
events = ['Phoenix','Indy500','Texas','Iowa','Pocono','Gateway']
#events = ['Gateway']
events_id={key:idx for idx, key in enumerate(events)}
#events = ['Indy500']
#events = ['Phoenix']

In [4]:
# make indy car completed_laps dataset
# car_number, completed_laps, rank, elapsed_time, rank_diff, elapsed_time_diff 
def make_cl_data(dataset):

    # pick up data with valid rank
    rankdata = dataset.rename_axis('MyIdx').sort_values(by=['elapsed_time','MyIdx'], ascending=True)
    rankdata = rankdata.drop_duplicates(subset=['car_number', 'completed_laps'], keep='first')

    # resort by car_number, lap
    uni_ds = rankdata.sort_values(by=['car_number', 'completed_laps', 'elapsed_time'], ascending=True)    
    uni_ds = uni_ds.drop(["unique_id", "best_lap", "current_status", "track_status", "lap_status",
                      "laps_behind_leade","laps_behind_prec","overall_rank","pit_stop_count",
                      "last_pitted_lap","start_position","laps_led"], axis=1)
    
    carnumber = set(uni_ds['car_number'])
    print('cars:', carnumber)
    print('#cars=', len(carnumber))
   
    # faster solution , uni_ds already sorted by car_number and lap
    uni_ds['rank_diff'] = uni_ds['rank'].diff()
    mask = uni_ds.car_number != uni_ds.car_number.shift(1)
    uni_ds['rank_diff'][mask] = 0
    
    uni_ds['time_diff'] = uni_ds['elapsed_time'].diff()
    mask = uni_ds.car_number != uni_ds.car_number.shift(1)
    uni_ds['time_diff'][mask] = 0
    
    df = uni_ds[['car_number','completed_laps','rank','elapsed_time','rank_diff','time_diff']]
    
    return df

def make_lapstatus_data(dataset):
    final_lap = max(dataset.completed_laps)
    total_laps = final_lap + 1

    # get records for the cars that finish the race
    completed_car_numbers= dataset[dataset.completed_laps == final_lap].car_number.values
    completed_car_count = len(completed_car_numbers)

    print('count of completed cars:', completed_car_count)
    print('completed cars:', completed_car_numbers)
    
    #pick up one of them
    onecar = dataset[dataset['car_number']==completed_car_numbers[0]]
    onecar = onecar.drop_duplicates(subset=['car_number', 'completed_laps'], keep='first')
    return onecar[['completed_laps','track_status']]
    

In [5]:
def load_data(event, year):
    inputfile = '../data/final/C_'+ event +'-' + year + '-final.csv'
    outputprefix = year +'-' + event + '-'
    dataset = pd.read_csv(inputfile)
    #dataset.info(verbose=True)    
    
    final_lap = max(dataset.completed_laps)
    total_laps = final_lap + 1

    # get records for the cars that finish the race
    completed_car_numbers= dataset[dataset.completed_laps == final_lap].car_number.values
    completed_car_count = len(completed_car_numbers)

    print('count of completed cars:', completed_car_count)
    print('completed cars:', completed_car_numbers)

    #make a copy
    alldata = dataset.copy()
    dataset = dataset[dataset['car_number'].isin(completed_car_numbers)]
    rankdata = alldata.rename_axis('MyIdx').sort_values(by=['elapsed_time','MyIdx'], ascending=True)
    rankdata = rankdata.drop_duplicates(subset=['car_number', 'completed_laps'], keep='first')
    
    cldata = make_cl_data(dataset)
    flagdata = make_lapstatus_data(dataset)
    acldata = make_cl_data(alldata)

    return alldata, rankdata, acldata, flagdata

### overall view of laptime scatter plots



In [6]:
def get_cardata(curcarno, ycol='time_diff'):
    car = acldata[acldata['car_number']==curcarno]
    #print(car['time_diff'].describe())
    
    cols=['completed_laps','rank','car_number','lap_status','track_status',
    'pit_stop_count','current_status','start_position']
    colid={key:idx for idx, key in enumerate(cols)}

    cardata = rankdata[rankdata['car_number'] == curcarno]

    carstatus = [[row[0], row[1],row[2],row[3],row[4],row[5],row[6],row[7]] for row in cardata[
        ['completed_laps','rank','car_number','lap_status','track_status',
        'pit_stop_count','current_status','start_position']].values]
    
    x = car['completed_laps'][1:].values
    y = car[ycol][1:].values

    pits=[]
    yellowflags=[]
    lastflag = 'x'
    for row in carstatus:
        lap = int(row[colid['completed_laps']])

        if row[colid['lap_status']]=='P':
            pits.append(lap)

        if row[colid['track_status']]=='Y':
            if lastflag != 'Y':       
                #start
                yellowflags.append(lap)
        else:
            if lastflag == 'Y':       
                #end
                yellowflags.append(lap)        
        lastflag = row[colid['track_status']]

    #pit lap
    pits = np.array(pits)
    #start, end lap
    #
    yellowflags = np.array(yellowflags)
    if (yellowflags.shape[0] % 2)==1:
        print('crash?:carno=', curcarno)
        yellowflags = []
    else:
        yellowflags = np.array(yellowflags).reshape((-1,2))    
    
    return car, x, y, pits, yellowflags

### bulid the dataset

In [7]:
def get_laptime_dataset(stagedata):
    """
    input: (alldata, rankdata, acldata, flagdata)
    output: laptime & rank data
    
    [(
    eventid,
    carids : rowid -> carno,
    laptime : #car_number x #totallaps (padded by Nan)),
    rank : #car_number x #totallaps (padded by Nan)
    )]
    """
    laptime_data = []
    for event in stagedata.keys():
        
        laptime_rec = []
        eventid = events_id[event]
        
        alldata, rankdata, acldata, flagdata = stagedata[event]
        carlist = set(acldata['car_number'])
        laplist = set(acldata['completed_laps'])
        totalcars = len(carlist)
        totallaps = len(laplist)

        #carnumber -> carid
        carids={key:idx for idx, key in enumerate(carlist)}
        decode_carids={idx:key for idx, key in enumerate(carlist)}
        
        #array: car_number x lap
        #laptime = np.zeros((totalcars, totallaps-1))
        #rank = np.zeros((totalcars, totallaps-1))
        laptime = np.empty((totalcars, totallaps-1))
        rank = np.empty((totalcars, totallaps-1))
        laptime[:] = np.NaN
        rank[:] = np.NaN
        
        
        lapdata = acldata[['car_number','completed_laps','time_diff','rank']].to_numpy()
        
        for row in lapdata:
            #completed_laps
            if int(row[1]) == 0:
                continue
                
            #add to laptime array
            # array[car_number, completed_laps] = time_diff
            laptime[carids[row[0]], int(row[1])-1] = row[2]
            rank[carids[row[0]], int(row[1])-1] = row[3]

        #add one record
        laptime_data.append([eventid, decode_carids, laptime,rank])
        # push this event into stage dataframe
        print('event=%s, records=%s'%(event, laptime.shape))
        
        
    return laptime_data

### load data

In [8]:
stagedata = {}
global_carids = {}
traindata = None
cur_carid = 0
for event in events:
    #alldata, rankdata, acldata, flagdata
    stagedata[event] = load_data(event, year)
    
    alldata, rankdata, acldata, flagdata = stagedata[event]
    carlist = set(acldata['car_number'])
    laplist = set(acldata['completed_laps'])
    print('%s: carno=%d, lapnum=%d'%(event, len(carlist), len(laplist)))

    #build the carid map
    for car in carlist:
        if car not in global_carids:
            global_carids[car] = cur_carid
            cur_carid += 1
    

count of completed cars: 11
completed cars: [ 1  6 27  9 28  5 20 14 15 22 30]
cars: {1, 5, 6, 9, 14, 15, 20, 22, 27, 28, 30}
#cars= 11
count of completed cars: 11
completed cars: [ 1  6 27  9 28  5 20 14 15 22 30]
cars: {1, 4, 5, 6, 9, 10, 12, 14, 15, 18, 19, 20, 21, 22, 23, 26, 27, 28, 30, 32, 59, 88, 98}
#cars= 23


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Phoenix: carno=23, lapnum=251
count of completed cars: 18
completed cars: [12 20  9 27 28 22 29  1  6 15 66 98  4 88 25 60 64 23]
cars: {64, 1, 66, 98, 4, 6, 9, 12, 60, 15, 20, 22, 23, 88, 25, 27, 28, 29}
#cars= 18
count of completed cars: 18
completed cars: [12 20  9 27 28 22 29  1  6 15 66 98  4 88 25 60 64 23]
cars: {1, 3, 4, 6, 7, 9, 10, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 59, 60, 64, 66, 88, 98}
#cars= 33
Indy500: carno=33, lapnum=201
count of completed cars: 9
completed cars: [ 9 22 27  5 28 15 30 18 10]
cars: {5, 9, 10, 15, 18, 22, 27, 28, 30}
#cars= 9
count of completed cars: 9
completed cars: [ 9 22 27  5 28 15 30 18 10]
cars: {1, 3, 4, 5, 6, 7, 9, 10, 12, 14, 15, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 30, 47, 55, 57, 59, 60, 68, 73, 83, 88, 98}
#cars= 32
Texas: carno=32, lapnum=249
count of completed cars: 5
completed cars: [ 5 21 30  1  6]
cars: {1, 5, 6, 21, 30}
#cars= 5
count of completed cars: 5
completed cars: [ 5 21 30  1  6]

In [9]:
#alldata, rankdata, acldata, flagdata = stagedata['Indy500']
#acldata[acldata['car_number']==12].head(10)

In [10]:
def get_telemetry_dataset(stagedata, frequency = 1):
    """
    input: 
        stagedata: (alldata, rankdata, acldata, flagdata)
        frequency: 1s by default
    
    output: vspeed & distance data
    
    [(
    eventid,
    carids : rowid -> carno,
    vspeed : #car_number x variable length
    distance : #car_number x variable length
    )]
    """
    telemetry_data = []
    for event in stagedata.keys():
        eventid = events_id[event]
        
        alldata, rankdata, acldata, flagdata = stagedata[event]
        #carnumber -> carid
        carlist = set(acldata['car_number'])
        carids={key:idx for idx, key in enumerate(carlist)}
        decode_carids={idx:key for idx, key in enumerate(carlist)}
        
        # multiple ts for this event
        ts_event = []
        for carno in carlist:
            #get completed_laps
            elapsed_time = np.max(acldata[acldata['car_number'] == carno][['elapsed_time']].values)
        
            # load data for this car
            #  timestamp\t distance \t vspeed
            # 16:23:00.588    3705.16 153.780 10416   3       0       109.5   0.04
            inputfile = f'../data/telemetry/{event}-{year}-{carno}.csv'
            try:
                _data = pd.read_csv(inputfile,delimiter='\t', header=None)
            except:
                #Texas-car-3
                print('failed to read telemetry:', inputfile)
                ts_event.append(np.array([0,np.nan, np.nan]))
                continue
            
            # calc the time differences(in seconds)
            _data[0]=pd.to_datetime(_data[0])
            _data[8] = (_data[0] - _data[0].iloc[0]).dt.total_seconds()
            _data[9] = _data[8].astype(int)

            ts_length = int(elapsed_time)
            #ts = []
            ts = np.zeros((ts_length,3))
            ts[:] = np.nan
            
            cur_id = 0
            _data_array = _data[[9,1,2]].to_numpy()
            last_id = _data_array.shape[0]
            for id in range(ts_length):
                while((cur_id < last_id) and (_data_array[cur_id][0] < id)):
                    cur_id += 1

                if cur_id == last_id:
                    break
                    
                if _data_array[cur_id][0] > id:
                    #not found, missing data
                    #ts.append([id, np.nan, np.nan])
                    ts[id,:] = [id, np.nan, np.nan]
                else:
                    #ts.append([id, _data_array[cur_id][1], _data_array[cur_id][2]])            
                    ts[id,:] = [id, _data_array[cur_id][1], _data_array[cur_id][2]]            
                    
            
            # deal with this ts
            # ts_event.append(np.array(ts))
            ts_event.append(ts)

        #add one record
        telemetry_data.append([eventid, decode_carids, ts_event])
        # push this event into stage dataframe
        print('event=%s, records=%s'%(event, len(telemetry_data[-1][2])))
        
        
    return telemetry_data

In [11]:
telemetry_data = get_telemetry_dataset(stagedata)

event=Phoenix, records=23
event=Indy500, records=33
failed to read telemetry: ../data/telemetry/Texas-2018-3.csv
failed to read telemetry: ../data/telemetry/Texas-2018-7.csv
failed to read telemetry: ../data/telemetry/Texas-2018-25.csv
failed to read telemetry: ../data/telemetry/Texas-2018-47.csv
failed to read telemetry: ../data/telemetry/Texas-2018-55.csv
failed to read telemetry: ../data/telemetry/Texas-2018-57.csv
failed to read telemetry: ../data/telemetry/Texas-2018-60.csv
failed to read telemetry: ../data/telemetry/Texas-2018-68.csv
failed to read telemetry: ../data/telemetry/Texas-2018-73.csv
failed to read telemetry: ../data/telemetry/Texas-2018-83.csv
event=Texas, records=32
event=Iowa, records=22
failed to read telemetry: ../data/telemetry/Pocono-2018-23.csv
event=Pocono, records=22
event=Gateway, records=21


In [12]:
%debug

ERROR:root:No traceback has been produced, nothing to debug.


In [13]:
import pickle
#stintdf.to_csv('laptime-%s.csv'%year)
with open('telemetry-%s.pickle'%year, 'wb') as f:
    #pack [global_carids, laptime_data]
    savedata = [global_carids, telemetry_data]
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(savedata, f, pickle.HIGHEST_PROTOCOL)

In [14]:
telemetry_data

[[0,
  {0: 1,
   1: 4,
   2: 5,
   3: 6,
   4: 9,
   5: 10,
   6: 12,
   7: 14,
   8: 15,
   9: 18,
   10: 19,
   11: 20,
   12: 21,
   13: 22,
   14: 23,
   15: 26,
   16: 27,
   17: 28,
   18: 30,
   19: 32,
   20: 59,
   21: 88,
   22: 98},
  [array([[0.00000e+00, 1.47129e+03, 1.31180e+02],
          [1.00000e+00, 1.53264e+03, 1.44230e+02],
          [2.00000e+00, 1.59916e+03, 1.54260e+02],
          ...,
          [6.23700e+03, 1.25131e+03, 1.57990e+02],
          [6.23800e+03, 1.31525e+03, 1.60730e+02],
          [6.23900e+03, 1.38781e+03, 1.64250e+02]]),
   array([[0.00000e+00, 1.43082e+03, 1.24800e+02],
          [1.00000e+00, 1.49510e+03, 1.36690e+02],
          [2.00000e+00, 1.55894e+03, 1.48700e+02],
          ...,
          [6.25300e+03,         nan,         nan],
          [6.25400e+03,         nan,         nan],
          [6.25500e+03,         nan,         nan]]),
   array([[  0.  , 353.28,  87.84],
          [  1.  ,    nan,    nan],
          [  2.  ,    nan,    nan],
  

In [15]:
telemetry_data[id][2][6][-1]

TypeError: list indices must be integers or slices, not builtin_function_or_method

In [16]:
id = 0
[x[-1] for x in telemetry_data[id][2]]

[array([6239.  , 1387.81,  164.25]),
 array([6255.,   nan,   nan]),
 array([nan, nan, nan]),
 array([6242.  , 1404.09,  160.01]),
 array([6243.  , 1412.88,  165.42]),
 array([nan, nan, nan]),
 array([nan, nan, nan]),
 array([6249.  , 1397.38,  160.75]),
 array([6249.  , 1369.53,  159.59]),
 array([6250.  , 1350.97,  158.67]),
 array([ 891.  , 1509.9 ,  103.03]),
 array([6248.  , 1338.09,  159.72]),
 array([6251.  , 1339.94,  153.17]),
 array([6249.  , 1358.13,  157.59]),
 array([6252.  , 1334.94,  156.66]),
 array([6254.  , 1412.75,  152.74]),
 array([6242.  , 1383.06,  161.42]),
 array([6243.  , 1389.  ,  165.15]),
 array([6250.  , 1370.34,  158.55]),
 array([4428.  ,  149.88,    0.  ]),
 array([6254.  , 1377.44,  151.88]),
 array([6253.  , 1348.31,  154.4 ]),
 array([6249.  , 1343.91,  160.62])]

In [17]:
np.sum(np.isnan(telemetry_data[id][2][6][:,1]))


34

In [None]:
telemetry_data[id][2][6][-5810:-5800]

### test 

In [None]:
#events = ['Phoenix','Indy500','Texas','Iowa','Pocono','Gateway']
alldata, rankdata, acldata, flagdata = stagedata['Gateway']

In [None]:
acldata[acldata['car_number']==14]

In [None]:
event='Indy500'
carno=12
year=2018
inputfile = f'../data/telemetry/{event}-{year}-{carno}.csv'
_data = pd.read_csv(inputfile,delimiter='\t', header=None)

In [None]:
a =_data.to_numpy()

In [None]:
a

In [None]:
_data.head(10)

In [None]:
_data[0]=pd.to_datetime(_data[0])
_data[8] = (_data[0] - _data[0].iloc[0]).dt.total_seconds()
_data[9] = _data[8].astype(int)

ts_length = 10
ts = []
cur_id = 0
_data_array = _data[[9,1,2]].to_numpy()
last_id = _data_array.shape[0]
for id in range(ts_length):
    while((cur_id < last_id) and (_data_array[cur_id][0] < id)):
        cur_id += 1
        
    if _data_array[cur_id][0] > id:
        #not found, missing data
        ts.append([id, np.Nan, np.Nan])
    else:
        ts.append([id, _data_array[cur_id][1], _data_array[cur_id][2]])
        

In [None]:
ts

In [None]:
_data[0]=pd.to_datetime(_data[0])

In [None]:
_data_array = _data[[9,1,2]].to_numpy()
_data_array[:10]

In [None]:
_data[8] = (_data[0] - _data[0].iloc[0]).dt.total_seconds()

In [None]:
_data.head(20)

In [None]:
_data[9] = _data[8].astype(int)

In [None]:
_data.info()

In [None]:
##debug data format
def _timestr(timestamp, scale=10000):
    s, ms = divmod(timestamp, scale)
    hours, remainder = divmod(s, 3600)
    minutes, seconds = divmod(remainder, 60)
    timestr = '{:02}:{:02}:{:02}.{:04}'.format(int(hours), int(minutes), int(seconds), int(ms))
    return timestr

def _gettime(timestr, scale=1000.):
    tmms = 0
    tmall = timestr.split('.')
    tms = [int(x) for x in tmall[0].split(':')]
    if len(tms) == 3 and len(tmall) == 2:
        tmms = (tms[0] * 3600 + tms[1] * 60 + tms[2])*scale + float(tmall[1])
    return tmms

def decode_cmd(cmd, idx, deli=chr(0xa6)):
    items = cmd.split(deli)
    print(str(_hex2int(items[idx])*1.0/10000))
  
def _hex2int(hexstr): 
    return int(hexstr, 16)
    

In [None]:
cmd="$C?U?200F7?R.I?7?1?2?F7?4438D18?4039D?T?3DC05?D9?0?1?B6?0?7?3DC05?Active?K?4?EE?3?0"
decode_cmd(cmd,8,'?')

In [None]:
cmd='$C?U?20015?R.I?3?1?2?15?748B24?3EE16?T?3E8ED?10?AE15?0?7D0A?0?3?3E8ED?Active?G?0?0?3?0'
decode_cmd(cmd,8,'?')