### laptime dataset

build a laptime dataset across all the oval races

when this dataset aims to be used in forecasting, covariates of the racing status can not be included, such as track_status and current_status

+ schema: eventid, car_number, completed_lap, laptime, lap_from_lastpit, lap_to_end, 

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
os.environ["CUDA_VISIBLE_DEVICES"]="7"

## Load Data

In [2]:
import os
os.getcwd()

'/scratch/hpda/indycar/predictor/notebook/9.DeepModels'

In [3]:
#
# parameters
#
#year = '2017'
year = '2018'
#event = 'Toronto'
#https://www.racing-reference.info/season-stats/2018/O/#
events_totalmiles=[256,500,372,268,500,310]
events_laplen = [1.022,2.5,1.5,0.894,2.5,1.25]
events = ['Phoenix','Indy500','Texas','Iowa','Pocono','Gateway']
events_id={key:idx for idx, key in enumerate(events)}
#events = ['Indy500']
#events = ['Phoenix']

In [4]:
# make indy car completed_laps dataset
# car_number, completed_laps, rank, elapsed_time, rank_diff, elapsed_time_diff 
def make_cl_data(dataset):

    # pick up data with valid rank
    rankdata = dataset.rename_axis('MyIdx').sort_values(by=['elapsed_time','MyIdx'], ascending=True)
    rankdata = rankdata.drop_duplicates(subset=['car_number', 'completed_laps'], keep='first')

    # resort by car_number, lap
    uni_ds = rankdata.sort_values(by=['car_number', 'completed_laps', 'elapsed_time'], ascending=True)    
    uni_ds = uni_ds.drop(["unique_id", "best_lap", "current_status", "track_status", "lap_status",
                      "laps_behind_leade","laps_behind_prec","overall_rank","pit_stop_count",
                      "last_pitted_lap","start_position","laps_led"], axis=1)
    
    carnumber = set(uni_ds['car_number'])
    print('cars:', carnumber)
    print('#cars=', len(carnumber))
   
    # faster solution , uni_ds already sorted by car_number and lap
    uni_ds['rank_diff'] = uni_ds['rank'].diff()
    mask = uni_ds.car_number != uni_ds.car_number.shift(1)
    uni_ds['rank_diff'][mask] = 0
    
    uni_ds['time_diff'] = uni_ds['elapsed_time'].diff()
    mask = uni_ds.car_number != uni_ds.car_number.shift(1)
    uni_ds['time_diff'][mask] = 0
    
    df = uni_ds[['car_number','completed_laps','rank','elapsed_time','rank_diff','time_diff']]
    
    return df

def make_lapstatus_data(dataset):
    final_lap = max(dataset.completed_laps)
    total_laps = final_lap + 1

    # get records for the cars that finish the race
    completed_car_numbers= dataset[dataset.completed_laps == final_lap].car_number.values
    completed_car_count = len(completed_car_numbers)

    print('count of completed cars:', completed_car_count)
    print('completed cars:', completed_car_numbers)
    
    #pick up one of them
    onecar = dataset[dataset['car_number']==completed_car_numbers[0]]
    onecar = onecar.drop_duplicates(subset=['car_number', 'completed_laps'], keep='first')
    return onecar[['completed_laps','track_status']]
    

In [5]:
def load_data(event, year):
    inputfile = '../data/final/C_'+ event +'-' + year + '-final.csv'
    outputprefix = year +'-' + event + '-'
    dataset = pd.read_csv(inputfile)
    #dataset.info(verbose=True)    
    
    final_lap = max(dataset.completed_laps)
    total_laps = final_lap + 1

    # get records for the cars that finish the race
    completed_car_numbers= dataset[dataset.completed_laps == final_lap].car_number.values
    completed_car_count = len(completed_car_numbers)

    print('count of completed cars:', completed_car_count)
    print('completed cars:', completed_car_numbers)

    #make a copy
    alldata = dataset.copy()
    dataset = dataset[dataset['car_number'].isin(completed_car_numbers)]
    rankdata = alldata.rename_axis('MyIdx').sort_values(by=['elapsed_time','MyIdx'], ascending=True)
    rankdata = rankdata.drop_duplicates(subset=['car_number', 'completed_laps'], keep='first')
    
    cldata = make_cl_data(dataset)
    flagdata = make_lapstatus_data(dataset)
    acldata = make_cl_data(alldata)

    return alldata, rankdata, acldata, flagdata

### overall view of laptime scatter plots



In [6]:
def get_cardata(curcarno, ycol='time_diff'):
    car = acldata[acldata['car_number']==curcarno]
    #print(car['time_diff'].describe())
    
    cols=['completed_laps','rank','car_number','lap_status','track_status',
    'pit_stop_count','current_status','start_position']
    colid={key:idx for idx, key in enumerate(cols)}

    cardata = rankdata[rankdata['car_number'] == curcarno]

    carstatus = [[row[0], row[1],row[2],row[3],row[4],row[5],row[6],row[7]] for row in cardata[
        ['completed_laps','rank','car_number','lap_status','track_status',
        'pit_stop_count','current_status','start_position']].values]
    
    x = car['completed_laps'][1:].values
    y = car[ycol][1:].values

    pits=[]
    yellowflags=[]
    lastflag = 'x'
    for row in carstatus:
        lap = int(row[colid['completed_laps']])

        if row[colid['lap_status']]=='P':
            pits.append(lap)

        if row[colid['track_status']]=='Y':
            if lastflag != 'Y':       
                #start
                yellowflags.append(lap)
        else:
            if lastflag == 'Y':       
                #end
                yellowflags.append(lap)        
        lastflag = row[colid['track_status']]

    #pit lap
    pits = np.array(pits)
    #start, end lap
    #
    yellowflags = np.array(yellowflags)
    if (yellowflags.shape[0] % 2)==1:
        print('crash?:carno=', curcarno)
        yellowflags = []
    else:
        yellowflags = np.array(yellowflags).reshape((-1,2))    
    
    return car, x, y, pits, yellowflags

### bulid the dataset

In [7]:
def get_laptime_dataset(stagedata):
    """
    input: (alldata, rankdata, acldata, flagdata)
    output: 
    
    [(eventid,car_number,laptime : #car_number x #totallaps (padded by zeros))]
    
    """
    laptime_data = []
    for event in stagedata.keys():
        
        laptime_rec = []
        eventid = events_id[event]
        
        alldata, rankdata, acldata, flagdata = stagedata[event]
        carlist = set(acldata['car_number'])
        laplist = set(acldata['completed_laps'])
        totalcars = len(carlist)
        totallaps = len(laplist)

        #carnumber -> carid
        carids={key:idx for idx, key in enumerate(carlist)}
        
        #array: car_number x lap
        laptime = np.zeros((totalcars, totallaps-1))
        
        lapdata = acldata[['car_number','completed_laps','time_diff']].to_numpy()
        
        for row in lapdata:
            #completed_laps
            if int(row[1]) == 0:
                continue
                
            #add to laptime array
            # array[car_number, completed_laps] = time_diff
            laptime[carids[row[0]], int(row[1])-1] = row[2]

        #add one record
        laptime_data.append([eventid, carids, laptime])
        # push this event into stage dataframe
        print('event=%s, records=%s'%(event, laptime.shape))
        
        
    return laptime_data

### load data

In [8]:
stagedata = {}
carids = {}
traindata = None
cur_carid = 0
for event in events:
    #alldata, rankdata, acldata, flagdata
    stagedata[event] = load_data(event, year)
    
    alldata, rankdata, acldata, flagdata = stagedata[event]
    carlist = set(acldata['car_number'])
    laplist = set(acldata['completed_laps'])
    print('%s: carno=%d, lapnum=%d'%(event, len(carlist), len(laplist)))

    #build the carid map
    for car in carlist:
        if car not in carids:
            carids[car] = cur_carid
            cur_carid += 1
    

('count of completed cars:', 11)
('completed cars:', array([ 1,  6, 27,  9, 28,  5, 20, 14, 15, 22, 30]))
('cars:', set([1, 5, 6, 9, 14, 15, 20, 22, 27, 28, 30]))
('#cars=', 11)
('count of completed cars:', 11)
('completed cars:', array([ 1,  6, 27,  9, 28,  5, 20, 14, 15, 22, 30]))


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


('cars:', set([1, 4, 5, 6, 9, 10, 12, 14, 15, 18, 19, 20, 21, 22, 23, 26, 27, 28, 30, 32, 59, 88, 98]))
('#cars=', 23)
Phoenix: carno=23, lapnum=251
('count of completed cars:', 18)
('completed cars:', array([12, 20,  9, 27, 28, 22, 29,  1,  6, 15, 66, 98,  4, 88, 25, 60, 64,
       23]))
('cars:', set([64, 1, 66, 4, 6, 9, 12, 98, 15, 60, 20, 22, 23, 88, 25, 27, 28, 29]))
('#cars=', 18)
('count of completed cars:', 18)
('completed cars:', array([12, 20,  9, 27, 28, 22, 29,  1,  6, 15, 66, 98,  4, 88, 25, 60, 64,
       23]))
('cars:', set([1, 3, 4, 6, 7, 9, 10, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33, 59, 60, 64, 66, 88, 98]))
('#cars=', 33)
Indy500: carno=33, lapnum=201
('count of completed cars:', 9)
('completed cars:', array([ 9, 22, 27,  5, 28, 15, 30, 18, 10]))
('cars:', set([5, 9, 10, 15, 18, 22, 27, 28, 30]))
('#cars=', 9)
('count of completed cars:', 9)
('completed cars:', array([ 9, 22, 27,  5, 28, 15, 30, 18, 10]))
('cars:', set([1, 3, 4

In [9]:
laptime_data = get_laptime_dataset(stagedata)

event=Iowa, records=(22, 300)
event=Phoenix, records=(23, 250)
event=Indy500, records=(33, 200)
event=Pocono, records=(22, 200)
event=Gateway, records=(21, 248)
event=Texas, records=(32, 248)


In [17]:
#check Car12 Indy500
laptime_data[2][2][7,:]

array([ 41.9238,  41.036 ,  41.3339,  41.0918,  40.7903,  41.1153,
        41.2998,  41.1854,  41.0853,  41.0486,  41.2261,  41.3214,
        41.2019,  41.1745,  41.3155,  41.2843,  41.4283,  41.407 ,
        41.4171,  41.4934,  41.3869,  41.2333,  41.2477,  41.3136,
        41.1053,  41.398 ,  41.9177,  41.9383,  42.1843,  44.0387,
        45.6879,  56.1652,  67.5397,  41.6366,  40.9116,  40.359 ,
        40.5654,  40.9596,  41.2025,  41.2059,  41.3058,  41.0387,
        41.124 ,  41.4863,  41.3339,  41.3737,  41.7481,  62.8487,
        97.1485, 116.1604, 117.8701, 105.3076, 109.6285, 104.8311,
        73.6674,  44.3144,  42.3946,  47.4349,  94.0684, 104.9096,
       106.4687,  93.924 ,  75.1453,  41.9144,  41.2125,  41.1279,
        41.2738,  46.8344,  94.5176, 111.4085, 106.6239, 101.914 ,
        80.7889,  42.0244,  41.0676,  41.1153,  41.2297,  41.2734,
        41.4444,  41.3528,  41.5088,  41.3364,  41.3633,  41.3129,
        41.3298,  41.564 ,  41.3727,  41.5422,  42.5959,  41.6

In [19]:
alldata, rankdata, acldata, flagdata = stagedata['Indy500']
acldata[acldata['car_number']==12]

Unnamed: 0_level_0,car_number,completed_laps,rank,elapsed_time,rank_diff,time_diff
MyIdx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2,12,0,3,0.2247,0.0,0.0000
37,12,1,2,42.1485,-1.0,41.9238
148,12,2,2,83.1845,0.0,41.0360
231,12,3,3,124.5184,1.0,41.3339
311,12,4,3,165.6102,0.0,41.0918
393,12,5,3,206.4005,0.0,40.7903
474,12,6,3,247.5158,0.0,41.1153
558,12,7,3,288.8156,0.0,41.2998
644,12,8,3,330.0010,0.0,41.1854
729,12,9,3,371.0863,0.0,41.0853


In [12]:
import pickle
#stintdf.to_csv('laptime-%s.csv'%year)
with open('laptime-%s.pickle'%year, 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(laptime_data, f, pickle.HIGHEST_PROTOCOL)

In [13]:
with open('laptime-2018.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    data = pickle.load(f)

In [14]:
pickle.__version__

'$Revision: 72223 $'