### stage dataset

base: 14./stage_dataset_withneighbor-newfeatures

build a stage dataset with feature extraction

+ remove first and final 'stint'
+ add neighbors information as features


# Imports

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


## Load Data

In [2]:
import os
os.getcwd()


'/scratch_hdd/hpda/indycar/notebook/18.FinalTest'

In [3]:
# make indy car completed_laps dataset
# car_number, completed_laps, rank, elapsed_time, rank_diff, elapsed_time_diff 
def make_cl_data(dataset):

    # pick up data with valid rank
    rankdata = dataset.rename_axis('MyIdx').sort_values(by=['elapsed_time','MyIdx'], ascending=True)
    rankdata = rankdata.drop_duplicates(subset=['car_number', 'completed_laps'], keep='first')

    # resort by car_number, lap
    uni_ds = rankdata.sort_values(by=['car_number', 'completed_laps', 'elapsed_time'], ascending=True)    
    uni_ds = uni_ds.drop(["unique_id", "best_lap", "current_status", "track_status", "lap_status",
                      "laps_behind_leade","laps_behind_prec","overall_rank","pit_stop_count",
                      "last_pitted_lap","start_position","laps_led"], axis=1)
    
    carnumber = set(uni_ds['car_number'])
    print('cars:', carnumber)
    print('#cars=', len(carnumber))
   
    # faster solution , uni_ds already sorted by car_number and lap
    uni_ds['rank_diff'] = uni_ds['rank'].diff()
    mask = uni_ds.car_number != uni_ds.car_number.shift(1)
    uni_ds['rank_diff'][mask] = 0
    
    uni_ds['time_diff'] = uni_ds['elapsed_time'].diff()
    mask = uni_ds.car_number != uni_ds.car_number.shift(1)
    uni_ds['time_diff'][mask] = 0
    
    df = uni_ds[['car_number','completed_laps','rank','elapsed_time','rank_diff','time_diff']]
    
    return df

def make_lapstatus_data(dataset):
    final_lap = max(dataset.completed_laps)
    total_laps = final_lap + 1

    # get records for the cars that finish the race
    completed_car_numbers= dataset[dataset.completed_laps == final_lap].car_number.values
    completed_car_count = len(completed_car_numbers)

    print('count of completed cars:', completed_car_count)
    print('completed cars:', completed_car_numbers)
    
    #pick up one of them
    onecar = dataset[dataset['car_number']==completed_car_numbers[0]]
    onecar = onecar.drop_duplicates(subset=['car_number', 'completed_laps'], keep='first')
    return onecar[['completed_laps','track_status']]
    

In [4]:
def load_data(event, year=0):
    #inputfile = '../data/final/C_'+ event +'-' + year + '-final.csv'
    if year>0:
        inputfile = '../data/final/C_'+ event +'-' + year + '.csv'
    else:
        inputfile = '../data/final/C_'+ event +'.csv'
    
    #outputprefix = year +'-' + event + '-'
    dataset = pd.read_csv(inputfile)
    #dataset.info(verbose=True)    
    
    final_lap = max(dataset.completed_laps)
    total_laps = final_lap + 1

    # get records for the cars that finish the race
    completed_car_numbers= dataset[dataset.completed_laps == final_lap].car_number.values
    completed_car_count = len(completed_car_numbers)

    print('count of completed cars:', completed_car_count)
    print('completed cars:', completed_car_numbers)

    #make a copy
    alldata = dataset.copy()
    dataset = dataset[dataset['car_number'].isin(completed_car_numbers)]
    rankdata = alldata.rename_axis('MyIdx').sort_values(by=['elapsed_time','MyIdx'], ascending=True)
    rankdata = rankdata.drop_duplicates(subset=['car_number', 'completed_laps'], keep='first')
    
    cldata = make_cl_data(dataset)
    flagdata = make_lapstatus_data(dataset)
    acldata = make_cl_data(alldata)

    return alldata, rankdata, acldata, flagdata

### overall view of laptime scatter plots



In [5]:
def get_cardata(curcarno, ycol='time_diff'):
    car = acldata[acldata['car_number']==curcarno]
    #print(car['time_diff'].describe())
    
    cols=['completed_laps','rank','car_number','lap_status','track_status',
    'pit_stop_count','current_status','start_position']
    colid={key:idx for idx, key in enumerate(cols)}

    cardata = rankdata[rankdata['car_number'] == curcarno]

    carstatus = [[row[0], row[1],row[2],row[3],row[4],row[5],row[6],row[7]] for row in cardata[
        ['completed_laps','rank','car_number','lap_status','track_status',
        'pit_stop_count','current_status','start_position']].values]
    
    x = car['completed_laps'][1:].values
    y = car[ycol][1:].values

    pits=[]
    yellowflags=[]
    lastflag = 'x'
    for row in carstatus:
        lap = int(row[colid['completed_laps']])

        if row[colid['lap_status']]=='P':
            pits.append(lap)

        if row[colid['track_status']]=='Y':
            if lastflag != 'Y':       
                #start
                yellowflags.append(lap)
        else:
            if lastflag == 'Y':       
                #end
                yellowflags.append(lap)        
        lastflag = row[colid['track_status']]

    #pit lap
    pits = np.array(pits)
    #start, end lap
    #
    yellowflags = np.array(yellowflags)
    if (yellowflags.shape[0] % 2)==1:
        print('crash?:carno=', curcarno)
        yellowflags = []
    else:
        yellowflags = np.array(yellowflags).reshape((-1,2))    
    
    return car, x, y, pits, yellowflags

In [6]:
def draw_laptime(ax, ycol='LapTime'):
    
    ymax = np.max(y)
    
    #yellow flags
    for yflag in yellowflags:
        ax.bar(yflag[0],ymax, width = yflag[1] - yflag[0], align='edge', color = 'y', zorder=-100)

    #plt.plot(np.arange(laptime.shape[0]), laptime, marker='.')
    ax.scatter(x, y, marker='.')    

    #pit stops    
    ax.scatter(pits, y[pits-1], marker='x', color='r')
    for pit in pits:
        ax.bar(pit,height = y[pits-1], width = .8, align='center', color = 'r', zorder=-100)
    
    #ax.set_xlim((0, 33))
    #ax.set_ylim((0, 201))
    ax.set_xlabel('Lap')
    ax.set_ylabel(ycol)
    
def draw_pitstop(ax, pits):
    #pit stops    
    ax.scatter(pits, y[pits-1], marker='x', color='r')
    for pit in pits:
        ax.bar(pit,height = y[pits-1], width = .8, align='center', color = 'r', zorder=-100)

    
def draw_yellowflag(ax,yellowflags):
    ymax = np.max(y)
    #yellow flags
    for yflag in yellowflags:
        ax.bar(yflag[0],ymax, width = yflag[1] - yflag[0], align='edge', color = 'y', zorder=-100)

### bulid the dataset

In [7]:
def get_stagedata(TRIM=2, include_final=False):
    """
    input: acldata, rankdata, the whole dataset
    output: stagedata['car_number','stage','start_lap','end_lap','start_rank','end_rank']
    """
    carlist = set(acldata['car_number'])
    data = []
    for curcarno in carlist:
        car = acldata[acldata['car_number']==curcarno]
        #print(car['time_diff'].describe())

        cols=['completed_laps','rank','car_number','lap_status','track_status',
        'pit_stop_count','current_status','start_position']
        colid={key:idx for idx, key in enumerate(cols)}

        cardata = rankdata[rankdata['car_number'] == curcarno]

        carstatus = [[row[0], row[1],row[2],row[3],row[4],row[5],row[6],row[7]] for row in cardata[
            ['completed_laps','rank','car_number','lap_status','track_status',
            'pit_stop_count','current_status','start_position']].values]

        pits=[]
        yellowflags=[]
        lastflag = 'x'
        
        stage = 0
        start_lap = 0
        pit_lap = 0
        pit_in_caution = 0
        for row in carstatus:
            lap = int(row[colid['completed_laps']])
            
            if lap==0:
                #hex2int
                start_rank = int(row[colid['start_position']], 16)     
            
            if row[colid['lap_status']]=='P':
                pits.append(lap)
                end_lap = lap - TRIM
                end_rank = carstatus[end_lap][colid['rank']]
                #add one record
                #stagedata['car_number','stage','start_lap','end_lap','start_rank','end_rank']
                data.append([curcarno,stage,start_lap, end_lap, pit_lap, pit_in_caution, start_rank, end_rank])
                # prepare for the next
                stage += 1
                start_lap = end_lap  # end_lap + 1?
                start_rank = end_rank
                pit_lap = lap  #current is pit lap
                pit_in_caution = 1 if row[colid['track_status']]=='Y' else 0
                

            if row[colid['track_status']]=='Y':
                if lastflag != 'Y':       
                    #start
                    yellowflags.append(lap)
            else:
                if lastflag == 'Y':       
                    #end
                    yellowflags.append(lap)        
            lastflag = row[colid['track_status']]

        if include_final:
            #final
            end_lap = carstatus[-1][colid['completed_laps']]
            end_rank = carstatus[-1][colid['rank']]
            #add one record
            #stagedata['car_number','stage','start_lap','end_lap',pit_lap, pit_in_caution,'start_rank','end_rank']
            data.append([curcarno,stage,start_lap, end_lap, pit_lap, pit_in_caution, start_rank, end_rank])
        
        #pit lap
        pits = np.array(pits)
        #start, end lap
        #
        yellowflags = np.array(yellowflags)
        if (yellowflags.shape[0] % 2)==1:
            print('crash?:carno=', curcarno)
            yellowflags = []
        else:
            yellowflags = np.array(yellowflags).reshape((-1,2))  
            
    data = np.array(data)
    #['car_number','stage','start_lap','end_lap',pit_lap, pit_in_caution,'start_rank','end_rank']
    df = pd.DataFrame({'car_number': data[:, 0], 'stage': data[:, 1],'start_lap': data[:, 2],
                      'end_lap': data[:, 3], 'pit_lap': data[:, 4], 'pit_in_caution': data[:, 5],
                       'start_rank': data[:, 6],'end_rank': data[:, 7]})
    return df, data

### events

In [8]:
def extract_features(stagedata):
    """
    input: (alldata, rankdata, acldata, flagdata, stage, data)
    
    target
    eventid
    car_number
    stageid
    
    #0, gobal info 
    firststage  ;  1/0    
    pit_in_caution; 1/0
    start_position  ; #
    
    # 0 order of #rank
    start_rank  ;  #rank
    start_rank_ratio; ; #rank/carnum 
    top_pack    ;  top5 1/0 
    bottom_pack  ; bottom5 1/0 
    
    average_rank  ; previous stage
    average_rank_all  ; all previous stages
    
    # 1 order of #rank
    change_in_rank; previous stage
    change_in_rank_all;  all previous stages
    # 2 order of #rank
    rate_of_change; previous stage
    rate_of_change_all;  all previous stages
    
    #neighbors;   
    prev_nb0_change_in_rank
    prev_nb1_change_in_rank
    prev_nb2_change_in_rank
    follow_nb0_change_in_rank
    follow_nb1_change_in_rank
    follow_nb2_change_in_rank
    
    # more features
    laptime_green_mean_prev  ; mean and std of the laptime in green laps of previous stage
    laptime_green_std_prev
    laptime_green_mean_all  ; mean and std of the laptime in green laps before
    laptime_green_std_all
    
    laptime_mean_prev  ; mean and std of the laptime in all laps of previous stage
    laptime_std_prev
    laptime_mean_all  ; mean and std of the laptime in all laps before
    laptime_std_all
    
    laps_prev   ; lap number of the previous stage
    laps_after_last_pitstop  ; lap number after the last pitstop
    
    pittime_prev   ; pittime of previous pitstop
    
    
    """
    eventid = 0
    cols=['target','start_lap',
             'eventid','car_number','stageid',
             'firststage','pit_in_caution','start_position',
             'start_rank','start_rank_ratio','top_pack','bottom_pack',
             'average_rank','average_rank_all',
             'change_in_rank','change_in_rank_all','rate_of_change','rate_of_change_all',
             'laptime_green_mean_prev','laptime_green_std_prev','laptime_green_mean_all','laptime_green_std_all', 
             'laptime_mean_prev','laptime_std_prev','laptime_mean_all','laptime_std_all', 
             'laps_prev','laps_after_last_pitstop','pittime_prev',     
             'prev_nb0_change_in_rank','prev_nb1_change_in_rank','prev_nb2_change_in_rank',
             'follow_nb0_change_in_rank','follow_nb1_change_in_rank','follow_nb2_change_in_rank']
    colid={key:idx for idx, key in enumerate(cols)}
    
    datacols = ['car_number','stage','start_lap','end_lap','pit_lap', 'pit_in_caution', 'start_rank','end_rank']
    datacolid={key:idx for idx, key in enumerate(datacols)}
    
    #maintain a <car,lap, (rank, stage)> index 
    idxcols = ['car_number','completed_laps','rank','stageid']
    carindex = {}
    
    stagedf = pd.DataFrame([],columns=cols)
    for event in stagedata.keys():
        alldata, rankdata, acldata, flagdata, stage, data = stagedata[event]
        carlist = set(acldata['car_number'])
        laplist = set(acldata['completed_laps'])
        totalcars = len(carlist)
        totallaps = len(laplist)

        # init the index
        #carindex[eventid] = np.array((totalcars, totallaps, 2))
        carindex[eventid] = pd.DataFrame([],columns=idxcols)
        
        records = []
        
        laphist = []
        rankhist = []
        rankchghist = []        
        #['car_number','stage','start_lap','end_lap',pit_lap, pit_in_caution, 'start_rank','end_rank']
        for row in data:
            # a new record
            rec={}
            #status, 'start_lap', start a new car
            if row[datacolid['start_lap']] == 0:
                start_position = row[datacolid['start_rank']]
                laphist = []
                rankhist = []
                rankchghist = []
            
            #target 
            rec['target'] = row[datacolid['end_rank']] - row[datacolid['start_rank']]
            rec['start_lap'] = row[datacolid['start_lap']]
            
            #features
            rec['eventid'] = eventid
            rec['car_number'] = row[datacolid['car_number']]
            rec['stageid'] = row[datacolid['stage']]
            rec['firststage'] = 0 if row[datacolid['stage']]==0 else 1
            rec['pit_in_caution'] = row[datacolid['pit_in_caution']]
            rec['start_position'] = start_position
            rec['start_rank'] = row[datacolid['start_rank']]
            rec['start_rank_ratio'] = rec['start_rank'] / totalcars
            rec['top_pack'] = 1 if rec['start_rank']<=5 else 0
            #todo, not accurate here
            rec['bottom_pack'] = 1 if rec['start_rank']>=totalcars-5 else 0
            #get from laphist
            if laphist:
                #previous stage
                prev_stage = laphist[-1]
                rec['average_rank'] = np.mean(list(prev_stage['rank']))
                rec['average_rank_all'] = np.mean(rankhist)
            else:
                rec['average_rank'] = rec['start_rank']
                rec['average_rank_all'] = rec['start_rank']
                
            #change in rank
            if rankchghist:
                rec['change_in_rank'] = rankchghist[-1]
                rec['change_in_rank_all'] = np.mean(rankchghist)
            else:
                rec['change_in_rank'] = 0
                rec['change_in_rank_all'] = 0
                
            #rate of change in rank
            if len(rankchghist)>2:
                rec['rate_of_change'] = rankchghist[-1] - rankchghist[-2]
                rec['rate_of_change_all'] = (rankchghist[-1] - rankchghist[0])/(len(rankchghist)-1)
            else:
                rec['rate_of_change'] = 0
                rec['rate_of_change_all'] = 0            
                
            # init neighbor info as 0
            rec['prev_nb0_change_in_rank'] = 0
            rec['prev_nb1_change_in_rank'] = 0
            rec['prev_nb2_change_in_rank'] = 0
            rec['follow_nb0_change_in_rank'] = 0
            rec['follow_nb1_change_in_rank'] = 0
            rec['follow_nb2_change_in_rank'] = 0
            
            # laptime
            if not laphist:
                rec['laptime_green_mean_prev'] = 0
                rec['laptime_green_std_prev'] = 0
                rec['laptime_green_mean_all'] = 0
                rec['laptime_green_std_all'] = 0
                rec['laptime_mean_prev'] = 0
                rec['laptime_std_prev'] = 0
                rec['laptime_mean_all'] = 0
                rec['laptime_std_all'] = 0
                rec['laps_prev'] = 0
                rec['laps_after_last_pitstop'] = 0
                rec['pittime_prev'] = 0
            else:
                #previous stage greeds
                prev_stage = laphist[-1]
                green_laps = list(prev_stage[prev_stage['track_status']=='G']['last_laptime'])
                rec['laptime_green_mean_prev'] = np.mean(green_laps)
                rec['laptime_green_std_prev'] =  np.std(green_laps)
                all_laps = list(prev_stage['last_laptime'])
                rec['laptime_mean_prev'] = np.mean(all_laps)
                rec['laptime_std_prev'] =  np.std(all_laps)
                rec['laps_prev'] = len(all_laps)
                
                last_pit_lap = int(prev_stage['last_pitted_lap'].values[-1], 16) 
                rec['laps_after_last_pitstop'] = rec['start_lap'] - last_pit_lap
                
                #get two laps
                pit_data = rankdata[(rankdata['car_number']== rec['car_number']) & 
                               (rankdata['completed_laps']>=last_pit_lap) &
                               (rankdata['completed_laps']<last_pit_lap+2)
                              ]
                rec['pittime_prev'] = np.mean(list(pit_data['last_laptime']))
                
                # all previous laps laptime
                all_data = rankdata[(rankdata['car_number']== rec['car_number']) & 
                               (rankdata['completed_laps']<rec['start_lap'])
                              ]
                green_laps = list(all_data[all_data['track_status']=='G']['last_laptime'])
                rec['laptime_green_mean_all'] = np.mean(green_laps)
                rec['laptime_green_std_all'] =  np.std(green_laps)
                all_laps = list(all_data['last_laptime'])
                rec['laptime_mean_all'] = np.mean(all_laps)
                rec['laptime_std_all'] =  np.std(all_laps)                
            
            #add one record
            records.append([rec[col] for col in cols])
            
            #update for the new stage
            lapdata = rankdata[(rankdata['car_number']== rec['car_number']) & 
                               (rankdata['completed_laps']>=row[datacolid['start_lap']]) &
                               (rankdata['completed_laps']<row[datacolid['end_lap']])
                              ]
            
            # some pit stops are gapless
            if len(lapdata)==0:
                print('Caution:zero pits at carno:', rec['car_number'],
                     'start_lap:',row[datacolid['start_lap']],
                     'end_lap:',row[datacolid['end_lap']])
                continue
                
            laphist.append(lapdata)
            rankhist.extend(list(lapdata['rank']))
            rankchghist.append(row[datacolid['end_rank']] - row[datacolid['start_rank']])
            
            # add to index, go through the lapdata
            #or _lap in range(row[datacolid['start_lap']],row[datacolid['end_lap']]):
            for index, row in lapdata.iterrows():
                #carindex[eventid][rec['car_number'], row['completed_laps'], 0] = row['rank']
                #carindex[eventid][rec['car_number'], row['completed_laps'], 1 = rec['stageid']
                laprec = [rec['car_number'], row['completed_laps'], row['rank'],rec['stageid']]
                df = pd.DataFrame([laprec],columns=idxcols)
                carindex[eventid] = pd.concat([carindex[eventid], df], ignore_index=True)
            
          
        # push this event into stage dataframe
        print('eventid=%d, records=%d'%(eventid, len(records)))
        df = pd.DataFrame(records,columns=cols)
        stagedf = pd.concat([stagedf, df], ignore_index=True)
        
        #for the next new event
        eventid += 1
        
    return stagedf, carindex

def update_neighbor(stagedf, carindex):
    # go through stage data to add neighbor info
    # carindex[eventid] <car, lap, (rank, stage)
    ndf = stagedf.copy()
    
    #stagedf.index = pd.RangeIndex(len(stagedf.index))
        
    for index, row in stagedf.iterrows():
        
        eventid = row['eventid']
        
        currank = row['start_rank']
        curlap = row['start_lap']
        if curlap == 0:
            continue
        
        df = carindex[row['eventid']]
        df_lap = df[df['completed_laps']==curlap]
        prev0 = df_lap[df_lap['rank'] == currank - 1]
        prev1 = df_lap[df_lap['rank'] == currank - 2]
        prev2 = df_lap[df_lap['rank'] == currank - 3]
        follow0 = df_lap[df_lap['rank'] == currank + 1]
        follow1 = df_lap[df_lap['rank'] == currank + 2]
        follow2 = df_lap[df_lap['rank'] == currank + 3]
        
        recs = [prev0, prev1, prev2, follow0, follow1,follow2]
        recloc = ['prev_nb0_change_in_rank','prev_nb1_change_in_rank','prev_nb2_change_in_rank',
                 'follow_nb0_change_in_rank','follow_nb1_change_in_rank','follow_nb2_change_in_rank']
        for recidx, rec in enumerate(recs):
            if rec.empty:
                continue
                
            car_number = rec['car_number']
            stageid = rec['stageid']
            #print('event:%d, car:%d, stage:%d, index:%d'%(eventid, car_number, stageid, index))
            #find the record of prev0 by eventid, carno, stageid
            #neighbor = stagedf[(stagedf['eventid'] == eventid) & (stagedf['car_number'] == car_number) & (stagedf['stageid'] == stageid)]
            #neighbor = ndf[(ndf['eventid'] == eventid) & (ndf['car_number'] == car_number) & (ndf['stageid'] == stageid)]
            neighbor = ndf.query('eventid == %d & car_number==%d & stageid==%d'%(eventid,car_number,stageid))
            
            if neighbor.empty:
                print('error: empty neighbor at: '%(eventid,car_number,stageid))
                continue
            
            # update current row
            #print('index=%d, col=%s'%(index, recloc[recidx]))
            stagedf.loc[index,recloc[recidx]] = neighbor['change_in_rank'].values[0]
            #stagedf.loc[index,recloc[recidx]] = 1
                                
    return stagedf

### load data

In [9]:
stagedata = {}
global_carids = {}
traindata = None
cur_carid = 0
years = ['2013','2014','2015','2016','2017','2018','2019']
#events = ['Indy500']
events = [f'Indy500-{x}' for x in years]
events_id={key:idx for idx, key in enumerate(events)}
dbid = f'Indy500_{years[0]}_{years[-1]}'

In [10]:
traindata = None
_trim = 0
_include_final = False
include_str = '1' if _include_final else '0'
output_file = f'stage-indy500-2013-2019-end{include_str}-t{_trim}.csv'

for event in events:
    #alldata, rankdata, acldata, flagdata
    stagedata[event] = load_data(event)
    
    alldata, rankdata, acldata, flagdata = stagedata[event]
    carlist = set(acldata['car_number'])
    laplist = set(acldata['completed_laps'])
    print('%s: carno=%d, lapnum=%d'%(event, len(carlist), len(laplist)))
    
    stage, data = get_stagedata(TRIM=_trim, include_final = _include_final)
    stagedata[event] = (alldata, rankdata, acldata, flagdata, stage, data)
    
    if traindata is None:
        traindata = data
    else:
        traindata = np.vstack((traindata, data))

count of completed cars: 19
completed cars: [11 26  1 25 19  3  2 77 83 20 22  8 14  9 18 55 78  5 12]
cars: {1, 2, 3, 5, 8, 9, 11, 12, 77, 14, 78, 18, 19, 20, 83, 22, 55, 25, 26}
#cars= 19
count of completed cars: 19
completed cars: [11 26  1 25 19  3  2 77 83 20 22  8 14  9 18 55 78  5 12]
cars: {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 18, 19, 20, 21, 22, 25, 26, 27, 41, 55, 60, 63, 77, 78, 81, 83, 91, 98}
#cars= 33
Indy500-2013: carno=33, lapnum=201
crash?:carno= 6


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


count of completed cars: 20
completed cars: [28  3 25 34  2 26 11 12 22 21 16 77 68  5 17 33 18  8 14 98]
cars: {33, 2, 3, 34, 5, 68, 98, 8, 11, 12, 77, 14, 16, 17, 18, 21, 22, 25, 26, 28}
#cars= 20
count of completed cars: 20
completed cars: [28  3 25 34  2 26 11 12 22 21 16 77 68  5 17 33 18  8 14 98]
cars: {2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 25, 26, 27, 28, 33, 34, 41, 63, 67, 68, 77, 83, 91, 98}
#cars= 33
Indy500-2014: carno=33, lapnum=201
crash?:carno= 9
crash?:carno= 10
crash?:carno= 67
crash?:carno= 83
count of completed cars: 20
completed cars: [ 2  1 83  9 15 27  3  6 21 22 11  5 14 24 28 98 48  7 29 26]
cars: {1, 2, 3, 98, 5, 6, 7, 9, 11, 14, 15, 48, 83, 21, 22, 24, 26, 27, 28, 29}
#cars= 20
count of completed cars: 20
completed cars: [ 2  1 83  9 15 27  3  6 21 22 11  5 14 24 28 98 48  7 29 26]
cars: {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 32, 41, 43, 48, 63, 83, 88, 98}
#cars= 33
Indy500-2

In [11]:
stagedf0, carindex = extract_features(stagedata)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)


eventid=0, records=198
eventid=1, records=228
eventid=2, records=198
eventid=3, records=259
eventid=4, records=248
eventid=5, records=192
eventid=6, records=217


In [12]:
%debug

ERROR:root:No traceback has been produced, nothing to debug.


In [13]:
stagedf0[(stagedf0['car_number']==12) & (stagedf0['eventid']==5)]

Unnamed: 0,target,start_lap,eventid,car_number,stageid,firststage,pit_in_caution,start_position,start_rank,start_rank_ratio,...,laptime_std_all,laps_prev,laps_after_last_pitstop,pittime_prev,prev_nb0_change_in_rank,prev_nb1_change_in_rank,prev_nb2_change_in_rank,follow_nb0_change_in_rank,follow_nb1_change_in_rank,follow_nb2_change_in_rank
1171,1,0,5,12,0,0,0,3,3,0.090909,...,0.0,0,0,0.0,0,0,0,0,0,0
1172,2,32,5,12,1,1,0,3,4,0.121212,...,8.527139,32,32,66.10705,0,0,0,0,0,0
1173,-5,50,5,12,2,1,1,3,6,0.181818,...,11.242669,18,18,61.85245,0,0,0,0,0,0
1174,1,94,5,12,3,1,0,3,1,0.030303,...,23.296174,44,44,117.01525,0,0,0,0,0,0
1175,6,129,5,12,4,1,0,3,2,0.060606,...,20.554273,35,35,59.24225,0,0,0,0,0,0


In [14]:
#stagedf0.reset_index(inplace=True)
stagedf = update_neighbor(stagedf0, carindex) 

In [15]:
stagedf = stagedf.drop(columns=['start_lap'])

In [16]:
stagedf = stagedf[stagedf['firststage']!=0]
#suffix='withneighbor-newfeatures'
#stagedf.to_csv('stage-%s-%s-t%s.csv'%('indy500-2013-2019',suffix, _trim))
stagedf.to_csv(output_file )

In [22]:
stagedf[(stagedf['car_number']==12) & (stagedf['eventid']==5)]

Unnamed: 0,target,eventid,car_number,stageid,firststage,pit_in_caution,start_position,start_rank,start_rank_ratio,top_pack,...,laptime_std_all,laps_prev,laps_after_last_pitstop,pittime_prev,prev_nb0_change_in_rank,prev_nb1_change_in_rank,prev_nb2_change_in_rank,follow_nb0_change_in_rank,follow_nb1_change_in_rank,follow_nb2_change_in_rank
1172,2,5,12,1,1,0,3,4,0.121212,1,...,8.527139,32,32,66.10705,-1,0,0,0,-3,-5
1173,-5,5,12,2,1,1,3,6,0.181818,0,...,11.242669,18,18,61.85245,-6,-5,0,6,3,3
1174,1,5,12,3,1,0,3,1,0.030303,1,...,23.296174,44,44,117.01525,0,0,0,-3,3,3
1175,6,5,12,4,1,0,3,2,0.060606,1,...,20.554273,35,35,59.24225,-3,0,0,2,1,0


In [18]:
newfeatures=['laptime_green_mean_prev','laptime_green_std_prev','laptime_green_mean_all','laptime_green_std_all', 
 'laptime_mean_prev','laptime_std_prev','laptime_mean_all','laptime_std_all', 
 'laps_prev','laps_after_last_pitstop','pittime_prev'] 
            
stagedf[(stagedf['eventid']==1)][newfeatures].head(10)

Unnamed: 0,laptime_green_mean_prev,laptime_green_std_prev,laptime_green_mean_all,laptime_green_std_all,laptime_mean_prev,laptime_std_prev,laptime_mean_all,laptime_std_all,laps_prev,laps_after_last_pitstop,pittime_prev
199,42.799075,6.887996,42.799075,6.887996,42.799075,6.887996,42.799075,6.887996,32,32,62.1056
200,42.6569,4.608618,42.725833,5.826656,42.6569,4.608618,42.725833,5.826656,34,34,60.258
201,42.123312,5.005922,42.524993,5.573782,42.123312,5.005922,42.524993,5.573782,33,33,61.043
202,42.042779,4.816449,42.404439,5.398465,42.042779,4.816449,42.404439,5.398465,33,33,60.3402
203,59.82255,4.67295,42.664411,5.787487,59.82255,4.67295,42.664411,5.787487,2,2,59.82255
204,42.562587,4.567612,42.65416,5.676642,48.205742,14.22479,43.35255,7.602824,19,19,54.07815
205,44.613955,10.457252,42.788896,6.145962,56.382071,20.81261,44.655502,10.517522,17,17,90.7861
207,42.923377,7.904836,42.923377,7.904836,42.923377,7.904836,42.923377,7.904836,31,31,64.50285
208,42.825834,5.030119,42.873832,6.603145,42.825834,5.030119,42.873832,6.603145,32,32,60.5877
209,42.312683,5.351519,42.696948,6.241232,42.312683,5.351519,42.696948,6.241232,29,29,60.42045


In [19]:
np.max(stagedf)

target                         30.000000
eventid                         6.000000
car_number                     98.000000
stageid                        13.000000
firststage                      1.000000
pit_in_caution                  1.000000
start_position                 33.000000
start_rank                     33.000000
start_rank_ratio                1.000000
top_pack                        1.000000
bottom_pack                     1.000000
average_rank                   33.000000
average_rank_all               33.000000
change_in_rank                 30.000000
change_in_rank_all             22.000000
rate_of_change                 45.000000
rate_of_change_all             16.000000
laptime_green_mean_prev       126.274750
laptime_green_std_prev        135.762342
laptime_green_mean_all         79.312200
laptime_green_std_all          77.265386
laptime_mean_prev            1311.406600
laptime_std_prev              589.369900
laptime_mean_all              168.974933
laptime_std_all 

In [20]:
stagedf[stagedf['pittime_prev']>=3644][newfeatures]

Unnamed: 0,laptime_green_mean_prev,laptime_green_std_prev,laptime_green_mean_all,laptime_green_std_all,laptime_mean_prev,laptime_std_prev,laptime_mean_all,laptime_std_all,laps_prev,laps_after_last_pitstop,pittime_prev


In [21]:
chkdata = stagedata['Pocono'][1]

KeyError: 'Pocono'

In [None]:
chkdata[(chkdata['car_number']==27) &(chkdata['lap_status']=='P')]

In [None]:
chkdata[(chkdata['car_number']==27) &(chkdata['completed_laps']<15)]

In [None]:
#check the integrity of the data(laptime)
#for event in stagedata.keys():
    alldata, rankdata, acldata, flagdata, stage, data = stagedata[event]
    
chkdata = stagedata['Pocono'][2]
chkdata.head(10)

### verify pitstop dataset

In [24]:
alldata, rankdata, acldata, flagdata,_,_ = stagedata['Indy500-2018']
rankdata[(rankdata['car_number']==12) & (rankdata['lap_status']=='P')]

Unnamed: 0_level_0,rank,car_number,unique_id,completed_laps,elapsed_time,last_laptime,lap_status,best_laptime,best_lap,time_behind_leader,...,time_behind_prec,laps_behind_prec,overall_rank,overall_best_laptime,current_status,track_status,pit_stop_count,last_pitted_lap,start_position,laps_led
MyIdx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2836,4,12,2,32,1345.0213,56.1652,P,40.7903,5,12.3354,...,2.4475,0,4,40.7903,Active,G,1,20,3,0
4801,6,12,2,50,2264.9697,116.1604,P,40.359,24,2.6656,...,0.3967,0,6,40.359,Active,Y,2,32,3,0
8594,1,12,2,94,4973.1373,52.4746,P,40.359,24,0.0,...,0.0,0,1,40.359,Active,G,3,5E,3,3
11961,2,12,2,129,6472.9332,53.4746,P,40.359,24,6.7402,...,6.7402,0,2,40.359,Active,G,4,81,3,18
15912,8,12,2,171,9297.102,52.4221,P,40.359,24,10.4272,...,2.7997,0,8,40.359,Active,G,5,AB,3,36


In [28]:
pitlaps = [31,49,93,128]
rankdata[(rankdata['car_number']==12) & (rankdata['completed_laps'].isin(pitlaps))]

Unnamed: 0_level_0,rank,car_number,unique_id,completed_laps,elapsed_time,last_laptime,lap_status,best_laptime,best_lap,time_behind_leader,...,time_behind_prec,laps_behind_prec,overall_rank,overall_best_laptime,current_status,track_status,pit_stop_count,last_pitted_lap,start_position,laps_led
MyIdx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2700,2,12,2,31,1288.8561,45.6879,T,40.7903,5,1.6026,...,1.6026,0,2,40.7903,Active,G,0,0,3,0
4708,6,12,2,49,2148.8093,97.1485,T,40.359,24,6.5879,...,2.0162,0,6,40.359,Active,Y,1,20,3,0
8460,1,12,2,93,4920.6627,40.7131,T,40.359,24,0.0,...,0.0,0,1,40.359,Active,G,2,32,3,2
11817,1,12,2,128,6419.4586,41.4755,T,40.359,24,0.0,...,0.0,0,1,40.359,Active,G,3,5E,3,18


In [29]:
rankdata[(rankdata['completed_laps']==31)]

Unnamed: 0_level_0,rank,car_number,unique_id,completed_laps,elapsed_time,last_laptime,lap_status,best_laptime,best_lap,time_behind_leader,...,time_behind_prec,laps_behind_prec,overall_rank,overall_best_laptime,current_status,track_status,pit_stop_count,last_pitted_lap,start_position,laps_led
MyIdx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2673,1,1,3,31,1287.2535,44.6091,T,41.0717,5,0.0,...,0.0,0,1,41.0717,Active,G,0,0,4,1
2700,2,12,2,31,1288.8561,45.6879,T,40.7903,5,1.6026,...,1.6026,0,2,40.7903,Active,G,0,0,3,0
2709,3,21,5,31,1289.8932,43.3229,T,41.0124,3,2.6397,...,1.0371,0,3,41.0124,Active,G,0,0,6,0
2710,4,18,4,31,1290.0753,44.7068,T,40.9646,2,2.8218,...,0.1821,0,4,40.9646,Active,G,0,0,5,0
2715,5,9,8,31,1292.1429,42.2613,T,41.1498,4,4.8894,...,2.0676,0,5,41.1498,Active,G,0,0,9,0
2720,6,98,B,31,1293.2371,41.5717,T,41.0183,4,5.9836,...,1.0942,0,6,41.0183,Active,G,0,0,C,0
2725,7,28,D,31,1294.3617,41.5736,T,41.1959,2,7.1082,...,1.1246,0,7,41.1959,Active,G,0,0,E,0
2732,8,20,0,31,1299.0977,57.0559,P,40.7746,2,11.8442,...,4.736,0,8,40.7746,Active,G,1,1F,1,1E
2734,9,22,1,31,1300.225,57.5418,P,40.6872,3,12.9715,...,1.1273,0,9,40.6872,Active,G,1,1F,2,0
2735,10,4,A,31,1300.8662,41.2958,T,41.2892,4,13.6127,...,0.6412,0,A,41.2892,Active,G,0,0,B,0
