## Add New feature to rankdb

base:   
  22.PaperFinal/RankNet-QuickTest.ipynb  
  22.PaperFinal/RankNet-makedb-rankmodel.ipynb
    
new features:
    
    1. LeadersPitCnt   ;  how many leaders go to pit stop
    2. shift update leaderpitcnt   ; calc leaderpitcnt by previous rank status shift of prediction_len
    [todo] 3. dynamically update leaderpitcnt   ; for the prediction_len future laps using current rank status

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os,sys
import random
import mxnet as mx
from mxnet import gluon
import pickle
import json
from gluonts.dataset.common import ListDataset
from gluonts.dataset.util import to_pandas
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

import ipdb; 

In [9]:
def nan_helper(y):
    """Helper to handle indices and logical indices of NaNs.

    Input:
        - y, 1d numpy array with possible NaNs
    Output:
        - nans, logical indices of NaNs
        - index, a function, with signature indices= index(logical_indices),
          to convert logical indices of NaNs to 'equivalent' indices
    Example:
        >>> # linear interpolation of NaNs
        >>> nans, x= nan_helper(y)
        >>> y[nans]= np.interp(x(nans), x(~nans), y[~nans])
    """

    return np.isnan(y), lambda z: z.nonzero()[0]

def test_flag(a, bitflag):
    return (a & bitflag) ==  bitflag

#
# remove NaN at the tail
# there should be no nans in the middle of the ts
COL_LAPTIME=0
COL_RANK=1
COL_TRACKSTATUS=2
COL_LAPSTATUS=3
COL_TIMEDIFF=4
COL_CAUTION_LAPS_INSTINT=5
COL_LAPS_INSTINT= 6
COL_ELAPSED_TIME= 7
COL_LAP2NEXTPIT = 8
#_featureCnt = 9

# added new features
COL_LEADER_PITCNT = 9
COL_TOTAL_PITCNT = 10
COL_SHIFT_TRACKSTATUS = 11
COL_SHIFT_LAPSTATUS = 12
COL_SHIFT_LEADER_PITCNT = 13
COL_SHIFT_TOTAL_PITCNT = 14

COL_LASTFEATURE = 14
# dynamically extended space in simulation
COL_TRACKSTATUS_SAVE = COL_LASTFEATURE+1
COL_LAPSTATUS_SAVE = COL_LASTFEATURE+2
COL_CAUTION_LAPS_INSTINT_SAVE = COL_LASTFEATURE+3
COL_LAPS_INSTINT_SAVE= COL_LASTFEATURE+4

COL_ENDPOS = COL_LASTFEATURE+5


FEATURE_STATUS = 2
FEATURE_PITAGE = 4
FEATURE_LEADER_PITCNT = 8
FEATURE_TOTAL_PITCNT = 16
FEATURE_SHIFT_TRACKSTATUS = 32
FEATURE_SHIFT_LAPSTATUS = 64
FEATURE_SHIFT_LEADER_PITCNT = 128
FEATURE_SHIFT_TOTAL_PITCNT  = 256

_feature2str= {
    FEATURE_STATUS : ("FEATURE_STATUS",'S'),
    FEATURE_PITAGE : ("FEATURE_PITAGE",'A'),
    FEATURE_LEADER_PITCNT : ("FEATURE_LEADER_PITCNT",'L'),
    FEATURE_TOTAL_PITCNT : ("FEATURE_TOTAL_PITCNT",'T'),
    FEATURE_SHIFT_TRACKSTATUS : ("FEATURE_SHIFT_TRACKSTATUS",'Y'),
    FEATURE_SHIFT_LAPSTATUS : ("FEATURE_SHIFT_LAPSTATUS",'P'),
    FEATURE_SHIFT_LEADER_PITCNT : ("FEATURE_SHIFT_LEADER_PITCNT",'L'),
    FEATURE_SHIFT_TOTAL_PITCNT  : ("FEATURE_SHIFT_TOTAL_PITCNT",'T')
    }


MODE_ORACLE = 0
MODE_NOLAP = 1
MODE_NOTRACK = 2
MODE_TESTZERO = 4
MODE_TESTCURTRACK = 8
#MODE_STR={MODE_ORACLE:'oracle', MODE_NOLAP:'nolap',MODE_NOTRACK:'notrack',MODE_TEST:'test'}

#_feature_mode = FEATURE_STATUS
def decode_feature_mode(feature_mode):
    
    retstr = []
    short_ret = []
    for feature in _feature2str.keys():
        if test_flag(feature_mode, feature):
            retstr.append(_feature2str[feature][0])
            short_ret.append(_feature2str[feature][1])
        else:
            short_ret.append('0')

    print(' '.join(retstr))
    
    return ''.join(short_ret)


def add_leader_cnt(selmat, rank_col=COL_RANK, pit_col=COL_LAPSTATUS, shift_len = 0, 
                   dest_col = COL_LEADER_PITCNT,
                   verbose = False):
    """
    add a new feature into mat(car, feature, lap)
    
    shift rank status
    
    input:
        sel_mat : laptime_data array [car, feature, lap]
    
    """
    dim1, dim2, dim3 = selmat.shape
    
    # rerank by the rank_col
    idx = np.argsort(selmat[:, rank_col,:], axis=0)
    true_rank = np.argsort(idx, axis=0).astype(np.float)

    # get leaderCnt by sorted pits
    pits = np.zeros((dim1,dim3))
    
    for lap in range(shift_len, dim3):
        col = idx[:, lap-shift_len]
        pits[:, lap] = selmat[col, pit_col, lap]
    
    leaderCnt = np.nancumsum(pits, axis=0) - pits
    
    if verbose:
        print('pits:\n')
        print(pits[:,190:])
        print('leaderCnt raw:\n')
        print(leaderCnt[:,190:])
    
    #remove nans
    nanidx = np.isnan(leaderCnt)
    leaderCnt[nanidx] = 0
    
    if verbose:
        print('leaderCnt after remove nan:\n')
        print(leaderCnt[:,190:])
    
    if dest_col == -1:
        #create a new data
        newmat = np.zeros((dim1,dim2+1,dim3))
        dest_col = dim2
        newmat[:,:dim2,:] = selmat.copy()
    else:
        #update mode
        newmat = selmat
    
    for lap in range(dim3):
        col = idx[:, lap]
        newmat[col, dest_col, lap] = leaderCnt[:, lap]
        
    # sync length to COL_RANK
    for rec in newmat:
        nans, x= nan_helper(rec[rank_col,:])
        nan_count = np.sum(nans)
        if nan_count > 0:
            #todo, some invalid nan, remove them
            #rec[dim2, np.isnan(rec[dim2,:])] = 0
            rec[dest_col, -nan_count:] = np.nan
    
    return newmat

def add_allpit_cnt(selmat, rank_col=COL_RANK, pit_col=COL_LAPSTATUS, 
                   dest_col = COL_TOTAL_PITCNT,verbose = False):
    """
    add a new feature into mat(car, feature, lap)
    
    total pits in a lap
    
    input:
        sel_mat : laptime_data array [car, feature, lap]
    
    """
    dim1, dim2, dim3 = selmat.shape

    #calc totalCnt vector for 
    totalCnt = np.nansum(selmat[:, pit_col, :], axis=0).reshape((-1))
    
    if verbose:
        print('pits:\n')
        print(pits[:,190:])
        print('totalCnt raw:\n')
        print(totalCnt[190:])
    
    #remove nans
    nanidx = np.isnan(totalCnt)
    totalCnt[nanidx] = 0
    
    if verbose:
        print('totalCnt after remove nan:\n')
        print(totalCnt[190:])
    
    if dest_col == -1:
        #create a new data
        newmat = np.zeros((dim1,dim2+1,dim3))
        dest_col = dim2
        newmat[:,:dim2,:] = selmat.copy()
    else:
        #update mode
        newmat = selmat

    for car in range(dim1):
        newmat[car, dest_col, :] = totalCnt
        
    # sync length to COL_RANK
    for rec in newmat:
        nans, x= nan_helper(rec[rank_col,:])
        nan_count = np.sum(nans)
        if nan_count > 0:
            #todo, some invalid nan, remove them
            #rec[dim2, np.isnan(rec[dim2,:])] = 0
            rec[dest_col, -nan_count:] = np.nan
    
    return newmat

def add_shift_feature(selmat, rank_col=COL_RANK, shift_col=COL_LAPSTATUS, shift_len = 2, 
                      dest_col = -1,verbose = False):
    """
    add a new feature into mat(car, feature, lap)
    
    shift features left in a lap
    
    warning: these are oracle features, be careful not to let future rank positions leaking
    
    input:
        sel_mat : laptime_data array [car, feature, lap]
    
    """
    dim1, dim2, dim3 = selmat.shape

    if dest_col == -1:
        #create a new data
        newmat = np.zeros((dim1,dim2+1,dim3))
        dest_col = dim2
        newmat[:,:dim2,:] = selmat.copy()
    else:
        #update mode
        newmat = selmat
    
    for car in range(dim1):
        # set empty status by default
        newmat[car, dest_col, :] = np.nan
        
        # get valid laps
        rec = selmat[car]
        nans, x= nan_helper(rec[rank_col,:])
        nan_count = np.sum(nans)
        recnnz = rec[shift_col, ~np.isnan(rec[rank_col,:])]
        reclen = len(recnnz)

        #shift copy
        newmat[car, dest_col, :reclen] = 0
        #newmat[car, dim2, :-shift_len] = selmat[car, shift_col, shift_len:]
        newmat[car, dest_col, :reclen-shift_len] = recnnz[shift_len:]
        
    # sync length to COL_RANK
    #for rec in newmat:
    #    nans, x= nan_helper(rec[rank_col,:])
    #    nan_count = np.sum(nans)
    #    if nan_count > 0:
    #        #todo, some invalid nan, remove them
    #        #rec[dim2, np.isnan(rec[dim2,:])] = 0
    #        rec[dim2, -nan_count:] = np.nan
    
    return newmat


def prepare_laptimedata(prediction_length, freq, 
                       test_event = 'Indy500-2018',
                       train_ratio=0.8,
                       context_ratio = 0.,
                       shift_len = -1):
    """
    prepare the laptime data for training
    
    1. remove short ts
    2. rerank the tss
    3. create new features
    
    input: 
        laptime_data   ; global var
    output:
        data  ; new representation of laptime_data
    
    """
    _laptime_data = laptime_data.copy()
    
    test_eventid = events_id[test_event]
    run_ts = COL_RANK
    
    # check shift len
    if shift_len < 0:
        shift_len = prediction_length
    print('prepare_laptimedata shift len:', shift_len)
    
    #_data: eventid, carids, datalist[carnumbers, features, lapnumber]->[laptime, rank, track, lap]]
    new_data = []
    for _data in _laptime_data:
        #skip eid > test_eventid
        if _data[0] > test_eventid:
            print('skip this event:', events[_data[0]])
            break
        
        if events[_data[0]] == test_event:
            test_mode = True
        else:
            test_mode = False        
        
        #statistics on the ts length
        ts_len = [ _entry.shape[1] for _entry in _data[2]]
        train_len = int(np.max(ts_len) * train_ratio)
        if train_len == 0:
            #use global train_len
            train_len = _train_len if not test_mode else _test_train_len
        
        if context_ratio != 0.:
            # add this part to train set
            context_len = int(np.max(ts_len) * context_ratio)
        else:    
            context_len = prediction_length*2
        if context_len < 10:
            context_len = 10
        
        print(f'before ====event:{events[_data[0]]}, prediction_len={prediction_length},train_len={train_len}, max_len={np.max(ts_len)}, min_len={np.min(ts_len)},context_len={context_len}')

        #rerank due to short ts removed
        #if run_ts == COL_RANK and dorerank == True:
        if True:
            sel_rows = []
            
            # use to check the dimension of features
            input_feature_cnt = _data[2].shape[1]
            if input_feature_cnt < COL_LASTFEATURE + 1:
                print('create new features mode, feature_cnt:', input_feature_cnt)
            else:
                print('update features mode, feature_cnt:', input_feature_cnt)
            
            for rowid in range(_data[2].shape[0]):
                # rec[features, lapnumber] -> [laptime, rank, track_status, lap_status,timediff]]
                rec = _data[2][rowid].copy()
                #remove nan(only tails)
                nans, x= nan_helper(rec[run_ts,:])
                nan_count = np.sum(nans)             
                rec = rec[:, ~np.isnan(rec[run_ts,:])]
                
                totallen = rec.shape[1]
                if ( totallen < train_len + prediction_length):
                    print(f'rerank a short ts: carid={_data[1][rowid]}，len={totallen}')
                    continue 
                else:
                    sel_rows.append(rowid)
                    
            #get selected matrix
            sel_idx = np.array(sel_rows)
            selmat = _data[2][sel_idx]
            
            # check the format of _data
            #ipdb.set_trace()
            
            mask = np.isnan(selmat[:,COL_RANK,:])
            
            idx = np.argsort(selmat[:,COL_RANK,:], axis=0)
            true_rank = np.argsort(idx, axis=0).astype(np.float)
            true_rank[mask] = np.nan
            
            if test_mode:
                #
                # for historical code mismatch, simulation does not run rerank
                #
                _data[2][sel_idx,COL_RANK,:] = true_rank + 1
            else:
                _data[2][sel_idx,COL_RANK,:] = true_rank
            
            # update the carno dict
            new_carids = {}
            for rowid in range(len(sel_idx)):
                carid = sel_idx[rowid]
                carno = _data[1][carid]
                new_carids[rowid] = carno

                
            # add new features
            # add leaderPitCnt
            if _data[0]==0:
                verbose = True
            else:
                verbose = False
                

            dest_col = -1 if input_feature_cnt < COL_LASTFEATURE + 1 else COL_LEADER_PITCNT
            data2_intermediate = add_leader_cnt(_data[2][sel_idx], shift_len = shift_len, dest_col=dest_col, verbose = verbose)
            
            # add totalPit
            dest_col = -1 if input_feature_cnt < COL_LASTFEATURE + 1 else COL_TOTAL_PITCNT
            data2_intermediate = add_allpit_cnt(data2_intermediate, dest_col=dest_col)
            
            #
            # add shift features, a fixed order, see the MACROS 
            #COL_SHIFT_TRACKSTATUS = 11
            #COL_SHIFT_LAPSTATUS = 12
            #COL_SHIFT_LEADER_PITCNT = 13
            #COL_SHIFT_TOTAL_PITCNT = 14
            #
            dest_col = -1 if input_feature_cnt < COL_LASTFEATURE + 1 else COL_SHIFT_TRACKSTATUS
            data2_intermediate = add_shift_feature(data2_intermediate, dest_col=dest_col,
                                                   shift_col=COL_TRACKSTATUS, shift_len = shift_len)
            
            dest_col = -1 if input_feature_cnt < COL_LASTFEATURE + 1 else COL_SHIFT_LAPSTATUS
            data2_intermediate = add_shift_feature(data2_intermediate, dest_col=dest_col,
                                                   shift_col=COL_LAPSTATUS, shift_len = shift_len)
            
            dest_col = -1 if input_feature_cnt < COL_LASTFEATURE + 1 else COL_SHIFT_LEADER_PITCNT
            data2_intermediate = add_shift_feature(data2_intermediate, dest_col=dest_col,
                                                   shift_col=COL_LEADER_PITCNT, shift_len = shift_len)
            
            dest_col = -1 if input_feature_cnt < COL_LASTFEATURE + 1 else COL_SHIFT_TOTAL_PITCNT
            data2_intermediate = add_shift_feature(data2_intermediate, dest_col=dest_col,
                                                   shift_col=COL_TOTAL_PITCNT, shift_len = shift_len)
            
            # final
            data2_newfeature = data2_intermediate
            
        new_data.append([_data[0], new_carids, data2_newfeature])
        
    return new_data


def get_real_features(feature_mode, rec, endpos):
    """
    construct the real value feature vector from feature_mode

    legacy code:
        real_features = {
            FEATURE_STATUS:[rec[COL_TRACKSTATUS,:],rec[COL_LAPSTATUS,:]],
            FEATURE_PITAGE:[rec[COL_TRACKSTATUS,:],rec[COL_LAPSTATUS,:],rec[COL_LAPS_INSTINT,:]],
            FEATURE_LEADERPITCNT:[rec[COL_TRACKSTATUS,:],rec[COL_LAPSTATUS,:],rec[COL_LEADER_PITCNT,:]],
            FEATURE_TOTALPITCNT:[rec[COL_TRACKSTATUS,:],rec[COL_LAPSTATUS,:],rec[COL_TOTAL_PITCNT,:]]
        }    
    
        real_features[feature_mode]
        
        
        COL_LEADER_PITCNT = 9
        COL_TOTAL_PITCNT = 10
        COL_SHIFT_TRACKSTATUS = 11
        COL_SHIFT_LAPSTATUS = 12
        COL_SHIFT_LEADER_PITCNT = 13
        COL_SHIFT_TOTAL_PITCNT = 14


        FEATURE_STATUS = 2
        FEATURE_PITAGE = 4
        FEATURE_LEADER_PITCNT = 8
        FEATURE_TOTAL_PITCNT = 16
        FEATURE_SHIFT_TRACKSTATUS = 32
        FEATURE_SHIFT_LAPSTATUS = 64
        FEATURE_SHIFT_LEADER_PITCNT = 128
        FEATURE_SHIFT_TOTAL_PITCNT  = 256        
    
    """
    
    features = []
    
    #check endpos
    if endpos <=0 :
        endpos = rec.shape[1]
    
    if test_flag(feature_mode, FEATURE_STATUS):
        features.append(rec[COL_TRACKSTATUS,:endpos])
        features.append(rec[COL_LAPSTATUS,:endpos])
        
    if test_flag(feature_mode, FEATURE_PITAGE):
        features.append(rec[COL_LAPS_INSTINT,:endpos])
        
    if test_flag(feature_mode, FEATURE_LEADER_PITCNT):
        features.append(rec[COL_LEADER_PITCNT,:endpos])
        
    if test_flag(feature_mode, FEATURE_TOTAL_PITCNT):
        features.append(rec[COL_TOTAL_PITCNT,:endpos])    
        
    if test_flag(feature_mode, FEATURE_SHIFT_TRACKSTATUS):
        features.append(rec[COL_SHIFT_TRACKSTATUS,:endpos])    
        
    if test_flag(feature_mode, FEATURE_SHIFT_LAPSTATUS):
        features.append(rec[COL_SHIFT_LAPSTATUS,:endpos])    

    if test_flag(feature_mode, FEATURE_SHIFT_LEADER_PITCNT):
        features.append(rec[COL_SHIFT_LEADER_PITCNT,:endpos])    

    if test_flag(feature_mode, FEATURE_SHIFT_TOTAL_PITCNT):
        features.append(rec[COL_SHIFT_TOTAL_PITCNT,:endpos])    
        
        
    return features

def make_dataset_byevent(_laptime_data, prediction_length, freq, 
                       useeid = False,
                       run_ts=COL_LAPTIME, 
                       test_event = 'Indy500-2018',
                       use_global_dict = True,
                       oracle_mode = MODE_ORACLE,
                       half_moving_win = True,
                       train_ratio=0.8,
                       log_transform = False,
                       context_ratio = 0.,
                       dorerank = True,
                       test_cars = []  
                ):
    """
    split the ts to train and test part by the ratio
    
    oracle_mode: false to simulate prediction in real by 
        set the covariates of track and lap status as nan in the testset
            
    
    """    
    #global setting
    feature_mode = _feature_mode
    
    start = pd.Timestamp("01-01-2019", freq=freq)  # can be different for each time series

    train_set = []
    test_set = []
    
    
    totalTSCnt = 0
    totalTSLen = 0
    test_eventid = events_id[test_event]
    
    #_data: eventid, carids, datalist[carnumbers, features, lapnumber]->[laptime, rank, track, lap]]
    for _data in _laptime_data:
        _train = []
        _test = []
        
        if events[_data[0]] == test_event:
            test_mode = True
        else:
            test_mode = False
            
        #statistics on the ts length
        ts_len = [ _entry.shape[1] for _entry in _data[2]]
        train_len = int(np.max(ts_len) * train_ratio)
        if train_len == 0:
            #use global train_len
            train_len = _train_len if not test_mode else _test_train_len
        
        if context_ratio != 0.:
            # add this part to train set
            context_len = int(np.max(ts_len) * context_ratio)
        else:    
            context_len = prediction_length*2
        if context_len < 10:
            context_len = 10
        
        print(f'after ====event:{events[_data[0]]}, prediction_len={prediction_length},train_len={train_len}, max_len={np.max(ts_len)}, min_len={np.min(ts_len)},context_len={context_len}')

        # process for each ts
        for rowid in range(_data[2].shape[0]):
            # rec[features, lapnumber] -> [laptime, rank, track_status, lap_status,timediff]]
            rec = _data[2][rowid].copy()
            
            #remove nan(only tails)
            nans, x= nan_helper(rec[run_ts,:])
            nan_count = np.sum(nans)             
            rec = rec[:, ~np.isnan(rec[run_ts,:])]
            
            # remove short ts
            totallen = rec.shape[1]
            
            totalTSCnt += 1
            totalTSLen += totallen
            
            if ( totallen < train_len + prediction_length):
                print(f'a short ts: carid={_data[1][rowid]}，len={totallen}')
                continue                
            
            if use_global_dict:
                carno = _data[1][rowid]
                carid = global_carids[_data[1][rowid]]
            else:
                #simulation dataset, todo, fix the carids as decoder
                carno = rowid
                carid = rowid
                
            #check carno in test_cars, testmode only
            if len(test_cars)>0 and carno not in test_cars:
                continue
                
            if useeid:
                static_cat = [carid, _data[0]]    
            else:
                static_cat = [carid]    
                
            #first, get target a copy    
            # target can be COL_XXSTATUS
            target_val = rec[run_ts,:].copy().astype(np.float32)
            if log_transform:
                target_val = np.log(target_val + 1.0)
            
            # selection of features
            if test_flag(oracle_mode, MODE_NOTRACK):                
                rec[COL_TRACKSTATUS, :] = 0
            if test_flag(oracle_mode, MODE_NOLAP):                
                rec[COL_LAPSTATUS, :] = 0

            test_rec_cnt = 0
            if not test_mode:
                # all go to train set
                real_features = get_real_features(feature_mode, rec, -1)
                
                _train.append({'target': target_val, 
                            'start': start, 
                            'feat_static_cat': static_cat,
                            'feat_dynamic_real': real_features
                          })
                    
            else:
                # reset train_len
                if context_ratio != 0.:
                    # all go to train set
                    #add [0, context_len] to train set 
                    # all go to train set
                    _train.append({'target': target_val[:context_len],  
                                'start': start, 
                                'feat_static_cat': static_cat,
                                'feat_dynamic_real': get_real_features(feature_mode, rec, context_len)
                              })
                              
                # testset
                # multiple test ts(rolling window as half of the prediction_length)
                #step = -int(prediction_length/2) if half_moving_win else -prediction_length
                step = -1
                for endpos in range(totallen, context_len+prediction_length, 
                                    step):

                    track_rec = rec[COL_TRACKSTATUS, :endpos].copy()
                    lap_rec = rec[COL_LAPSTATUS, :endpos].copy()
                    pitage_rec = rec[COL_LAPS_INSTINT, :endpos].copy()

                    real_features = get_real_features(feature_mode, rec, endpos)
                    
                    _test.append({'target': rec[run_ts,:endpos].astype(np.float32), 
                            'start': start, 
                            'feat_static_cat': static_cat,
                            'feat_dynamic_real': real_features
                             })
                                 
                    test_rec_cnt += 1
            
            #check feature cnt
            featureCnt = len(real_features)
            
            #add one ts
            print(f'carno:{carno}, totallen:{totallen}, nancount:{nan_count}, test_reccnt:{test_rec_cnt},featureCnt:{featureCnt}')

        train_set.extend(_train)
        test_set.extend(_test)

    print(f'train len:{len(train_set)}, test len:{len(test_set)}, totsl TsCnt:{totalTSCnt}, total ts len:{totalTSLen}')
    
    train_ds = ListDataset(train_set, freq=freq)
    test_ds = ListDataset(test_set, freq=freq)    
    
    return train_ds, test_ds, train_set, test_set

### Test Code

In [10]:
#inlap status = 
# 0 , no inlap
# 1 , set previous lap
# 2 , set the next lap
_inlap_status = 0

#
# featuremode in [FEATURE_STATUS, FEATURE_PITAGE]:
#
_feature_mode = 506
_featureCnt = 9

#
# training parameters
#
freq = "1min"
_train_len = 40
prediction_length = 2

context_ratio = 0.
context_length =  40
contextlen = context_length

dataset='rank'
_run_ts = COL_RANK

_test_event = 'Indy500-2018'
year = '2018'

#
# string map
#
inlapstr = {0:'noinlap',1:'inlap',2:'outlap'}
weightstr = {True:'weighted',False:'noweighted'}
catestr = {True:'cate',False:'nocate'}
cur_featurestr = decode_feature_mode(_feature_mode)
print('feature_mode:', _feature_mode, cur_featurestr)

#
# input data parameters
#
years = ['2013','2014','2015','2016','2017','2018','2019']
events = [f'Indy500-{x}' for x in years]
events_id={key:idx for idx, key in enumerate(events)}
dbid = f'Indy500_{years[0]}_{years[-1]}_v{_featureCnt}_p{_inlap_status}'
_dataset_id = '%s-%s'%(inlapstr[_inlap_status], cur_featurestr)

# standard output file names
LAPTIME_DATASET = f'laptime_rank_timediff_pit-oracle-{dbid}.pickle' 
STAGE_DATASET = f'stagedata-{dbid}.pickle' 
EVALUATION_RESULT_DF = f'evaluation_result_d{dataset}.csv'
LONG_FORECASTING_DFS = f'long_forecasting_dfs_d{dataset}.pickle'
FORECAST_FIGS_DIR = f'forecast-figs-d{dataset}/'   

_test_train_len = 40

FEATURE_STATUS FEATURE_LEADER_PITCNT FEATURE_TOTAL_PITCNT FEATURE_SHIFT_TRACKSTATUS FEATURE_SHIFT_LAPSTATUS FEATURE_SHIFT_LEADER_PITCNT FEATURE_SHIFT_TOTAL_PITCNT
feature_mode: 506 S0LTYPLT


In [11]:
outdir = 'test/'
outputRoot = outdir
os.makedirs(outdir, exist_ok=True)

_task_dir = f'{outdir}/'


print('Load laptime and stage dataset:',outputRoot + LAPTIME_DATASET, outputRoot + STAGE_DATASET)
with open(outputRoot + LAPTIME_DATASET, 'rb') as f:
    global_carids, laptime_data = pickle.load(f, encoding='latin1') 
with open(outputRoot + STAGE_DATASET, 'rb') as f:
    stagedata = pickle.load(f, encoding='latin1') 


#
#dbname, train_ds, test_ds = makedbs()   
#
useeid = False
interpolate = False
#ipstr = '-ip' if interpolate else '-noip'
ipstr = '%s-%s'%('ip' if interpolate else 'noip', 'eid' if useeid else 'noeid')
dbname = _task_dir + f'gluontsdb-{dataset}-oracle-{ipstr}-all-all-f{freq}-t{prediction_length}-r{_test_event}-indy-{year}.pickle'


if useeid:
    cardinality = [len(global_carids), len(laptime_data)]
else:
    cardinality = [len(global_carids)]


shift = 'shift'
laptimedb = _task_dir + f'gluontsdb-{dataset}-oracle-{ipstr}-all-all-f{freq}-t{prediction_length}-r{_test_event}-indy-{year}-{shift}.pickle'    

prepared_laptimedata = prepare_laptimedata(prediction_length, freq, test_event = _test_event,
                       train_ratio=0, context_ratio = 0., shift_len = prediction_length)


laptime_data = prepared_laptimedata
prepared_laptimedata = prepare_laptimedata(prediction_length, freq, test_event = _test_event,
                       train_ratio=0, context_ratio = 0., shift_len = prediction_length)


train_ds, test_ds,_,_ = make_dataset_byevent(prepared_laptimedata, prediction_length,freq,
                                     useeid=useeid, run_ts=_run_ts,
                                    test_event=_test_event, log_transform =False,
                                    context_ratio=0, train_ratio = 0, dorerank =True)


with open(dbname, 'wb') as f:
    savedata = [freq, prediction_length, cardinality, train_ds, test_ds]
    pickle.dump(savedata, f, pickle.HIGHEST_PROTOCOL)
    
with open(laptimedb, 'wb') as f:
    pickle.dump(prepared_laptimedata, f, pickle.HIGHEST_PROTOCOL)
    

Load laptime and stage dataset: test/laptime_rank_timediff_pit-oracle-Indy500_2013_2019_v9_p0.pickle test/stagedata-Indy500_2013_2019_v9_p0.pickle
prepare_laptimedata shift len: 2
before ====event:Indy500-2013, prediction_len=2,train_len=40, max_len=200, min_len=200,context_len=10
create new features mode, feature_cnt: 9
rerank a short ts: carid=4，len=3
rerank a short ts: carid=6，len=34
pits:

[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0. nan nan  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.  1.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 

carno:3, totallen:200, nancount:0, test_reccnt:0,featureCnt:8
carno:5, totallen:200, nancount:0, test_reccnt:0,featureCnt:8
carno:6, totallen:190, nancount:10, test_reccnt:0,featureCnt:8
carno:7, totallen:198, nancount:2, test_reccnt:0,featureCnt:8
carno:8, totallen:200, nancount:0, test_reccnt:0,featureCnt:8
carno:9, totallen:168, nancount:32, test_reccnt:0,featureCnt:8
carno:10, totallen:177, nancount:23, test_reccnt:0,featureCnt:8
carno:11, totallen:200, nancount:0, test_reccnt:0,featureCnt:8
carno:12, totallen:200, nancount:0, test_reccnt:0,featureCnt:8
carno:14, totallen:200, nancount:0, test_reccnt:0,featureCnt:8
carno:15, totallen:44, nancount:156, test_reccnt:0,featureCnt:8
carno:16, totallen:200, nancount:0, test_reccnt:0,featureCnt:8
carno:17, totallen:200, nancount:0, test_reccnt:0,featureCnt:8
carno:18, totallen:200, nancount:0, test_reccnt:0,featureCnt:8
carno:19, totallen:198, nancount:2, test_reccnt:0,featureCnt:8
carno:20, totallen:175, nancount:25, test_reccnt:0,featur

### TestCode: add_leader_cnt

In [None]:
a = np.array([[[2, 5, 9, 6, 6, 3, 7, 7, 0, 2],
        [0, 0, 1, 1, 0, 1, 1, 0, 1, 1]],
        [[5, 8, 8, 6, 4, 4, 9, np.nan, np.nan, np.nan],
        [1, 1, 1, 0, 1, 1, 1, 0, 0, 1]],
       [[7, 1, 3, 2, 6, 9, 9, 4, 4, 9],
        [1, 0, 0, 0, 0, 1, 0, 1, 0, 1]],
       [[8, 4, 9, 1, 7, 0, 7, 2, np.nan, np.nan],
        [0, 1, 0, 0, 0, 1, 0, 1, 1, 1]]])

results like this:

    array([[[2., 5., 9., 6., 6., 3., 7., 7., 0., 2.],
        [0., 0., 1., 1., 0., 1., 1., 0., 1., 1.],
        [0., 1., 1., 0., 1., 1., 0., 2., 0., 0.]],

       [[5., 8., 8., 6., 4., 4., 9., 3., 8., 7.],
        [1., 1., 1., 0., 1., 1., 1., 0., 0., 1.],
        [0., 1., 0., 1., 0., 2., 1., 1., 2., 2.]],

       [[7., 1., 3., 2., 6., 9., 9., 4., 4., 9.],
        [1., 0., 0., 0., 0., 1., 0., 1., 0., 1.],
        [1., 0., 0., 0., 1., 3., 2., 1., 2., 3.]],

       [[8., 4., 9., 1., 7., 0., 7., 2., 1., 6.],
        [0., 1., 0., 0., 0., 1., 0., 1., 1., 1.],
        [2., 0., 2., 0., 1., 0., 1., 0., 1., 1.]]])

In [None]:
mat1 = add_leader_cnt(a, 0, 1)
mat1

In [None]:
mat2 = add_allpit_cnt(mat1, rank_col=0, pit_col = 1)
mat2

In [None]:
add_shift_feature(mat2, rank_col=0, shift_col = 2)

### TestCode: remove nan

In [None]:
a = np.array([
 [ 0,  0,  0,  0,  0,  0,  0, np.nan, np.nan,  0],
 [ 0,  1,  0,  0,  0,  1,  0,  0,  0,  0],
 [ 0,  1,  0, np.nan, np.nan,  1,  0,  0,  0,  0],
 [ 0,  1,  0,  0,  0,  1,  0,  0,  0,  0],
 [ 0,  1,  0,  0,  0, np.nan, np.nan, np.nan, np.nan, np.nan],
 [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]])


In [None]:
idx = np.isnan(a)

In [None]:
idx

In [None]:
a[idx]=0
a

In [None]:
a=np.array([FEATURE_STATUS,
FEATURE_LEADER_PITCNT,
FEATURE_TOTAL_PITCNT,
FEATURE_SHIFT_TRACKSTATUS,
FEATURE_SHIFT_LAPSTATUS,
FEATURE_SHIFT_LEADER_PITCNT,
FEATURE_SHIFT_TOTAL_PITCNT])
np.sum(a)

In [None]:
FEATURE_STATUS