In [1]:
import pandas as pd
import numpy as np
import os 

### Read data from data files

In [2]:
def read_data(path=None, file=None):
    
    if path and not file:
        files_list = os.listdir(path)
        li = []    
        
        for file in files_list:
            filepath = path + '/' + file
            df = pd.read_csv(filepath, index_col=0)
            li.append(df)
        
        dataframe = pd.concat(li, axis=0, ignore_index=True)
    
    elif file and not path:
        dataframe = pd.read_csv(file, index_col=0)
        
    elif file and path:
        filepath = path + '/' + file
        dataframe = pd.read_csv(filepath, index_col=0)
        
    else:
        dataframe = None
        
    return dataframe

In [3]:
df_2013 = read_data(file='./data/Data2013.csv')
df_2014 = read_data(file='./data/Data2014.csv')
df_2015 = read_data(file='./data/Data2015.csv')
df_2016 = read_data(file='./data/Data2016.csv')
df_2017 = read_data(file='./data/Data2017.csv')
df_2018 = read_data(file='./data/Data2018.csv')
df_2019 = read_data(file='./data/Data2019.csv')

In [4]:
df_2013.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56,57,58,59,60,61,62,63,64,65
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,1.0,1.006015,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
2,1.021548,2.0,0.0,2.0,1.017281,2.0,1.029052,2.0,0.0,2.0,...,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0,1.018563,2.0
3,1.044093,3.0,1.010881,3.0,1.040279,3.0,1.05209,3.0,1.005662,3.0,...,0.0,3.0,0.0,3.0,1.011268,3.0,0.0,3.0,1.041447,3.0
4,1.066638,4.0,1.033982,4.0,1.063278,4.0,1.075127,4.0,1.028592,4.0,...,1.018794,4.0,1.020992,4.0,1.03418,4.0,1.015575,4.0,1.06433,4.0


In [5]:
df_2018.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56,57,58,59,60,61,62,63,64,65
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.023845,1.0,1.021091,1.0,1.018493,1.0,1.013808,1.0,1.015944,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
2,1.04769,2.0,1.044842,2.0,1.042346,2.0,1.03741,2.0,1.039227,2.0,...,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0,0.0,2.0
3,1.071536,3.0,1.068593,3.0,1.066199,3.0,1.061013,3.0,1.06251,3.0,...,0.0,3.0,0.0,3.0,0.0,3.0,0.0,3.0,0.0,3.0
4,1.095381,4.0,1.092345,4.0,1.090051,4.0,1.084616,4.0,1.085792,4.0,...,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0


In [6]:
df_2018.iloc[[-1],:].max(axis=1)

10782    1976.0135
dtype: float64

In [7]:
df_2018.iloc[[-1],:]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56,57,58,59,60,61,62,63,64,65
10782,200.90918,1402.0027,200.81365,1317.9259,200.984613,1484.898,200.776592,1519.0526,0.0,0.0,...,0.0,0.0,200.71606,1387.2449,200.147612,143.6974,200.858939,1348.7667,199.588564,360.975


In [8]:
dist_columns = [ str(idx) for idx in df_2018.columns if int(idx) % 2 == 0 ]
index_max_dist = df_2018[dist_columns].idxmax(1)

In [9]:
index_max_dist

0        0
1        0
2        0
3        0
4        0
        ..
10778    4
10779    4
10780    4
10781    4
10782    4
Length: 10783, dtype: object

In [10]:
index_max_dist.value_counts()

0     3553
4     3465
50     980
18     729
58     516
24     489
40     261
8      177
10     176
6      130
44      87
26      66
34      58
2       55
62      41
dtype: int64

In [11]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

### Data processing

In [155]:
# Function for all 33 cars
from scipy.stats import rankdata

def create_data(dataframe, num_cars=10, rank_step=10, frame_step=1, skip_step=5, frame_size=25, normalization=True):
    
    # Start of the frame 
    start_row = 0 
    
    # End of the frame 
    end_row = start_row + skip_step * frame_size
    
    # Max row that can be accessed
    limit = end_row + max(rank_step, frame_size)
    
    # Label row (rank row)
    label_row = end_row + rank_step
    
    # Length of the dataframe 
    length_df = len(dataframe)    
    
    # List of distance columns
    dist_columns = [ str(idx) for idx in dataframe.columns if int(idx) % 2 == 0 ]
    # List of distance columns
    pit_columns = [ str(idx) for idx in dataframe.columns if int(idx) % 2 != 0 ]
    
    # Get the index of max distance column
    index_max_dist = dataframe[dist_columns].idxmax(1)
    
    # Lists to store data and labels
    data = []
    labels = []
    
    if normalization:
        # Scale values for pitstop column
        dataframe[pit_columns] = scaler.fit_transform(dataframe[pit_columns])
    
    while limit < length_df:
        
        # Rows to append as a frame
        indexes = [ i for i in range(start_row, end_row, skip_step) ]
        
        # Transform dataframe to scale values
        tmp_data = dataframe.iloc[indexes,:]
        
        if normalization:
        # Get max and min value for last row of dataframe
            max_value = dataframe[dist_columns].iloc[[indexes[-1]],:].max(axis=1)
            min_value = dataframe[dist_columns][dataframe[dist_columns] > 0].iloc[[indexes[0]],:].min(axis=1)

        
            # Normalize by the max value
            tmp_data[dist_columns] = (tmp_data[dist_columns] - float(min_value))/(float(max_value) - float(min_value))
            tmp_data[dist_columns]  = tmp_data[dist_columns].replace([-np.Inf, np.Inf, np.NaN], -1)
        
                
        # Append the rows to data list 
        data.append(tmp_data.values)
        
        # Label row with probability distribution of        
        label_row_df = rankdata(dataframe.iloc[[label_row],:][dist_columns].values[0])
        label_row_df = 34 - label_row_df 
        label_row_df[label_row_df>1] = 0
        
        labels.append(label_row_df)

        start_row = start_row + frame_step
        end_row = start_row + skip_step * frame_size
        label_row = end_row + rank_step
        limit = end_row + max(rank_step, frame_size)
        
    return np.array(data), np.array(labels)     

In [156]:
# Create data for each year

x_2013, y_2013 = create_data(df_2013, rank_step=5, frame_step=3, skip_step=1, frame_size=10)
x_2014, y_2014 = create_data(df_2014, rank_step=5, frame_step=3, skip_step=1, frame_size=10)
x_2015, y_2015 = create_data(df_2015, rank_step=5, frame_step=3, skip_step=1, frame_size=10)
x_2016, y_2016 = create_data(df_2016, rank_step=5, frame_step=3, skip_step=1, frame_size=10)
x_2017, y_2017 = create_data(df_2017, rank_step=5, frame_step=3, skip_step=1, frame_size=10)
x_2018, y_2018 = create_data(df_2018, rank_step=5, frame_step=3, skip_step=1, frame_size=10)
x_2019, y_2019 = create_data(df_2019, rank_step=5, frame_step=3, skip_step=1, frame_size=10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in

In [157]:
x_2013.shape

(3195, 10, 66)

In [158]:
y_2013.shape

(3195, 33)

### Define training data and labels

In [159]:
X_train = np.vstack((x_2013,x_2014,x_2015,x_2016,x_2017))

In [160]:
X_train.shape

(17780, 10, 66)

In [161]:
X_train[1000][9]

array([ 9.99436234e-01,  9.15303851e-01,  9.91224675e-01,  9.08891705e-01,
        9.98863739e-01,  9.18810413e-01,  9.96956510e-01,  8.76972177e-01,
        9.95457621e-01,  8.31938877e-01,  9.87687984e-01,  6.06153029e-01,
        9.89568712e-01,  6.07305793e-01,  9.85158940e-01,  5.01263344e-01,
        9.92496340e-01,  9.01146113e-01,  9.95635088e-01,  5.72972077e-01,
        1.00000000e+00,  9.17925944e-01,  9.18107069e-01,  7.14825811e-01,
        9.92336698e-01,  9.11029681e-01,  9.99657828e-01,  9.17426873e-01,
        9.97853220e-01,  9.14120277e-01,  9.96179785e-01,  9.11089666e-01,
        9.99112105e-01,  9.12414275e-01, -2.50498941e+00,  0.00000000e+00,
        9.80896624e-01,  5.61337005e-01,  9.98148315e-01,  9.16780121e-01,
        9.92735163e-01,  9.06018717e-01, -2.50498941e+00,  0.00000000e+00,
        9.94361766e-01,  9.01584918e-01, -2.50498941e+00,  0.00000000e+00,
        9.86148328e-01,  8.23116454e-01,  9.90605756e-01,  8.57104568e-01,
        9.46353681e-01,  

In [162]:
Y_train = np.vstack((y_2013,y_2014,y_2015,y_2016, y_2017))

In [163]:
Y_train[1000]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [164]:
Y_train.shape

(17780, 33)

In [165]:
Y_train = Y_train.astype(np.float)

### Define validation data and labels

In [166]:
X_val = x_2018
Y_val = y_2018.astype(np.float)

In [167]:
Y_val

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]])

In [168]:
frame_size = 10
n_features = 66
n_out = 33

In [169]:
import tensorflow as tf

BUFFER_SIZE=200
BATCH_SIZE=16

train_data_single = tf.data.Dataset.from_tensor_slices((X_train, Y_train))
train_data_single = train_data_single.cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).repeat()

val_data_single = tf.data.Dataset.from_tensor_slices((X_val, Y_val))
val_data_single = val_data_single.batch(BATCH_SIZE).repeat()

### Model creation

In [192]:
from random import randint
from numpy import array
from numpy import argmax
from numpy import array_equal
from tensorflow.python.keras import Sequential
from tensorflow.python.keras.layers import LSTM
from tensorflow.python.keras.layers import Dense, Flatten, Dropout
from tensorflow.python.keras.layers import TimeDistributed
from tensorflow.python.keras.layers import RepeatVector
#from attention_decoder import AttentionDecoder

'''
# define model

model = Sequential()
model.add(LSTM(150, input_shape=(frame_size, n_features), return_sequences=True))
model.add(AttentionDecoder(150, n_out))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
'''

model = tf.keras.models.Sequential()
model.add(LSTM(128, input_shape=X_train.shape[-2:], return_sequences=True))
model.add(LSTM(64, return_sequences=True))
model.add(Flatten())
model.add(Dense(33, activation='softmax'))

opt = tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.999, amsgrad=False)
model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])


In [193]:
model.summary()

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_29 (LSTM)               (None, 10, 128)           99840     
_________________________________________________________________
lstm_30 (LSTM)               (None, 10, 64)            49408     
_________________________________________________________________
flatten_10 (Flatten)         (None, 640)               0         
_________________________________________________________________
dense_20 (Dense)             (None, 33)                21153     
Total params: 170,401
Trainable params: 170,401
Non-trainable params: 0
_________________________________________________________________


In [194]:
for x, y in val_data_single.take(1):
    print(model.predict(x).shape)

(16, 33)


### Train the model

In [195]:
single_step_history = model.fit(train_data_single, epochs=10,
                                            steps_per_epoch=X_train.shape[0]//BATCH_SIZE,
                                            validation_data=val_data_single,
                                            validation_steps=X_val.shape[0])

Train for 1111 steps, validate for 3588 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
