In [None]:
class RunType:
    debug = False # specify whether debug is enabled
    show_data = True # specify whether data information is displayed
class DataPrep:
    step_size = 5
    standardize = True
    train_test_split = 0.80

## Imports

In [None]:
import numpy as np 
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import random
#print(sns.__version__)

## Data

In [None]:
for dirname, _, filenames in os.walk('../input'):
    
    for filename in filenames:
        filename = os.path.join(dirname, filename)
        print(filename)

In [None]:
print("reading 1....")
tracking_table = pd.read_csv("../input/big-data-derby-2022/nyra_tracking_table.csv",low_memory=False)
if(RunType.show_data):
    display(tracking_table[0:2])
    display(tracking_table.describe())
    
print("reading 2....")
#names = ["track_id","race_date","race_number","program_number","trakus_index"]
#start_table = pd.read_csv("../input/big-data-derby-2022/nyra_start_table.csv",low_memory=False,columns = names)
start_table = pd.read_csv("../input/big-data-derby-2022/nyra_start_table.csv",low_memory=False)
if(RunType.show_data):
    display(start_table[0:2])
    display(start_table.describe())
    
print("reading 3....")
race_table = pd.read_csv("../input/big-data-derby-2022/nyra_race_table.csv",low_memory=False)
if(RunType.show_data):
    display(race_table[0:2])
    display(race_table.describe())
    
print("reading 4....")
complete = pd.read_csv("../input/big-data-derby-2022/nyra_2019_complete.csv",low_memory=False)
if(RunType.show_data):
    display(tracking_table[0:2])
    display(complete.describe())
    
print("Complete!")

## Data Display

In [None]:
if RunType.show_data:
    unique_id = tracking_table['track_id'].unique()
    interested_month = 7
    for track_id in unique_id:
        print(f"==========={track_id}===========")
        tracking0 = tracking_table[tracking_table["track_id"] == track_id]
        tracking1 = tracking0[tracking0["race_date"].str.split('-').str[1].astype(int) == interested_month]
        if(RunType.debug):
            print("Entry Count:",len(tracking1))

        plt.style.use("dark_background")
        plt.figure(figsize = (16, 8))
        sns.scatterplot(data = tracking1, x = "longitude", y = "latitude", hue = "race_number", palette = "Paired")

        plt.title(f"track_id = {track_id}, Longitude and Latitude by Race Number")
        plt.legend()
        plt.show()

## Data Preparation
Plan: Split data on:
1. Track Name
2. Month
3. Race Number

In [None]:
def get_data(standardize=False):

    data = {}
    proc_data = {}
    unique_id = tracking_table['track_id'].unique()

    for track_id in unique_id:
        tracking0 = tracking_table[tracking_table["track_id"] == track_id]
        unique_months = tracking0['race_date'].str.split('-').str[1].astype(int).unique()
        if RunType.debug:
            print(len(unique_months))
            display(unique_months)
        for interested_month in unique_months:

            tracking1 = tracking0[tracking0["race_date"].str.split('-').str[1].astype(int) == interested_month]
            unique_race_num = tracking1['race_number'].unique()
            if RunType.debug:
                print(len(unique_race_num))
                display(unique_race_num)
            print(f"{track_id}_{interested_month} race count: {len(unique_race_num)}")
            for race_num in unique_race_num:
                tracking2 = tracking1[tracking1["race_number"].astype(int) == race_num]
                unique_prog_num = tracking2['program_number'].unique()
                for prog_num in unique_prog_num:
                    label = f"{track_id}_{interested_month}_{race_num}_{prog_num}"
                    if RunType.debug:
                        print(label)
                    cols = ['longitude', 'latitude']
                    temp_df=tracking2[tracking2["program_number"].astype(str) == prog_num]
                    
                    
                    if standardize:
                        long_mean = temp_df["longitude"].mean()
                        lat_mean = temp_df["latitude"].mean()
                        if RunType.debug:
                            print(long_mean,lat_mean)
                        temp_df["longitude"] = temp_df["longitude"].astype(float)-long_mean
                        temp_df["latitude"] = temp_df["latitude"].astype(float)-lat_mean
                    proc_data[label] = [[e for e in row if e==e] for row in temp_df[cols].values.tolist()]
                    data[label] = temp_df
    return proc_data, data

    



In [None]:
# train_data format:
# track-race-id_0 : [(l1,l1),(l2,l2),(l3,l3),(l4,l4)]
# track-race-id_1 : [(l1,l1),(l2,l2),(l3,l3),(l4,l4)]
train_data, disp_data = get_data(DataPrep.standardize)



In [None]:
# count =0
# for i,d in train_data.items():
#     print(d)
#     count +=1
#     if count == 5:
#         break;

In [None]:
def prep_training_data(data,step_size):
    proc_data = []
    # example: have [(l1,l1),(l2,l2),(l3,l3),(l4,l4)]
    for i in range(len(data)-step_size):
        endp = i+step_size
        time_data = [] 
        for j in range(i,endp):
            time_data.append(data[j])
        proc_data.append((time_data,data[endp]))
    return proc_data

def prep_training_data_2(data,step_size):
    vals = []
    labels = []
    # example: have [(l1,l1),(l2,l2),(l3,l3),(l4,l4)]
    for i in range(len(data)-step_size):
        endp = i+step_size
        time_data = []
        for j in range(i,endp):
            time_data.append(data[j])
        vals.append(time_data)
        labels.append(data[endp])
    return pd.DataFrame.from_dict(vals),pd.DataFrame.from_dict(labels)
        
    

In [None]:
# want:
# track-race-id_0 : [
#([(l1,l1),(l2,l2)],(l3,l3)),
#([(l2,l2),(l3,l3)],(l4,l4))
#]
# track-race-id_1 : [
#([(l1,l1),(l2,l2)],(l3,l3)),
#([(l2,l2),(l3,l3)],(l4,l4))
#]

# for id, d in train_data.items():
#     if RunType.debug:
#         tmp = prep_training_data(d,DataPrep.step_size)
#         print(tmp[0])
#     train_data[id] = prep_training_data(d,DataPrep.step_size)
# all labels: have individual batches of data
all_data = []
all_labels = []

for i,vals in train_data.items():
    vals,labels = prep_training_data_2(vals,DataPrep.step_size)
    all_data.append(vals)
    all_labels.append(labels)

In [None]:
def split_data(data,percent):
    
    val_count = (int)(percent*len(data))
    vals = random.sample(range(0, len(data)), val_count)
    train_set = {}
    test_set = {}
    count = 0
    
    for id, d in data.items():
        if count in vals:
            train_set[id]=d
        else:
            test_set[id]=d
        count+=1
    print("train_len: ",len(train_set))
    print("test_len: ",len(test_set))
    return train_set,test_set
        
def split_data_2(data,labels,percent):
    
    val_count = (int)(percent*len(data))
    vals = random.sample(range(0, len(data)), val_count)

    train_data = []
    train_labels = []
    test_data = []
    test_labels = []
    print("datalen: ",len(data))
    print("labellen: ",len(data))
    count = 0
    for loc in range(len(data)):
        if count in vals:
            train_data.append(data[loc])
            train_labels.append(labels[loc])
        else:
            test_data.append(data[loc])
            test_labels.append(labels[loc])
        count+=1
    print("train_len: ",len(train_data))
    print("test_len: ",len(test_data))
    return train_data,train_labels,test_data,test_labels

In [None]:

# now have data prepared in the desired steps with a step size 
#train_set, test_set = split_data(train_data,DataPrep.train_test_split)
x_train,y_train,x_test,y_test = split_data_2(all_data,all_labels,DataPrep.train_test_split)

# count =0
# for d in train_data:
#     print(d)
#     count +=1
#     if count == 5:
#         break;



In [None]:
import torch #pytorch
import torch.nn as nn
from torch.autograd import Variable 

X_train_tensors = Variable(torch.Tensor(x_train))
X_test_tensors = Variable(torch.Tensor(x_test))

y_train_tensors = Variable(torch.Tensor(y_train))
y_test_tensors = Variable(torch.Tensor(y_test)) 