# GENERATE PERSONAL DATASETS
This is a notebook where explain how to create my personal dataset, that it's used in other complementary notebook. https://www.kaggle.com/code/josmejagamarra/tps-oct-22-personal-subm

In [None]:
#IMPORT THE LIBRARIES
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None) 
import gc
import tensorflow as tf
import time

# 1. CONFIGURE THE TPU

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    BATCH_SIZE = 4096 * strategy.num_replicas_in_sync
    print("TPU")
except:
    tpu = None
    strategy = tf.distribute.get_strategy()
    BATCH_SIZE=512
    print("CPU")

# 2. LOAD THE TRAINING DATA

In [None]:
start_time = time.time()

#Load the dtypes
dtypes_df = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2022/train_dtypes.csv')
dtypes_df['dtype']=dtypes_df['dtype'].replace(['float64','float32'],'float16') #Use float16
dtypes = {k: v for (k, v) in zip(dtypes_df.column, dtypes_df.dtype)}

#Filter the useless columns
all_columns =list(pd.read_csv("/kaggle/input/tabular-playground-series-oct-2022/train_0.csv",nrows=1))
useless_columns = ['event_id','event_time','player_scoring_next','team_scoring_next']
usecols = [i for i in all_columns if i not in useless_columns]

#Read the train data
train0 = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2022/train_0.csv',usecols = usecols ,dtype=dtypes)
train1 = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2022/train_1.csv', usecols = usecols ,dtype=dtypes)
train2 = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2022/train_2.csv', usecols = usecols ,dtype=dtypes)
train3 = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2022/train_3.csv',usecols = usecols , dtype=dtypes)
train4 = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2022/train_4.csv',usecols = usecols , dtype=dtypes)
train5 = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2022/train_5.csv',usecols = usecols , dtype=dtypes)
train6 = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2022/train_6.csv',usecols = usecols , dtype=dtypes)
train7 = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2022/train_7.csv', usecols = usecols ,dtype=dtypes)
train8 = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2022/train_8.csv',usecols = usecols , dtype=dtypes)
train9 = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2022/train_9.csv', usecols = usecols ,dtype=dtypes)

#Concatenate the data
data = pd.concat([train0,train1,train2,train3,train4,train5,train6,train7,train8,train9],axis=0).fillna(0)
del(dtypes_df,dtypes,all_columns,useless_columns,usecols,train0,train1,train2,train3,train4,train5,train6,train7,train8,train9)
_=gc.collect()

print("--- %s seconds ---" % (time.time() - start_time))

# 3. FEATURE ENGINEERING

In [None]:
#Distance between player and the ball
def dist_ball_player (data):
    for i in np.arange(0,6,1):
        data[f'p{i}_ball'] = np.sqrt(
                               (data[f'p{i}_pos_x']-data['ball_pos_x'])**2+
                               (data[f'p{i}_pos_y']-data['ball_pos_y'])**2+
                               (data[f'p{i}_pos_z']-data['ball_pos_z'])**2
                                )
    return data

#VECTOR BETWEEN BALL AND GATE
def vect_ball_gate (data):
    for i,n in [("A",-100),("B",100)]:
        data[f'ball_gate{i}_x'] = data['ball_pos_x']-(0)
        data[f'ball_gate{i}_y'] = data['ball_pos_y']-(n)
        data[f'ball_gate{i}_z'] = data['ball_pos_z']-(0)
    return data

#DISTANCE BEETWEEN BOOST POSITION AND PLAYER
boost_pos_dic={'boost0_pos':[-61.4, -81.9, 0],
              'boost1_pos':[61.4, -81.9, 0],
              'boost2_pos':[-71.7, 0, 0],
              'boost3_pos':[71.7, 0, 0],
              'boost4_pos':[-61.4, 81.9, 0],
              'boost5_pos':[61.4, 81.9, 0]}

def dist_boost_player (data):
    for i in np.arange(0,6,1):
        for j in np.arange(0,6,1):
            data[f'p{i}_boost{j}'] = np.sqrt(
                (data[f'p{i}_pos_x']-boost_pos_dic[f'boost{i}_pos'][0])**2+
                (data[f'p{i}_pos_y']-boost_pos_dic[f'boost{i}_pos'][1])**2+
                (data[f'p{i}_pos_z']-boost_pos_dic[f'boost{i}_pos'][2])**2
            )
    return data

#BALL VELOCITY VALUE
def ball_vel_val (data):
    data['ball_vel']=np.sqrt(
        data['ball_vel_x']**2+
        data['ball_vel_y']**2+
        data['ball_vel_z']**2
    )
    return data

#PLAYER VELOCITY VALUE
def player_vel_val (data):
    for i in np.arange(0,6,1):
        data[f'p{i}_vel']=np.sqrt(
            data[f'p{i}_vel_x']**2+
            data[f'p{i}_vel_y']**2+
            data[f'p{i}_vel_z']**2
        )
    return data

#APLY THE FEATURE ENGINEERING
start_time = time.time()
##
data = dist_ball_player (data)
data = vect_ball_gate (data)
data = dist_boost_player (data)
data = ball_vel_val (data)
data = player_vel_val (data)
##
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# MOVE THE TARGET COLUMNS TO THE END
df_temp = data.pop('team_A_scoring_within_10sec')
data['team_A_scoring_within_10sec'] =df_temp

df_temp = data.pop('team_B_scoring_within_10sec')
data['team_B_scoring_within_10sec'] =df_temp

del(df_temp)
_=gc.collect()

# 4. GENERATE USEFULL COLUMNS LIST

In [None]:
P0=[k for k in data.columns.to_list() if k.startswith("p0")]
P1=[k for k in data.columns.to_list() if k.startswith("p1")]
P2=[k for k in data.columns.to_list() if k.startswith("p2")]
P3=[k for k in data.columns.to_list() if k.startswith("p3")]
P4=[k for k in data.columns.to_list() if k.startswith("p4")]
P5=[k for k in data.columns.to_list() if k.startswith("p5")]

Basic=[k for k in data.columns.to_list() if not k.startswith(("p0",
                                                               "p1",
                                                               "p2",
                                                               "p3",
                                                               "p4",
                                                               "p5",
                                                               "team_A_scoring_within_10sec",
                                                              "team_B_scoring_within_10sec"))]

In [None]:
#DEFINE UTILS
game_nums = data['game_num'].unique()
game_numbers = np.array_split(game_nums,10) #split games into 10 bins

columns_t = data.drop(columns=['game_num']).columns
ds_size = data.shape[0]
NUM_SHARDS = 1

from tensorflow.data import Dataset, TFRecordDataset
import os

# 5. SAVE THE INPUT & TARGET DATA

In [None]:
# SAVE THE DATA FROM TEAM A - INPUT 1
with strategy.scope():
    start_time = time.time()
    ##
    for i in range(10):
        print(f"saving tfrecords {i}")
        current_games=game_numbers[i]
        df=data.query("game_num in @current_games")
        df=df[Basic+P0+P1+P2]
        df=df.drop(columns=['game_num'])
        PATH_PREFIX = f'/kaggle/working/Team_A_input/train_{i}/feats.tfrecord'

        ds_feats = Dataset.from_tensor_slices(df.astype("float16"))
        ds_feats= ds_feats.map(tf.io.serialize_tensor)

        def reduce_func(key, dataset):
            filename = tf.strings.join([PATH_PREFIX, tf.strings.as_string(key)])#place into different shards different parts of dataset
            writer = tf.data.experimental.TFRecordWriter(filename) 
            writer.write(dataset.map(lambda _, x: x))
            return tf.data.Dataset.from_tensors(filename)

        ds_feats = ds_feats.enumerate()
        dataset = ds_feats.apply(tf.data.experimental.group_by_window( 
            lambda i, _: i % NUM_SHARDS, reduce_func, tf.int64.max
        ))

        # Iterate through the dataset to trigger data writing.
        for _ in dataset:
            pass
    ##
    print("--- %s seconds ---" % (time.time() - start_time))

del(df, ds_feats,dataset)
_=gc.collect()

In [None]:
# SAVE DATA FROM TEAM B - INPUT 2
with strategy.scope():
    import time
    start_time = time.time()
    ##
    for i in range(10):
        print(f"saving tfrecords {i}")
        current_games=game_numbers[i]
        df=data.query("game_num in @current_games")
        df=df[Basic+P3+P4+P5]
        df=df.drop(columns=['game_num'])
        PATH_PREFIX = f'/kaggle/working/Team_B_input/train_{i}/feats.tfrecord'

        ds_feats = Dataset.from_tensor_slices(df.astype("float16"))
        ds_feats= ds_feats.map(tf.io.serialize_tensor)

        def reduce_func(key, dataset):
            filename = tf.strings.join([PATH_PREFIX, tf.strings.as_string(key)])#place into different shards different parts of dataset
            writer = tf.data.experimental.TFRecordWriter(filename)
            writer.write(dataset.map(lambda _, x: x))
            return tf.data.Dataset.from_tensors(filename)

        ds_feats = ds_feats.enumerate()
        dataset = ds_feats.apply(tf.data.experimental.group_by_window( 
            lambda i, _: i % NUM_SHARDS, reduce_func, tf.int64.max
        ))

        # Iterate through the dataset to trigger data writing.
        for _ in dataset:
            pass
    ##
    print("--- %s seconds ---" % (time.time() - start_time))
    
del(df, ds_feats,dataset)
_=gc.collect()

In [None]:
#SAVE DATA FROM TARGET
PATH_PREFIX = '/kaggle/working/Target/target.tfrecord'
df = data[["team_A_scoring_within_10sec","team_B_scoring_within_10sec"]]
df = Dataset.from_tensor_slices(df.astype("float16"))
df = df.map(tf.io.serialize_tensor)
writer = tf.data.experimental.TFRecordWriter(PATH_PREFIX)
writer.write(df)

del(df, data)
gc.collect()

# 6. SAVE THE TEST DATA

In [None]:
# LOAD THE DATASET FROM TEST
start_time = time.time()
##
dtypes_df = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2022/test_dtypes.csv')
dtypes_df['dtype']=dtypes_df['dtype'].replace(['float64','float32'],'float16') #Use float16
dtypes = {k: v for (k, v) in zip(dtypes_df.column, dtypes_df.dtype)}
all_columns =list(pd.read_csv("/kaggle/input/tabular-playground-series-oct-2022/test.csv",nrows=1))
test = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2022/test.csv',dtype=dtypes)
data_t = test.drop(columns=['id']).fillna(0)
##
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# APPLYING ALL FEATURE FUNCTIONS TO THE TEST DATA
data_t = dist_ball_player (data_t)
data_t = vect_ball_gate (data_t)
data_t = dist_boost_player (data_t)
data_t = ball_vel_val (data_t)
data_t = player_vel_val (data_t)

In [None]:
# SAVE DATA FROM TEST- INPUT 1
PATH_PREFIX = '/kaggle/working/Test/test_in_1.tfrecord'
df = data_t[Basic[1:]+P0+P1+P2]
df = tf.data.Dataset.from_tensor_slices(df.astype("float16"))
df = df.map(tf.io.serialize_tensor)
writer = tf.data.experimental.TFRecordWriter(PATH_PREFIX)
writer.write(df)

In [None]:
# SAVE DATA FROM TEST- INPUT 2
PATH_PREFIX = '/kaggle/working/Test/test_in_2.tfrecord'
df = data_t[Basic[1:]+P3+P4+P5]
df = tf.data.Dataset.from_tensor_slices(df.astype("float16"))
df = df.map(tf.io.serialize_tensor)
writer = tf.data.experimental.TFRecordWriter(PATH_PREFIX)
writer.write(df)