# ❗️❗️❗️DISCLAIMER: My notebook uses the model architecture of [this notebook](https://www.kaggle.com/code/alexryzhkov/tps-2022-10-fastai-with-multistart-and-tta) by [@alexryzhkov](https://www.kaggle.com/alexryzhkov) as the main branch. My notebook wants to showcase the *SPEED* of training using TPU and how it is possible to train a reconstruction head together with a standard logloss head without particular changes in speed. If you enjoyed first like [@alexryzhkov](https://www.kaggle.com/alexryzhkov)'s notebook and leave a like also to this one if you want.❗️❗️❗️






The notebook results slower than the previous one since the number of epochs and the early stopping patience has been increased.
The model could improve by tuning learning rate, and other NN hyperparameters. A possibility could also be increase the number of inferences on the test set by enabling dropout also in inference. 

An interesting aspect I found is that even though I shuffle players inside the network the reconstruction loss keeps decreasing.

This month's TPS allowed me to understand how TPU is incredibly fast and how complex is setting up a tabular pipeline for working on TPU, bugs are often difficult to spot and hard to fix.

Hope this notebook helps someone learn something new. 

I hope to learn from other kagglers at the end of this TPS seeing top solutions.

# Required imports

In [None]:
import pandas as pd
import gc
from pathlib import Path
import tensorflow as tf
from tensorflow.data import Dataset, TFRecordDataset
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense,Dropout,Input,GaussianNoise,BatchNormalization,Lambda
from tensorflow.keras.losses import  BinaryCrossentropy,MeanSquaredError
from tensorflow.keras.optimizers import Adam
import tensorflow_addons as tfa

# TPU preparation

In [None]:
#get google cloud path of the dataset while on CPU 
#Uncomment this cell, execute it on cpu, write down GCS_DS_PATH below in the variable
from kaggle_datasets import KaggleDatasets
GCS_DS_PATH = KaggleDatasets().get_gcs_path("tps-oct-2022-tfrecords")
GCS_DS_PATH

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    BATCH_SIZE = 4096 * strategy.num_replicas_in_sync
    print("TPU")
except:
    tpu = None
    strategy = tf.distribute.get_strategy()
    BATCH_SIZE=512
    print("CPU")

# Dataset setup

In [None]:
# used to autotune tensorflow dataset transformations
AUTO = tf.data.experimental.AUTOTUNE

#get 10 datasets independently and store all in a list for later use
datasets=[]

#option for faster TPU data read
ignore_order = tf.data.Options()
ignore_order.experimental_deterministic = False

with strategy.scope():
    for i in range(10):
        PATH=tf.io.gfile.glob(os.path.join(GCS_DS_PATH,f'train_{i}/feats.tfrecord*'))
        ds = TFRecordDataset(PATH, num_parallel_reads=AUTO)
        ds = ds.with_options(ignore_order)
        ds_all_feats = ds.map(lambda x: tf.ensure_shape(tf.io.parse_tensor(x, out_type=tf.float32),(187)), num_parallel_calls=AUTO)
        dataset=ds_all_feats.map(lambda x: (x[:-2],([x[-2]],[x[-1]],x[:-2])), num_parallel_calls=AUTO)

        datasets.append(dataset)

In [None]:
#train  validation split
def get_train_valid(datasets,valid_idx=0):
    #return datasets[(valid_idx+1)%10],datasets[valid_idx]
    train_ds=None
    for i,dataset in enumerate(datasets):
        if i==valid_idx:
            valid_ds=dataset
        elif train_ds is not None:
            train_ds=train_ds.concatenate(dataset)
        else:
            train_ds=dataset
            
            
    return train_ds,valid_ds

# Model definition

## scheduler from [this notebook](https://www.kaggle.com/code/tolgadincer/tf-keras-learning-rate-schedulers/notebook)

In [None]:
import matplotlib.pyplot as plt
def plot_scheduler2(step, schedulers):
    if not isinstance(schedulers, list):
        schedulers = [schedulers]

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[10, 3])
    for scheduler in schedulers:
        x = range(step)
        y = [scheduler(i).numpy() for i in x]
        ax1.plot(x, y, label=scheduler.name)
        ax1.set_xlabel('Epoch')
        ax1.set_ylabel('Learning Rate')
        ax1.legend()

        ax2.plot(x, y, label=scheduler.name)
        ax2.set_xlabel('Epoch')
        ax2.set_ylabel('Learning Rate')
        ax2.set_yscale('log')
        ax2.legend()
    plt.show()
    
exp_clr = tfa.optimizers.ExponentialCyclicalLearningRate(
    initial_learning_rate=7e-5,
    maximal_learning_rate=8e-3,
    step_size=150,
    scale_mode='iterations',
    gamma=0.9985,
    name='ExponentialCyclicalLearningRate'
)
plot_scheduler2(15000, exp_clr)

In [None]:
#Support constants that will be used in the augment function
displacement=tf.constant([*([i for i in range(28)]*3)],dtype=tf.int64)

indices=tf.range(0,3,dtype=tf.int64)

indices_ball=tf.range(0,17,dtype=tf.int64)

#Augmentation function -> maybe we can even shuffle teams, the model seems to be resistent to player reordering ???!!!
def augment(x):
    #shuffle team A
    shuffled_indices = tf.random.shuffle(indices)
    indices_teamA=tf.repeat(shuffled_indices,28,axis=-1)*28+displacement+17
    original_indicesA=tf.repeat(indices,28,axis=-1)*28+displacement+17
    a=tf.gather(x,indices_teamA,axis=-1)
    
    shuffled_indices = tf.random.shuffle(indices)
    indices_teamB=tf.repeat(shuffled_indices,28,axis=-1)*28+displacement+101
    original_indicesB=tf.repeat(indices,28,axis=-1)*28+displacement+101
    b=tf.gather(x,indices_teamB,axis=-1)
    
    ball=tf.gather(x,indices_ball,axis=-1)
    
    return tf.concat([ball,a,b],axis=-1)

In [None]:
def create_model():
    inputs=Input(shape=[185])
    
    x=inputs
    x2=inputs
    
    
    
    #need to check better this logic
    #Shuffle players inside their own team
    x=Lambda(augment)(x)#this should make reconstruction very hard -> has to reorder players , well actually it seems that having this or not is not so important for reconstruction loss
    
    sequential=tf.keras.Sequential([
        BatchNormalization(),
        GaussianNoise(0.005),
        Dense(4096,activation=tfa.activations.mish),
        BatchNormalization(),
        Dropout(0.75),
        Dense(2048,activation=tfa.activations.mish),
        BatchNormalization(),
        Dropout(0.75),
        Dense(2048,activation=tfa.activations.mish),
        BatchNormalization(),
        Dropout(0.75),
        Dense(1024,activation=tfa.activations.mish),
        BatchNormalization(),
        Dropout(0.75),
        Dense(512,activation=tfa.activations.mish),
        Dropout(0.7)
    ])
    
    x=sequential(x)
    x2=sequential(x2)#skip augmentation, this helps if we want to add some kind of preprocessing
    
    denseA=Dense(1,activation="sigmoid",name="teamA")
    denseB=Dense(1,activation="sigmoid",name="teamB")
    
    out1=denseA(x)
    out2=denseB(x)
    
    
    x=Dense(1024,activation=tfa.activations.mish)(x)
    x=BatchNormalization()(x)    
    x=Dropout(0.8)(x)
    
    x=Dense(1024,activation=tfa.activations.mish)(x)
    x=BatchNormalization()(x)    
    x=Dropout(0.8)(x)
    
    out3=Dense(185,name="reconstruction")(x)
    
    
    model=Model(inputs=inputs,outputs=[out1,out2,out3])
    
    model_inference=Model(inputs=inputs,outputs=[out1,out2])
    
    #higher BS -> better increase the Learning rate
    model.compile(Adam(learning_rate=exp_clr),[BinaryCrossentropy(from_logits=False),BinaryCrossentropy(from_logits=False),MeanSquaredError()],["accuracy"],loss_weights=[1,1,0.7])
    return model,model_inference

In [None]:
df=pd.read_csv("../input/tabular-playground-series-oct-2022/sample_submission.csv")
df["team_A_scoring_within_10sec"]=0
df["team_B_scoring_within_10sec"]=0
with strategy.scope():
    PATH=tf.io.gfile.glob(os.path.join(GCS_DS_PATH,f'test/feats.tfrecord'))
    ds = TFRecordDataset(PATH, num_parallel_reads=AUTO)

    ds_test = ds.map(lambda x: tf.ensure_shape(tf.io.parse_tensor(x, out_type=tf.float32),(185)), num_parallel_calls=AUTO)

    batched_dataset=ds_test.batch(512*8).cache()

# Training 

In [None]:
model,_=create_model()
Number_of_parameters=model.count_params()
print(f"The model has {Number_of_parameters:,} parameters")

In [None]:
import numpy as np
#code adapted from https://stackoverflow.com/questions/64556120/early-stopping-with-multiple-conditions
#NEXT step -> save best for A, best for B -> stop when both stop improving (so if only A improve keep going and update only best for A)
#             can do inference using the 2 models separately         
class CustomEarlyStopping(tf.keras.callbacks.Callback):
    def __init__(self, patience=0):
        super(CustomEarlyStopping, self).__init__()
        self.patience = patience
        self.best_weights = None
        
    def on_train_begin(self, logs=None):
        # The number of epoch it has waited when loss is no longer minimum.
        self.wait = 0
        # The epoch the training stops at.
        self.stopped_epoch = 0
        # Initialize the best as infinity.
        self.loss = np.Inf

    def on_epoch_end(self, epoch, logs=None): 
        loss1=logs.get('val_teamA_loss')
        loss2=logs.get('val_teamB_loss')
        loss=loss1+loss2
        # If BOTH the validation loss AND map10 does not improve for 'patience' epochs, stop training early.
        if np.less(loss, self.loss):
            self.loss = loss
            self.wait = 0
            # Record the best weights if current results is better (less).
            self.best_weights = self.model.get_weights()
        else:
            self.wait += 1
            if self.wait >= self.patience:
                self.stopped_epoch = epoch
                self.model.stop_training = True
                print("Restoring model weights from the end of the best epoch.")
                self.model.set_weights(self.best_weights)
                
    def on_train_end(self, logs=None):
        pass

In [None]:
predsA=[]
predsB=[]
with strategy.scope():
    gc.collect()
    for i in range(10):
        #callbacks=[tf.keras.callbacks.EarlyStopping(monitor="val_teamA_loss",mode="min",restore_best_weights=True,patience=5),
        #           tf.keras.callbacks.EarlyStopping(monitor="val_teamB_loss",mode="min",restore_best_weights=True,patience=5)]
        callbacks=[CustomEarlyStopping(patience=10)]
        print(f"---------------------------  start fold {i}  ---------------------------")
        train_ds,valid_ds=get_train_valid(datasets,i)
        #Very important caching reduces training time to 10% after first epoch!!!!
        train=train_ds.shuffle(500000).batch(BATCH_SIZE).cache().prefetch(AUTO).shuffle(150)
        valid=valid_ds.shuffle(500000).batch(BATCH_SIZE).cache().prefetch(AUTO).shuffle(150)
        model,model_inference=create_model()
        model.fit(train,validation_data=valid,epochs=60,callbacks=callbacks,verbose=2)
        preds=model_inference.predict(batched_dataset,verbose=2)
        
        preds=model_inference.predict(batched_dataset,verbose=2)
        preds2=model_inference.predict(batched_dataset,verbose=2)
        preds3=model_inference.predict(batched_dataset,verbose=2)
        preds4=model_inference.predict(batched_dataset,verbose=2)
        preds5=model_inference.predict(batched_dataset,verbose=2)
        
        predsA.append(preds[0][:,0])
        predsB.append(preds[1][:,0])
        predsA.append(preds2[0][:,0])
        predsB.append(preds2[1][:,0])
        predsA.append(preds3[0][:,0])
        predsB.append(preds3[1][:,0])
        predsA.append(preds4[0][:,0])
        predsB.append(preds4[1][:,0])
        predsA.append(preds5[0][:,0])
        predsB.append(preds5[1][:,0])
        
        df["team_A_scoring_within_10sec"]+=(preds[0][:,0]+preds2[0][:,0]+preds3[0][:,0]+preds4[0][:,0]+preds5[0][:,0])/5
        df["team_B_scoring_within_10sec"]+=(preds[1][:,0]+preds2[1][:,0]+preds3[1][:,0]+preds4[1][:,0]+preds5[1][:,0])/5
        
        #save models to disk 
        save_locally = tf.saved_model.SaveOptions(experimental_io_device='/job:localhost')
        model.save(f'./model_{i}', options=save_locally) # saving in Tensorflow's "SavedModel" format
        del model,preds,preds2,preds3,preds4,preds5,train,valid
        gc.collect()

    df["team_A_scoring_within_10sec"]/=10
    df["team_B_scoring_within_10sec"]/=10

# Prediction

In [None]:
df.to_csv("submission.csv",index=False)
df.head()

In [None]:
df["team_A_scoring_within_10sec"]=np.median(np.array(predsA),axis=0)
df["team_B_scoring_within_10sec"]=np.median(np.array(predsB),axis=0)

In [None]:
df.head()

In [None]:
df.to_csv("submission_median.csv",index=False)
df.head()