In [43]:
import gc
import random

import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from sklearn import preprocessing
from pathlib import Path
from sklearn.model_selection import train_test_split

%matplotlib inline

In [44]:
# TODO 
# - TF functionnal API
# - Explorer tes donnees pour sortir des tendances
# - Dans un softmax = pas de NaN == donnee manquante dans les data de test
# - Data symetrique pour les situation de but donc run le model sur A seul = proba de A, run le model sur B = proba de B 
print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.10.0


In [45]:
### META
# PATH
DATA_FOLDER = "./data"
CHECKPOINTS_FOLDER = "./checkpoints"
CHECKPOINTS_PATH = CHECKPOINTS_FOLDER+"/cp-{epoch:04d}.ckpt"
OUT_FOLDER = "./out"

# MODEL
EPOCHS = 3
BATCH_SIZE = 128

In [46]:
%%time
def reduce_memory_usage(df, verbose=True):
    numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

def reduce_mem_usage(df):
    """
        iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


CPU times: user 12 µs, sys: 0 ns, total: 12 µs
Wall time: 15.3 µs


In [47]:
# Load dtype model to reduce size of df
dtypes_df = pd.read_csv(DATA_FOLDER+"/train_dtypes.csv")
dtypes = {k: v for (k, v) in zip(dtypes_df.column, dtypes_df.dtype)}

# Load TEST DF
test_df = pd.read_csv(f"{DATA_FOLDER}/test.csv", dtype=dtypes)
test_df.fillna(-1000.0, inplace=True)
test_df = reduce_memory_usage(test_df)


Mem. usage decreased to 74.89 Mb (44.0% reduction)


In [48]:
# Load TRAIN DF
train_df_list = []
for i in range(10):
    print(f"Optimizing {i} df")
    train_df_list.append(reduce_memory_usage(pd.read_csv(f"{DATA_FOLDER}/train_{i}.csv", dtype=dtypes)))

Optimizing 0 df
Mem. usage decreased to 260.33 Mb (40.9% reduction)
Optimizing 1 df
Mem. usage decreased to 262.67 Mb (40.9% reduction)
Optimizing 2 df
Mem. usage decreased to 257.43 Mb (40.9% reduction)
Optimizing 3 df
Mem. usage decreased to 256.11 Mb (40.9% reduction)
Optimizing 4 df
Mem. usage decreased to 256.51 Mb (40.9% reduction)
Optimizing 5 df
Mem. usage decreased to 253.59 Mb (40.9% reduction)
Optimizing 6 df
Mem. usage decreased to 249.98 Mb (40.9% reduction)
Optimizing 7 df
Mem. usage decreased to 255.51 Mb (40.9% reduction)
Optimizing 8 df
Mem. usage decreased to 260.62 Mb (40.9% reduction)
Optimizing 9 df
Mem. usage decreased to 254.69 Mb (40.9% reduction)


In [49]:
def create_model():

    model_v1 = tf.keras.models.Sequential([
        tf.keras.layers.Dense(256, activation='relu', input_shape=[54]),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(3, activation='softmax')
    ])
    model_v2 = tf.keras.models.Sequential([
        tf.keras.layers.Dense(256, activation='relu', input_shape=[54]),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.05),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.05),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(2, activation='sigmoid')
    ])
    
    model_v3 = tf.keras.models.Sequential([
        tf.keras.layers.Dense(512, activation='relu', input_shape=[54]),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.05),
        tf.keras.layers.Dense(2, activation='sigmoid')
    ])

    model_v1.compile(
        optimizer='adam',
        #loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
        loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
        metrics=['AUC', 'accuracy']
    )
    # Accuracy pas pertinente: ici le "recall" = taux de hit des but est plus pertinent
    # AUC/Courbe ROC plus pertinent pour la detection des faux/vrai positifs
    model_v2.compile(
        optimizer='adam',
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=['AUC', tf.keras.metrics.Recall()]
    )   

    model_v3.compile(
        optimizer='adam',
        loss=tf.keras.losses.BinaryCrossentropy(),
        metrics=['AUC', tf.keras.metrics.Recall()]
    )   
        
    return model_v3

model = create_model()
model.summary()


Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_32 (Dense)            (None, 512)               28160     
                                                                 
 batch_normalization_24 (Bat  (None, 512)              2048      
 chNormalization)                                                
                                                                 
 dropout_20 (Dropout)        (None, 512)               0         
                                                                 
 dense_33 (Dense)            (None, 256)               131328    
                                                                 
 batch_normalization_25 (Bat  (None, 256)              1024      
 chNormalization)                                                
                                                                 
 dropout_21 (Dropout)        (None, 256)              

In [50]:
model = create_model()
model.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_44 (Dense)            (None, 512)               28160     
                                                                 
 batch_normalization_33 (Bat  (None, 512)              2048      
 chNormalization)                                                
                                                                 
 dropout_28 (Dropout)        (None, 512)               0         
                                                                 
 dense_45 (Dense)            (None, 256)               131328    
                                                                 
 batch_normalization_34 (Bat  (None, 256)              1024      
 chNormalization)                                                
                                                                 
 dropout_29 (Dropout)        (None, 256)             

In [51]:
# CALLBACKS CONFIGURATION

class AccuracyCallback(tf.keras.callbacks.Callback):
        # Define the correct function signature for on_epoch_end
        def on_epoch_end(self, epoch, logs={}):
            accuracy = 0.93
            if logs.get('accuracy') is not None and logs.get('accuracy') > accuracy:
                print(f"\nReached {accuracy*100}% accuracy so cancelling training!") 
                
                # Stop training once the above condition is met
                self.model.stop_training = True

class RecallCallback(tf.keras.callbacks.Callback):
        # Define the correct function signature for on_epoch_end
        def on_epoch_end(self, epoch, logs={}):
            recall = 0.93
            if logs.get('recall') is not None and logs.get('recall') > recall:
                print(f"\nReached {recall*100}%  so cancelling training!") 
                
                # Stop training once the above condition is met
acc_callback = AccuracyCallback()
rec_callback = RecallCallback()

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=CHECKPOINTS_PATH,
    save_weights_only=True,
    save_freq=150*BATCH_SIZE,
    verbose=1
)
# Scheduler plateau pour le learning rate
lr_schedule = tf.keras.callbacks.LearningRateScheduler(
    lambda epoch: 1e-8 * 10**(epoch / 20)
)

callbacks_list = [rec_callback]#, lr_schedule, cp_callback, acc_callback]

In [52]:
to_remove = [
    'game_num', 
    'event_id', 
    'event_time', 
    'player_scoring_next', 
    'team_scoring_next',
]

history_list = []

model.save_weights(CHECKPOINTS_PATH.format(epoch=0))

for i, df in enumerate(train_df_list):
    #df['no_team_scored'] = np.logical_xor(df['team_A_scoring_within_10sec'],df['team_B_scoring_within_10sec'])
    #df['no_team_scored'] = (~df['no_team_scored']).astype(int)
        
    df_drop = df.drop(to_remove, axis=1)
    # improvment: fillna pour prendre en compte les destructions et eviter les NaN dans les predictions
    #df_drop = df.dropna(inplace=False, axis=0)

    df_drop = df_drop.fillna(-1000.0)

        
    features = df_drop.loc[:,'ball_pos_x':'boost5_timer']
    #target = df_drop.loc[:,'team_A_scoring_within_10sec':'no_team_scored']
    target = df_drop.loc[:,'team_A_scoring_within_10sec':'team_B_scoring_within_10sec']

    # test_size en regle general = 10/15 % # previous: 0.05
    X_train, X_val, Y_train, Y_val = train_test_split(features, target, test_size = 0.10, random_state=41)
        
    print(f'Training dataset number {i}')
        
    history = model.fit(
        X_train, 
        Y_train,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        verbose=1,
        validation_data=(X_val, Y_val),
        callbacks=callbacks_list,
    )

    # can we plot learning history on multiple training ?
    history_list.append(history)
    
    # memory management to lower chance of kernel crashing
    del df, X_train, X_val, Y_train, Y_val
    gc.collect()
    print()

Training dataset number 0
Epoch 1/3
Epoch 2/3
Epoch 3/3

Training dataset number 1
Epoch 1/3
Epoch 2/3
Epoch 3/3

Training dataset number 2
Epoch 1/3
Epoch 2/3
Epoch 3/3

Training dataset number 3
Epoch 1/3
Epoch 2/3
Epoch 3/3

Training dataset number 4
Epoch 1/3
Epoch 2/3
Epoch 3/3

Training dataset number 5
Epoch 1/3
Epoch 2/3
Epoch 3/3

Training dataset number 6
Epoch 1/3
Epoch 2/3
Epoch 3/3

Training dataset number 7
Epoch 1/3
Epoch 2/3
Epoch 3/3

Training dataset number 8
Epoch 1/3
Epoch 2/3
Epoch 3/3

Training dataset number 9
Epoch 1/3
Epoch 2/3
Epoch 3/3



In [53]:
#latest = tf.train.latest_checkpoint(CHECKPOINTS_FOLDER)

In [54]:
#model = create_model()
#model.load_weights(latest)

In [55]:
# Test model loaded from latest checkpoints
#model.save(CHECKPOINTS_FOLDER+'/my_model.h5')

In [56]:
#new_model = tf.keras.models.load_model(CHECKPOINTS_FOLDER+'/my_model.h5')

# Show the model architecture
#new_model.summary()

In [61]:
test = test_df.loc[:,'ball_pos_x':'boost5_timer']
preds = model.predict(test)




In [62]:
preds

array([[0.02608083, 0.00367771],
       [0.01589172, 0.04301617],
       [0.01437513, 0.03655634],
       ...,
       [0.04801385, 0.00723238],
       [0.01273773, 0.021108  ],
       [0.06388223, 0.00713387]], dtype=float32)

In [63]:
ss = pd.read_csv(DATA_FOLDER+'/sample_submission.csv')
ss['team_A_scoring_within_10sec'] = preds[:,0]
ss['team_B_scoring_within_10sec'] = preds[:,1]
ss.to_csv(OUT_FOLDER+'/Submission.csv', index=False)
ss.head()

Unnamed: 0,id,team_A_scoring_within_10sec,team_B_scoring_within_10sec
0,0,0.026081,0.003678
1,1,0.015892,0.043016
2,2,0.014375,0.036556
3,3,0.015553,0.029662
4,4,0.026748,0.020111


In [60]:
"""
history = model.history
def plot_graphs(history, metric):
    plt.plot(history[metric])
    plt.plot(history[f'val_{metric}'])
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, f'val_{metric}'])
    plt.show()
    
#plot_graphs(history, "accuracy")
plot_graphs(history, "recall")
plot_graphs(history, "loss")
"""

'\nhistory = model.history\ndef plot_graphs(history, metric):\n    plt.plot(history[metric])\n    plt.plot(history[f\'val_{metric}\'])\n    plt.xlabel("Epochs")\n    plt.ylabel(metric)\n    plt.legend([metric, f\'val_{metric}\'])\n    plt.show()\n    \n#plot_graphs(history, "accuracy")\nplot_graphs(history, "recall")\nplot_graphs(history, "loss")\n'