**Load Libs**

In [17]:
import math
# Native python libs
import os
from datetime import datetime
from typing import Any

In [18]:
# pip installed libs
import numpy as np
import pandas as pd
import tensorflow as tf
import sklearn

**Paths**

In [19]:
BASE_PATH = f"{os.path.abspath('')}\\.."

**Kaggle**

In [20]:
KAGGLE = False

In [21]:
KAGGLE_PATH = "/kaggle" if KAGGLE else f"{BASE_PATH}\\kaggle"

In [22]:
def submission_path_exists() -> str:
    directory = f"{KAGGLE_PATH}\\working\\{datetime.now().strftime('%d%m%Y')}"
    if not os.path.exists(directory):
        os.mkdir(directory)
        print(f"Created new output directory for today at '{directory}'")
    return directory

In [23]:
INPUT_PATH = f"{KAGGLE_PATH}\\input\\goodreads-books-reviews-290312"
OUTPUT_PATH = submission_path_exists()
SUBMISSION_PATH = f"{OUTPUT_PATH}\\{datetime.now().strftime('%H%M%S')}_submission.csv"

**Tensorboard & General Monitoring**

In [24]:
TENSORBOARD_LOGS_PATH = f"{BASE_PATH}\\tensorboard_logs"
KERAS_TUNER_MONITOR_PATH = f"{OUTPUT_PATH}\\keras_tuner_monitoring"
MONITOR_PATH = f"{OUTPUT_PATH}\\monitoring.csv"

In [25]:
# Machine Learning tensorboard paths
TENSORBOARD_LOGS_PATH_ML = f"{TENSORBOARD_LOGS_PATH}\\ML"
LINEAR = f"{TENSORBOARD_LOGS_PATH_ML}\\Linear"
MLP = f"{TENSORBOARD_LOGS_PATH_ML}\\MLP"

In [26]:
# Deep Learning tensorboard paths
TENSORBOARD_LOGS_PATH_DL = f"{TENSORBOARD_LOGS_PATH}\\DL"
CNN = f"{TENSORBOARD_LOGS_PATH_DL}\\CNN"
RESNET = f"{TENSORBOARD_LOGS_PATH_DL}\\ResNet"
RNN = f"{TENSORBOARD_LOGS_PATH_DL}\\RNN"
SIMPLE_RNN = f"{TENSORBOARD_LOGS_PATH_DL}\\SimpleRNN"
TRANSFORMER = f"{TENSORBOARD_LOGS_PATH_DL}\\Transformer"

In [27]:
# Test if path is good
os.path.abspath(TENSORBOARD_LOGS_PATH)

'C:\\Users\\juanm\\OneDrive\\Bureau\\ESGI - Projets\\4IABD\\Projet Deep Learning\\tensorboard_logs'

**Hyperparameters**

In [28]:
# Fix
CLASSES = 6

In [29]:
# Adjustable
BATCH_SIZE = 1024  # Big batch size, small learning rate
VOCAB_SIZE = 20000
SEQUENCE_LENGTH = 256
EMBEDDING_DIMS = 128
EPOCHS = 100
TRIALS = 10

**Load Datasets**

In [30]:
train_dataset = pd.read_csv(f"{INPUT_PATH}\\goodreads_train.csv",
                            usecols=['review_text', 'rating'])

In [31]:
test_dataset = pd.read_csv(f"{INPUT_PATH}\\goodreads_test.csv",
                           usecols=['review_text'])

**GPU/TPU MultiThreading Setup**

In [32]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)

    strategy = tf.distribute.experimental.TPUStrategy
except ValueError:
    strategy = tf.distribute.get_strategy()
    print('Number of replicas:', strategy.num_replicas_in_sync)

Number of replicas: 1


In [33]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
except ValueError:
    tpu = None
    gpus = tf.config.experimental.list_logical_devices("GPU")

In [34]:
if tpu:
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu, )
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
elif len(gpus) > 1:
    strategy = tf.distribute.MultiWorkerMirroredStrategy([gpu.name for gpu in gpus])
    print('Running on multiple GPUs ', [gpu.name for gpu in gpus])
elif len(gpus) == 1:
    strategy = tf.distribute.get_strategy()
    print('Running on single GPU ', gpus[0].name)
else:
    strategy = tf.distribute.get_strategy()
    print('Running on CPU')
print("Number of accelerators: ", strategy.num_replicas_in_sync)

Running on single GPU  /device:GPU:0
Number of accelerators:  1


**NLP**

In [35]:
# Create a TextVectorization layer
vectorize_layer = tf.keras.layers.TextVectorization(standardize=None,
                                                    output_sequence_length=SEQUENCE_LENGTH,
                                                    output_mode='int')

In [36]:
%%time
with strategy.scope():
    vectorize_layer.adapt(train_dataset['review_text'], batch_size=BATCH_SIZE * strategy.num_replicas_in_sync)

CPU times: total: 1min 2s
Wall time: 1min 28s


In [37]:
vectorize_layer.get_vocabulary()

['',
 '[UNK]',
 'the',
 'and',
 'I',
 'to',
 'a',
 'of',
 'is',
 'was',
 'in',
 'that',
 'it',
 'this',
 'for',
 'but',
 'with',
 'book',
 'her',
 'as',
 'The',
 'so',
 'not',
 'she',
 'have',
 'be',
 'on',
 'you',
 'like',
 'just',
 'my',
 'about',
 'are',
 'really',
 'he',
 'me',
 'at',
 'his',
 'read',
 'all',
 'one',
 'more',
 'from',
 'they',
 'what',
 'an',
 'story',
 'love',
 'has',
 'had',
 'how',
 'It',
 'This',
 'who',
 'because',
 'out',
 'up',
 'or',
 'by',
 'when',
 'were',
 "I'm",
 'would',
 'their',
 'much',
 'some',
 'get',
 "didn't",
 'very',
 'if',
 '-',
 'there',
 'characters',
 'will',
 'into',
 'can',
 "it's",
 'even',
 'think',
 'first',
 'And',
 "don't",
 'than',
 'But',
 'know',
 'book.',
 'also',
 'been',
 'it.',
 'other',
 'loved',
 'only',
 'good',
 'reading',
 'time',
 'see',
 'did',
 'way',
 'we',
 'him',
 'could',
 'them',
 'no',
 'which',
 'little',
 'do',
 'still',
 'going',
 'being',
 'books',
 'She',
 'things',
 'too',
 'felt',
 "It's",
 'made',
 'feel

In [38]:
def vectorize_text(text: Any, label: Any) -> Any:
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

**Creating Dataset For Models**

In [39]:
train_dataset, validation_dataset = sklearn.model_selection.train_test_split(train_dataset, test_size=0.2)

In [40]:
def dataset_from_raw_data(x: np.ndarray, y: np.ndarray, batch_size: int = BATCH_SIZE) -> Any:
    # Create dataset
    dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(batch_size)
    # Vectorize
    dataset = dataset.map(vectorize_text)
    print(dataset.element_spec)
    return dataset

**Linear**

In [41]:
def linear() -> tf.keras.Sequential:
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(VOCAB_SIZE + 1, EMBEDDING_DIMS),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(CLASSES, activation='sigmoid'),
    ])

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                  metrics=["accuracy"])
    return model

**MLP**

In [42]:
def mlp() -> tf.keras.Sequential:
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(VOCAB_SIZE + 1, EMBEDDING_DIMS))
    model.add(tf.keras.layers.GlobalAveragePooling1D())
    model.add(tf.keras.layers.Dense(units=512, activation='relu'))
    model.add(tf.keras.layers.Dense(units=384, activation='relu'))
    model.add(tf.keras.layers.Dense(CLASSES, activation='sigmoid'))

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-2),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                  metrics=["accuracy"])
    return model

**CNN**

In [43]:
def cnn() -> tf.keras.Sequential:
    model = tf.keras.Sequential()
    model.add(tf.keras.Input(shape=(SEQUENCE_LENGTH,), dtype=tf.int32))
    model.add(tf.keras.layers.Embedding(VOCAB_SIZE + 1, EMBEDDING_DIMS))
    model.add(tf.keras.layers.Reshape((math.isqrt(SEQUENCE_LENGTH), math.isqrt(SEQUENCE_LENGTH), -1),
                                      input_shape=(None, SEQUENCE_LENGTH)))

    # Conv & pooling tf.keras.layers
    model.add(tf.keras.layers.Conv2D(filters=8, kernel_size=(3, 3), activation='tanh', padding='same'))
    model.add(tf.keras.layers.Conv2D(filters=8, kernel_size=(3, 3), activation='tanh', padding='same'))
    model.add(tf.keras.layers.MaxPool2D())
    model.add(tf.keras.layers.Conv2D(filters=16, kernel_size=(3, 3), activation='tanh', padding='same'))
    model.add(tf.keras.layers.Conv2D(filters=16, kernel_size=(3, 3), activation='tanh', padding='same'))
    model.add(tf.keras.layers.MaxPool2D())
    model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=(3, 3), activation='tanh', padding='same'))
    model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=(3, 3), activation='tanh', padding='same'))
    model.add(tf.keras.layers.MaxPool2D())

    # Fully connected tf.keras.layers
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(units=32, activation='relu'))
    model.add(tf.keras.layers.Dense(units=16, activation='relu'))
    model.add(tf.keras.layers.Dense(units=8, activation='relu'))
    model.add(tf.keras.layers.Dense(CLASSES, activation='softmax'))

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                  metrics=["accuracy"])
    return model

**Simpler RNN**

In [44]:
def simple_rnn() -> tf.keras.Sequential:
    model = tf.keras.Sequential()
    model.add(tf.keras.Input(shape=(SEQUENCE_LENGTH,), dtype=tf.int32))
    model.add(tf.keras.layers.Embedding(VOCAB_SIZE + 1, EMBEDDING_DIMS))
    model.add(tf.keras.layers.SimpleRNN(512))
    model.add(tf.keras.layers.Dense(CLASSES, activation='sigmoid'))

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-2),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                  metrics=["accuracy"])
    return model

**Utilitary For Monitoring**

In [45]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [46]:
def tensorboard_logs(model_name: str) -> tf.keras.callbacks.TensorBoard:
    return tf.keras.callbacks.TensorBoard(f"{globals()[model_name.upper()]}"
                                          f"_BS_{BATCH_SIZE}"
                                          f"_MAXFEAT_{VOCAB_SIZE}"
                                          f"_EMBEDDING_{EMBEDDING_DIMS}"
                                          f"_SEQLEN_{SEQUENCE_LENGTH}"
                                          f"_EPOCHS_{EPOCHS}"
                                          f"_TRIALS_{TRIALS}")

In [47]:
def epochs_logs(model_name: str) -> tf.keras.callbacks.CSVLogger:
    return tf.keras.callbacks.CSVLogger(f"{globals()[model_name.upper()]}"
                                        f"_BS_{BATCH_SIZE}"
                                        f"_MAXFEAT_{VOCAB_SIZE}"
                                        f"_EMBEDDING_{EMBEDDING_DIMS}"
                                        f"_SEQLEN_{SEQUENCE_LENGTH}"
                                        f"_EPOCHS_{EPOCHS}"
                                        f"_TRIALS_{TRIALS}.csv")

In [49]:
# In case of multiple models training
for model in [linear, mlp, cnn, simple_rnn]:
    for batch_size in [1024, 15600, 56333, 18000, 23256]:
        x, y = sklearn.model_selection.train_test_split(train_dataset, test_size=0.2)
        x = dataset_from_raw_data(train_dataset['review_text'], train_dataset['rating'], batch_size=batch_size)
        y = dataset_from_raw_data(validation_dataset['review_text'], validation_dataset['rating'],
                                  batch_size=batch_size)
        model_to_train = model()
        model_to_train.fit(x, validation_data=y, epochs=EPOCHS,
                           callbacks=[stop_early, tensorboard_logs(model.__name__), epochs_logs(model.__name__)])
        eval_result = model.evaluate(validation_dataset)
        model_to_train.save(f"{OUTPUT_PATH}\\"
                            f"{model.__name__}"
                            f"_loss_{eval_result[0]}"
                            f"_acc_{eval_result[1]}")

(TensorSpec(shape=(None, 256), dtype=tf.int64, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))
(TensorSpec(shape=(None, 256), dtype=tf.int64, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))
Epoch 1/100
 35/704 [>.............................] - ETA: 1:10 - loss: 1.7853 - accuracy: 0.2807


KeyboardInterrupt



**Evaluation**

In [None]:
models = [dir for root, dirs, files in os.walk(f'{KAGGLE_PATH}/working') for dir in dirs if dir.__contains__("acc")]
sort_models_per_acc = sorted(models, key=lambda x: float(x[x.find('_acc_') + 5:]), reverse=True)
sort_models_per_loss = sorted(models, key=lambda x: float(x[x.find('_loss_') + 6:x.find('_acc_')]))
print(sort_models_per_acc)
print(sort_models_per_loss)

In [None]:
best_model = tf.keras.models.load_model(f"{KAGGLE_PATH}/working/{sort_models_per_acc[0]}")

**Submission**

In [None]:
submission = pd.DataFrame()
submission['review_id'] = [data.decode("utf-8") for data in test_dataset['review_id']]
submission['rating'] = best_model.predict(test_dataset)

In [None]:
submission.to_csv(SUBMISSION_PATH, index=False)
print(f"Submission registered at {SUBMISSION_PATH}")