**Load Libs**

In [257]:
# Native python libs
import os
import sys
import math
import collections
from datetime import datetime
from typing import Any, Union

import keras.optimizers

In [258]:
# pip installed libs
import numpy as np
import pandas as pd
import tensorflow as tf
import keras_tuner as kt

**Paths**

In [259]:
BASE_PATH = f"{os.path.abspath('')}\\.."

**Kaggle**

In [260]:
KAGGLE = False

In [261]:
KAGGLE_PATH = "kaggle" if KAGGLE else f"{BASE_PATH}\\kaggle"

In [262]:
def output_path_exists() -> str:
    directory = f"{KAGGLE_PATH}\\working\\{datetime.now().strftime('%d%m%Y')}"
    if not os.path.exists(directory):
        os.mkdir(directory)
        print(f"Created new output directory for today at '{directory}'")
    return directory

In [263]:
INPUT_PATH = f"{KAGGLE_PATH}\\input\\goodreads-books-reviews-290312"
OUTPUT_PATH = f"{output_path_exists()}\\{datetime.now().strftime('%H%M%S')}_submission.csv"

**Tensorboard**

In [264]:
TENSORBOARD_LOGS_PATH = f"{BASE_PATH}\\tensorboard_logs"

In [265]:
# Machine Learning tensorboard paths
TENSORBOARD_LOGS_PATH_ML = f"{TENSORBOARD_LOGS_PATH}\\ML"
LINEAR = f"{TENSORBOARD_LOGS_PATH_ML}\\Linear"
MLP = f"{TENSORBOARD_LOGS_PATH_ML}\\MLP"

In [266]:
# Deep Learning tensorboard paths
TENSORBOARD_LOGS_PATH_DL = f"{TENSORBOARD_LOGS_PATH}\\DL"
CNN = f"{TENSORBOARD_LOGS_PATH_DL}\\CNN"
RNN = f"{TENSORBOARD_LOGS_PATH_DL}\\RNN"
TRANSFORMER = f"{TENSORBOARD_LOGS_PATH_DL}\\Transformer"

In [267]:
# Test if path is good
os.path.abspath(TENSORBOARD_LOGS_PATH)

'C:\\Users\\juanm\\OneDrive\\Bureau\\ESGI - Projets\\4IABD\\Projet Deep Learning\\tensorboard_logs'

**Hyperparameters**

In [268]:
# Fix
CLASSES = 6
BATCH_SIZE = 32  # Train = 900000 | Test = 478033
BUFFER = 50000

In [269]:
# Adjustable
VOCAB_SIZE = 20000
SEQUENCE_LENGTH = 256
EMBEDDING_DIMS = 128
EPOCHS = 10
LEARNING_RATE = 0.001
NUM_HEADS = 2
FF_DIMS = 4 * EMBEDDING_DIMS  # In paper, value used is 4 * EMBEDDING_DIMS
DROPOUT_RATE = 0  # Between 0 and 1

**Load Data**

In [270]:
def load_csv_data(path: str,
                  batch_size: int = BATCH_SIZE,
                  buffer: int = BUFFER,
                  separator: Union[list[str], str] = ',',
                  columns: list[str] = None) -> collections.OrderedDict:
    # Load data -> tensors
    dataset = tf.data.experimental.make_csv_dataset(
        path,
        batch_size=batch_size,
        field_delim=separator,
        select_columns=columns,
        shuffle=False,
        prefetch_buffer_size=buffer,
        num_rows_for_inference=None
    )
    # Get an iterator over the dataset
    iterator = dataset.as_numpy_iterator()
    return next(iterator)

**Train Dataset**

In [271]:
%%time
# Load training dataset
train_dataset = load_csv_data(f"{INPUT_PATH}\\goodreads_train.csv",
                              batch_size=900000,
                              columns=['review_text', 'rating'])

CPU times: total: 57.8 s
Wall time: 2min 22s


In [272]:
train_ratings = train_dataset['rating']

**Test Dataset**

In [273]:
%%time
# Load test dataset
test_dataset = load_csv_data(f"{INPUT_PATH}\\goodreads_test.csv",
                             batch_size=1024,
                             columns=['review_id', 'review_text'])

CPU times: total: 35.3 s
Wall time: 1min 4s


In [274]:
test_review_ids = test_dataset['review_id']

**NLP**

In [275]:
# Create a TextVectorization layer
vectorize_layer = tf.keras.layers.TextVectorization(standardize=None,
                                                    output_sequence_length=SEQUENCE_LENGTH,
                                                    output_mode='int')  # int avec couche d'embedding sinon tf_idf

In [276]:
%%time
# Fit the layer to the input text data
vectorize_layer.adapt(train_dataset['review_text'], batch_size=BATCH_SIZE)

CPU times: total: 2min 57s
Wall time: 6min 9s


In [253]:
vectorize_layer.get_vocabulary()

['',
 '[UNK]',
 'the',
 'and',
 'I',
 'to',
 'a',
 'of',
 'is',
 'was',
 'in',
 'that',
 'it',
 'this',
 'for',
 'but',
 'with',
 'book',
 'her',
 'as',
 'The',
 'so',
 'not',
 'she',
 'have',
 'be',
 'on',
 'you',
 'like',
 'just',
 'my',
 'about',
 'are',
 'really',
 'he',
 'me',
 'at',
 'his',
 'read',
 'all',
 'one',
 'more',
 'from',
 'they',
 'what',
 'an',
 'story',
 'love',
 'has',
 'had',
 'how',
 'It',
 'This',
 'who',
 'because',
 'out',
 'up',
 'or',
 'by',
 'when',
 'were',
 "I'm",
 'would',
 'their',
 'much',
 'some',
 'get',
 "didn't",
 'very',
 'if',
 '-',
 'there',
 'characters',
 'will',
 'into',
 'can',
 "it's",
 'even',
 'think',
 'first',
 'And',
 "don't",
 'than',
 'But',
 'know',
 'book.',
 'also',
 'been',
 'it.',
 'other',
 'loved',
 'only',
 'good',
 'reading',
 'time',
 'see',
 'did',
 'way',
 'we',
 'him',
 'could',
 'them',
 'no',
 'which',
 'little',
 'do',
 'still',
 'going',
 'being',
 'books',
 'She',
 'things',
 'too',
 'felt',
 "It's",
 'made',
 'feel

In [254]:
def vectorize_text(text: Any, label: Any) -> Any:
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

**Creating Dataset For Models**

In [255]:
def dataset_from_raw_data(x: np.ndarray, y: np.ndarray, batch_size: int = BATCH_SIZE) -> Any:
    y = tf.keras.utils.to_categorical(y, dtype='int32')
    dataset = tf.data.Dataset.from_tensor_slices((x, y))#.batch(batch_size)
    dataset = dataset.map(vectorize_text)
    print(dataset)
    return dataset

In [256]:
%%time
train_dataset = dataset_from_raw_data(train_dataset['review_text'], train_dataset['rating'])

<MapDataset element_spec=(TensorSpec(shape=(None, 256), dtype=tf.int64, name=None), TensorSpec(shape=(6,), dtype=tf.int32, name=None))>
CPU times: total: 2 s
Wall time: 8.64 s


**Linear**

In [186]:
def linear(hp):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(VOCAB_SIZE + 1, EMBEDDING_DIMS),
        tf.keras.layers.GlobalAveragePooling1D(),
        tf.keras.layers.Dense(CLASSES, activation='sigmoid'),
    ])

    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate),
                  loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                  metrics=["accuracy"])
    return model

**MLP**

In [187]:
def mlp(hp):
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(VOCAB_SIZE + 1, EMBEDDING_DIMS))
    model.add(tf.keras.layers.GlobalAveragePooling1D())
    hp_units = hp.Int('units', min_value=32, max_value=512, step=32)
    model.add(tf.keras.layers.Dense(units=hp_units, activation='relu'))
    model.add(tf.keras.layers.Dense(units=hp_units, activation='relu'))
    model.add(tf.keras.layers.Dense(CLASSES, activation='sigmoid'))

    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate),
                  loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                  metrics=["accuracy"])
    return model

**CNN**

In [188]:
# CNN init
def cnn(hp):
    model = tf.keras.Sequential()
    model.add(tf.keras.Input(shape=(SEQUENCE_LENGTH,), dtype=tf.int32))
    model.add(tf.keras.layers.Embedding(VOCAB_SIZE + 1, EMBEDDING_DIMS))
    model.add(tf.keras.layers.Reshape((math.isqrt(SEQUENCE_LENGTH), math.isqrt(SEQUENCE_LENGTH), -1),
                                      input_shape=(None, SEQUENCE_LENGTH)))
    # Conv & pooling layers
    hp_filters_1 = hp.Int('filters', min_value=8, max_value=32, step=8)
    model.add(tf.keras.layers.Conv2D(filters=hp_filters_1, kernel_size=(3, 3), activation='tanh', padding='same'))
    model.add(tf.keras.layers.Conv2D(filters=hp_filters_1, kernel_size=(3, 3), activation='tanh', padding='same'))
    model.add(tf.keras.layers.MaxPool2D())
    hp_filters_2 = hp.Int('filters', min_value=16, max_value=64, step=16)
    model.add(tf.keras.layers.Conv2D(filters=hp_filters_2, kernel_size=(3, 3), activation='tanh', padding='same'))
    model.add(tf.keras.layers.Conv2D(filters=hp_filters_2, kernel_size=(3, 3), activation='tanh', padding='same'))
    model.add(tf.keras.layers.MaxPool2D())
    hp_filters_3 = hp.Int('filters', min_value=32, max_value=128, step=32)
    model.add(tf.keras.layers.Conv2D(filters=hp_filters_3, kernel_size=(3, 3), activation='tanh', padding='same'))
    model.add(tf.keras.layers.Conv2D(filters=hp_filters_3, kernel_size=(3, 3), activation='tanh', padding='same'))
    model.add(tf.keras.layers.MaxPool2D())
    hp_filters_4 = hp.Int('filters', min_value=64, max_value=256, step=64)
    model.add(tf.keras.layers.Conv2D(filters=hp_filters_4, kernel_size=(3, 3), activation='tanh', padding='same'))
    model.add(tf.keras.layers.Conv2D(filters=hp_filters_4, kernel_size=(3, 3), activation='tanh', padding='same'))
    model.add(tf.keras.layers.MaxPool2D())
    # Fully connected layers
    model.add(tf.keras.layers.Flatten())
    hp_units_1 = hp.Int('units', min_value=64, max_value=256, step=64)
    model.add(tf.keras.layers.Dense(units=hp_units_1, activation='relu'))
    hp_units_2 = hp.Int('units', min_value=32, max_value=128, step=32)
    model.add(tf.keras.layers.Dense(units=hp_units_2, activation='relu'))
    hp_units_3 = hp.Int('units', min_value=16, max_value=64, step=16)
    model.add(tf.keras.layers.Dense(units=hp_units_3, activation='relu'))
    model.add(tf.keras.layers.Dense(CLASSES, activation='softmax'))

    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate),
                  loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                  metrics=["accuracy"])
    return model

**ResNets / HighwayNets**

**RNN**

**Transformer**

In [189]:
inputs = tf.keras.Input(shape=(SEQUENCE_LENGTH,), dtype=tf.int32)

x = tf.keras.layers.Embedding(VOCAB_SIZE + 1, EMBEDDING_DIMS)(inputs)

# Add the multi-head self-attention layer
x = tf.keras.layers.MultiHeadAttention(EMBEDDING_DIMS, NUM_HEADS)(x, x, x)
# Regularization
x = tf.keras.layers.Dropout(rate=DROPOUT_RATE)(x)
# Add the feed-forward layer
x = tf.keras.layers.Dense(FF_DIMS, activation='relu')(x)
# Regularization
x = tf.keras.layers.Dropout(rate=DROPOUT_RATE)(x)

# Add a dense layer for output
outputs = tf.keras.layers.Dense(VOCAB_SIZE + 1)(x)

# Create the model
transformer = tf.keras.Model(inputs=inputs, outputs=outputs)

In [190]:
"""class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self,
                 num_heads: int = NUM_HEADS,
                 ff_dim: int = FF_DIMS,
                 embed_dim: int = EMBEDDING_DIMS,
                 dropout_rate: int = 0):
        # TODO : NUM_HEADS HV A IMPLEMENTER
        super().__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(ff_dim, activation="relu"),
             tf.keras.layers.Dense(embed_dim),]
        )
        self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)"""

'class TransformerBlock(tf.keras.layers.Layer):\n    def __init__(self,\n                 num_heads: int = NUM_HEADS,\n                 ff_dim: int = FF_DIMS,\n                 embed_dim: int = EMBEDDING_DIMS,\n                 dropout_rate: int = 0):\n        # TODO : NUM_HEADS HV A IMPLEMENTER\n        super().__init__()\n        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)\n        self.ffn = tf.keras.Sequential(\n            [tf.keras.layers.Dense(ff_dim, activation="relu"),\n             tf.keras.layers.Dense(embed_dim),]\n        )\n        self.layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)\n        self.layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)\n        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)\n        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)\n\n    def call(self, inputs, training):\n        attn_output = self.att(inputs, inputs)\n        attn_output = self.dropout1(attn_outp

In [191]:
"""class TokenAndPositionEmbedding(tf.keras.layers.Layer):
    def __init__(self,
                 maxlen: int = SEQUENCE_LENGTH,
                 vocab_size: int = VOCAB_SIZE,
                 embed_dim: int = EMBEDDING_DIMS):
        super().__init__()
        self.token_emb = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = tf.keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, dataset):
        maxlen = tf.shape(dataset)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        dataset = self.token_emb(dataset)
        return dataset + positions"""

'class TokenAndPositionEmbedding(tf.keras.layers.Layer):\n    def __init__(self,\n                 maxlen: int = SEQUENCE_LENGTH,\n                 vocab_size: int = VOCAB_SIZE,\n                 embed_dim: int = EMBEDDING_DIMS):\n        super().__init__()\n        self.token_emb = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)\n        self.pos_emb = tf.keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)\n\n    def call(self, dataset):\n        maxlen = tf.shape(dataset)[-1]\n        positions = tf.range(start=0, limit=maxlen, delta=1)\n        positions = self.pos_emb(positions)\n        dataset = self.token_emb(dataset)\n        return dataset + positions'

In [192]:
"""inputs = tf.keras.Input(shape=(SEQUENCE_LENGTH,), dtype=tf.int64)
embedding_layer = TokenAndPositionEmbedding()
transformer = embedding_layer(inputs)
transformer_block = TransformerBlock(transformer)
transformer = transformer_block(transformer)
transformer = tf.keras.layers.GlobalAveragePooling1D()(transformer)
transformer = tf.keras.layers.Dense(20, activation="relu")(transformer)
outputs = tf.keras.layers.Dense(CLASSES, activation="softmax")(transformer)

transformer = tf.keras.Model(inputs=inputs, outputs=outputs)"""

'inputs = tf.keras.Input(shape=(SEQUENCE_LENGTH,), dtype=tf.int64)\nembedding_layer = TokenAndPositionEmbedding()\ntransformer = embedding_layer(inputs)\ntransformer_block = TransformerBlock(transformer)\ntransformer = transformer_block(transformer)\ntransformer = tf.keras.layers.GlobalAveragePooling1D()(transformer)\ntransformer = tf.keras.layers.Dense(20, activation="relu")(transformer)\noutputs = tf.keras.layers.Dense(CLASSES, activation="softmax")(transformer)\n\ntransformer = tf.keras.Model(inputs=inputs, outputs=outputs)'

**Training**

In [193]:
%%time
train_dataset = dataset_from_raw_data(train_dataset['review_text'], train_dataset['rating'])

<BatchDataset element_spec=(TensorSpec(shape=(None, None, 256), dtype=tf.int64, name=None), TensorSpec(shape=(None, 6), dtype=tf.int32, name=None))>
CPU times: total: 15.6 ms
Wall time: 98.8 ms


In [194]:
counter = 0
for data in train_dataset:
    counter += 1
print(counter)

1024


In [148]:
"""%%time
from concurrent.futures import ThreadPoolExecutor, as_completed

with ThreadPoolExecutor() as executor:
    model_functions = [linear, mlp, cnn]
    logs = [LINEAR, MLP, CNN]
    futures = [executor.submit(train_over_model, model_function, log) for model_function in model_functions for log in logs]

    for future in as_completed(futures):
        future.result()
        POUR LE RAPPORT"""

'%%time\nfrom concurrent.futures import ThreadPoolExecutor, as_completed\n\nwith ThreadPoolExecutor() as executor:\n    model_functions = [linear, mlp, cnn]\n    logs = [LINEAR, MLP, CNN]\n    futures = [executor.submit(train_over_model, model_function, log) for model_function in model_functions for log in logs]\n\n    for future in as_completed(futures):\n        future.result()\n        POUR LE RAPPORT'

In [157]:
models = [linear, mlp, cnn]
paths = [LINEAR, MLP, CNN]

In [158]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [162]:
%%time
trained_models = []

strategy = tf.distribute.MultiWorkerMirroredStrategy()

with strategy.scope():
    for model, path in zip(models, paths):
        # Init model
        tuner = kt.BayesianOptimization(model,
                                        objective='val_accuracy',
                                        max_trials=EPOCHS,
                                        alpha=1e-4,
                                        beta=2.6)
        print(0)
        tuner.search(train_dataset,
                     epochs=EPOCHS,
                     callbacks=[stop_early])
        best_hps = tuner.get_best_hyperparameters(num_trials=EPOCHS)
        print(best_hps)
        model = tuner.hypermodel.build(best_hps)
        # Train over model
        history = model.fit(train_dataset,
                            epochs=EPOCHS,
                            callbacks=[tf.keras.callbacks.TensorBoard(f"{path}"
                                                                      f"_BS_{BATCH_SIZE}"
                                                                      f"_MAXFEAT_{VOCAB_SIZE}"
                                                                      f"_EMBEDDING_{EMBEDDING_DIMS}"
                                                                      f"_SEQLEN_{SEQUENCE_LENGTH}"
                                                                      f"_LR_{LEARNING_RATE}")])
        val_acc_per_epoch = history.history['val_accuracy']
        best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
        print(f"Best epoch: {best_epoch}")
        trained_models.append(model)

INFO:tensorflow:Single-worker MultiWorkerMirroredStrategy with local_devices = ('/device:GPU:0',), communication = CommunicationImplementation.AUTO
INFO:tensorflow:Reloading Tuner from .\untitled_project\tuner0.json
0


KeyError: <keras_tuner.engine.trial.Trial object at 0x00000210927CCAF0>

In [273]:
trained_models

[<keras.engine.sequential.Sequential at 0x134f7b7a140>,
 <keras.engine.sequential.Sequential at 0x134fbaf2c80>,
 <keras.engine.sequential.Sequential at 0x134ef70dde0>]

**Computing Accuracy** _(To choose best algorithm for submission)_

In [276]:
labels_predicted = 0  #[testing_model(trained_model, train_dataset) for trained_model in trained_models]

In [277]:
len(labels_predicted[0])

32

In [None]:
for label in labels_predicted:
    print(label)

In [280]:
accuracy_metric = tf.keras.metrics.Accuracy()
accuracy_metric.update_state(train_ratings, labels_predicted)
print(accuracy_metric.result().numpy())

ValueError: Shapes (1024,) and (3, 32) are incompatible

**Testing**

In [None]:
%%time
test_dataset = dataset_from_raw_data(test_dataset['review_text'],
                                     np.random.default_rng().integers(0, CLASSES, 1024))

In [None]:
# Test the model
def predict_model(model: tf.keras.Sequential, dataset: Any) -> np.ndarray:
    model_predict = 0
    for text, label in dataset:
        model_predict = model.predict(text)
    return model_predict.argmax(axis=1)

**Submission**

In [None]:
# Submission code
sample_submission = pd.read_csv(INPUT_PATH + "\\goodreads_sample_submission.csv")

In [None]:
%%time
# Getting data for csv file
sample_submission['rating'] = predict_model(cnn)
sample_submission['review_id'] = [data.decode("utf-8") for data in test_review_ids]

In [None]:
sample_submission.to_csv(OUTPUT_PATH, index=False)
print(f"CSV registered at {OUTPUT_PATH}")