**Load Libs**

In [1651]:
# Native python libs
import os
import collections

In [1652]:
# pip installed libs
import numpy as np
import pandas as pd
import tensorflow as tf

**Paths**

In [1653]:
BASE_PATH = os.path.abspath('') + "\\.."

**Kaggle**

In [1654]:
KAGGLE_PATH = BASE_PATH + "\\kaggle"
INPUT_PATH = KAGGLE_PATH + "\\input\\goodreads-books-reviews-290312"
OUTPUT_PATH = KAGGLE_PATH + "\\working\\submission.csv"

**Tensorboard**

In [1655]:
TENSORBOARD_LOGS_PATH = BASE_PATH + "\\tensorboard_logs"
TENSORBOARD_LOGS_PATH_ML = TENSORBOARD_LOGS_PATH + "\\ML"
TENSORBOARD_LOGS_PATH_DL = TENSORBOARD_LOGS_PATH + "\\DL"

In [1656]:
# Test if path is good
os.path.abspath(TENSORBOARD_LOGS_PATH)

'C:\\Users\\juanm\\OneDrive\\Bureau\\ESGI - Projets\\4IABD\\Projet Deep Learning\\tensorboard_logs'

In [1657]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [1658]:
%tensorboard --logdir {TENSORBOARD_LOGS_PATH}

Launching TensorBoard...

**Hyperparameters**

In [1659]:
CLASSES = 6
BATCH_SIZE = 1024 # Size of csv file = 478033
MAX_FEATURES = 20000
EMBEDDING_DIM = 128
SEQUENCE_LENGTH = 250
EPOCHS = 100
LEARNING_RATE = 0.001

**Load Data**

In [1660]:
from typing import Any, Union


def load_csv_data(path: str,
                  batch_size: int = BATCH_SIZE,
                  separator: Union[list[str], str] = ',',
                  columns: Union[list[str], str] = None) -> collections.OrderedDict:
    # Load data -> tensors
    dataset = tf.data.experimental.make_csv_dataset(
        path,
        batch_size=batch_size,
        field_delim=separator,
        select_columns=columns,
        shuffle=False
    )
    # Get an iterator over the dataset
    iterator = dataset.as_numpy_iterator()
    return next(iterator)

**Train Dataset**

In [1661]:
# Load training dataset
train_dataset = load_csv_data(f"{INPUT_PATH}\\goodreads_train.csv", columns=['review_text', 'rating'])

**Test Dataset**

In [1662]:
# Load test dataset
test_dataset = load_csv_data(f"{INPUT_PATH}\\goodreads_test.csv", batch_size=478033, columns=['review_id', 'review_text'])

In [1663]:
test_review_ids = test_dataset['review_id']

**NLP**

In [1664]:
# Create a TextVectorization layer
vectorize_layer = tf.keras.layers.TextVectorization(standardize=None,
                                                    output_sequence_length=SEQUENCE_LENGTH,
                                                    output_mode='int')  # Ou int avec couche d'embedding sinon tf_idf

In [1665]:
# Fit the layer to the input text data
vectorize_layer.adapt(train_dataset['review_text'])

In [1666]:
vectorize_layer.get_vocabulary()

['',
 '[UNK]',
 'the',
 'and',
 'a',
 'I',
 'to',
 'is',
 'of',
 'in',
 'this',
 'that',
 'was',
 'for',
 'with',
 'book',
 'it',
 'her',
 'but',
 'you',
 'The',
 'my',
 'are',
 'an',
 'very',
 'story',
 'not',
 'be',
 'as',
 'have',
 'his',
 'he',
 'she',
 'from',
 'on',
 'has',
 'about',
 'one',
 'love',
 'read',
 'by',
 'me',
 'all',
 'This',
 'at',
 'so',
 'just',
 'who',
 'like',
 'will',
 'really',
 'more',
 'loved',
 'author',
 'they',
 'what',
 'characters',
 'And',
 'had',
 'their',
 'It',
 'when',
 'up',
 'first',
 'some',
 'how',
 'But',
 'were',
 '-',
 'He',
 'would',
 'there',
 'into',
 'can',
 'absolutely',
 'book.',
 'much',
 'out',
 'sexy',
 'or',
 'could',
 'honest',
 'books',
 'She',
 'because',
 "can't",
 'story.',
 'provided',
 'if',
 'even',
 'two',
 'other',
 'There',
 'no',
 'get',
 'also',
 'reading',
 'know',
 "I'm",
 'him',
 'your',
 'want',
 'think',
 'been',
 'than',
 'only',
 "don't",
 'return',
 'did',
 'between',
 'life',
 'we',
 'which',
 "didn't",
 'am'

In [1667]:
def vectorize_text(text: Any, label: Any) -> Any:
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

**Normalizing Data**

In [1668]:
def normalize_data(x: np.ndarray, y: np.ndarray, batch_size: int = BATCH_SIZE) -> Any:
    y = tf.keras.utils.to_categorical(y)
    dataset = tf.data.Dataset.from_tensor_slices((x, y))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(vectorize_text)
    print(dataset)
    return dataset

**Training**

In [1669]:
train_dataset = normalize_data(train_dataset['review_text'], train_dataset['rating'])

<MapDataset element_spec=(TensorSpec(shape=(None, 250), dtype=tf.int64, name=None), TensorSpec(shape=(None, 6), dtype=tf.float32, name=None))>


**MLP**

In [1670]:
# Define the model
mlp = tf.keras.Sequential([
    tf.keras.layers.Embedding(MAX_FEATURES, EMBEDDING_DIM),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(CLASSES, activation='sigmoid'),
])

In [1671]:
mlp.summary()

Model: "sequential_40"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_40 (Embedding)    (None, None, 128)         2560000   
                                                                 
 global_average_pooling1d_26  (None, 128)              0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_120 (Dense)           (None, 32)                4128      
                                                                 
 dense_121 (Dense)           (None, 32)                1056      
                                                                 
 dense_122 (Dense)           (None, 6)                 198       
                                                                 
Total params: 2,565,382
Trainable params: 2,565,382
Non-trainable params: 0
___________________________________________

In [1672]:
# Compile the model
mlp.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
              loss="binary_crossentropy",
              metrics=['accuracy'])

In [1673]:
# Train the model
mlp.fit(train_dataset,
          epochs=EPOCHS,
          callbacks=[tf.keras.callbacks.TensorBoard(f"{TENSORBOARD_LOGS_PATH_ML}\\MLP"
                                                    f"_BS_{BATCH_SIZE}"
                                                    f"_MAXFEAT_{MAX_FEATURES}"
                                                    f"_EMBEDDING_{EMBEDDING_DIM}"
                                                    f"_SEQUENCELEN_{SEQUENCE_LENGTH}"
                                                    f"_LR_{LEARNING_RATE}")])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x1c08cdf8d60>

**Testing**

In [1674]:
test_ratings = np.random.default_rng().integers(0, CLASSES, 478033)

In [1675]:
test_dataset = normalize_data(test_dataset['review_text'], test_ratings, batch_size=478033)

<MapDataset element_spec=(TensorSpec(shape=(None, 250), dtype=tf.int64, name=None), TensorSpec(shape=(None, 6), dtype=tf.float32, name=None))>


In [1676]:
def testing_model(model, dataset):
    for text, label in dataset.take(BATCH_SIZE):
        model_predict = model.predict(text)
    return model_predict.argmax(axis=1)

**Submission**

In [1677]:
# Submission code
sample_submission = pd.read_csv(INPUT_PATH + "\\goodreads_sample_submission.csv")

In [1678]:
# Getting data for csv file
sample_submission['rating'] = testing_model(mlp, test_dataset)
sample_submission['review_id'] = [data.decode("utf-8") for data in test_review_ids]



In [1679]:
sample_submission.to_csv(OUTPUT_PATH, index=False)
print(f"CSV registered at {OUTPUT_PATH}")

CSV registered at C:\Users\juanm\OneDrive\Bureau\ESGI - Projets\4IABD\Projet Deep Learning\test_juan\..\kaggle\working\submission.csv
