**Init**

In [261]:
# Native python libs
import os

In [262]:
# pip installed libs
import numpy as np
import pandas as pd
import tensorflow as tf

In [263]:
BASE_PATH = os.path.abspath('') + "\\.."
TENSORBOARD_LOGS_PATH = BASE_PATH + "\\tensorboard_logs"
KAGGLE_PATH = BASE_PATH + "\\kaggle"
INPUT_PATH = KAGGLE_PATH + "\\input\\goodreads-books-reviews-290312"
OUTPUT_PATH = KAGGLE_PATH + "\\working\\submission.csv"

In [264]:
# Test if path is good
os.path.abspath(TENSORBOARD_LOGS_PATH)

'C:\\Users\\juanm\\OneDrive\\Bureau\\ESGI - Projets\\4IABD\\Projet Deep Learning\\tensorboard_logs'

In [265]:
#!tensorboard --logdir PATH

**Pre-Treatment For Machine Learning Algorithms**

In [266]:
# Load the data
train_dataset = tf.data.experimental.make_csv_dataset(
    '../kaggle/input/goodreads-books-reviews-290312/goodreads_train.csv',
    batch_size=256,
    field_delim=',',
    select_columns=['review_id', 'review_text', 'rating']
)

test_dataset = tf.data.experimental.make_csv_dataset(
    '../kaggle/input/goodreads-books-reviews-290312/goodreads_test.csv',
    batch_size=256,
    field_delim=',',
    select_columns=['review_id', 'review_text']
)

In [267]:
# Get an iterator over the train_dataset
iterator_train = train_dataset.as_numpy_iterator()
train_dataset = next(iterator_train)

In [268]:
# Get an iterator over the test_dataset
iterator_test = test_dataset.as_numpy_iterator()
test_dataset = next(iterator_test)

In [269]:
# Init every ds
train_review_ids, test_review_ids = train_dataset['review_id'], test_dataset['review_id']
train_reviews, test_reviews = train_dataset['review_text'], test_dataset['review_text']
train_ratings = train_dataset['rating']

In [270]:
test_review_ids

array([b'df83f7aa82d9a4a8a17fba8b5a0b907f',
       b'f81bcee7ca0207d6458f360cff6f372a',
       b'b2ad786016834b37c577990095a89275',
       b'a98604ae885a4974f10ac0bd647660fd',
       b'edb9579fa816010ce2bd03b9b7a4d52c',
       b'0c38c5f2078e7879c018800c468b4f3d',
       b'e7a95dab5140bd44ca68020f8f7b3b42',
       b'a6bbbacb5e884c6e634db1a6fb34f1cf',
       b'86bb1cab4ab4a5c67e48fc60811a5fe2',
       b'4382640e779d9def7e1b6de22c6b2e1c',
       b'e6971763c16496ba9168df958e05841f',
       b'6cdc81fbf7bd131fb8e0a27f711415a2',
       b'ec885df7c9c55400d57bcf2cbf2a7c91',
       b'9c9d247c0d9f6658c2d725df427cf4d0',
       b'35d6ca41a695abf6cdebefd63a815de4',
       b'752960831b6394390a7885b9fcb885b3',
       b'0ba18556d6c6f6da3fa83cb573234aed',
       b'3d6f5d2d0cd69cbb1c73dc43668dcce5',
       b'c96e91481368a5dabfba6c21fe365e16',
       b'7cd11679e4120e3e0afe1ff496adce18',
       b'd1db52053ae48eccc56beba2b2a5f84a',
       b'fe8afe5d82e32980e7bb9f4939e9bc8f',
       b'a3725dea76f04d06fe18bb2

**NLP**

In [271]:
from typing import Any


def convert_to_nlp(dataset: Any):
    # Create a TextVectorization layer
    vectorizer = tf.keras.layers.TextVectorization(output_sequence_length=100, output_mode='tf_idf') # Ou int avec couche d'embedding
    # Fit the layer to the input text data
    vectorizer.adapt(dataset)
    # Transform the text data into integer sequences
    return vectorizer(dataset)

**Transforming Data**

In [272]:
# Labelling review ids
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
test_review_ids = le.fit_transform(test_review_ids)

In [273]:
# Vectorize text
train_reviews = convert_to_nlp(train_reviews)
test_reviews = convert_to_nlp(train_reviews)

**Training**

In [274]:
# Convert the target labels to one-hot encoded arrays
train_ratings_one_hot = tf.keras.utils.to_categorical(train_ratings)

In [275]:
# Getting in desired shape
train_reviews = [[i] for i in train_reviews]
train_ratings_one_hot = [[i] for i in train_ratings_one_hot]

In [276]:
tf.print(train_reviews)

[[[35 7 33 ... 0 0 0]],
 [[48 204 5 ... 5846 10 213]],
 [[11 18 2452 ... 1491 1245 143]],
 [[4 296 542 ... 176 1828 11]],
 [[15 12 122 ... 24 27 6]],
 [[2882 14 30 ... 13 2 492]],
 [[2513 27 201 ... 0 0 0]],
 [[12 14 13 ... 3192 1787 3]],
 [[371 138 3657 ... 3634 3 5312]],
 [[6435 515 12 ... 0 0 0]],
 [[818 816 230 ... 67 68 9]],
 [[3851 1247 41 ... 0 0 0]],
 [[109 2 71 ... 0 0 0]],
 [[4 37 38 ... 35 200 116]],
 [[4 651 6 ... 1477 377 1137]],
 [[12 14 50 ... 57 56 20]],
 [[4357 4 121 ... 4 79 74]],
 [[371 138 2 ... 3 436 7]],
 [[2079 4 37 ... 83 24 259]],
 [[4 161 12 ... 15 2 155]],
 [[6 354 241 ... 63 17 12]],
 [[6452 138 2201 ... 0 0 0]],
 [[87 980 5 ... 0 0 0]],
 [[20 176 30 ... 70 95 10]],
 [[4775 12 14 ... 0 0 0]],
 [[2203 8 6 ... 1781 25 54]],
 [[117 2 534 ... 0 0 0]],
 [[156 100 6 ... 0 0 0]],
 [[6 887 5917 ... 0 0 0]],
 [[4668 12 84 ... 0 0 0]],
 [[122 331 846 ... 0 0 0]],
 [[6 985 1843 ... 0 0 0]],
 [[75 105 23 ... 0 0 0]],
 [[20 1944 4 ... 0 0 0]],
 [[118 5 803 ... 6408 15 86

In [277]:
# Create a dataset from the sequences
train_dataset = tf.data.Dataset.from_tensor_slices((train_reviews, train_ratings_one_hot))

In [278]:
tf.print(train_dataset)

<TensorSliceDataset element_spec=(TensorSpec(shape=(1, 100), dtype=tf.int64, name=None), TensorSpec(shape=(1, 6), dtype=tf.float32, name=None))>


**MLP**

In [279]:
# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(6, activation='sigmoid')
])

In [280]:
# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
              loss="binary_crossentropy",
              metrics=['accuracy'])

In [281]:
# Train the model
model.fit(train_dataset,
          epochs=10,
          callbacks=[tf.keras.callbacks.TensorBoard(TENSORBOARD_LOGS_PATH)])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1df9926f850>

**Testing**

In [282]:
# Create a dataset from the sequences
test_dataset = tf.data.Dataset.from_tensor_slices(test_reviews)
tf.print(test_dataset)

<TensorSliceDataset element_spec=TensorSpec(shape=(100,), dtype=tf.int64, name=None)>


In [283]:
model_predict = model.predict(test_reviews)



In [284]:
model_predict

array([[-0.33717933, -0.4791053 , -0.36599943,  0.30232665,  0.39007258,
         0.31291068],
       [-0.27274454, -0.4502117 , -0.3560969 ,  0.22734278,  0.45071563,
         0.18510158],
       [-0.31516358, -0.47748885, -0.6254769 ,  0.08597627,  0.37846076,
         0.34188947],
       ...,
       [-0.25502488, -0.4990398 , -0.534453  ,  0.15738311,  0.44957456,
         0.37385756],
       [-0.23866393, -0.44232222, -0.3837485 ,  0.18750687,  0.36443415,
         0.33929458],
       [-0.32992333, -0.530186  , -0.5629255 ,  0.17183001,  0.5568629 ,
         0.36359507]], dtype=float32)

**Submission**

In [285]:
# Submission code
sample_submission = pd.read_csv(INPUT_PATH + "\\goodreads_sample_submission.csv")
print(np.asarray(sample_submission['rating']).shape)
sample_submission.rating = np.argmax(model_predict, axis=1)
sample_submission.to_csv(OUTPUT_PATH, index=test_review_ids)

(478033,)


ValueError: Length of values (256) does not match length of index (478033)