# Import Google Drive access (Colab)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Check if the Colab GPU is available. 

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 89.6 gigabytes of available RAM

You are using a high-RAM runtime!


# Import Library and set access Variable 

In [None]:
from sklearn.metrics import accuracy_score,classification_report
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm
from tensorflow import keras
from keras import layers
import tensorflow as tf
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split


In [None]:
%load_ext tensorboard

In [None]:
import datetime, os

logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

In [None]:
print(logdir)

logs/20230105-221224


# Test if a GPU is available for the training

In [None]:
if not tf.test.gpu_device_name():
    print('No GPU found. Please use a GPU to train your neural network.')
else:
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

Default GPU Device: /device:GPU:0


# Import Train Dataset 

In [None]:
df = pd.read_csv('/content/drive/MyDrive/ESGI/DL - Book/data/goodreads_train.csv')
df.head()

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,8842281e1d1347389f2ab93d60773d4d,18245960,dfdbb7b0eb5a7e4c26d59a937e2e5feb,5,This is a special book. It started slow for ab...,Sun Jul 30 07:44:10 -0700 2017,Wed Aug 30 00:00:26 -0700 2017,Sat Aug 26 12:05:52 -0700 2017,Tue Aug 15 13:23:18 -0700 2017,28,1
1,8842281e1d1347389f2ab93d60773d4d,16981,a5d2c3628987712d0e05c4f90798eb67,3,Recommended by Don Katz. Avail for free in Dec...,Mon Dec 05 10:46:44 -0800 2016,Wed Mar 22 11:37:04 -0700 2017,,,1,0
2,8842281e1d1347389f2ab93d60773d4d,28684704,2ede853b14dc4583f96cf5d120af636f,3,"A fun, fast paced science fiction thriller. I ...",Tue Nov 15 11:29:22 -0800 2016,Mon Mar 20 23:40:27 -0700 2017,Sat Mar 18 23:22:42 -0700 2017,Fri Mar 17 23:45:40 -0700 2017,22,0
3,8842281e1d1347389f2ab93d60773d4d,27161156,ced5675e55cd9d38a524743f5c40996e,0,Recommended reading to understand what is goin...,Wed Nov 09 17:37:04 -0800 2016,Wed Nov 09 17:38:20 -0800 2016,,,5,1
4,8842281e1d1347389f2ab93d60773d4d,25884323,332732725863131279a8e345b63ac33e,4,"I really enjoyed this book, and there is a lot...",Mon Apr 25 09:31:23 -0700 2016,Mon Apr 25 09:31:23 -0700 2016,Sun Jun 26 00:00:00 -0700 2016,Sat May 28 00:00:00 -0700 2016,9,1


# Dataset pre-processing (Tokenizer and GloVE (global vectorization file))

In [None]:
ratings = df['rating']
reviews = df['review_text']

X_train, X_test, y_train, y_test  = train_test_split(reviews, ratings, stratify = ratings ,test_size = 0.2, shuffle = True , random_state = 25)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

max_features = 40000
maxlen = 300

tokenizer = Tokenizer(num_words=max_features, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

cv_train = tokenizer.texts_to_sequences(X_train)
#pads sequences so they all have the same length
padded_in = pad_sequences(cv_train, maxlen=300, value=0.0)

In [None]:
print(cv_train[0])
print(padded_in.shape)


[40, 29, 6, 193, 137, 13, 12, 6, 227, 14, 18, 65, 162, 3250, 15, 12, 34, 11, 34, 38092, 326, 9, 75, 37, 94, 231, 1513, 784, 55, 132, 24, 121, 521, 1866, 5, 2, 53, 11, 6, 294, 402, 7, 3360, 3, 76, 26, 2, 67, 5, 593, 173, 86, 72, 168, 136, 70, 20, 63, 713, 2, 211, 76, 26, 372, 16, 13, 73, 5728, 670, 4482, 3138, 24687, 2835, 4086, 3, 1, 6592, 4962, 4376, 326, 133, 758, 7, 749, 5, 378, 1221, 53, 309, 6, 129, 4739, 3, 1]
(720000, 300)


In [None]:
EMBEDDING_FILE = "/content/drive/MyDrive/ESGI/DL - Book/input/glove.6B.200d.txt"
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(EMBEDDING_FILE, encoding="utf8"))

In [None]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

  if (await self.run_code(code, result,  async_=asy)):


(-0.008671864, 0.38186216)

In [None]:
word_index = tokenizer.word_index
embed_size = 200
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

# Create the CNN Model :

In [None]:
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Bidirectional, Dropout

model = keras.Sequential(
    [
        layers.Input(shape=(maxlen,)),
        layers.Embedding(max_features, embed_size, weights=[embedding_matrix]),
        layers.Conv1D(filters=300, kernel_size=1, padding='same', activation='relu'),
        layers.Conv1D(filters=300, kernel_size=1, padding='same', activation='relu'),
        layers.Conv1D(filters=30, kernel_size=4, padding='same', activation='relu'),
        layers.Conv1D(filters=30, kernel_size=4, padding='same', activation='relu'),
        layers.Conv1D(filters=30, kernel_size=1, padding='same', activation='relu'),
        layers.Conv1D(filters=30, kernel_size=1, padding='same', activation='relu'),
        layers.MaxPooling1D(pool_size=2),
        layers.Bidirectional(LSTM(32, return_sequences=True)),
        layers.Dropout(0.1),
        
        layers.Conv1D(filters=600, kernel_size=1, padding='same', activation='relu'),
        layers.Conv1D(filters=600, kernel_size=1, padding='same', activation='relu'),
        layers.Conv1D(filters=60, kernel_size=4, padding='same', activation='relu'),
        layers.Conv1D(filters=60, kernel_size=4, padding='same', activation='relu'),
        layers.Conv1D(filters=60, kernel_size=1, padding='same', activation='relu'),
        layers.Conv1D(filters=60, kernel_size=1, padding='same', activation='relu'),
        layers.MaxPooling1D(pool_size=2),
        layers.Bidirectional(LSTM(32, return_sequences=True)),
        layers.Dropout(0.1),
        
        layers.Flatten(),
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.50),
        layers.Dense(6, activation='sigmoid'),
        
        
        
    ]
)

## Testing a lighter model (Unsuccessful)

In [None]:
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Bidirectional, Dropout

model = keras.Sequential(
    [
        layers.Input(shape=(maxlen,)),
        layers.Embedding(max_features, embed_size, weights=[embedding_matrix]),
        layers.Conv1D(filters=300, kernel_size=1, padding='same', activation='relu'),
        layers.Conv1D(filters=30, kernel_size=4, padding='same', activation='relu'),
        layers.MaxPooling1D(pool_size=2),
        layers.Bidirectional(LSTM(32, return_sequences=True)),
        layers.Dropout(0.1),
        
        layers.Conv1D(filters=600, kernel_size=1, padding='same', activation='relu'),
        layers.Conv1D(filters=60, kernel_size=4, padding='same', activation='relu'),
        layers.MaxPooling1D(pool_size=2),
        layers.Bidirectional(LSTM(32, return_sequences=True)),
        layers.Dropout(0.1),
        
        layers.Flatten(),
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.50),
        layers.Dense(6, activation='sigmoid'),
        
        
        
    ]
)

In [None]:
#import tensorflow_addons as tfa
# Création de la métrique F1
#f1_metric = tfa.metrics.F1Score(num_classes=5, average='micro')

model.compile(optimizer=keras.optimizers.Adam(1e-4), loss='sparse_categorical_crossentropy', metrics=['acc']) # F1 = F1_score
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 300, 200)          8000000   
                                                                 
 conv1d (Conv1D)             (None, 300, 300)          60300     
                                                                 
 conv1d_1 (Conv1D)           (None, 300, 30)           36030     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 150, 30)          0         
 )                                                               
                                                                 
 bidirectional (Bidirectiona  (None, 150, 64)          16128     
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 150, 64)           0

In [None]:

callbacks = tf.keras.callbacks.TensorBoard(log_dir=logdir, histogram_freq=1, embeddings_freq=1)

epochs=8, batch_size=32
1. Epoch 4/8 18000/18000 [==============================] - 497s 28ms/step - loss: 0.9456 - acc: 0.5985 - val_loss: 0.9565 - val_acc: 0.5933

20000 - 200 - 4 - 64
1. loss: 0.9544 - acc: 0.5942 - val_loss: 0.9600 - val_acc: 0.5922  
# PLUS D'EPOCH SI BATCH SIZE PLUS PETIT CAR PAS D'OVERFITTING -> MEILLEUR SCORE SI PLUS DE MOTS

30000 - 250 - 6 - 64

Epoch 1/6 - loss: 1.1477 - acc: 0.5047 - val_loss: 1.0116 - val_acc: 0.5675
Epoch 2/6 - loss: 1.0070 - acc: 0.5712 - val_loss: 0.9991 - val_acc: 0.5737
Epoch 3/6 - loss: 0.9693 - acc: 0.5879 - val_loss: 0.9633 - val_acc: 0.5908
Epoch 4/6 - loss: 0.9429 - acc: 0.5994 - val_loss: 0.9756 - val_acc: 0.5829
Epoch 5/6 - loss: 0.9226 - acc: 0.6095 - val_loss: 0.9576 - val_acc: 0.5929
Epoch 6/6 - loss: 0.9031 - acc: 0.6190 - val_loss: 0.9678 - val_acc: 0.5878

# Overfitting 

40000 - 300 - 5 - 32

Epoch 1/5
18000/18000 [==============================] - 419s 23ms/step - loss: 1.1014 - acc: 0.5280 - val_loss: 0.9907 - val_acc: 0.5758
Epoch 2/5
18000/18000 [==============================] - 412s 23ms/step - loss: 0.9720 - acc: 0.5864 - val_loss: 0.9474 - val_acc: 0.5951
Epoch 3/5
18000/18000 [==============================] - 412s 23ms/step - loss: 0.9307 - acc: 0.6050 - val_loss: 0.9324 - val_acc: 0.6035
Epoch 4/5
18000/18000 [==============================] - 410s 23ms/step - loss: 0.9013 - acc: 0.6190 - val_loss: 0.9373 - val_acc: 0.6033
Epoch 5/5
18000/18000 [==============================] - 410s 23ms/step - loss: 0.8767 - acc: 0.6309 - val_loss: 0.9310 - val_acc: 0.6061

# Leger Overfit, mais meilleur score -> Essayer avec batchsize plus grand et une epoch de plus ou réduire encore un peu le modèle

# Overfitting plus visible avec batch size de 64, val_acc => 0,6025



# Train the model :

In [None]:
history = model.fit(padded_in, y_train, epochs=5, batch_size=32, verbose=1, validation_split=0.20)  # 8 epoch for 0,58150 F1

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Try to use the Tensorboard feature (unsuccessful)

In [None]:
%tensorboard dev upload --logdir \ './lien/vers/logs/'

ERROR: Timed out waiting for TensorBoard to start. It may still be running as pid 7760.

In [None]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 25688), started 1:08:08 ago. (Use '!kill 25688' to kill it.)

# Load and Predict the value of the Test Dataset 

In [None]:
df_test = pd.read_csv('/content/drive/MyDrive/ESGI/DL - Book/data/goodreads_test.csv')
df_test.head()

Unnamed: 0,user_id,book_id,review_id,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,b9450d1c1f97f891c392b1105959b56e,7092507,5c4df7e70e9b438c761f07a4620ccb7c,** spoiler alert ** \n This is definitely one ...,Sat Nov 10 06:06:13 -0800 2012,Sun Nov 11 05:38:36 -0800 2012,Sun Nov 11 05:38:36 -0800 2012,Sat Nov 10 00:00:00 -0800 2012,1,0
1,b9450d1c1f97f891c392b1105959b56e,5576654,8eaeaf13213eeb16ad879a2a2591bbe5,"** spoiler alert ** \n ""You are what you drink...",Fri Nov 09 21:55:16 -0800 2012,Sat Nov 10 05:41:49 -0800 2012,Sat Nov 10 05:41:49 -0800 2012,Fri Nov 09 00:00:00 -0800 2012,1,0
2,b9450d1c1f97f891c392b1105959b56e,15754052,dce649b733c153ba5363a0413cac988f,Roar is one of my favorite characters in Under...,Fri Nov 09 00:25:50 -0800 2012,Sat Nov 10 06:14:10 -0800 2012,Sat Nov 10 06:14:10 -0800 2012,Fri Nov 09 00:00:00 -0800 2012,0,0
3,b9450d1c1f97f891c392b1105959b56e,17020,8a46df0bb997269d6834f9437a4b0a77,** spoiler alert ** \n If you feel like travel...,Thu Nov 01 00:28:39 -0700 2012,Sat Nov 03 11:35:22 -0700 2012,Sat Nov 03 11:35:22 -0700 2012,Thu Nov 01 00:00:00 -0700 2012,0,0
4,b9450d1c1f97f891c392b1105959b56e,12551082,d11d3091e22f1cf3cb865598de197599,3.5 stars \n I read and enjoyed the first two ...,Thu Oct 18 00:57:00 -0700 2012,Mon Apr 01 23:00:51 -0700 2013,Sat Mar 30 00:00:00 -0700 2013,Fri Mar 29 00:00:00 -0700 2013,0,0


In [None]:
df_test = df_test[['review_id', 'review_text']]
testing_sequences = tokenizer.texts_to_sequences(df_test['review_text'])
testing_sequences = pad_sequences(testing_sequences, maxlen=300)

In [None]:
from sklearn.metrics import multilabel_confusion_matrix

ypred=model.predict(testing_sequences)



# Create the CSV Submission file :

In [None]:
df_test = df_test.drop(['review_text'], axis=1)
df_test.head()

Unnamed: 0,review_id
0,5c4df7e70e9b438c761f07a4620ccb7c
1,8eaeaf13213eeb16ad879a2a2591bbe5
2,dce649b733c153ba5363a0413cac988f
3,8a46df0bb997269d6834f9437a4b0a77
4,d11d3091e22f1cf3cb865598de197599


In [None]:
saida = np.argmax(ypred, axis=1)
df_test['rating']=pd.Series(saida)
df_test.head()

Unnamed: 0,review_id,rating
0,5c4df7e70e9b438c761f07a4620ccb7c,4
1,8eaeaf13213eeb16ad879a2a2591bbe5,3
2,dce649b733c153ba5363a0413cac988f,4
3,8a46df0bb997269d6834f9437a4b0a77,4
4,d11d3091e22f1cf3cb865598de197599,4


In [None]:
df_test.to_csv('/content/drive/MyDrive/ESGI/DL - Book/submission-CNN-40000-300-5-32-reducedModel.csv', index=False)