In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

data = keras.datasets.imdb

#"Ne garder que les 10 000 mots les plus fréquents du dataset".
# we are going to leave words that are occuring one time or twice 
#not throwing them into our model
(train_data,train_labels),(test_data,test_labels) = data.load_data(num_words=88000)

print(train_data[0])
# we see integer encoded words , each integer is a word
# this is fine for the computer to read it , but not for us 
# we should find the maping for each word to read it

# this gives us a dictionary that has those keys(integers) and those mappings(words)
word_index = data.get_word_index()

# values are deplaced in train_data with 3 to keep 0 1 2 reserved
#to have same values in word_index and train_data , we add +3
#padding to make all oiur reviews the same lenght
# if a review is 100 other 200 we add a bench of padding to the end of the first to make it 200
# and our model will ofc know that we should not look at this padding
word_index = {k: v+3 for k,v in word_index.items()}
word_index["<PAD>"] = 0         #padding
word_index["<START>"] = 1
word_index["<UNK>"] = 2       #unknown
word_index["<UNUSED>"] = 3


reverse_word_index = dict([(value , key) for (key,value) in word_index.items()])

# ? → what to return if the key doesn’t exist in the dictionary.
def decode_review(text):
    return " ".join([reverse_word_index.get(i,"?") for i in text])
    

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]


In [19]:
print(decode_review(test_data[0]))
print(len(test_data[0]), len(test_data[1]))

<START> please give this one a miss br br kristy swanson and the rest of the cast rendered terrible performances the show is flat flat flat br br i don't know how michael madison could have allowed this one on his plate he almost seemed to know this wasn't going to work out and his performance was quite lacklustre so all you madison fans give this a miss
68 260


In [20]:
# the differnt lengths won't work for our model
# we need to know our input shape is going to be
# to determine how many input neurones and output neurones
# we could pick the longest review and make all the reviews the same len
# but we will pick an arbitrary length and make all the riviews that len
# by adding pad <PAD> or removing words
#params : train_data , value of what we add , padding = "post" : we add after not before

#preprocessing data : make the data in a form tha our model can accept

train_data = keras.preprocessing.sequence.pad_sequences(train_data ,
                                        value=word_index["<PAD>"],
                                        padding = "post",
                                        maxlen=250)
test_data = keras.preprocessing.sequence.pad_sequences(test_data ,
                                                value = word_index["<PAD>"],
                                                padding="post",
                                                maxlen=250)

print(len(train_data) , len(test_data))
print(len(test_data[0]), len(test_data[1]))

25000 25000
250 250


In [22]:
#define the model
model = keras.Sequential()

# instead of passing a list of layers we will use model.add
#we want our output either the review is positive or negative
#so we will have one neurone as output to decide 0 1
# get our model to understand our words that they have a similar meaning
# and to kind of group those words together in a similar form or a similar way
#we don't know which words are similar to each other


#embeding layer : it generates word vectors for each word that we pass it
# and try to make the related vectors : words(integers) close to each other
# in a simple way we can say based on the word surrounding these words
# a word vector can be in any kind of dimensional space
# in this case we've picked 16 dimension for each word vector
# a vector is a straight line with a bench of different coeff in a space
model.add(keras.layers.Embedding(88000,16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16 , activation="relu"))
model.add(keras.layers.Dense(1,activation="sigmoid"))

model.summary()
print("----------------")
model.compile(optimizer="adam" , loss="binary_crossentropy" , metrics=["accuracy"])

x_val = train_data[:10000]
x_train = train_data[10000:]

y_val = train_labels[:10000]
y_train = train_labels[10000:]

fitModel = model.fit(x_train , y_train , epochs=40,
                     batch_size=512 , validation_data=(x_val, y_val),
                    verbose = 1)



----------------
Epoch 1/40
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 45ms/step - accuracy: 0.5376 - loss: 0.6918 - val_accuracy: 0.6693 - val_loss: 0.6850
Epoch 2/40
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 34ms/step - accuracy: 0.7015 - loss: 0.6804 - val_accuracy: 0.7175 - val_loss: 0.6672
Epoch 3/40
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - accuracy: 0.7402 - loss: 0.6569 - val_accuracy: 0.7634 - val_loss: 0.6339
Epoch 4/40
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 43ms/step - accuracy: 0.7897 - loss: 0.6157 - val_accuracy: 0.7840 - val_loss: 0.5897
Epoch 5/40
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - accuracy: 0.8084 - loss: 0.5652 - val_accuracy: 0.8040 - val_loss: 0.5409
Epoch 6/40
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - accuracy: 0.8313 - loss: 0.5087 - val_accuracy: 0.8240 - val_loss: 0.4902
Epoch 7/40
[1m30

In [23]:
results = model.evaluate(test_data, test_labels)
print(results)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8767 - loss: 0.3125
[0.3198740482330322, 0.8753600120544434]


In [24]:
# shape = (250,) → une séquence de 250 mots
test_review = test_data[0]
# shape = (1, 250)
#Ça ajoute une nouvelle dimension à la position 0 (le début).
test_review_dim = np.expand_dims(test_review , axis=0 )
predict = model.predict(test_review_dim)
print("Review : ")
print(decode_review(test_review))
print("Prediction : " + str(predict[0]))
print("Actual : " + str(test_labels[0]))



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step
Review : 
<START> please give this one a miss br br kristy swanson and the rest of the cast rendered terrible performances the show is flat flat flat br br i don't know how michael madison could have allowed this one on his plate he almost seemed to know this wasn't going to work out and his performance was quite lacklustre so all you madison fans give this a miss <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <P

In [25]:
# every time we want to make a prediction we have to retrain the model
#in this case it's fine because it takes us only few minutes
# but in most real cases it will take hours or days 
# so we have to save our model

In [27]:
#to save a model : name_of_model.save()
#h5 an extension for a saved model in keras and tensorflow
model.save("model.keras")

In [30]:
# after saving the model we don't need all this stuff
# we just have to load this in
# we can train a bunch of different models and tweak hyperparametres of them
#like changing the amount of neurones in the hidden layers
#and only save the best model with higher acc
model_saved = keras.models.load_model("model.keras")

In [34]:
 # using with open so i don't have to close the file after
# we need to remove ' " , () from our text so when we split our words with " "
# we get correct words not art, in example or "The
# The big problem often is the data you have to make it good


def review_encode(s) :
    encoded = [1]  # 1 refers to <START> all reviews begin with start
    for word in s :
        word = word.lower()
        if word in word_index :
            encoded.append(word_index[word])
        else :
            encoded.append(2)   #<UNK>
    return encoded


with open("test.txt", encoding="utf-8") as f:
    for line in f.readlines():
        line = line.replace(",", "").replace(".", "").replace(")", "") \
                   .replace("(", "").replace(":", "").replace('"', "").strip()  # 🛠️ corriger "\\" ici
        nline = line.split(" ")  # on sépare les mots
        encode = review_encode(nline)  # sequence de nbr de mots
        encode = keras.preprocessing.sequence.pad_sequences([encode],
                        value=word_index["<PAD>"],
                        padding="post",
                        maxlen=250)

        predict = model.predict(encode)
        print(line)
        print(encode)
        print(predict[0])
        
                                                                                                             

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step
I watched this movie last night and I was truly impressed The acting was phenomenal especially the lead actor who delivered a powerful and emotional performance The storyline was engaging from start to finish with unexpected twists that kept me hooked The soundtrack matched the scenes perfectly and elevated the emotional impact I highly recommend this film to anyone looking for a meaningful and unforgettable experience
[[    1    13   296    14    20   236   314     5    13    16   371  1555
      4   116    16  6814   262     4   485   284    37  2132     6   976
      5   921   239     4   769    16  1728    39   380     8  1363    19
   2076  1299    15   828    72  3305     4   816  4722     4   139   950
      5 11214     4   921  1488    13   545   386    14    22     8   259
    267    18     6  3191     5  3210   585     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0