In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib as plt
import tensorflow_hub as hub
from keras import layers
import bert
import re
import transformers
from transformers import AutoModel, BertTokenizerFast
from tensorflow.python.keras.models import save_model








  from .autonotebook import tqdm as notebook_tqdm


In [None]:
df = pd.read_csv('IMDB_Dataset.csv')

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
TAG_RE  = re.compile(r'<[^>]+>')
def remove_tags(txt):
    return TAG_RE.sub('', txt)

In [None]:
def preflitrage_txt(x):
    phrase = remove_tags(x)
    phrase = re.sub('[^a-zA-Z]',' ', phrase)
    phrase = re.sub(r"\s+[a-zA-Z]\s+",' ',phrase)
    phrase = re.sub(r'\s+', ' ', phrase)
    return phrase 

In [None]:
reviews = []
phrases = list(df['review'])
for x in phrases :
    reviews.append(preflitrage_txt(x))
print(df.columns.values)


['review' 'sentiment']


In [None]:
df.sentiment.unique()

array(['positive', 'negative'], dtype=object)

In [None]:
y = df['sentiment']
y = np.array(list(map(lambda x: 1 if x=="positive" else 0, y)))
print(reviews[10])

Phil the Alien is one of those quirky films where the humour is based around the oddness of everything rather than actual punchlines At first it was very odd and pretty funny but as the movie progressed didn find the jokes or oddness funny anymore Its low budget film thats never problem in itself there were some pretty interesting characters but eventually just lost interest imagine this film would appeal to stoner who is currently partaking For something similar but better try Brother from another planet 


In [None]:
# Chargement du tokenizer directement via Hugging Face
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# Tokenisation du texte
tokens = tokenizer.tokenize("don't be so judgmental")
print("Tokens :", tokens)

# Conversion des tokens en IDs
tokens_ids = tokenizer.convert_tokens_to_ids(tokens)
print("IDs des tokens :", tokens_ids)


Tokens : ['don', "'", 't', 'be', 'so', 'judgment', '##al']
IDs des tokens : [2123, 1005, 1056, 2022, 2061, 8689, 2389]


In [None]:
def tokenize_reviews(text_reviews):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text_reviews))

tokenized_reviews = [tokenize_reviews(review) for review in reviews ]

Token indices sequence length is longer than the specified maximum sequence length for this model (560 > 512). Running this sequence through the model will result in indexing errors


In [None]:
reviews_with_len = [[review, y[i], len(review)] 
                    for i, review in enumerate(tokenized_reviews)]

import random
random.shuffle(reviews_with_len)

reviews_with_len.sort(key=lambda x: x[2])

sorted_reviews_labels = [(review_lab[0], review_lab[1]) for review_lab in reviews_with_len]

def data_generator():
    for review, label in sorted_reviews_labels:
        yield (review, label)

processed_dataset = tf.data.Dataset.from_generator(data_generator, 
                                                    output_signature=(
                                                        tf.TensorSpec(shape=(None,), dtype=tf.int32),
                                                        tf.TensorSpec(shape=(), dtype=tf.int32)
                                                    ))


BATCH_SIZE = 32
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None,), ()))

next(iter(batched_dataset))


(<tf.Tensor: shape=(32, 21), dtype=int32, numpy=
 array([[ 2054,  5896,  2054,  2466,  2054,  6752,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [ 3191,  1996,  2338,  5293,  1996,  3185,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [ 3078,  5436,  3078,  3257,  3532,  7613,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [ 2062, 23873,  3993,  2062, 11259,  2172,  2172,  2062, 14888,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [ 2023,  3185,  2003,  6659,  2021,  2009,  2038,  2070,  2204,
          3896,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [ 1045,  2876,  9278,  2023,  2028,  2130,  2006,  7922, 12635,
    

In [None]:
import math

TOTAL_BATCHES = math.ceil(len(sorted_reviews_labels) / BATCH_SIZE)
TEST_BATCHES = TOTAL_BATCHES // 10

batched_dataset.shuffle(TOTAL_BATCHES)
test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)


In [None]:
class TEXT_MODEL(tf.keras.Model):

    def __init__(self,
                vocabulary_size,
                embedding_dimensions=128,
                cnn_filters=50,
                dnn_units=512,
                model_output_classes=2,
                dropout_rate=0.1,
                training=False,
                name="text_model"):
        super(TEXT_MODEL, self).__init__(name=name)

        self.embedding = layers.Embedding(vocabulary_size,
                                        embedding_dimensions)

        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=2,
                                        padding="valid",
                                        activation="relu")

        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=3,
                                        padding="valid",
                                        activation="relu")

        self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=4,
                                        padding="valid",
                                        activation="relu")
        self.pool = layers.GlobalMaxPool1D()

        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        self.last_dense = layers.Dense(units=1,activation="sigmoid")
        

    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l)
        l_1 = self.pool(l_1)
        l_2 = self.cnn_layer2(l)
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        l_3 = self.pool(l_3)

        concatenated = tf.concat([l_1, l_2, l_3], axis=1)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training=True)
        model_output  = self.last_dense(concatenated)
        return model_output


        
        


In [None]:
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 200
CNN_FILTERS = 100
DNN_UNITS = 256
OUTPUT_CLASSES = 2
DROPOUT_RATE = 0.2

NB_EPOCHS = 5

In [None]:
text_model = TEXT_MODEL(vocabulary_size=VOCAB_LENGTH,
                               embedding_dimensions=EMB_DIM,
                               cnn_filters=CNN_FILTERS,
                               dnn_units=DNN_UNITS,
                               model_output_classes=OUTPUT_CLASSES,
                               dropout_rate=DROPOUT_RATE)

In [None]:
text_model.compile(loss="binary_crossentropy",
                   optimizer="adam",
                   metrics=["accuracy"])


In [None]:
text_model.fit(train_data, epochs= NB_EPOCHS)
save_model(text_model, "sentiment_model_retrain.h5")


Epoch 1/5
    982/Unknown [1m93s[0m 89ms/step - accuracy: 0.7948 - loss: 0.4066

In [None]:
results = text_model.evaluate(test_data)
print(results)



[1m156/156[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 58ms/step - accuracy: 0.8953 - loss: 0.4888
[0.43716928362846375, 0.8986378312110901]


In [None]:
def predict_review(model, review_text):
    
    cleaned_review = preflitrage_txt(review_text)

    
    tokenized_review = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(cleaned_review))
    review_tensor = tf.convert_to_tensor([tokenized_review])
    prediction = model.predict(review_tensor)

    
    if prediction >= 0.5:
        return "Avis positif 😊"
    else:
        return "Avis négatif 😞"


test_review = "I got bricked up from that movie"
print(predict_review(text_model, test_review))



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step
Avis positif 😊
