<a href="https://colab.research.google.com/github/AhmedCoolProjects/ESI/blob/main/GRU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BY AHMED BARGDY

In [34]:
# Préparation de l’environnement du travail

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, classification_report

from tensorflow.keras.preprocessing.text import Tokenizer

from tensorflow.keras.models import Sequential

from tensorflow.keras.layers import GRU, Dense, Embedding, Bidirectional

In [65]:
def handle_bad_lines(line):
    print(f"Skipping line: {line}")

data_url = "https://firebasestorage.googleapis.com/v0/b/esi-school-resources.appspot.com/o/text_mining%2Ftps%2FIMDB%20Dataset.csv?alt=media&token=005703f2-e4f5-4d48-8c82-77dabb6dc938"

# Chargement et Exploration du Jeu de Données
df_data = pd.read_csv(data_url, header=0, encoding='utf-8', engine="python", on_bad_lines=handle_bad_lines)


# Exploration du dataset.

In [4]:
# Afficher les premières lignes du dataset :
df_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
# Afficher des informations sur les variables :
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1613 entries, 0 to 1612
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     1613 non-null   object
 1   sentiment  1613 non-null   object
dtypes: object(2)
memory usage: 25.3+ KB


In [6]:
# Afficher des statistiques descriptives :
df_data.describe()

Unnamed: 0,review,sentiment
count,1613,1613
unique,1613,2
top,One of the other reviewers has mentioned that ...,positive
freq,1,820


In [7]:
#  les valeurs uniques dans les colonnes catégorielles :
print(f"Unique values in sentiment: {df_data['sentiment'].unique()}")


Unique values in sentiment: ['positive' 'negative']


In [8]:
#  la distribution des valeurs dans les colonnes catégorielles :
print(f"la distribution des valeurs de sentiment: \n{df_data['sentiment'].value_counts()}")

la distribution des valeurs de sentiment: 
positive    820
negative    793
Name: sentiment, dtype: int64


# Pre-processing

In [9]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
from nltk.stem import PorterStemmer

In [11]:
# Nettoyer et prétraiter les textes (enlever la ponctuation, convertir en minuscules, etc.).
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
def clean_text(text):
    # Supprimer la ponctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Convertir en minuscules
    text = text.lower()
    # Supprimer les stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    # Appliquer la racinisation (stemming)
    words = [stemmer.stem(word) for word in words]
    # Rejoindre les mots pour former le texte traité
    text = ' '.join(words)
    return text

In [12]:
# encodage
df_data['encoded_sentiment'] = df_data['sentiment'].map({'positive': 1, 'negative': 0})

In [13]:
df_data['cleaned_review'] = df_data['review'].apply(clean_text)

In [14]:
df_data.head()

Unnamed: 0,review,sentiment,encoded_sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,1,one review mention watch 1 oz episod youll hoo...
1,A wonderful little production. <br /><br />The...,positive,1,wonder littl product br br film techniqu unass...
2,I thought this was a wonderful way to spend ti...,positive,1,thought wonder way spend time hot summer weeke...
3,Basically there's a family where a little boy ...,negative,0,basic there famili littl boy jake think there ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,petter mattei love time money visual stun film...


In [16]:
# remove unused columns
df_data = df_data[['cleaned_review', 'encoded_sentiment']]

In [17]:
df_data.head()

Unnamed: 0,cleaned_review,encoded_sentiment
0,one review mention watch 1 oz episod youll hoo...,1
1,wonder littl product br br film techniqu unass...,1
2,thought wonder way spend time hot summer weeke...,1
3,basic there famili littl boy jake think there ...,0
4,petter mattei love time money visual stun film...,1


In [18]:
# Tokenizing the text
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(df_data['cleaned_review'])

In [19]:
# converting text to sequences
sequences = tokenizer.texts_to_sequences(df_data['cleaned_review'])

In [20]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [39]:
max_length = 200
vocab_size = 10000
sequence_length = 50

In [40]:
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

In [41]:
# Displaying the first sequence as an example
print("Original text:\n", df_data['cleaned_review'][0])
print("\nTokenized sequence:\n", sequences[0])
print("\nPadded sequence:\n", padded_sequences[0])
print("\nLen Padded sequence:\n", len(padded_sequences[0]))

Original text:
 one review mention watch 1 oz episod youll hook right exactli happen mebr br first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use wordbr br call oz nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home manyaryan muslim gangsta latino christian italian irish moreso scuffl death stare dodgi deal shadi agreement never far awaybr br would say main appeal show due fact goe show wouldnt dare forget pretti pictur paint mainstream audienc forget charm forget romanceoz doesnt mess around first episod ever saw struck nasti surreal couldnt say readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard wholl sold nickel inmat wholl kill order get away well manner middl class inmat turn prison bitch due lack street skill prison ex

# Train and Test Data

In [42]:
x = df_data['cleaned_review']
y = df_data['encoded_sentiment']

tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(x)

# Convertir les textes en séquences d'entiers
sequences = tokenizer.texts_to_sequences(x)
# Remplir les séquences pour qu'elles aient toutes la même longueur
x_padded = pad_sequences(sequences, maxlen=sequence_length, padding='post', truncating='post')
# Diviser les données en ensembles d'entraînement et de test
x_train, x_test, y_train, y_test = train_test_split(x_padded, y, test_size=0.2, random_state=42)

# 4.	Construction du Modèle GRU

In [60]:
def create_model(vocab_size=vocab_size, embedding_dim=50, sequence_length=sequence_length, gru_units=50, dense_units=1, activation='sigmoid'):
  """
  lstm_units: nombre de neurones LSTM dans la couche LSTM, plus il est élevé, plus d'apprentissage complexe
  embedding_dim: Dim de l'espace d'incorporation
  input_length: longueur des séquences en entrée
  activation: la fonction appliquée à la sortie 'sigmoid' est souvent utilisé pour produire des valuers entre 0 et 1 pour la probabilité de la classe positive.
  """
  model = Sequential()
  model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=sequence_length))
  model.add(GRU(units=gru_units, dropout=0.2, recurrent_dropout=0.2))
  model.add(Dense(units=dense_units, activation=activation))
  return model

def compile_model(model):
  # Compiler le modèle
  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
  return model

def fit_model(model):
  # Entraîner le modèle
  model.fit(x_train, y_train, epochs=10, batch_size=128, validation_data=(x_test, y_test))
  return model

def evaluate_model(model):
  loss, accuracy = model.evaluate(x_test, y_test, verbose=2)
  print('---------------Loss - Accuracy----------------')
  print(f'Loss on test set: {loss:.4f}')
  print(f'Accuracy on test set: {accuracy:.4f}')
  y_pred = model.predict(x_test)
  y_pred_binary = (y_pred > 0.5).astype(int)
  precision = accuracy_score(y_test, y_pred_binary)
  print('---------------Precision----------------')
  print(f'Precision on test set: {precision:.4f}')
  print('---------------Classification Report----------------')
  print(classification_report(y_test, y_pred_binary))

In [61]:
model = create_model()
model = compile_model(model)
print('---------------Summary----------------')
print(model.summary())
print('---------------Training----------------')
model = fit_model(model)
print('---------------Evaluation----------------')
evaluate_model(model)

---------------Summary----------------
Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 50, 50)            500000    
                                                                 
 gru_9 (GRU)                 (None, 50)                15300     
                                                                 
 dense_9 (Dense)             (None, 1)                 51        
                                                                 
Total params: 515351 (1.97 MB)
Trainable params: 515351 (1.97 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
---------------Training----------------
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
---------------Evaluation----------------
11/11 - 0s - loss: 0.8339 - accuracy: 0.7430 

In [64]:
model = create_model(gru_units=100)
model = compile_model(model)
print('---------------Summary----------------')
print(model.summary())
print('---------------Training----------------')
model = fit_model(model)
print('---------------Evaluation----------------')
evaluate_model(model)

---------------Summary----------------
Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, 50, 50)            500000    
                                                                 
 gru_12 (GRU)                (None, 100)               45600     
                                                                 
 dense_12 (Dense)            (None, 1)                 101       
                                                                 
Total params: 545701 (2.08 MB)
Trainable params: 545701 (2.08 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
---------------Training----------------
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
---------------Evaluation----------------
11/11 - 0s - loss: 1.3324 - accuracy: 0.7152

In [63]:
model = create_model(activation='tanh')
model = compile_model(model)
print('---------------Summary----------------')
print(model.summary())
print('---------------Training----------------')
model = fit_model(model)
print('---------------Evaluation----------------')
evaluate_model(model)

---------------Summary----------------
Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, 50, 50)            500000    
                                                                 
 gru_11 (GRU)                (None, 50)                15300     
                                                                 
 dense_11 (Dense)            (None, 1)                 51        
                                                                 
Total params: 515351 (1.97 MB)
Trainable params: 515351 (1.97 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
---------------Training----------------
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
---------------Evaluation----------------
11/11 - 0s - loss: 1.3041 - accuracy: 0.6409

# Interpretation des résultats