<a href="https://colab.research.google.com/github/Baflee/teki/blob/main/ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SimpleRNN
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.utils import to_categorical
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import seaborn as sns
import matplotlib.pyplot as plt

In [29]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [30]:
data = pd.read_csv("/content/Twitter_Data.csv")
data.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [31]:
data.dropna(how='any', inplace=True)
data.isnull().sum()

Unnamed: 0,0
clean_text,0
category,0


In [32]:
texts = data['clean_text'].tolist()
labels = data['category'].tolist()

In [33]:
# prompt: Text preprocessing - stopwords removal and porter stemming algorithm

# Remove stopwords and apply Porter Stemming
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

processed_texts = []
for text in texts:
    # Convert to lowercase and remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())
    # Tokenize the text
    words = text.split()
    # Remove stopwords and apply stemming
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    processed_texts.append(' '.join(words))

# Update the 'texts' variable with the processed texts
texts = processed_texts

print("Preprocessing complete.")
# Display first 5 processed texts as an example
print(texts[:5])

Preprocessing complete.
['modi promis minimum govern maximum govern expect begin difficult job reform state take year get justic state busi exit psu templ', 'talk nonsens continu drama vote modi', 'say vote modi welcom bjp told rahul main campaign modi think modi relax', 'ask support prefix chowkidar name modi great servic confus read crustal clear crass filthi nonsens see abus come chowkidar', 'answer among power world leader today trump putin modi may']


In [34]:
# prompt: Tokenization and fixing the vocabulary size

# Tokenization
# Define the maximum number of words to keep in the vocabulary
max_words = 10000  # Adjust this number as needed

tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(texts)

# Convert texts to sequences of integers
sequences = tokenizer.texts_to_sequences(texts)

# Display some sequences as an example
print("Example sequences:")
print(sequences[:5])

# Display vocabulary size (limited by max_words)
print(f"Vocabulary size (limited to max_words): {len(tokenizer.word_index)}")
print(f"Number of unique words actually found: {len(tokenizer.word_index) + 1 if '<OOV>' in tokenizer.word_index else len(tokenizer.word_index)}")

Example sequences:
[[2, 55, 696, 27, 1531, 27, 271, 884, 1095, 57, 1201, 114, 29, 16, 22, 985, 114, 265, 3018, 2638, 1102], [88, 978, 373, 724, 9, 2], [11, 9, 2, 760, 4, 459, 24, 392, 110, 2, 38, 2, 2573], [44, 26, 2950, 53, 119, 2, 85, 938, 1103, 193, 1, 381, 4736, 2965, 978, 42, 370, 31, 53], [236, 782, 32, 74, 41, 65, 785, 3032, 2, 98]]
Vocabulary size (limited to max_words): 88091
Number of unique words actually found: 88092


In [35]:
# prompt: Padding the sequence

# Padding sequences to a fixed length
vocab_size = 10000
maxlen = 100  # Define the maximum sequence length

padded_sequences = pad_sequences(sequences, maxlen=maxlen, padding='post', truncating='post')

print(f"Padded sequences shape: {padded_sequences.shape}")
print("Example padded sequences:")
print(padded_sequences[:5])

# Encode the labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Convert labels to one-hot encoding for categorical crossentropy loss
num_classes = len(label_encoder.classes_)
one_hot_labels = to_categorical(encoded_labels, num_classes=num_classes)

print(f"Encoded labels shape: {encoded_labels.shape}")
print(f"One-hot labels shape: {one_hot_labels.shape}")
print("Example encoded labels:")
print(encoded_labels[:5])
print("Example one-hot labels:")
print(one_hot_labels[:5])

Padded sequences shape: (162969, 100)
Example padded sequences:
[[   2   55  696   27 1531   27  271  884 1095   57 1201  114   29   16
    22  985  114  265 3018 2638 1102    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]
 [  88  978  373  724    9    2    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0   

In [36]:
# prompt: prepare labels and split the data

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, one_hot_labels, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (130375, 100)
X_test shape: (32594, 100)
y_train shape: (130375, 3)
y_test shape: (32594, 3)


In [37]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, SimpleRNN, Dense

# Paramètres du modèle
embedding_dim = 50         # Dimension de l’espace d’incorporation
input_dim = vocab_size     # Taille du vocabulaire
max_length = maxlen        # Longueur maximale d’entrée (des séquences)
output_dim = num_classes   # Nombre de classes à prédire

# Création du modèle MLP enrichi de RNN
model = Sequential([
    Embedding(input_dim=input_dim, output_dim=embedding_dim, input_length=max_length),
    Bidirectional(LSTM(32, return_sequences=True)),
    Bidirectional(SimpleRNN(32)),
    Dense(64, activation='relu'),
    Dense(output_dim, activation='softmax')
])

# Compilation
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Résumé du modèle
model.summary()




In [38]:
# Entraînement du modèle avec validation croisée sur 10 % des données d'entraînement
history = model.fit(
    X_train,
    y_train,
    epochs=5,
    batch_size=32,
    validation_split=0.1,
    verbose=1
)

# Évaluation sur les données de test
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)

print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")


Epoch 1/5
[1m3667/3667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m424s[0m 114ms/step - accuracy: 0.7370 - loss: 0.6249 - val_accuracy: 0.8557 - val_loss: 0.4072
Epoch 2/5
[1m3667/3667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m437s[0m 112ms/step - accuracy: 0.8694 - loss: 0.3685 - val_accuracy: 0.8630 - val_loss: 0.3920
Epoch 3/5
[1m3667/3667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m454s[0m 116ms/step - accuracy: 0.8836 - loss: 0.3222 - val_accuracy: 0.8606 - val_loss: 0.4055
Epoch 4/5
[1m3667/3667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m448s[0m 117ms/step - accuracy: 0.9017 - loss: 0.2678 - val_accuracy: 0.8553 - val_loss: 0.4343
Epoch 5/5
[1m3667/3667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m428s[0m 114ms/step - accuracy: 0.9173 - loss: 0.2260 - val_accuracy: 0.8466 - val_loss: 0.4692
Test Loss: 0.4720
Test Accuracy: 0.8460


In [41]:
from sklearn.metrics import classification_report
import numpy as np

# 6. Évaluation du modèle

# Prédiction sur l'ensemble de test
y_pred_probs = model.predict(X_test)  # Probabilités de chaque classe

[1m1019/1019[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 28ms/step


In [52]:
y_pred = np.argmax(y_pred_probs, axis=1)  # Classe prédite (indice)
y_true = np.argmax(y_test, axis=1)  # Classe réelle (indice)

# Récupération des noms de classes depuis le label encoder
categories = label_encoder.classes_

# Affichage du rapport de classification
print("Rapport de classification :")
print(classification_report(y_true, y_pred, target_names=[str(x) for x in categories]))

Rapport de classification :
              precision    recall  f1-score   support

        -1.0       0.79      0.77      0.78      7152
         0.0       0.84      0.90      0.87     11067
         1.0       0.88      0.85      0.86     14375

    accuracy                           0.85     32594
   macro avg       0.84      0.84      0.84     32594
weighted avg       0.85      0.85      0.85     32594



In [53]:
loss, acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc}")

[1m1019/1019[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 27ms/step - accuracy: 0.8418 - loss: 0.4856
Test Accuracy: 0.8459839224815369
