<a href="https://colab.research.google.com/github/AlexandreISEN/CalculatriceWEB/blob/main/Classification_commentaires_toxiques.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Résumé du projet**

---

Ce projet vise à développer un modèle d'intelligence artificielle capable de classifier des commentaires en fonction de leur degré et type de toxicité.
Pour cela, il est nécessaire de créer et d'entraîner un modèle de deep learning utilisant de
l'embedding et des RNNs, puis de l'intégrer dans un pipeline qui prend en entrée une phrase brute, effectue tous les traitements et retourne une classification des commentaires donnés en entrée.

---



# Importation des packages

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

# Importation des données

Ajoutez un raccourci de ce dossier à votre google drive :

https://drive.google.com/drive/folders/1mx-CAzT10YKrmxHfYDP_1Oef7PVGUr7s?usp=sharing

In [3]:
from imblearn.under_sampling import RandomUnderSampler
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
df = pd.read_csv('/content/drive/MyDrive/data_classification_commentaires_toxiques/train.csv')
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


# Etude du jeu de données

In [5]:
# Create a composite label that combines all target columns into one string per row
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
df['composite_label'] = df[label_cols].astype(str).agg('-'.join, axis=1)

X = df['comment_text']
y = df['composite_label']

print("Feature samples:")
print(X.head())
print("\nTarget samples:")
print(y.head())

Feature samples:
0    Explanation\nWhy the edits made under my usern...
1    D'aww! He matches this background colour I'm s...
2    Hey man, I'm really not trying to edit war. It...
3    "\nMore\nI can't make any real suggestions on ...
4    You, sir, are my hero. Any chance you remember...
Name: comment_text, dtype: object

Target samples:
0    0-0-0-0-0-0
1    0-0-0-0-0-0
2    0-0-0-0-0-0
3    0-0-0-0-0-0
4    0-0-0-0-0-0
Name: composite_label, dtype: object


# Préparation des données

In [6]:
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X.to_frame(), y)

# Before RUS: Check original composite label distribution
print("Original Composite Label Distribution:\n")
print(y.value_counts())

print("\nTotal samples before RUS:", len(y))

# After RUS (assuming y_res is your resampled target)
print("\nResampled Composite Label Distribution:\n")
print(y_res.value_counts())

print("\nTotal samples after RUS:", len(y_res))

Original Composite Label Distribution:

composite_label
0-0-0-0-0-0    143346
1-0-0-0-0-0      5666
1-0-1-0-1-0      3800
1-0-1-0-0-0      1758
1-0-0-0-1-0      1215
1-1-1-0-1-0       989
1-0-1-0-1-1       618
0-0-1-0-0-0       317
0-0-0-0-1-0       301
1-1-1-0-1-1       265
0-0-1-0-1-0       181
1-1-1-0-0-0       158
1-0-0-0-0-1       136
1-0-0-0-1-1       134
1-0-1-1-1-0       131
1-0-0-1-0-0       113
1-1-1-1-1-0        64
1-0-1-1-1-1        56
0-0-0-0-0-1        54
1-1-0-0-0-0        41
1-0-1-0-0-1        35
1-1-1-1-1-1        31
0-0-0-0-1-1        28
0-0-0-1-0-0        22
0-0-1-0-1-1        18
1-0-0-1-1-0        16
1-1-0-0-1-0        14
1-1-0-1-0-0        11
1-0-1-1-0-0        11
1-0-0-1-0-1         7
1-1-0-0-1-1         7
1-1-1-0-0-1         6
1-1-1-1-0-0         4
0-0-1-0-0-1         3
1-1-0-0-0-1         3
0-0-0-1-1-0         3
1-0-0-1-1-1         3
0-0-1-1-0-0         2
0-0-1-1-1-0         2
1-1-0-1-1-0         1
1-1-0-1-0-1         1
Name: count, dtype: int64

Total samples b

In [7]:
import re
import nltk
import spacy
import string
#import autocorrect

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
#from autocorrect import Speller

# Download necessary resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Initialize spell checker and lemmatizer
#spell = Speller(lang='en')
lemmatizer = WordNetLemmatizer()

# Define stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [8]:
def clean_text(text):
    # Lowercasing
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Handle contractions (e.g., "don't" -> "do not")
    contractions = {
        "n't": " not", "'re": " are", "'s": " is", "'d": " would",
        "'ll": " will", "'t": " not", "'ve": " have", "'m": " am"
    }
    for contraction, full_form in contractions.items():
        text = text.replace(contraction, full_form)

    # Spell-checking
    #text = spell(text)

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join words back into a single string
    cleaned_text = ' '.join(tokens)

    return cleaned_text

In [9]:
# Apply cleaning function to all comments
df['cleaned_text'] = df['comment_text'].apply(clean_text)

# Verify the results
df[['comment_text', 'cleaned_text']].head()

Unnamed: 0,comment_text,cleaned_text
0,Explanation\nWhy the edits made under my usern...,explanation edits made username hardcore metal...
1,D'aww! He matches this background colour I'm s...,daww match background colour im seemingly stuc...
2,"Hey man, I'm really not trying to edit war. It...",hey man im really trying edit war guy constant...
3,"""\nMore\nI can't make any real suggestions on ...",cant make real suggestion improvement wondered...
4,"You, sir, are my hero. Any chance you remember...",sir hero chance remember page thats


In [10]:
print("Before Cleaning:")
print(y.value_counts())  # Check label distribution before cleaning

print("\nAfter Cleaning:")
df = df[df['cleaned_text'].str.strip() != '']  # Remove empty comments
y_cleaned = df['composite_label']
print(y_cleaned.value_counts())  # Check label distribution after cleaning

Before Cleaning:
composite_label
0-0-0-0-0-0    143346
1-0-0-0-0-0      5666
1-0-1-0-1-0      3800
1-0-1-0-0-0      1758
1-0-0-0-1-0      1215
1-1-1-0-1-0       989
1-0-1-0-1-1       618
0-0-1-0-0-0       317
0-0-0-0-1-0       301
1-1-1-0-1-1       265
0-0-1-0-1-0       181
1-1-1-0-0-0       158
1-0-0-0-0-1       136
1-0-0-0-1-1       134
1-0-1-1-1-0       131
1-0-0-1-0-0       113
1-1-1-1-1-0        64
1-0-1-1-1-1        56
0-0-0-0-0-1        54
1-1-0-0-0-0        41
1-0-1-0-0-1        35
1-1-1-1-1-1        31
0-0-0-0-1-1        28
0-0-0-1-0-0        22
0-0-1-0-1-1        18
1-0-0-1-1-0        16
1-1-0-0-1-0        14
1-1-0-1-0-0        11
1-0-1-1-0-0        11
1-0-0-1-0-1         7
1-1-0-0-1-1         7
1-1-1-0-0-1         6
1-1-1-1-0-0         4
0-0-1-0-0-1         3
1-1-0-0-0-1         3
0-0-0-1-1-0         3
1-0-0-1-1-1         3
0-0-1-1-0-0         2
0-0-1-1-1-0         2
1-1-0-1-1-0         1
1-1-0-1-0-1         1
Name: count, dtype: int64

After Cleaning:
composite_label
0-0-0-

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [12]:
# Define hyperparameters
MAX_VOCAB_SIZE = 20000  # Maximum words in vocabulary
MAX_SEQUENCE_LENGTH = 100  # Maximum length of each sequence (adjustable)

# Initialize the Tokenizer
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")

# Fit the tokenizer on the cleaned text
tokenizer.fit_on_texts(df['cleaned_text'])

# Convert text to sequences of numbers
sequences = tokenizer.texts_to_sequences(df['cleaned_text'])

In [13]:
# Pad sequences to ensure uniform input size
X_padded = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

# Check the shape of your processed data
print("Shape of padded sequences:", X_padded.shape)

Shape of padded sequences: (159553, 100)


In [14]:
import pickle

# Save tokenizer for later use
with open('tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)

In [15]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2025-03-23 17:06:52--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-03-23 17:06:52--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-03-23 17:06:53--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [16]:
import numpy as np

# Path to GloVe file (choose 50D, 100D, 200D, or 300D)
GLOVE_PATH = "glove.6B.100d.txt"  # Using 100D vectors

# Load GloVe word embeddings
embeddings_index = {}

with open(GLOVE_PATH, 'r', encoding='utf-8') as file:
    for line in file:
        values = line.split()
        word = values[0]  # First value is the word
        vector = np.asarray(values[1:], dtype='float32')  # The rest are vector values
        embeddings_index[word] = vector

print(f"Loaded {len(embeddings_index)} word vectors.")

Loaded 400000 word vectors.


In [17]:
# Define Embedding Dimensions (should match the GloVe file used)
EMBEDDING_DIM = 100

# Get vocab size from tokenizer
word_index = tokenizer.word_index  # Word to index mapping
num_words = min(MAX_VOCAB_SIZE, len(word_index) + 1)  # Vocabulary size

# Initialize Embedding Matrix
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

# Fill embedding matrix with GloVe vectors (if available)
for word, i in word_index.items():
    if i < num_words:  # Ensure index is within vocab limit
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector  # Use GloVe vector

print("Embedding Matrix shape:", embedding_matrix.shape)

Embedding Matrix shape: (20000, 100)


In [18]:
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(
    input_dim=num_words,
    output_dim=EMBEDDING_DIM,
    input_length=MAX_SEQUENCE_LENGTH,
    weights=[embedding_matrix],  # Use GloVe weights
    trainable=False  # Set to True if I want to fine-tune
)



# Entraînement du modèle baseline

In [19]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Bidirectional, Dense, Dropout, Embedding
from tensorflow.keras.optimizers import Adam

In [20]:
# Define Model
model = Sequential()

# 1️⃣ Embedding Layer (Pretrained GloVe)
model.add(embedding_layer)  # Uses the embedding_layer we created earlier

# 2️⃣ Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.3))  # Dropout to prevent overfitting

# 3️⃣ Another LSTM Layer (Optional)
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.3))

# 4️⃣ Fully Connected Dense Layer
model.add(Dense(32, activation='relu'))  # Intermediate dense layer
model.add(Dropout(0.2))

# 5️⃣ Output Layer (Multi-label Classification)
model.add(Dense(6, activation='sigmoid'))  # 6 units for 6 labels

# Compile the Model
model.compile(
    loss='binary_crossentropy',  # Best for multi-label classification
    optimizer=Adam(learning_rate=0.001),  # Adam optimizer
    metrics=['accuracy']  # Track accuracy
)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [43]:
# Split the composite label and convert it to a binary matrix
y = df['composite_label'].str.split('-').apply(lambda x: [int(i) for i in x]).values
y = np.array(y.tolist())  # Convert to a NumPy array with the correct dtype

# Check the shape and type of y
print(y.shape)  # It should be (num_samples, 6)
print(y.dtype)  # Should print int
print(X_padded.shape)  # Should be (num_samples, max_sequence_length)

(159553, 6)
int64
(159553, 100)


In [22]:
print(X_padded.shape)  # Should print (num_samples, sequence_length)
print(y.shape)  # Should print (num_samples, 6)

(159553, 100)
(159553, 6)


In [40]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)


In [42]:
history = model.fit(
    X_padded, y,
    epochs=10,
    batch_size=16,  # Trying a smaller batch size
    validation_split=0.2,
    callbacks=[early_stopping]
)
model.save("my_toxic_comment_model.keras")  # Saves the entire model (architecture + weights)

SyntaxError: invalid syntax. Perhaps you forgot a comma? (<ipython-input-42-8bae41d21e87>, line 5)

In [24]:
model.save("my_toxic_comment_model.keras")  # Saves the entire model (architecture + weights)

In [25]:
import numpy as np

# Example toxic comment
new_comment = ["I will kill you !!! Kill yourself ! Die ! Look out ! I'll kill you !"]

# Tokenize and pad the input
new_comment_seq = tokenizer.texts_to_sequences(new_comment)  # Tokenize
new_comment_padded = pad_sequences(new_comment_seq, maxlen=MAX_SEQUENCE_LENGTH)  # Pad

# Make a prediction
prediction = model.predict(new_comment_padded)

# Convert predictions to readable labels
threshold = 0.5  # Set a threshold for classification
predicted_labels = (prediction > threshold).astype(int)

# Show results
for label, pred in zip(label_cols, predicted_labels[0]):  # label_cols = ['toxic', 'severe_toxic', ...]
    print(f"{label}: {'Yes' if pred else 'No'}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 435ms/step
toxic: Yes
severe_toxic: No
obscene: Yes
threat: No
insult: Yes
identity_hate: No


In [41]:
from keras.models import load_model
model = load_model("my_toxic_comment_model.keras")
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(
    X_padded, y,
    epochs=5,  # Set more epochs
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stopping]
)

Epoch 1/5


KeyboardInterrupt: 

In [37]:
def reset_model(model):
    for layer in model.layers:
        if hasattr(layer, 'kernel_initializer') and hasattr(layer, 'bias_initializer'):
            layer.kernel.assign(layer.kernel_initializer(layer.kernel.shape))
            layer.bias.assign(layer.bias_initializer(layer.bias.shape))

In [38]:
reset_model("my_toxic_comment_model.keras")

AttributeError: 'str' object has no attribute 'layers'

In [27]:
import re
import numpy as np
import pickle
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [30]:
# Load tokenizer
with open("tokenizer.pkl", "rb") as file:
    tokenizer = pickle.load(file)

# Load trained model
model = tf.keras.models.load_model("my_toxic_comment_model.keras")

# Define maximum sequence length (must match training settings)
MAX_SEQUENCE_LENGTH = 100  # Adjust based on what you used

In [31]:
def preprocess_text(text):
    # Load NLTK and other required modules inside the function (ensures portability)
    import nltk
    import string
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    from nltk.stem import WordNetLemmatizer

    nltk.download('stopwords')
    nltk.download('punkt')
    nltk.download('wordnet')

    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    # Lowercasing
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Convert text into tokenized sequence
    seq = tokenizer.texts_to_sequences([" ".join(tokens)])

    # Pad sequence
    padded_seq = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH, padding="post")

    return padded_seq

In [32]:
LABELS = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

def predict_toxicity(text):
    processed_text = preprocess_text(text)  # Preprocess the input
    predictions = model.predict(processed_text)[0]  # Get predictions

    # Convert predictions into a readable format
    predicted_labels = {label: float(pred) for label, pred in zip(LABELS, predictions)}
    return predicted_labels

In [35]:
sample_text = "You are the worst person ever!"
output = predict_toxicity(sample_text)

print("Toxicity Prediction:", output)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Toxicity Prediction: {'toxic': 0.6652194857597351, 'severe_toxic': 0.0003941649047192186, 'obscene': 0.029128815978765488, 'threat': 0.006673396099358797, 'insult': 0.08321651071310043, 'identity_hate': 0.0007753943209536374}


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Your Code

# Itération de la modélisation

In [44]:
# Print Model Summary
model.summary()

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

# Define Model
model = Sequential()

# 1️⃣ Embedding Layer (Pretrained GloVe)
model.add(embedding_layer)  # Uses the embedding_layer from previous setup

# 2️⃣ Bidirectional LSTM Layer with L2 Regularization
model.add(Bidirectional(LSTM(128, return_sequences=True, kernel_regularizer=l2(0.01))))
model.add(Dropout(0.4))  # Increased dropout to 40%

# 3️⃣ Another LSTM Layer (Reduced to 32 Units)
model.add(Bidirectional(LSTM(32, kernel_regularizer=l2(0.01))))
model.add(Dropout(0.4))  # Increased dropout

# 4️⃣ Fully Connected Dense Layer
model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.3))  # More dropout

# 5️⃣ Output Layer (Multi-label Classification)
model.add(Dense(6, activation='sigmoid'))  # 6 units for 6 labels

# Compile the Model with a Lower Learning Rate
model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(learning_rate=0.0005),  # Lowered learning rate
    metrics=['accuracy']
)

# Print Model Summary
model.summary()

In [45]:
# Split the composite label and convert it to a binary matrix
y = df['composite_label'].str.split('-').apply(lambda x: [int(i) for i in x]).values
y = np.array(y.tolist())  # Convert to a NumPy array with the correct dtype

# Check the shape and type of y
print(y.shape)  # It should be (num_samples, 6)
print(y.dtype)  # Should print int
print(X_padded.shape)  # Should be (num_samples, max_sequence_length)

(159553, 6)
int64
(159553, 100)


In [46]:
history = model.fit(
    X_padded, y,
    epochs=10,
    batch_size=16,  # Try a smaller batch size
    validation_split=0.2,
    callbacks=[early_stopping]
)
model.save("my_toxic_comment_model_2.keras")  # Saves the entire model (architecture + weights)

Epoch 1/10
[1m7978/7978[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m209s[0m 26ms/step - accuracy: 0.7827 - loss: 0.5328 - val_accuracy: 0.9941 - val_loss: 0.1420
Epoch 2/10
[1m7978/7978[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m270s[0m 27ms/step - accuracy: 0.9941 - loss: 0.1471 - val_accuracy: 0.9941 - val_loss: 0.1431
Epoch 3/10
[1m7978/7978[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m245s[0m 24ms/step - accuracy: 0.9945 - loss: 0.1454 - val_accuracy: 0.9941 - val_loss: 0.1410
Epoch 4/10
[1m7978/7978[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m221s[0m 27ms/step - accuracy: 0.9946 - loss: 0.1436 - val_accuracy: 0.9941 - val_loss: 0.1410
Epoch 5/10
[1m7978/7978[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m247s[0m 25ms/step - accuracy: 0.9942 - loss: 0.1424 - val_accuracy: 0.9941 - val_loss: 0.1410
Epoch 6/10
[1m7978/7978[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 24ms/step - accuracy: 0.9940 - loss: 0.1406 - val_accuracy: 0.9941 - val_loss: 0.141

In [47]:
# Load tokenizer
with open("tokenizer.pkl", "rb") as file:
    tokenizer = pickle.load(file)

# Load trained model
model = tf.keras.models.load_model("my_toxic_comment_model.keras")

# Define maximum sequence length (must match training settings)
MAX_SEQUENCE_LENGTH = 100  # Adjust based on what you used

In [48]:
def preprocess_text(text):
    # Load NLTK and other required modules inside the function (ensures portability)
    import nltk
    import string
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    from nltk.stem import WordNetLemmatizer

    nltk.download('stopwords')
    nltk.download('punkt')
    nltk.download('wordnet')

    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    # Lowercasing
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Convert text into tokenized sequence
    seq = tokenizer.texts_to_sequences([" ".join(tokens)])

    # Pad sequence
    padded_seq = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH, padding="post")

    return padded_seq

In [49]:
LABELS = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

def predict_toxicity(text):
    processed_text = preprocess_text(text)  # Preprocess the input
    predictions = model.predict(processed_text)[0]  # Get predictions

    # Convert predictions into a readable format
    predicted_labels = {label: float(pred) for label, pred in zip(LABELS, predictions)}
    return predicted_labels

In [50]:
sample_text = "You are the worst person ever!"
output = predict_toxicity(sample_text)

print("Toxicity Prediction:", output)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 517ms/step
Toxicity Prediction: {'toxic': 0.6652194857597351, 'severe_toxic': 0.0003941649047192186, 'obscene': 0.029128815978765488, 'threat': 0.006673396099358797, 'insult': 0.08321651071310043, 'identity_hate': 0.0007753943209536374}
