# LSTM Model for Profanity and Inappropriate Content Detection

In [1]:
# Load improved complex dataset
import pandas as pd
df = pd.read_csv('Profanity_Dataset_Complex.csv')
df.head()

Unnamed: 0,text,label
0,What kind of stupid garbage is this? It broke ...,1
1,إزاي منتج بالسوء ده يتباع؟ كله نصب واحتيال، وك...,1
2,Experience unmatched comfort with our orthoped...,0
3,المنتج دا قذر لأبعد الحدود، ريحته مقرفة وشكله ...,1
4,تم تصنيع هذا المنتج باستخدام أحدث تقنيات العزل...,0


In [2]:
# Preprocessing
import re
import string
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    return text
df['processed'] = df['text'].apply(preprocess_text)

In [3]:
# Tokenization & Padding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['processed'])
sequences = tokenizer.texts_to_sequences(df['processed'])
padded = pad_sequences(sequences, padding='post', maxlen=100)
labels = df['label'].values

In [4]:
# Train/Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(padded, labels, test_size=0.2, random_state=42)

In [5]:
# Build the LSTM Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.callbacks import EarlyStopping

model = Sequential([
    Embedding(10000, 64, input_length=100),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(32)),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test),
                    callbacks=[early_stop], batch_size=32)

Epoch 1/20




[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 262ms/step - accuracy: 0.6008 - loss: 0.6859 - val_accuracy: 1.0000 - val_loss: 0.5957
Epoch 2/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 105ms/step - accuracy: 1.0000 - loss: 0.4771 - val_accuracy: 1.0000 - val_loss: 0.0853
Epoch 3/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 100ms/step - accuracy: 1.0000 - loss: 0.0675 - val_accuracy: 1.0000 - val_loss: 0.0126
Epoch 4/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 119ms/step - accuracy: 1.0000 - loss: 0.0172 - val_accuracy: 1.0000 - val_loss: 0.0034
Epoch 5/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 110ms/step - accuracy: 1.0000 - loss: 0.0072 - val_accuracy: 1.0000 - val_loss: 0.0014
Epoch 6/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 108ms/step - accuracy: 1.0000 - loss: 0.0046 - val_accuracy: 1.0000 - val_loss: 7.6213e-04
Epoch 7/20
[1m10/10[0m [32m━━━━

In [11]:
# Prediction Function
from tensorflow.keras.preprocessing.sequence import pad_sequences

def predict_lstm(text):
    seq = tokenizer.texts_to_sequences([preprocess_text(text)])
    padded_seq = pad_sequences(seq, padding='post', maxlen=100)
    prob = model.predict(padded_seq)[0][0]
    return {
        'text': text,
        'prob_offensive': float(prob),
        'recommendation': 'REJECT' if prob > 0.5 else 'APPROVE'
    }
predict_lstm("هذا الشيء كأنه معمول من زبالة المصنع، ولا يسوى قشرة بصلة، المنتج خرا بصراحة وما ينفعش لأي استخدام.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step


{'text': 'هذا الشيء كأنه معمول من زبالة المصنع، ولا يسوى قشرة بصلة، المنتج خرا بصراحة وما ينفعش لأي استخدام.',
 'prob_offensive': 0.00019563609384931624,
 'recommendation': 'APPROVE'}

In [12]:
model.save("lstm_profanity_model.h5")

