In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from datasets import load_dataset

In [2]:
# Load civil_comments dataset
dataset = load_dataset("google/civil_comments")
print(dataset['train'][0])

{'text': "This is so cool. It's like, 'would you want your mother to read this??' Really great idea, well done!", 'toxicity': 0.0, 'severe_toxicity': 0.0, 'obscene': 0.0, 'threat': 0.0, 'insult': 0.0, 'identity_attack': 0.0, 'sexual_explicit': 0.0}


In [None]:
# load data
df = pd.DataFrame(dataset['train'])

labels = df[df.columns[2:]].values

# texts
texts = df['text'].astype(str).values

In [4]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Apply cleaning
df['text'] = df['text'].map(lambda x: clean_text(x))

In [5]:
df['text'].isnull().sum()

np.int64(0)

In [6]:
is_empty_string = df['text'] == ''
count_empty_string = is_empty_string.sum()
print("Number of values ​​that equal an empty string:", count_empty_string)

Number of values ​​that equal an empty string: 339


In [7]:
empty_string_df = df[is_empty_string]
print("DataFrame containing empty text:\n", empty_string_df)


DataFrame containing empty text:
         text  toxicity  severe_toxicity  obscene  threat  insult  \
276                0.1              0.0      0.0     0.0     0.0   
316                0.0              0.0      0.0     0.0     0.0   
10598              0.0              0.0      0.0     0.0     0.0   
20487              0.0              0.0      0.0     0.0     0.0   
33283              0.0              0.0      0.0     0.0     0.0   
...      ...       ...              ...      ...     ...     ...   
1766576            0.0              0.0      0.0     0.0     0.0   
1767014            0.0              0.0      0.0     0.0     0.0   
1785002            0.0              0.0      0.0     0.0     0.0   
1795472            0.0              0.0      0.0     0.0     0.0   
1796225            0.0              0.0      0.0     0.0     0.0   

         identity_attack  sexual_explicit  
276                  0.0              0.1  
316                  0.0              0.0  
10598            

In [8]:
empty_string_rows = df[df['text'] == '']
print(f"Number of rows with empty string in 'text': {len(empty_string_rows)}")


Number of rows with empty string in 'text': 339


In [9]:
# Remove the rows where text has an empty string
df = df[df['text'] != '']

In [10]:
empty_string_rows = df[df['text'] == '']
print(f"Number of rows with empty string in 'text': {len(empty_string_rows)}")


Number of rows with empty string in 'text': 0


In [11]:
texts = df['text'].astype(str).values


In [None]:
filtered_texts = []
filtered_labels = []

for text, label in zip(texts, labels):
    if text.strip():  
        filtered_texts.append(text)
        filtered_labels.append(label)

texts = filtered_texts
labels = filtered_labels

In [13]:
# save clean dataset 
import numpy as np

# NumPy arrays
texts = np.array(texts)
labels = np.array(labels)

# save
np.save('clean_texts.npy', texts)
np.save('clean_labels.npy', labels)

In [None]:
# Create a vectorizer
vectorizer = TextVectorization(
    max_tokens=20000,
    output_sequence_length=300,  
    output_mode='int'
)

# Training the vectorizer on texts
vectorizer.adapt(texts)

MemoryError: Unable to allocate 12.5 GiB for an array with shape (1804512,) and data type <U1855

In [None]:
vectorizer('Hello, Deep learing is non eazy')

In [None]:
# Convert data to Dataset
dataset = tf.data.Dataset.from_tensor_slices((texts, labels))

# Apply vectorization
dataset = dataset.map(lambda x, y: (vectorizer(x), y), num_parallel_calls=tf.data.AUTOTUNE)

# Data organization
dataset = dataset.cache()
dataset = dataset.shuffle(1804874)
dataset = dataset.batch(32)
dataset = dataset.prefetch(tf.data.AUTOTUNE)

In [None]:
for x_batch, y_batch in dataset.take(1):
    print("x_batch shape:", x_batch.shape)
    print("y_batch shape:", y_batch.shape)

In [None]:
len(dataset)

In [None]:
batch_X , batch_Y = dataset.as_numpy_iterator().next()

In [None]:
batch_Y.shape

In [None]:
batch_X.shape

In [None]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, LSTM, Dropout, Bidirectional, Dense, Embedding

In [None]:
model = Sequential()
model.add(Input(shape=(300,)))
# Create the embedding layer
model.add(Embedding(input_dim=20001, output_dim=32, input_length=300))
# Bidirectional LSTM Layer
model.add(Bidirectional(LSTM(32, activation='tanh')))
# Feature extractor Fully connected layers
model.add(Dense(128, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
# Final layer
model.add(Dense(6, activation='sigmoid'))

In [None]:
model.compile(loss='BinaryCrossentropy', optimizer='Adam')

In [None]:
model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=4,
    restore_best_weights=True
)

In [None]:
history = model.fit(train, 
                    epochs=5, 
                    validation_data=val,
                    batch_size=64,
                    callbacks=[early_stop]       
                   )

In [None]:
model.save('final_model.h5')
print("model saved successfully!")


In [None]:
from matplotlib import pyplot as plt
plt.figure(figsize=(8,5))
pd.DataFrame(history.history).plot()
plt.show()