In [58]:
!pip install tiktoken



In [59]:
import re
import os
os.environ["KERAS_BACKEND"] = "tensorflow"

import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset
from typing import List, Tuple

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, ConfusionMatrixDisplay, confusion_matrix

import keras
import keras_nlp
from keras import layers
from keras import regularizers
import keras_tuner as kt

import tensorflow as tf
from tensorflow import data as tf_data
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Dataset

In [60]:
# Loading the dataset

dataset = load_dataset("yaful/DeepfakeTextDetect")

In [61]:
df_train = dataset["train"].to_pandas()
df_train

Unnamed: 0,text,label,src
0,White girls very rarely date Asian men. Even i...,1,cmv_human
1,I am a 23 year old male Indian American male. ...,1,cmv_human
2,"Take three people, Persons A, B, and C. They l...",1,cmv_human
3,(A) Work part-time in high school; Then go to ...,1,cmv_human
4,When police introduce a new form of speed prev...,1,cmv_human
...,...,...,...
319066,Noisy Intermediate-Scale Quantum (NISQ) machin...,1,sci_gen_human
319067,Recent years have seen rising needs for locati...,1,sci_gen_human
319068,The ongoing neural revolution in machine trans...,1,sci_gen_human
319069,Let D be a set of n pairwise disjoint unit dis...,1,sci_gen_human


In [62]:
df_test = dataset["test"].to_pandas()
df_test

Unnamed: 0,text,label,src
0,Little disclaimer: this deals with US laws and...,1,cmv_human
1,"Read: Mentally Retarded Downs. See, we've got ...",1,cmv_human
2,"If any of you frequent rbadhistory, there is a...",1,cmv_human
3,"I believe in a flat tax system, where everyone...",1,cmv_human
4,"Edit: Ok guy's, my views have been changed on ...",1,cmv_human
...,...,...,...
56814,We consider the recovery of a source term f (x...,1,sci_gen_human
56815,"Self-supervised learning (SlfSL), aiming at le...",1,sci_gen_human
56816,Recurrent neural networks (RNNs) have achieved...,1,sci_gen_human
56817,Deep reinforcement learning (DRL) is a booming...,1,sci_gen_human


In [63]:
# Shuffling the datasets

df_train = df_train.sample(frac=1).reset_index(drop=True)
df_test = df_test.sample(frac=1).reset_index(drop=True)

In [64]:
# Model constants.
max_features = 20000
embedding_dim = 128
sequence_length = 500


vectorize_layer = layers.TextVectorization(
    standardize="lower_and_strip_punctuation",
    max_tokens=max_features,
    output_sequence_length=sequence_length,
    output_mode="int"
)

text_data = df_train['text']
vectorize_layer.adapt(text_data)

## Model

In [65]:
def build_block(last_layer: tf.Tensor, neuron_num: int) -> tf.Tensor:
    
    x = layers.Dense(neuron_num, activation='relu')(last_layer)
    x = layers.BatchNormalization()(x)
    
    x = layers.Dense(int(neuron_num*1.5), activation='relu')(x)
    x = layers.BatchNormalization()(x)
    
    x = layers.Dropout(0.2)(x)
    
    return x

In [75]:
def make_model(sequence_length: int) -> keras.Model:
    """
        Modèle FFN.

        Input:
            - input_shape - a (height, width, chan) tuple, the shape of the input images
            - num_classes - number of classes the model must learn

        Output:
            model - a Keras Model() instance
    """
    
    x_input = layers.Input(shape=(sequence_length,), dtype="int64")

    x = layers.Embedding(max_features, embedding_dim)(x_input)
    x = layers.Dropout(0.2)(x)

    x = build_block(x, 128)
    
    x = build_block(x, 256)
    
    x = layers.GlobalMaxPooling1D()(x)

    x = build_block(x, 256)
    
    x = build_block(x, 128)
    
    x = build_block(x, 32)

    predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

    model = keras.models.Model(x_input, predictions, name='FFN_v1')
    
    return model

In [76]:
model = make_model(sequence_length)

initial_learning_rate = 0.001
lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=100000,
    decay_rate=0.97,
    staircase=True
)


model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=lr_schedule),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [77]:
print("Nombre de couches : ", len(model.layers))

Nombre de couches :  30


## Training

In [None]:
X_train = df_train['text']
y_train = df_train['label']

In [None]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)

sequences = tokenizer.texts_to_sequences(X_train)

X_train_padded = pad_sequences(sequences, maxlen=sequence_length, padding='post')

print("Shape of X_train_padded:", X_train_padded.shape)

In [81]:
epochs = 10

callbacks = [
    keras.callbacks.ModelCheckpoint("save_at_{epoch}.keras")
]

history = model.fit(
    x=X_train_padded,
    y=y_train,
    epochs=epochs,
    callbacks=callbacks,
    batch_size=64,
    validation_split=0.2
)

Epoch 1/10
[1m   7/3989[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:19[0m 20ms/step - accuracy: 0.5030 - loss: 0.9163  

I0000 00:00:1714938778.772673     236 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
W0000 00:00:1714938778.797220     236 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m3989/3989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.7286 - loss: 0.5434

W0000 00:00:1714938862.647328     234 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update
W0000 00:00:1714938863.763699     234 graph_launch.cc:671] Fallback to op-by-op mode because memset node breaks graph update


[1m3989/3989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 23ms/step - accuracy: 0.7286 - loss: 0.5433 - val_accuracy: 0.6783 - val_loss: 0.7732
Epoch 2/10
[1m3989/3989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 19ms/step - accuracy: 0.8443 - loss: 0.3546 - val_accuracy: 0.7567 - val_loss: 0.7262
Epoch 3/10
[1m3989/3989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 19ms/step - accuracy: 0.8692 - loss: 0.3082 - val_accuracy: 0.7591 - val_loss: 0.7223
Epoch 4/10
[1m3989/3989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 19ms/step - accuracy: 0.8837 - loss: 0.2763 - val_accuracy: 0.7503 - val_loss: 0.8931
Epoch 5/10
[1m3989/3989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 19ms/step - accuracy: 0.8967 - loss: 0.2506 - val_accuracy: 0.7209 - val_loss: 0.8081
Epoch 6/10
[1m3989/3989[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 19ms/step - accuracy: 0.9056 - loss: 0.2307 - val_accuracy: 0.7715 - val_loss: 0.7904
Epoch 7/10
[1

In [88]:
X_test = df_test['text']
y_test = df_test['label']

In [89]:
sequences = tokenizer.texts_to_sequences(X_test)

X_test_padded = pad_sequences(sequences, maxlen=sequence_length, padding='post')

print("Shape of X_train_padded:", X_test_padded.shape)

Shape of X_train_padded: (56819, 500)


In [90]:
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

[1m1776/1776[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.5915 - loss: 1.4172
Test Accuracy: 58.90%


## End to End Model

In [93]:
def create_end_model():
    inputs = layers.Input(shape=(1,), dtype="string")

    indices = vectorize_layer(inputs)

    outputs = model(indices)

    end_to_end_model = keras.models.Model(inputs, outputs)
    end_to_end_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    
    return end_to_end_model

In [95]:
end_to_end_model = create_end_model()
end_to_end_model.evaluate(X_test, y_test)

[1m1776/1776[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step - accuracy: 0.5122 - loss: 1.7458


[1.7640995979309082, 0.5108854174613953]