In [1]:
import re
import os
os.environ["KERAS_BACKEND"] = "tensorflow"

import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset
from typing import List, Tuple

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, ConfusionMatrixDisplay, confusion_matrix

import keras
import keras_nlp
from keras import layers
from keras import regularizers
import keras_tuner as kt

import tensorflow as tf
from tensorflow import data as tf_data
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

2024-05-18 16:48:57.628125: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-18 16:48:57.628213: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-18 16:48:57.718197: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Dataset

In [2]:
# Loading the dataset

dataset = load_dataset("yaful/DeepfakeTextDetect")

Downloading data: 100%|██████████| 233M/233M [00:01<00:00, 175MB/s]  
Downloading data: 100%|██████████| 42.3M/42.3M [00:00<00:00, 141MB/s] 
Downloading data: 100%|██████████| 41.9M/41.9M [00:00<00:00, 135MB/s] 
Downloading data: 100%|██████████| 1.29M/1.29M [00:00<00:00, 13.2MB/s]
Downloading data: 100%|██████████| 2.13M/2.13M [00:00<00:00, 21.6MB/s]


Generating train split:   0%|          | 0/319071 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/56792 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/56819 [00:00<?, ? examples/s]

Generating test_ood_gpt split:   0%|          | 0/1562 [00:00<?, ? examples/s]

Generating test_ood_gpt_para split:   0%|          | 0/2362 [00:00<?, ? examples/s]

In [3]:
df_train = dataset["train"].to_pandas()
df_train

Unnamed: 0,text,label,src
0,White girls very rarely date Asian men. Even i...,1,cmv_human
1,I am a 23 year old male Indian American male. ...,1,cmv_human
2,"Take three people, Persons A, B, and C. They l...",1,cmv_human
3,(A) Work part-time in high school; Then go to ...,1,cmv_human
4,When police introduce a new form of speed prev...,1,cmv_human
...,...,...,...
319066,Noisy Intermediate-Scale Quantum (NISQ) machin...,1,sci_gen_human
319067,Recent years have seen rising needs for locati...,1,sci_gen_human
319068,The ongoing neural revolution in machine trans...,1,sci_gen_human
319069,Let D be a set of n pairwise disjoint unit dis...,1,sci_gen_human


In [4]:
df_test = dataset["test"].to_pandas()
df_test

Unnamed: 0,text,label,src
0,Little disclaimer: this deals with US laws and...,1,cmv_human
1,"Read: Mentally Retarded Downs. See, we've got ...",1,cmv_human
2,"If any of you frequent rbadhistory, there is a...",1,cmv_human
3,"I believe in a flat tax system, where everyone...",1,cmv_human
4,"Edit: Ok guy's, my views have been changed on ...",1,cmv_human
...,...,...,...
56814,We consider the recovery of a source term f (x...,1,sci_gen_human
56815,"Self-supervised learning (SlfSL), aiming at le...",1,sci_gen_human
56816,Recurrent neural networks (RNNs) have achieved...,1,sci_gen_human
56817,Deep reinforcement learning (DRL) is a booming...,1,sci_gen_human


In [5]:
# Shuffling the datasets

df_train = df_train.sample(frac=1).reset_index(drop=True)
df_test = df_test.sample(frac=1).reset_index(drop=True)

In [6]:
# Model constants.
max_features = 20000
embedding_dim = 128
sequence_length = 500
epochs = 5

In [7]:
def make_model(sequence_length: int) -> keras.Model:
    """
        Modèle RNN - LSTM.

        Input:
            - sequence_length - an int, the max length of a sequence

        Output:
            model - a Keras Model() instance
    """
    
    x_input = layers.Input(shape=(sequence_length,), dtype="int64")

    x = layers.Embedding(max_features, embedding_dim)(x_input)

    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
    
    x = layers.Bidirectional(layers.LSTM(64))(x)

    predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

    model = keras.models.Model(x_input, predictions, name='RNN_LSTM_v1')
    
    return model

In [None]:
for i in range(2):
    vectorize_layer = layers.TextVectorization(
    standardize="lower_and_strip_punctuation",
    max_tokens=max_features,
    output_sequence_length=sequence_length,
    output_mode="int",
    ngrams = (i*2+1,i*2+1) #(1,1) & (3,3)
    )
    
    text_data = df_train['text']
    vectorize_layer.adapt(text_data)
    
    model = make_model(sequence_length)

    initial_learning_rate = 0.001
    lr_schedule = keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate,
        decay_steps=100000,
        decay_rate=0.97,
        staircase=True
    )


    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr_schedule),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    X_train = df_train['text']
    y_train = df_train['label']
    X_train_vectorized = vectorize_layer(X_train)
    
    callbacks = [
        keras.callbacks.ModelCheckpoint("save_at_{epoch}.keras")
    ]

    history = model.fit(
        x=X_train_vectorized,
        y=y_train,
        epochs=epochs,
        callbacks=callbacks,
        batch_size=64,
        validation_split=0.2
    )
    
    X_test = df_test['text']
    y_test = df_test['label']
    X_test_vectorized = vectorize_layer(X_test)

    loss, accuracy = model.evaluate(X_test_vectorized, y_test)
    print(f'Test Accuracy ngram {i*2+1}: {accuracy * 100:.2f}%')