In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory

# For example, running this (by clickingrun or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ai-vs-human-text/AI_Human.csv
/kaggle/input/suicidewatch/SuicideWatch.csv
/kaggle/input/suicidal-mental-health-dataset/mental-health.csv
/kaggle/input/suicide-watch/Suicide_Detection.csv


In [2]:
import tensorflow as tf
import keras
from keras import layers, optimizers

from keras_hub.tokenizers import WordPieceTokenizer, compute_word_piece_vocabulary

from keras_hub.layers import TokenAndPositionEmbedding, FNetEncoder

import keras_tuner as kt

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('/kaggle/input/ai-vs-human-text/AI_Human.csv')
df

Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0.0
1,Transportation is a large necessity in most co...,0.0
2,"""America's love affair with it's vehicles seem...",0.0
3,How often do you ride in a car? Do you drive a...,0.0
4,Cars are a wonderful thing. They are perhaps o...,0.0
...,...,...
487230,Tie Face on Mars is really just a big misunder...,0.0
487231,The whole purpose of democracy is to create a ...,0.0
487232,I firmly believe that governments worldwide sh...,1.0
487233,I DFN't agree with this decision because a LFT...,0.0


In [4]:
df=df.rename({'generated' : 'class'}, axis = 1)
df.shape

(487235, 2)

In [5]:
train_df, other = train_test_split(
    df,
    test_size = 0.1,
    random_state = 9730,
    shuffle = True,
    stratify = df['class'],
)

val_df, test_df = train_test_split(
    other,
    test_size = 0.5,
    random_state = 9730,
    shuffle = True,
    stratify = other['class'],
)

In [6]:
train_dataset = tf.data.Dataset.from_tensor_slices(
    (train_df['text'].values, train_df['class'].values)
)

val_dataset = tf.data.Dataset.from_tensor_slices(
    (val_df['text'].values, val_df['class'].values)
)

test_dataset = tf.data.Dataset.from_tensor_slices(
    (test_df['text'].values, test_df['class'].values)
)

In [7]:
# convert the text to lowercase
train_dataset = train_dataset.map(lambda text, label: (tf.strings.lower(text), label))
val_dataset = val_dataset.map(lambda text, label: (tf.strings.lower(text), label))
test_dataset = test_dataset.map(lambda text, label: (tf.strings.lower(text), label))

In [8]:
BATCH_SIZE = 16
MAX_SEQLEN = 512
auto = tf.data.AUTOTUNE

In [9]:
train_dataset = train_dataset.batch(BATCH_SIZE).prefetch(auto)
val_dataset = val_dataset.batch(BATCH_SIZE).prefetch(auto)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(auto)

In [10]:
for text_batch, label_batch in train_dataset.take(1):
    for i in range(3):
        print(f"{text_batch.numpy()[i]}\n{label_batch.numpy()[i]}\n")

b'imagine being able to know the emotions of students during class, such as when they are bored, or when they are confused. that\'s what the new facial action coding system would active. this kind of technology in the classrooms would be a necessity for teacher to effects teach to their students, and identify when they need help, or when they are getting bored during their lesson.\n\nstudents get bored during class, and don\'t pay attention because of that. students also get confused during class, but sometimes a bored student can get confused with a student that is confused on what the teacher is taking about and needs help. the facs would change that, because teacher would be able to analyze whether their students are either board or confused, or enjoying their lessons. it would also help teacher identify which lessons their students are enjoying, and would be able to use that data to plan which lessons to include, and which lessons or activity to throw away became they are boring an

In [11]:
#Calculate Vocabulary Size
vocabulary = set()
train_df['text'].str.lower().str.split().apply(vocabulary.update)
vocabulary_size = len(vocabulary)
print(vocabulary_size)

437885


In [12]:
reserved_tokens = ["[PAD]", "[UNK]"]

In [13]:
def train_word_piece(ds, vocab_size, reserved_tokens):
    word_piece_ds = ds.unbatch().map(lambda x, y: x)
    vocab = compute_word_piece_vocabulary(
        word_piece_ds.batch(1024).prefetch(2),
        vocabulary_size=vocabulary_size,
        reserved_tokens=reserved_tokens,
    )
    return vocab

In [14]:
vocab = train_word_piece(train_dataset, vocabulary_size, reserved_tokens)

In [15]:
print(f'Tokens: {vocab[24:101]}')

Tokens: ['7', '8', '9', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\xa0', '¡', '¢', '£', '¨', '©', '«', '¬', '®', '°', '²', '´', '¶', '·', '¸', '¹', 'º', 'Á', 'Â', 'Ã', 'Å', 'É', 'Ë', 'Ñ', 'Ó', 'Ö', '×', 'ß', 'à', 'á', 'â']


In [16]:
tokenizer = WordPieceTokenizer(
    vocabulary = vocab,
    lowercase = False,
    sequence_length = MAX_SEQLEN,
)

In [17]:
input_sentence_ex = train_dataset.take(1).get_single_element()[0][0]
input_tokens_ex = tokenizer(input_sentence_ex)

print("Sentence: ", input_sentence_ex)
print("Tokens: ", input_tokens_ex)
print("Recovered text after detokenizing: ", tokenizer.detokenize(input_tokens_ex))

Sentence:  tf.Tensor(b'imagine being able to know the emotions of students during class, such as when they are bored, or when they are confused. that\'s what the new facial action coding system would active. this kind of technology in the classrooms would be a necessity for teacher to effects teach to their students, and identify when they need help, or when they are getting bored during their lesson.\n\nstudents get bored during class, and don\'t pay attention because of that. students also get confused during class, but sometimes a bored student can get confused with a student that is confused on what the teacher is taking about and needs help. the facs would change that, because teacher would be able to analyze whether their students are either board or confused, or enjoying their lessons. it would also help teacher identify which lessons their students are enjoying, and would be able to use that data to plan which lessons to include, and which lessons or activity to throw away beca

In [18]:
def format_dataset(sentence, label):
    sentence = tokenizer(sentence)
    return ({"input_ids": sentence}, label)

def make_dataset(dataset):
    dataset = dataset.map(format_dataset, num_parallel_calls=tf.data.AUTOTUNE)
    return dataset.shuffle(BATCH_SIZE * 8).prefetch(16).cache()

In [19]:
train_dataset = make_dataset(train_dataset)
val_dataset = make_dataset(val_dataset)
test_dataset = make_dataset(test_dataset)

In [20]:
EMBED_DIM = 128
INTERMEDIATE_DIM = 4 * EMBED_DIM

In [32]:
from tensorflow.keras import layers, regularizers, optimizers, Model
from tensorflow.keras.layers import LSTM, Dense, Input
from tensorflow.keras.models import Model

def create_model(vocabulary_size):
    input_ids = Input(shape=(512,), dtype="int64", name="input_ids")
    
    # Embedding layer with masking
    x = TokenAndPositionEmbedding(
        vocabulary_size=vocabulary_size,
        sequence_length=512,
        embedding_dim=128,
        mask_zero=False,
    )(input_ids)

   
    # Bidirectional LSTM with explicit mask
    x = layers.Bidirectional(
        layers.LSTM(
            32,
            activation="tanh",
            kernel_regularizer=regularizers.l2(0.001),
            use_cudnn=False
        )
    )(x)

    # Dense layer for binary classification
    outputs = Dense(1, activation="sigmoid")(x)

    # Compile the model
    model = Model(input_ids, outputs, name="BiEscalator")
    model.compile(
        optimizer=optimizers.Adam(learning_rate=3e-4),
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )
    return model


## Hyperparameter Tuning

In [24]:
'''import keras_tuner as kt
tuner = kt.BayesianOptimization(
    hypermodel = create_model,
    objective = 'val_loss',
    overwrite = True,
    max_trials = 10
)
tuner.search(train_dataset, validation_data = val_dataset, epochs = 10, verbose = 1)'''

"import keras_tuner as kt\ntuner = kt.BayesianOptimization(\n    hypermodel = create_model,\n    objective = 'val_loss',\n    overwrite = True,\n    max_trials = 10\n)\ntuner.search(train_dataset, validation_data = val_dataset, epochs = 10, verbose = 1)"

# Modelling RAHHHHH


In [34]:
Sherlock = create_model(29383)
Sherlock.summary()

In [38]:
mask = Sherlock.layers[1].compute_mask(train_dataset)
print(mask)

None


In [39]:
reduceLR = keras.callbacks.ReduceLROnPlateau(
    monitor = "val_loss",
    factor = 0.0973,
    patience = 3,
    verbose = 1,
    mode = "auto",
    min_delta = 1e-4,
    cooldown = 0,
    min_lr = 0,
)

In [40]:
earlyStop = keras.callbacks.EarlyStopping(
    monitor="val_loss",
    min_delta = 0.001,
    patience = 10,
    verbose = 1,
    mode = "auto",
    restore_best_weights = True,
)

In [41]:
history = Sherlock.fit(
    train_dataset,
    epochs=50, 
    validation_data=val_dataset,
    callbacks=[earlyStop, reduceLR]
)

Epoch 1/50
[1m27407/27407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m780s[0m 28ms/step - accuracy: 0.9685 - loss: 0.1071 - val_accuracy: 0.9967 - val_loss: 0.0143 - learning_rate: 3.0000e-04
Epoch 2/50
[1m27407/27407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m749s[0m 27ms/step - accuracy: 0.9946 - loss: 0.0201 - val_accuracy: 0.9972 - val_loss: 0.0115 - learning_rate: 3.0000e-04
Epoch 3/50
[1m27407/27407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m748s[0m 27ms/step - accuracy: 0.9967 - loss: 0.0123 - val_accuracy: 0.9982 - val_loss: 0.0083 - learning_rate: 3.0000e-04
Epoch 4/50
[1m27407/27407[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m747s[0m 27ms/step - accuracy: 0.9969 - loss: 0.0122 - val_accuracy: 0.9959 - val_loss: 0.0139 - learning_rate: 3.0000e-04
Epoch 5/50
[1m 3875/27407[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m10:26[0m 27ms/step - accuracy: 0.9984 - loss: 0.0063

KeyboardInterrupt: 

In [42]:
import matplotlib.pyplot as plt

def plot_result(item):
    plt.plot(history.history[item], label=item)
    plt.plot(history.history["val_" + item], label="val_" + item)
    plt.xlabel("Epochs")
    plt.ylabel(item)
    plt.title("Train and Validation {} Over Epochs".format(item), fontsize=14)
    plt.legend()
    plt.grid()
    plt.show()


plot_result("loss")
plot_result("accuracy")

NameError: name 'history' is not defined

In [43]:
Sherlock.evaluate(test_dataset)

[1m1523/1523[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 13ms/step - accuracy: 0.9981 - loss: 0.0077


[0.00760191585868597, 0.9979886412620544]