In [29]:
# Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
import tensorflow_text as text  # Registers the ops.
import tensorflow_hub as hub
import matplotlib.pyplot as plt

# Dataset Loading

#### Loading the datasets from the direct path is important orelse it threw me an error if I downloaded the kaggle datasets and then used them here.

In [None]:
train_essay = pd.read_csv("./input/llm-detect-ai-generated-text/train_essays.csv")
train_essay

In [None]:
test_essay = pd.read_csv("./input/llm-detect-ai-generated-text/test_essays.csv")
test_essay

In [None]:
sample_sub = pd.read_csv("./input/llm-detect-ai-generated-text/test_essays.csv")
sample_sub

# Data Analysis and Visualization

In [None]:
train_essay.info()

In [None]:
train_essay['prompt_id'].value_counts()

In [None]:
sns.countplot(x=train_essay['prompt_id'])
plt.show()

In [None]:
train_essay['prompt_id'].value_counts().plot(kind="pie",autopct="%.1f%%")
plt.title("Prompt ID")
plt.show()

In [None]:
train_essay['generated'].value_counts().plot(kind="pie",autopct="%.1f%%") # to see what to take as base
plt.title("Target label")
plt.show()

#### We observe that wrt 'generated' the data is highly imbalanced so we would be using 'prompt_id'

In [None]:
train_essay.head()

In [None]:
ai_df = train_essay[train_essay['generated']==1]
ai_df

In [None]:
train_essay

# Data Pre-processing

### we would be using the stopwords_text and then we would be splitting them and all these words would be removed from the essays we had earlier

In [41]:
stopwords_text = """"i me my myself we our ours ourselves you you're you've you'll you'd your yours yourself yourselves he him his himself she she's her hers herself it it's its itself they them their theirs themselves what which who whom this that that'll these those am is are was were be been being have has had having do does did doing a an the and but if or because as until while of at by for with about against between into through during before after above below to from up down in out on off over under again further then once here there when where why how all any both each few more most other some such only own same so than too very s t can will just don don't should should've now d ll m o re ve y ain aren aren't couldn couldn't didn didn't doesn doesn't hadn hadn't hasn hasn't haven haven't isn isn't ma mightn mightn't mustn mustn't needn needn't shan shan't shouldn shouldn't wasn wasn't weren weren't won won't wouldn wouldn't"
"""
stopwords_list = stopwords_text.split()

In [None]:
len(stopwords_list)

# Using custom datasets to feed in more data

#### this was done to make the model train better with wide variety of data

In [43]:
df = pd.read_csv("./input/dataset-4/Training_Essay_Data.csv")

In [None]:
df

In [None]:
df2 = pd.read_csv("./input/dataset-2/train_drcat_04.csv")
df2 = df2[['text','label']]
df2.columns = ['text','generated']
df2

In [46]:
#df2.drop_duplicates(inplace=True,ignore_index=True)

In [47]:
#df2

In [None]:
df3 = pd.read_csv("./input/dataset-2/train_essays_RDizzl3_seven_v1.csv")
df3.columns = ['text','generated']
df3

In [None]:
train_data = pd.concat([df3,df2,df],axis=0,ignore_index=True)
train_data

In [50]:
train_data.drop_duplicates(inplace=True,ignore_index=True)

In [None]:
train_data

In [None]:
# Dataset
d1 = pd.read_csv("./input/dataset-3/falcon_180b_v1.csv")
d1

In [None]:
# Dataset
d2 = pd.read_csv("./input/dataset-3/llama_70b_v1.csv")
d2

In [None]:
data = pd.concat([d1,d2],axis=0,ignore_index=True)
data['generated'] = 1
data.columns = ['text','writing_prompt','generated']
data = data[['text','generated']]
data

In [None]:
Train_Data = pd.concat([train_data,data],axis=0,ignore_index=True)
Train_Data

In [None]:
d = pd.read_csv("./input/dataset-3/LLM_generated_essay_PaLM.csv")
d

In [57]:
d['generated'] = d['generated'].astype(int)

In [None]:
dd = d[['text','generated']]
dd

In [None]:
Train_Data = pd.concat([Train_Data,dd],axis=0,ignore_index=True)
Train_Data

In [60]:
Train_Data.drop_duplicates(inplace=True,ignore_index=True)

In [None]:
Train_Data

In [None]:
Train_Data['generated'].value_counts()

In [None]:
Train_Data.generated.value_counts().plot(kind='pie',autopct="%.1f%%")
plt.title("Target Column Distributions")
plt.show()

In [None]:
sns.countplot(x=Train_Data['generated'])
plt.show

# Balancing the dataset

In [65]:
# function to balance label_counts

def balance_labels(df, label_column='generated'):
    label_counts=df[label_column].value_counts()
    max_count=label_counts.max()
    
    balanced_df=pd.DataFrame()
    
    for label, count in label_counts.items():
        label_subset = df[df[label_column]==label]
        oversampled_subset= label_subset.sample(n=(max_count-count), replace=True, random_state=42)
        balanced_df = pd.concat([balanced_df, label_subset, oversampled_subset], ignore_index=True)
        
    return balanced_df


In [66]:
# Balance labels in the dataframe
Train_Data=balance_labels(Train_Data)

In [None]:
Train_Data.generated.value_counts().plot(kind='pie',autopct="%.1f%%")
plt.title("Target Column Distributions")
plt.show()

# Model Building

### first we would be segregating the training dataset we have into train and test datasets

In [None]:
x_train,x_test,y_train,y_test=train_test_split(Train_Data.text,Train_Data.generated,test_size=0.009,shuffle=True)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
x_train

## Using bert model for predictions

In [76]:
model_path ="https://www.kaggle.com/models/tensorflow/bert/TensorFlow2/bert-en-uncased-l-12-h-768-a-12/2"
preprocess_path = "https://www.kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-preprocess/3/"

In [77]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer(preprocess_path)
encoder_inputs  = preprocessor(text_input) # this is basically the preprocessed text

In [78]:
## Use BERT Model
encoder = hub.KerasLayer(model_path, trainable=True)
outputs = encoder(encoder_inputs)
pooled_output = outputs['pooled_output'] # [batch_size, 512].
sequence_output = outputs["sequence_output"] # [batch_size, seq_length, 512].

In [79]:
dropout = tf.keras.layers.Dropout(0.51 , name="dropout1")(pooled_output)
dense_2 = tf.keras.layers.Dense(64 , activation='relu')(dropout)
dropout = tf.keras.layers.Dropout(0.3 , name="dropout2")(dense_2)

dense_out = tf.keras.layers.Dense(1 , activation='sigmoid', name='output')(dropout)

model = tf.keras.Model(inputs=text_input, outputs=dense_out)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_word_ids':   0           ['input_3[0][0]']                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128)}                                                  

In [80]:
#model.compile(optimizer='adam', loss='binary_crossentropy',metrics=["accuracy"]) # using adam gave better results as compared to rmse
#history = model.fit(x_train, y_train , batch_size=512 , epochs=10 , validation_data=(x_test, y_test)) 
# using smaller batch size gave better predictions as it improved accuracy but the submission accuracy reduced

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-6),
              loss='binary_crossentropy',
              metrics=["acc"])

checkpoint_filepath = 'checkpoint.hdf5'
metric = 'val_accuracy'
callback_list = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath, monitor=metric,
                    verbose=2, save_best_only=True, mode='max')
history = model.fit(x_train, y_train , batch_size=8, callbacks=[callback_list],epochs=1 , validation_data=(x_test, y_test))
# model.load_weights(checkpoint_filepath)
model.save("model-bert")

   43/10222 [..............................] - ETA: 6:25:24 - loss: 0.9980 - acc: 0.5058

In [None]:
loss , acc = model.evaluate(x_train, y_train)
print("Accuracy on Train data:",acc)
loss , acc = model.evaluate(x_test, y_test)
print("Accuracy on Test data:",acc)

In [None]:
test_essay

# Predictions

In [None]:
y_pred = model.predict(test_essay['text'])

In [None]:
y_pred

In [None]:
y_pred[:,0]

# Saving the output in a CSV file

In [53]:
# Assuming 'test_ids' are the IDs of test samples, and 'predictions' are the predicted values
# converting to the format required for submissions
submission = pd.DataFrame({'id': test_essay['id'] , 'generated': y_pred[:,0] })
submission.to_csv('submission.csv', index=False)  # Save the CSV file

In [None]:
# pd.read_csv("/kaggle/working/submission.csv")