In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

csv_files = ['/kaggle/input/daigt-proper-train-dataset/train_drcat_01.csv',
             '/kaggle/input/daigt-proper-train-dataset/train_drcat_02.csv', 
             '/kaggle/input/daigt-proper-train-dataset/train_drcat_03.csv', 
             '/kaggle/input/daigt-proper-train-dataset/train_drcat_04.csv']

# Merge CSV files
dfs = []
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)

# Shuffle combined data
combined_df = combined_df.sample(
    frac=1, random_state=42).reset_index(drop=True)

# Stratified split: train (80%) and temp (20%)
train_df, temp_df = train_test_split(
    combined_df,
    test_size=0.2,
    stratify=combined_df['label'],
    random_state=42
)

# Stratified split: dev (10%) and test (10%) from temp
dev_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df['label'],
    random_state=42
)

# Save the splits
train_df.to_csv('train.csv', index=False)
dev_df.to_csv('dev.csv', index=False)
test_df.to_csv('test.csv', index=False)

In [2]:
import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")


CUDA available: True
GPU name: Tesla P100-PCIE-16GB


In [3]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict
import pandas as pd
import torch


train_df = pd.read_csv('train.csv')
dev_df = pd.read_csv('dev.csv')
test_df = pd.read_csv('test.csv')

# Take smaller samples to speed up training
train_df = train_df.sample(n=2000, random_state=42)  # or frac=0.1
dev_df = dev_df.sample(n=500, random_state=42)
test_df = test_df.sample(n=500, random_state=42)

# Keep only necessary columns
train_df = train_df[['text', 'label']]
dev_df = dev_df[['text', 'label']]
test_df = test_df[['text', 'label']]


2025-05-10 08:18:54.956988: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746865135.144893      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746865135.203476      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
# Convert pandas DataFrames to HuggingFace Datasets
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
# test_manual_df = pd.read_csv('/kaggle/working/test.csv')

In [None]:
# test_manual = pd.read_csv('/kaggle/working/test.csv')

In [None]:
# test_manual = test_manual[test_manual['label'] != 1]

In [None]:
# test_manual = Dataset.from_pandas(test_manual)

In [5]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [6]:
# Tokenize datasets


def tokenize(example):
    return tokenizer(example['text'], padding='max_length', truncation=True, max_length=512)


train_dataset = train_dataset.map(tokenize, batched=True)
dev_dataset = dev_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [7]:
# test_manual = test_manual.map(tokenize, batched=True)

In [8]:
# Set format for PyTorch
train_dataset.set_format(type='torch', columns=[
                         'input_ids', 'attention_mask', 'label'])
dev_dataset.set_format(type='torch', columns=[
                       'input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=[
                        'input_ids', 'attention_mask', 'label'])

In [9]:
# test_manual.set_format(type='torch', columns=[
#                         'input_ids', 'attention_mask', 'label'])

In [10]:
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base", num_labels=2)

if torch.cuda.is_available():
    model = model.cuda()


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [12]:

training_args = TrainingArguments(
    output_dir="./roberta_ai_detector",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [13]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2

In [14]:
import evaluate
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return accuracy.compute(predictions=preds, references=labels)


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [16]:
# Train
trainer.train()

Step,Training Loss
100,0.2258
200,0.1032
300,0.0291
400,0.0744
500,0.0144
600,0.0193
700,0.0272
800,0.0271
900,0.0169
1000,0.0219


TrainOutput(global_step=2500, training_loss=0.02655877690501511, metrics={'train_runtime': 1475.0288, 'train_samples_per_second': 13.559, 'train_steps_per_second': 1.695, 'total_flos': 5262221107200000.0, 'train_loss': 0.02655877690501511, 'epoch': 10.0})

In [17]:
# Evaluate
trainer.evaluate(test_dataset)

{'eval_loss': 0.04996133968234062,
 'eval_accuracy': 0.994,
 'eval_runtime': 7.5826,
 'eval_samples_per_second': 65.94,
 'eval_steps_per_second': 8.308,
 'epoch': 10.0}

In [18]:
print("Evaluation on Dev Set:")
print(trainer.evaluate(eval_dataset=dev_dataset))

print("Evaluation on Test Set:")
print(trainer.evaluate(eval_dataset=test_dataset))

model.save_pretrained("roberta_ai_detector")
tokenizer.save_pretrained("roberta_ai_detector")

Evaluation on Dev Set:
{'eval_loss': 0.01981492154300213, 'eval_accuracy': 0.998, 'eval_runtime': 7.5718, 'eval_samples_per_second': 66.035, 'eval_steps_per_second': 8.32, 'epoch': 10.0}
Evaluation on Test Set:
{'eval_loss': 0.04996133968234062, 'eval_accuracy': 0.994, 'eval_runtime': 7.5393, 'eval_samples_per_second': 66.319, 'eval_steps_per_second': 8.356, 'epoch': 10.0}


('roberta_ai_detector/tokenizer_config.json',
 'roberta_ai_detector/special_tokens_map.json',
 'roberta_ai_detector/vocab.json',
 'roberta_ai_detector/merges.txt',
 'roberta_ai_detector/added_tokens.json')

In [19]:
# Evaluate performance with metrics
results = trainer.evaluate(test_dataset)
print("Test Set Evaluation Metrics:")
print(results)


Test Set Evaluation Metrics:
{'eval_loss': 0.04996133968234062, 'eval_accuracy': 0.994, 'eval_runtime': 7.516, 'eval_samples_per_second': 66.525, 'eval_steps_per_second': 8.382, 'epoch': 10.0}


In [20]:
predictions = trainer.predict(test_dataset)
logits = predictions.predictions
labels = predictions.label_ids

# Convert logits to predicted class
preds = logits.argmax(axis=-1)


In [21]:
for i in range(10):
    print(f"\nText: {test_df.iloc[i]['text'][:200]}...")
    print(f"Actual Label: {labels[i]}, Predicted: {preds[i]}")



Text: I think art edukation is super impotent for kids. Some peoples might say its not that impotent but I disagree. Arts helps kids with theyre imagination and creativity. Like for example when we do art p...
Actual Label: 1, Predicted: 0

Text: As an 8th grader, I find Ralph Waldo Emerson's statement "To be yourself in a world that is constantly trying to make you something else is the greatest accomplishment" to be incredibly inspiring and ...
Actual Label: 1, Predicted: 1

Text: If some schools offer distance learning as an option for students to attend classes from home. By a way of online or video conferencing the student would benefit.

The first reason is the student can ...
Actual Label: 0, Predicted: 0

Text: Hey there!  As an 8th grader, I'm super excited to write this essay about why having a high paying job is worth it.  I mean, who doesn't want to make lots of money and live a comfortable life, right? ...
Actual Label: 1, Predicted: 1

Text: I think it would be fair to s

In [22]:
import pandas as pd

output_df = test_df.copy()
output_df['predicted_label'] = preds
output_df.to_csv("test_predictions.csv", index=False)


In [23]:
output_df

Unnamed: 0,text,label,predicted_label
6133,I think art edukation is super impotent for ki...,1,0
10820,"As an 8th grader, I find Ralph Waldo Emerson's...",1,1
1659,If some schools offer distance learning as an ...,0,0
4520,"Hey there! As an 8th grader, I'm super excite...",1,1
9690,I think it would be fair to say that more than...,0,0
...,...,...,...
12023,"Summer projects, meant to ensure students cont...",0,0
3069,"Dear TEACHER_NAME,\n\nI am writing you on beha...",0,0
7015,The use of Facial Action Coding System (FACS) ...,1,1
1358,Title: A Seven-Day Event Series at a National ...,1,1


In [None]:
!zip -r file.zip /kaggle/working