# Artificial Text Detection


First, let's install all necassary packages, supress warnings, and mound Google Drive.

In [1]:
!pip install transformers # supports Transformer-based models
!pip install datasets # datasets for experiments
!pip install evaluate # evaluation metrics for experiments
!pip install transformers[torch] # backend for training

Collecting datasets
  Downloading datasets-2.17.0-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.6/536.6 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=12.0.0 (from datasets)
  Downloading pyarrow-15.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (38.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.3/38.3 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow, dill, multiprocess, datasets
  Attempting uninstall: pyarrow
    Found exis

In [2]:
from transformers.utils import logging

logging.set_verbosity_error()

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [5]:
!mkdir '/content/drive/My Drive/atd'
!mkdir '/content/drive/My Drive/atd/data'
!mkdir '/content/drive/My Drive/atd/model'
output_path = '/content/drive/My Drive/atd'

mkdir: cannot create directory ‘/content/drive/My Drive/atd’: File exists
mkdir: cannot create directory ‘/content/drive/My Drive/atd/data’: File exists
mkdir: cannot create directory ‘/content/drive/My Drive/atd/model’: File exists


Next, import pandas to manipulate data and fix the random seed abnd tqdm to control exec time .

In [6]:
import pandas as pd # data manipulation & storage
from tqdm.auto import tqdm

In [7]:
from transformers import  set_seed # fix random seed
set_seed(0)

In [8]:
import numpy as np

## Building artifical text detection

Now, let's load the data


In [9]:
# Read dev data from the disk
texts_dev = pd.read_csv("/content/drive/MyDrive/atd/data/dev.csv")
# Rename columns on "text" and "labels"
texts_dev = texts_dev.rename(columns={'Text': 'text', 'Class': 'labels'})
texts_dev

Unnamed: 0,ID,text,labels
0,0,As a German myself I like so much I want...,M
1,1,The trailer certainly did its job attracting t...,H
2,2,The truth is that about _________ that I canno...,M
3,3,"I was surprised at first ive seen it,it looks ...",M
4,4,oh dear. waves of kids being murdered in favel...,H
...,...,...,...
1995,1995,"""Across 110th Street"" was more than just a cli...",H
1996,1996,"first off, I'd like to ~~thank~~ commend you o...",M
1997,1997,I was telling my kids about the bad things in...,M
1998,1998,This is a movie that sadly most people will ne...,H


Now, we split the dataset into three parts (dev dataset on two parts and a separate test dataset is the third part) and create a `DatasetDict` object, which we will further feed to the classifier.

In [10]:
from sklearn.model_selection import train_test_split  # import the train_test_split function from the sklearn library

# Dividing the whole train dataset in train and validation dataset
train = texts_dev
val, train = train_test_split(train, test_size=0.9) # test size is 0.9, so 1800 samples will be train and 200 - validation

# Reset the index of the dataframes after splitting
train.reset_index(inplace=True)
val.reset_index(inplace=True)

In [11]:
# Read test data from the disk
test = pd.read_csv("/content/drive/MyDrive/atd/data/test.csv")
# Rename column on 'text'
test = test.rename(columns={'Text': 'text'})
test

Unnamed: 0,ID,text
0,0,Although The Mole People isn't the best of Uni...
1,1,"This is a terrible, terrible film.<br /><br />..."
2,2,I had been warned about Mike Leigh's 'All or N...
3,3,"Burt Lancaster(who I just thought was great, t..."
4,4,"It's not plot driven, OK; !!!!!! ----------..."
...,...,...
19995,19995,People don't realize this is the first all dig...
19996,19996,I wasn't going to write a review but I had to ...
19997,19997,We're deep into student film urchins; as far a...
19998,19998,Its about time for a Marvel movie to be great....


In [12]:
from datasets import Dataset, DatasetDict # import necessary modules for creating datasets

# Create an empty DatasetDict object which will be train and validation set
ds = DatasetDict()

# add  datasets to the DatasetDict with specified keys
# each dataset is created from a pandas dataframe (train, val)
ds['train'] = Dataset.from_pandas(train)
ds['validation'] = Dataset.from_pandas(val)

print(ds)


DatasetDict({
    train: Dataset({
        features: ['index', 'ID', 'text', 'labels'],
        num_rows: 1800
    })
    validation: Dataset({
        features: ['index', 'ID', 'text', 'labels'],
        num_rows: 200
    })
})


In [13]:
# Create an empty DatasetDict object which will be test set
test_ds = DatasetDict()

# Add  datasets to the DatasetDict with specified keys
# The dataset is created from a pandas dataframe (test_s)
test_ds['test_s'] = Dataset.from_pandas(test)
print(test_ds)

DatasetDict({
    test_s: Dataset({
        features: ['ID', 'text'],
        num_rows: 20000
    })
})


In [14]:
# Save the dataset to disk
ds.save_to_disk(f'{output_path}/data/dataset')

Saving the dataset (0/1 shards):   0%|          | 0/1800 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/200 [00:00<?, ? examples/s]

We define the label convertores.

In [15]:
# map class IDs to labels
id2label = {0: 'H', 1: 'M'}

# map labels to class IDs
label2id = {'H': 0, 'M': 1}


Let stsrt building the model! The first step is to preprocess the texts.

We import the `AutoTokenizer` class from the transformers library.
Then we load a pre-trained tokenizer for the `distilbert-base-uncased` model. A tokenizer is necessary to convert text data into a format that can be fed into the model for processing.

In [16]:
from transformers import AutoTokenizer # import the AutoTokenizer class from the transformers library

# load a pre-trained tokenizer for the 'distilbert-base-uncased' model
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [17]:
# Preprocess the texts by tokenizing them using the tokenizer's dictionary and mapping the labels to their respective ids
def preprocess(batch):
    # Tokenize and truncate texts to have 128 tokens and pad, when necessary
    tokenized_batch = tokenizer(batch['text'], padding=True, truncation=True, max_length=128)

    # Convert labels
    tokenized_batch['labels'] = [label2id[label] for label in batch['labels']]

    # Return processed data
    return tokenized_batch

In [18]:
# The separate function for tokenizing and preprocess the test dataset (as it is without the "labels")
def test_preprocess(batch):
    # Tokenize and truncate texts to have 128 tokens and pad, when necessary
    tokenized_batch = tokenizer(batch['text'], padding=True, truncation=True, max_length=128)
    return tokenized_batch

This code applies the preprocess function to the dataset ds using batch processing. This means that the function will be applied to the data in chunks or batches, rather than one entry at a time. This can be more memory-efficient and faster.

In [19]:
# Apply the 'preprocess' function to the dataset 'ds' using batch processing
tokenized_ds = ds.map(preprocess, batched=True)
tokenized_ds

Map:   0%|          | 0/1800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['index', 'ID', 'text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 1800
    })
    validation: Dataset({
        features: ['index', 'ID', 'text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 200
    })
})

In [20]:
# Apply the 'test_preprocess' function to the dataset 'test_ds' using batch processing
tokenized_test_ds = test_ds.map(test_preprocess, batched=True)
tokenized_test_ds

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

DatasetDict({
    test_s: Dataset({
        features: ['ID', 'text', 'input_ids', 'attention_mask'],
        num_rows: 20000
    })
})

Let us define the model architecure. We will use the `distilbert-base-uncased` model as a backbone for binary predicitions.

In [21]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer # import necessary components from the transformers library

# initialize a model for sequence classification (e.g. for text classification)
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [22]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [23]:
# Define the training arguments for the model
training_args = TrainingArguments(
    output_dir=f'tmp/',                           # directory to save the model and results
    learning_rate=2e-5,                           # learning rate for optimization
    per_device_train_batch_size=32,               # batch size per GPU for training
    per_device_eval_batch_size=32,                # batch size per GPU for evaluation
    num_train_epochs=5,                           # number of training epochs
    weight_decay=0.01,                            # weight decay for regularization
    evaluation_strategy='epoch',                  # evaluation strategy during training (per epoch)
    save_strategy='epoch',                        # saving strategy during training (per epoch)
    load_best_model_at_end=True,                  # load the best model at the end of training
)

In [24]:
# Intialize the Trainer with necessary components and settings
trainer = Trainer(
    model=model,                                  # model to be trained
    args=training_args,                           # training arguments defined above
    train_dataset=tokenized_ds['train'],          # training dataset
    eval_dataset=tokenized_ds['validation'],      # validation dataset
    tokenizer=tokenizer                           # tokenizer for data processing
)

Finally let's train the model!

In [25]:
# Train the model
trainer.train()

{'eval_loss': 0.1845952868461609, 'eval_runtime': 0.6414, 'eval_samples_per_second': 311.824, 'eval_steps_per_second': 10.914, 'epoch': 1.0}
{'eval_loss': 0.12146129459142685, 'eval_runtime': 0.6638, 'eval_samples_per_second': 301.293, 'eval_steps_per_second': 10.545, 'epoch': 2.0}
{'eval_loss': 0.09179668128490448, 'eval_runtime': 0.7089, 'eval_samples_per_second': 282.146, 'eval_steps_per_second': 9.875, 'epoch': 3.0}
{'eval_loss': 0.08735299855470657, 'eval_runtime': 0.6716, 'eval_samples_per_second': 297.806, 'eval_steps_per_second': 10.423, 'epoch': 4.0}
{'eval_loss': 0.08174071460962296, 'eval_runtime': 0.6812, 'eval_samples_per_second': 293.593, 'eval_steps_per_second': 10.276, 'epoch': 5.0}
{'train_runtime': 112.2818, 'train_samples_per_second': 80.155, 'train_steps_per_second': 2.538, 'train_loss': 0.1332027502227248, 'epoch': 5.0}


TrainOutput(global_step=285, training_loss=0.1332027502227248, metrics={'train_runtime': 112.2818, 'train_samples_per_second': 80.155, 'train_steps_per_second': 2.538, 'train_loss': 0.1332027502227248, 'epoch': 5.0})

Now we are predictiing the performance on two test sets.

In [26]:
# Now make a predictions
predictions = trainer.predict(tokenized_test_ds['test_s']).predictions
# The output of the "base" predictions is an not "normal" array. So the argmax is required to make classification on '0' or '1'
predicted_class_indices = np.argmax(predictions, axis=1)
# And make the back tranformation from the "0" and "1" to the "H" and "M"
predicted_labels = [id2label[i] for i in predicted_class_indices]

In [27]:
# The "normal" data
predicted_class_indices

array([0, 0, 0, ..., 1, 0, 1])

In [28]:
# Actual predicted labels
print(predicted_labels)

['H', 'H', 'H', 'H', 'M', 'M', 'H', 'M', 'H', 'M', 'M', 'H', 'H', 'H', 'M', 'M', 'H', 'H', 'M', 'H', 'M', 'M', 'H', 'M', 'M', 'M', 'M', 'M', 'H', 'H', 'M', 'M', 'M', 'M', 'M', 'H', 'H', 'M', 'M', 'M', 'M', 'H', 'M', 'M', 'M', 'H', 'H', 'M', 'H', 'H', 'H', 'H', 'M', 'M', 'M', 'H', 'H', 'M', 'M', 'M', 'H', 'M', 'H', 'M', 'H', 'M', 'M', 'M', 'H', 'H', 'M', 'M', 'H', 'H', 'H', 'H', 'M', 'H', 'M', 'M', 'M', 'M', 'H', 'M', 'H', 'H', 'H', 'M', 'H', 'H', 'M', 'M', 'H', 'H', 'M', 'M', 'M', 'M', 'H', 'M', 'M', 'M', 'H', 'H', 'H', 'M', 'H', 'M', 'H', 'M', 'H', 'M', 'M', 'M', 'M', 'H', 'M', 'H', 'H', 'M', 'H', 'M', 'H', 'M', 'M', 'M', 'H', 'M', 'H', 'M', 'M', 'M', 'M', 'M', 'H', 'M', 'M', 'H', 'M', 'M', 'M', 'H', 'M', 'M', 'M', 'M', 'M', 'M', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'M', 'M', 'H', 'M', 'H', 'M', 'H', 'H', 'H', 'M', 'M', 'M', 'H', 'M', 'M', 'H', 'M', 'H', 'H', 'H', 'M', 'H', 'M', 'H', 'M', 'H', 'H', 'M', 'M', 'M', 'H', 'H', 'H', 'H', 'M', 'M', 'H', 'H', 'H', 'M', 'H', 'H', 'H', 'H', 'H',

In [29]:
# Check the len of the labels array to ensure that it is actually 20000 predicted labels
print(len(predicted_labels))

20000


In [30]:
# Make IDs array
ids = [i for i in range(20000)]

In [31]:
print(len(ids))

20000


In [32]:
# Final predictions dataframe with the necessary format
predictions_final = pd.DataFrame(
    {'ID': ids,
     'Class': predicted_labels,
    })


In [33]:
# Checking the dataframe
predictions_final.head(10)

Unnamed: 0,ID,Class
0,0,H
1,1,H
2,2,H
3,3,H
4,4,M
5,5,M
6,6,H
7,7,M
8,8,H
9,9,M


In [34]:
# Making an csv file and delete "index" column
predictions_final.to_csv('/content/drive/MyDrive/atd/data/output.csv', index=False)