# Imports and Downloads

In [None]:
!pip install spacy
!pip install datasets
!python -m spacy download en_core_web_sm
!pip install transformers==3.2.0
!pip install transformers[sentencepiece]
!pip install sentencepiece

In [None]:
from nltk.corpus import stopwords
from datasets import load_dataset
import re
import concurrent.futures
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
import spacy
from transformers import DistilBertTokenizer, AlbertTokenizer, BertTokenizer, ElectraTokenizer
from transformers import AlbertForSequenceClassification, DistilBertForSequenceClassification, BertForSequenceClassification
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nlp = spacy.load("en_core_web_sm")
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anuja\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anuja\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
def seed_everything(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
seed_everything(1)

# Training Dataset Loading

In [None]:
ds = load_dataset("Jinyan1/COLING_2025_MGT_en")

In [35]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'source', 'sub_source', 'lang', 'model', 'label', 'text'],
        num_rows: 610767
    })
    dev: Dataset({
        features: ['id', 'source', 'sub_source', 'lang', 'model', 'label', 'text'],
        num_rows: 261758
    })
})

In [22]:
training_dataset = ds["train"]
train_dataset = training_dataset.to_pandas()

In [36]:
train_dataset.head()

Unnamed: 0,text,label
0,hitler plan succession power structure death s...,1
1,bush administration turned attention iraq argu...,0
2,best pedi pho attentive rush time come vega co...,0
3,m mv loch seaforth passenger sailing month tes...,0
4,malta participated eurovision song contest son...,0


In [None]:
train_dataset = train_dataset[['text', 'label']]
print("Length of the dataset: ", len(train_dataset))

Length of the dataset:  610767


# Development Dataset Loading

In [40]:
ds = load_dataset("Jinyan1/COLING_2025_MGT_en")

In [41]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'source', 'sub_source', 'lang', 'model', 'label', 'text'],
        num_rows: 610767
    })
    dev: Dataset({
        features: ['id', 'source', 'sub_source', 'lang', 'model', 'label', 'text'],
        num_rows: 261758
    })
})

In [42]:
development_dataset = ds["dev"]
dev_dataset = development_dataset.to_pandas()

In [44]:
dev_dataset.head()

Unnamed: 0,id,source,sub_source,lang,model,label,text
0,e0c8d183-c377-4af0-a12c-2204d75cd5f0,m4gt,peerread,en,gpt4,1,"The paper titled ""A Transition-Based Directed ..."
1,2b6d18d5-993f-486f-a631-986f46ec7ba0,mage,wp,en,text-davinci-003,1,"(Apologies for two submissions, but need to wr..."
2,327667aa-fbe2-46dc-b2e9-31c7618845ca,mage,cmv,en,7B,1,WARNING: WALL OF TEXT!!! I also jump from topi...
3,2fa81a5f-a6da-4cd5-ab0b-22ea4e0464b9,m4gt,outfox,en,cohere,1,Emotion recognition through facial feedback ha...
4,61b935f0-3a00-4441-a185-34216bc5b55a,mage,eli5,en,gpt-3.5-turbo,1,Several things. 1. The cooling effect of air c...


In [45]:
dev_dataset = dev_dataset[['text', 'label']]
print("Length of the dataset: ", len(dev_dataset))

Length of the dataset:  261758


In [46]:
dev_dataset.head()

Unnamed: 0,text,label
0,"The paper titled ""A Transition-Based Directed ...",1
1,"(Apologies for two submissions, but need to wr...",1
2,WARNING: WALL OF TEXT!!! I also jump from topi...,1
3,Emotion recognition through facial feedback ha...,1
4,Several things. 1. The cooling effect of air c...,1


# Testing Dataset Loading

In [3]:
ds = pd.read_json('test_set_en_with_label.jsonl', lines=True)

In [5]:
ds.head()

Unnamed: 0,text,language,label,source,model,prompt_flag,prompt,domain,paper_id,binary,mixset_category,testset_id
0,"Hello, Thanks for sharing your health concern ...",English,0,CUDRT,human,,hello Dr. ! I am married since 2 years and my ...,GPT3.5_QA,,,,0
1,"In primary school, especially in the countrysi...",English,0,ieltsduck,human,,,ielts,,,,1
2,The advent of artificial intelligence (AI) has...,English,1,ieltsduck,gpt-4o-mini-2024-07-18,improved,Please act as a student preparing for the IELT...,ielts,,,,2
3,Unemployment insurance through options refers ...,English,1,CUDRT,GPT3.5,,Unemployment Insurance Through Options,GPT3.5_QA,,,,3
4,The long exposure feature on DSLR cameras work...,English,1,CUDRT,ChatGLM,,How the long exposure feature on DSLR Cameras ...,ChatGLM_QA,,,,4


In [6]:
test_dataset = ds[['text', 'label']]
print("Length of the dataset: ", len(test_dataset))

Length of the dataset:  73941


In [7]:
test_dataset.head()

Unnamed: 0,text,label
0,"Hello, Thanks for sharing your health concern ...",0
1,"In primary school, especially in the countrysi...",0
2,The advent of artificial intelligence (AI) has...,1
3,Unemployment insurance through options refers ...,1
4,The long exposure feature on DSLR cameras work...,1


# Training Dataset Pre-processing

In [3]:
def process_text_batch(batch_texts):
    processed_batch = []
    for text in batch_texts:
        text = text.lower()
        doc = nlp(text)
        tokens = []
        for token in doc:
            if token.is_alpha and not token.is_stop:
                tokens.append(lemmatizer.lemmatize(token.text))
        processed_batch.append(' '.join(tokens))
    return processed_batch

def preprocess_text_parallel(dataset):
    batch_size = 1000
    n = len(dataset)
    processed_texts = []
    batches = []
    for i in range(0, n, batch_size):
        batch = dataset['text'][i:i+batch_size].tolist()
        batches.append(batch)
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = executor.map(process_text_batch, batches)
        
    for batch in results:
        processed_texts.extend(batch)
    
    dataset['text'] = processed_texts
    return dataset

In [None]:
cleaned_dataset = preprocess_text_parallel(train_dataset) # 175m 4.8s

In [None]:
(train_dataset['label'] == cleaned_dataset['label']).value_counts()

label
True    610767
Name: count, dtype: int64

In [None]:
cleaned_dataset.to_csv('cleaned_dataset.csv', index=False)

# Development Dataset Pre-processing

In [47]:
def process_text_batch(batch_texts):
    processed_batch = []
    for text in batch_texts:
        text = text.lower()
        doc = nlp(text)
        tokens = []
        for token in doc:
            if token.is_alpha and not token.is_stop:
                tokens.append(lemmatizer.lemmatize(token.text))
        processed_batch.append(' '.join(tokens))
    return processed_batch

def preprocess_text_parallel(dataset):
    batch_size = 1000
    n = len(dataset)
    processed_texts = []
    batches = []
    for i in range(0, n, batch_size):
        batch = dataset['text'][i:i+batch_size].tolist()
        batches.append(batch)
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = executor.map(process_text_batch, batches)
        
    for batch in results:
        processed_texts.extend(batch)
    
    dataset['text'] = processed_texts
    return dataset

In [None]:
cleaned_dev_dataset = preprocess_text_parallel(dev_dataset) #127m 14.6s

In [None]:
(dev_dataset['label'] == cleaned_dev_dataset['label']).value_counts()

label
True    261758
Name: count, dtype: int64

In [None]:
cleaned_dev_dataset.to_csv('cleaned_dev_dataset.csv', index=False)

In [52]:
cleaned_dev_dataset.head()

Unnamed: 0,text,label
0,paper titled transition based directed aciclic...,1
1,apology submission need write dear powerfalcon...,1
2,warning wall text jump topic topic transition ...,1
3,emotion recognition facial feedback subject gr...,1
4,thing cooling effect air current taking away i...,1


# Test Dataset Pre-Processing

In [15]:
def process_text_batch(batch_texts):
    processed_batch = []
    for text in batch_texts:
        text = text.lower()
        doc = nlp(text)
        tokens = []
        for token in doc:
            if token.is_alpha and not token.is_stop:
                tokens.append(lemmatizer.lemmatize(token.text))
        processed_batch.append(' '.join(tokens))
    return processed_batch

def preprocess_text_parallel(dataset):
    batch_size = 1000
    n = len(dataset)
    processed_texts = []
    batches = []
    for i in range(0, n, batch_size):
        batch = dataset['text'][i:i+batch_size].tolist()
        batches.append(batch)
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = executor.map(process_text_batch, batches)
        
    for batch in results:
        processed_texts.extend(batch)
    
    dataset['text'] = processed_texts
    return dataset

In [None]:
cleaned_test_dataset = preprocess_text_parallel(test_dataset) #53m 11.3s

In [None]:
(test_dataset['label'] == cleaned_test_dataset['label']).value_counts()

label
True    73941
Name: count, dtype: int64

In [None]:
cleaned_test_dataset.to_csv('cleaned_test_dataset.csv', index=False)

In [20]:
cleaned_test_dataset.head()

Unnamed: 0,text,label
0,hello thanks sharing health concern gone query...,0
1,primary school especially countryside japan pr...,0
2,advent artificial intelligence ai sparked vigo...,1
3,unemployment insurance option refers use finan...,1
4,long exposure feature dslr camera work allowin...,1


# Tokenization of Training Dataset for the BERT models

In [9]:
cleaned_dataset = pd.read_csv('cleaned_dataset.csv')

In [None]:
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
tinybert_tokenizer = BertTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')
electra_small_tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [None]:
cleaned_texts = [str(text) for text in cleaned_dataset['text'].values if text is not None]
DB_inputs = distilbert_tokenizer(cleaned_texts, padding=True, truncation=True, max_length=512, return_tensors='pt')
TB_inputs = tinybert_tokenizer(cleaned_texts, padding=True, truncation=True, max_length=512, return_tensors='pt') 
EL_inputs = electra_small_tokenizer(cleaned_texts, padding=True, truncation=True, max_length=512, return_tensors='pt') # 1 hr

In [11]:
labels = torch.tensor(cleaned_dataset['label'].values)

In [None]:
print(DB_inputs['input_ids'].shape)
print(TB_inputs['input_ids'].shape)
print(EL_inputs['input_ids'].shape)
print(labels.shape)

torch.Size([610767, 512])
torch.Size([610767, 512])
torch.Size([610767, 512])
torch.Size([610767])


In [None]:
torch.save(DB_inputs, 'intermediates/DB_inputs.pt')
torch.save(TB_inputs, 'intermediates/TB_inputs.pt')
torch.save(EL_inputs, 'intermediates/EL_inputs.pt')
torch.save(labels, 'intermediates/labels.pt')

# Tokenization of Development Dataset for the BERT models

In [16]:
cleaned_dev_dataset = pd.read_csv('cleaned_dev_dataset.csv')

In [None]:
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
tinybert_tokenizer = BertTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')
electra_small_tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')

In [None]:
cleaned_dev_texts = [str(text) for text in cleaned_dev_dataset['text'].values if text is not None]
DB_dev_inputs = distilbert_tokenizer(cleaned_dev_texts, padding=True, truncation=True, max_length=512, return_tensors='pt')
EL_dev_inputs = electra_small_tokenizer(cleaned_dev_texts, padding=True, truncation=True, max_length=512, return_tensors='pt') #
TB_dev_inputs = tinybert_tokenizer(cleaned_dev_texts, padding=True, truncation=True, max_length=512, return_tensors='pt') #32m 4.7s

In [56]:
labels_dev = torch.tensor(cleaned_dev_dataset['label'].values)

In [None]:
print(DB_dev_inputs['input_ids'].shape)
print(EL_dev_inputs['input_ids'].shape)
print(TB_dev_inputs['input_ids'].shape)
print(labels_dev.shape)

torch.Size([261758, 512])
torch.Size([261758, 512])
torch.Size([261758, 512])
torch.Size([261758])


In [None]:
torch.save(DB_dev_inputs, 'intermediates_dev/DB_dev_inputs.pt')
torch.save(EL_dev_inputs, 'intermediates/EL_dev_inputs.pt')
torch.save(TB_dev_inputs, 'intermediates_dev/TB_dev_inputs.pt')
torch.save(labels_dev, 'intermediates_dev/labels_dev.pt')

# Tokenization of Test Dataset for the BERT models

In [24]:
cleaned_test_dataset = pd.read_csv('cleaned_test_dataset.csv')

In [None]:
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
tinybert_tokenizer = BertTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')
electra_small_tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')

In [None]:
cleaned_test_texts = [str(text) for text in cleaned_test_dataset['text'].values if text is not None]
DB_test_inputs = distilbert_tokenizer(cleaned_test_texts, padding=True, truncation=True, max_length=512, return_tensors='pt') 
EL_test_inputs = electra_small_tokenizer(cleaned_test_texts, padding=True, truncation=True, max_length=512, return_tensors='pt') #
TB_test_inputs = tinybert_tokenizer(cleaned_test_texts, padding=True, truncation=True, max_length=512, return_tensors='pt') #10m 23.7s

In [28]:
labels_test = torch.tensor(cleaned_test_dataset['label'].values)

In [None]:
print(DB_test_inputs['input_ids'].shape)
print(EL_test_inputs['input_ids'].shape)
print(TB_test_inputs['input_ids'].shape)
print(labels_test.shape)

torch.Size([73941, 512])
torch.Size([73941, 512])
torch.Size([73941, 512])
torch.Size([73941])


In [None]:
torch.save(DB_test_inputs, 'intermediates/DB_test_inputs.pt')
torch.save(EL_test_inputs, 'intermediates/EL_test_inputs.pt')
torch.save(TB_test_inputs, 'intermediates/TB_test_inputs.pt')
torch.save(labels_test, 'intermediates/labels_test.pt')