# Imports and Downloads

In [None]:
# !pip install spacy
# !pip install datasets
# !python -m spacy download en_core_web_sm
# !pip install transformers==3.2.0
# !pip install transformers[sentencepiece]
# !pip install sentencepiece


Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp39-cp39-win_amd64.whl.metadata (8.3 kB)
Downloading sentencepiece-0.2.0-cp39-cp39-win_amd64.whl (991 kB)
   ---------------------------------------- 0.0/991.5 kB ? eta -:--:--
   --------------------------------------- 991.5/991.5 kB 15.5 MB/s eta 0:00:00
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.2.0


In [1]:
from nltk.corpus import stopwords
from datasets import load_dataset
import re
import concurrent.futures
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
import spacy
from transformers import DistilBertTokenizer, AlbertTokenizer, BertTokenizer
from transformers import AlbertForSequenceClassification, DistilBertForSequenceClassification, BertForSequenceClassification
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nlp = spacy.load("en_core_web_sm")
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anuja\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anuja\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Dataset Loading

In [17]:
ds = load_dataset("Jinyan1/COLING_2025_MGT_en")

In [35]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'source', 'sub_source', 'lang', 'model', 'label', 'text'],
        num_rows: 610767
    })
    dev: Dataset({
        features: ['id', 'source', 'sub_source', 'lang', 'model', 'label', 'text'],
        num_rows: 261758
    })
})

In [22]:
training_dataset = ds["train"]
train_dataset = training_dataset.to_pandas()

In [36]:
train_dataset.head()

Unnamed: 0,text,label
0,hitler plan succession power structure death s...,1
1,bush administration turned attention iraq argu...,0
2,best pedi pho attentive rush time come vega co...,0
3,m mv loch seaforth passenger sailing month tes...,0
4,malta participated eurovision song contest son...,0


In [None]:
train_dataset = train_dataset[['text', 'label']]
print("Length of the dataset: ", len(train_dataset))

Length of the dataset:  610767


# Dataset Pre-processing

In [None]:
def process_text_batch(batch_texts):
    processed_batch = []
    for text in batch_texts:
        text = text.lower()
        doc = nlp(text)
        tokens = []
        for token in doc:
            if token.is_alpha and not token.is_stop:
                tokens.append(lemmatizer.lemmatize(token.text))
        processed_batch.append(' '.join(tokens))
    return processed_batch

def preprocess_text_parallel(dataset):
    batch_size = 1000
    n = len(dataset)
    processed_texts = []
    batches = []
    for i in range(0, n, batch_size):
        batch = dataset['text'][i:i+batch_size].tolist()
        batches.append(batch)
    
    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = executor.map(process_text_batch, batches)
        
    for batch in results:
        processed_texts.extend(batch)
    
    dataset['text'] = processed_texts
    return dataset

In [None]:
cleaned_dataset = preprocess_text_parallel(train_dataset) # 175m 4.8s

In [None]:
(train_dataset['label'] == cleaned_dataset['label']).value_counts() # Checking if order was preserved

label
True    610767
Name: count, dtype: int64

In [42]:
# save cleaned dataset
cleaned_dataset.to_csv('cleaned_dataset.csv', index=False)

# Tokenization for the BERT models

In [3]:
cleaned_dataset = pd.read_csv('cleaned_dataset.csv')

In [4]:
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
albert_tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
tinybert_tokenizer = BertTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [None]:
cleaned_texts = [str(text) for text in cleaned_dataset['text'].values if text is not None]
DB_inputs = distilbert_tokenizer(cleaned_texts, padding=True, truncation=True, max_length=512, return_tensors='pt')
AB_inputs = albert_tokenizer(cleaned_texts, padding=True, truncation=True, max_length=512, return_tensors='pt')
TB_inputs = tinybert_tokenizer(cleaned_texts, padding=True, truncation=True, max_length=512, return_tensors='pt') #56m

In [11]:
labels = torch.tensor(cleaned_dataset['label'].values)

In [12]:
print(DB_inputs['input_ids'].shape)
print(AB_inputs['input_ids'].shape)
print(TB_inputs['input_ids'].shape)
print(labels.shape)

torch.Size([610767, 512])
torch.Size([610767, 512])
torch.Size([610767, 512])
torch.Size([610767])


In [14]:
# Save tokenized inputs and labels
torch.save(DB_inputs, 'intermediates/DB_inputs.pt')
torch.save(AB_inputs, 'intermediates/AB_inputs.pt')
torch.save(TB_inputs, 'intermediates/TB_inputs.pt')
torch.save(labels, 'intermediates/labels.pt')