In [1]:
!pip install tokenizers transformers




[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Tokenizing the Converted CSV Dataset

In [2]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors

tokenizer = Tokenizer(models.BPE())

tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

tokenizer.post_processor = processors.TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", 1),
        ("[SEP]", 2),
        ("[PAD]", 0),
        ("[MASK]", 3)
    ],
)

trainer = trainers.BpeTrainer(
    vocab_size=5000,
    min_frequency=2,
    special_tokens=["[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)

files = ["../QED_data/QED_data.txt"]

tokenizer.train(files, trainer)

tokenizer.save("custom_tokenizer.json")


In [3]:
from transformers import PreTrainedTokenizerFast

hf_tokenizer = PreTrainedTokenizerFast(tokenizer_file="custom_tokenizer.json")

hf_tokenizer.cls_token = "[CLS]"
hf_tokenizer.sep_token = "[SEP]"
hf_tokenizer.pad_token = "[PAD]"
hf_tokenizer.mask_token = "[MASK]"

hf_tokenizer.save_pretrained("custom_tokenizer")


  from .autonotebook import tqdm as notebook_tqdm


('custom_tokenizer\\tokenizer_config.json',
 'custom_tokenizer\\special_tokens_map.json',
 'custom_tokenizer\\tokenizer.json')

In [4]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-19.0.1-cp312-cp312-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.5.0-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Using cached multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Using cached fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.11.13-cp312-cp312-win_amd64.whl.metadata (8.0 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Downloading aiohappyeyeballs-2.6.1-py3-none-any.whl.metadata (5.9 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->datasets)
  Using cached ai


[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import os
os.environ['USE_TF'] = "0"

In [None]:
import torch

In [None]:
from transformers import RobertaTokenizerFast

tokenizer_path = 'custom_tokenizer'

tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_path)

sample = "e_[ID](X)^(*) e_[ID](X)^(*) to e_[ID](X) e_[ID](X)"
tokens = tokenizer(sample)
print(tokens['input_ids'])

tokenizer.decode(tokens['input_ids'])

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PreTrainedTokenizer'. 
The class this function is called from is 'RobertaTokenizerFast'.


[1, 37, 32, 24, 5, 29, 138, 193, 32, 24, 5, 29, 138, 168, 193, 32, 24, 5, 29, 6, 193, 32, 24, 5, 29, 6, 2]


'[SEP] e _ I ( X )^(*) Ġe _ I ( X )^(*) Ġto Ġe _ I ( X ) Ġe _ I ( X ) [PAD]'

In [7]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import numpy as np
import torch

# ✅ Load the dataset
csv_path = '../QED_data/processed_dataset.csv'
df = pd.read_csv(csv_path)

# ✅ Split into train, validation, and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

# ✅ Tokenization Function
MAX_LENGTH = 44

def tokenize_function(example):
    input_tokens = tokenizer(
        example['text'],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors="np"  # 🔥 Change to numpy → Avoid Arrow interference!
    )

    label_tokens = tokenizer(
        example['label'],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors="np"
    )

    return {
        'input_ids': input_tokens['input_ids'][0],
        'attention_mask': input_tokens['attention_mask'][0],
        'labels': label_tokens['input_ids'][0]
    }

# ✅ Step 1: Tokenize Manually
train_data = train_df.apply(tokenize_function, axis=1).tolist()
val_data = val_df.apply(tokenize_function, axis=1).tolist()
test_data = test_df.apply(tokenize_function, axis=1).tolist()

# ✅ Step 2: Convert to Dictionary
def convert_to_dict(data):
    return {
        'input_ids': np.stack([x['input_ids'] for x in data]),
        'attention_mask': np.stack([x['attention_mask'] for x in data]),
        'labels': np.stack([x['labels'] for x in data])
    }

train_dict = convert_to_dict(train_data)
val_dict = convert_to_dict(val_data)
test_dict = convert_to_dict(test_data)

# ✅ Step 3: Create Dataset from Scratch → New Schema!
dataset = DatasetDict({
    "train": Dataset.from_dict(train_dict),
    "validation": Dataset.from_dict(val_dict),
    "test": Dataset.from_dict(test_dict)
})

# ✅ Step 4: Set Format for PyTorch
dataset.set_format(type='torch')

# ✅ Sample Check
print(dataset['train'][0])


{'input_ids': tensor([   1,  199,  205,   32,  147,   32, 2518,    5,   29,    6,  106,  205,
          32,  123,   32,  232,  456,    5,   29,    6,  168,  106,  205,   32,
         147,   32,  466,  228,    5,   29,  138,  106,  205,   32,   87,   32,
         464,  187,    5,   29,  138,    2,    2,    2]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]), 'labels': tensor([  1,  20,  11, 153,   7,  37,  31,  16, 103,  45,  32,  34,  31,  16,
         60,  66,  13,  11,  14,   7,  45,  32,  34,  31,  14,   7,  50,  32,
         86,  60, 139,  11,  14,   7,  50,  32, 114,   7,  50,  32,  85,  60,
         66,   2])}


In [8]:
from torch.utils.data import DataLoader

batch_size = 16

train_loader = DataLoader(dataset['train'], batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset['validation'], batch_size=batch_size)
test_loader = DataLoader(dataset['test'], batch_size=batch_size)


In [14]:
import pickle

### Saving DataLoaders in the Python File

In [17]:
with open(r'../src/Dataloaders/train_loader.pkl', 'wb') as f:
    pickle.dump(train_loader, f)

with open(r'../src/Dataloaders/test_loader.pkl', 'wb') as f:
    pickle.dump(test_loader, f)

with open(r'../src/Dataloaders/val_loader.pkl', 'wb') as f:
    pickle.dump(val_loader, f)

