# Tokenizing the Converted CSV Dataset

In [31]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors

tokenizer = Tokenizer(models.BPE())

tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

tokenizer.post_processor = processors.TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", 1),
        ("[SEP]", 2),
        ("[PAD]", 0),
        ("[MASK]", 3), 
        ("[STATE_ID]" , 4)
    ],
)

trainer = trainers.BpeTrainer(
    vocab_size=512,
    min_frequency=2,
    special_tokens=["[CLS]", "[SEP]", "[PAD]", "[MASK]" , "[STATE_ID]"]
)

files = ["../QED_data/QED_data.txt"]

tokenizer.train(files, trainer)

tokenizer.save("custom_tokenizer.json")


In [32]:
from transformers import PreTrainedTokenizerFast

hf_tokenizer = PreTrainedTokenizerFast(tokenizer_file="custom_tokenizer.json")

hf_tokenizer.cls_token = "[CLS]"
hf_tokenizer.sep_token = "[SEP]"
hf_tokenizer.pad_token = "[PAD]"
hf_tokenizer.mask_token = "[MASK]"
hf_tokenizer.mask_token = "[STATE_ID]"

hf_tokenizer.save_pretrained("custom_tokenizer")


('custom_tokenizer\\tokenizer_config.json',
 'custom_tokenizer\\special_tokens_map.json',
 'custom_tokenizer\\tokenizer.json')

In [21]:
import os
os.environ['USE_TF'] = "0"

In [22]:
from transformers import RobertaTokenizerFast

tokenizer_path = 'custom_tokenizer'

tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_path)

sample = "e_[ID](X)^(*) e_[ID](X)^(*) to e_[ID](X) e_[ID](X)"
tokens = tokenizer(sample)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PreTrainedTokenizer'. 
The class this function is called from is 'RobertaTokenizerFast'.


In [25]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import numpy as np
import torch
from transformers import PreTrainedTokenizerFast

# ✅ Load the tokenizer
tokenizer = PreTrainedTokenizerFast.from_pretrained("custom_tokenizer")
tokenizer.pad_token = tokenizer.eos_token if tokenizer.eos_token is not None else "[PAD]"

# ✅ Load the dataset
csv_path = '../QED_data/processed_dataset.csv'
df = pd.read_csv(csv_path)

# ✅ Clean the DataFrame
# Convert text and label columns to strings, handle missing values
df['text'] = df['text'].astype(str).replace('nan', '')  # Convert to string, replace NaN with empty string
df['label'] = df['label'].astype(str).replace('nan', '')

# ✅ Split into train, validation, and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

# ✅ Tokenization Function
MAX_LENGTH = 44

def tokenize_function(example):
    # Ensure inputs are strings and not empty
    text = example['text'] if example['text'] else ""
    label = example['label'] if example['label'] else ""
    
    # Tokenize text
    input_tokens = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors="np"
    )

    # Tokenize label
    label_tokens = tokenizer(
        label,
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors="np"
    )

    return {
        'input_ids': input_tokens['input_ids'][0],
        'attention_mask': input_tokens['attention_mask'][0],
        'labels': label_tokens['input_ids'][0]
    }

# ✅ Step 1: Tokenize Manually
train_data = train_df.apply(tokenize_function, axis=1).tolist()
val_data = val_df.apply(tokenize_function, axis=1).tolist()
test_data = test_df.apply(tokenize_function, axis=1).tolist()

# ✅ Step 2: Convert to Dictionary
def convert_to_dict(data):
    return {
        'input_ids': np.stack([x['input_ids'] for x in data]),
        'attention_mask': np.stack([x['attention_mask'] for x in data]),
        'labels': np.stack([x['labels'] for x in data])
    }

train_dict = convert_to_dict(train_data)
val_dict = convert_to_dict(val_data)
test_dict = convert_to_dict(test_data)

# ✅ Step 3: Create Dataset from Scratch
dataset = DatasetDict({
    "train": Dataset.from_dict(train_dict),
    "validation": Dataset.from_dict(val_dict),
    "test": Dataset.from_dict(test_dict)
})

# ✅ Step 4: Set Format for PyTorch
dataset.set_format(type='torch')

# ✅ Sample Check
print(dataset['train'][0])

{'input_ids': tensor([  1,  24, 189, 158,  33, 424, 207,   6,  30,   7, 107,  82, 189, 219,
         33, 496, 193,   6,  30,   7, 169, 201,  33,  65,  33,   4,   6,  30,
          7, 107, 201,  33, 199,  33,   4,   6,  30, 139,  98,  33,  13,  82,
          6,   2]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'labels': tensor([  1,   5,  65,  33, 115,  87, 215,  71,  49,  33,  16,  89,  53, 248,
         46,  33, 112,  32,  15,  61,  67,  51,  33, 108,  61, 114,   8,  51,
         33,  86,  61,  67,  51,  33, 103,  61,  67,  94,  33,  95,   7, 110,
         67,   2])}


In [26]:
from torch.utils.data import DataLoader

batch_size = 16

train_loader = DataLoader(dataset['train'], batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset['validation'], batch_size=batch_size)
test_loader = DataLoader(dataset['test'], batch_size=batch_size)


In [27]:
import pickle

### Saving DataLoaders in the Python File

In [29]:
with open(r'../src/Dataloaders/train_loader.pkl', 'wb') as f:
    pickle.dump(train_loader, f)

with open(r'../src/Dataloaders/test_loader.pkl', 'wb') as f:
    pickle.dump(test_loader, f)

with open(r'../src/Dataloaders/val_loader.pkl', 'wb') as f:
    pickle.dump(val_loader, f)



In [30]:
tokenizer.vocab_size

512

In [21]:
tokenizer.decode([1, 35632, 48766, 9874, 20758, 28768, 11745, 9693, 45096, 43001, 28449, 42744, 19394, 20729, 3068, 43965, 29878, 40972, 46786, 18185, 42672, 4199, 26545, 17362, 36242, 38463, 48635, 7562, 6366, 23576, 47614, 20166, 4413, 15086, 30211, 27408, 1640, 32132, 32393, 17635, 2346, 49387, 22450, 48115])

'[SEP]'