# Tokenizing the Converted CSV Dataset

In [15]:
import pandas as pd

# Load the preprocessed CSV
csv_path = 'processed_2.csv'
df = pd.read_csv(csv_path)

# Clean the DataFrame
df['text'] = df['text'].astype(str).replace('nan', '')
df['label'] = df['label'].astype(str).replace('nan', '')

# Combine text and label into a list
all_texts = df['text'].tolist() + df['label'].tolist()

# Save to a text file
output_txt_path = 'preprocessed_2.txt'
with open(output_txt_path, 'w', encoding='utf-8') as f:
    for text in all_texts:
        if text:  # Skip empty strings
            f.write(text + '\n')

print(f"Preprocessed data saved to {output_txt_path}")

Preprocessed data saved to preprocessed_2.txt


In [19]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors

tokenizer = Tokenizer(models.BPE())

tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()

tokenizer.post_processor = processors.TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", 1),
        ("[SEP]", 2),
        ("[PAD]", 0),
        ("[MASK]", 3), 
        ("[STATE_ID]" , 4)
    ],
)

trainer = trainers.BpeTrainer(
    vocab_size=512,
    min_frequency=2,
    special_tokens=["[CLS]", "[SEP]", "[PAD]", "[MASK]" , "[STATE_ID]"]
)

files = ["preprocessed_2.txt"]

tokenizer.train(files, trainer)

tokenizer.save("custom_tokenizer.json")


In [29]:
from transformers import PreTrainedTokenizerFast

hf_tokenizer = PreTrainedTokenizerFast(tokenizer_file="custom_tokenizer.json")

hf_tokenizer.cls_token = "[CLS]"
hf_tokenizer.sep_token = "[SEP]"
hf_tokenizer.pad_token = "[PAD]"
hf_tokenizer.mask_token = "[MASK]"
hf_tokenizer.mask_token = "[STATE_ID]"

hf_tokenizer.save_pretrained("custom_tokenizer")


('custom_tokenizer\\tokenizer_config.json',
 'custom_tokenizer\\special_tokens_map.json',
 'custom_tokenizer\\tokenizer.json')

512

In [24]:
import os
os.environ['USE_TF'] = "0"

In [30]:
from transformers import RobertaTokenizerFast

tokenizer_path = 'custom_tokenizer'

tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_path)

sample = "e_[ID](X)^(*) e_[ID](X)^(*) to e_[ID](X) e_[ID](X)"
tokens = tokenizer(sample)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PreTrainedTokenizer'. 
The class this function is called from is 'RobertaTokenizerFast'.


In [27]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import numpy as np
import torch
from transformers import PreTrainedTokenizerFast

# ✅ Load the tokenizer
tokenizer = PreTrainedTokenizerFast.from_pretrained("custom_tokenizer")
tokenizer.pad_token = tokenizer.eos_token if tokenizer.eos_token is not None else "[PAD]"

# ✅ Load the dataset
csv_path = 'processed_2.csv'
df = pd.read_csv(csv_path)

# ✅ Clean the DataFrame
# Convert text and label columns to strings, handle missing values
df['text'] = df['text'].astype(str).replace('nan', '')  # Convert to string, replace NaN with empty string
df['label'] = df['label'].astype(str).replace('nan', '')

# ✅ Split into train, validation, and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

In [42]:


MAX_LENGTH_TEXT = 509  # 99th percentile for text
MAX_LENGTH_LABEL = 421 

def tokenize_function(example):
    # Ensure inputs are strings and not empty
    text = example['text'] if example['text'] else ""
    label = example['label'] if example['label'] else ""
    
    # Tokenize text
    input_tokens = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH_TEXT,
        return_tensors="np"
    )

    # Tokenize label
    label_tokens = tokenizer(
        label,
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH_LABEL,
        return_tensors="np"
    )

    return {
        'input_ids': input_tokens['input_ids'][0],
        'attention_mask': input_tokens['attention_mask'][0],
        'labels': label_tokens['input_ids'][0]
    }

# ✅ Step 1: Tokenize Manually
train_data = train_df.apply(tokenize_function, axis=1).tolist()
val_data = val_df.apply(tokenize_function, axis=1).tolist()
test_data = test_df.apply(tokenize_function, axis=1).tolist()

# ✅ Step 2: Convert to Dictionary
def convert_to_dict(data):
    return {
        'input_ids': np.stack([x['input_ids'] for x in data]),
        'attention_mask': np.stack([x['attention_mask'] for x in data]),
        'labels': np.stack([x['labels'] for x in data])
    }

train_dict = convert_to_dict(train_data)
val_dict = convert_to_dict(val_data)
test_dict = convert_to_dict(test_data)

# ✅ Step 3: Create Dataset from Scratch
dataset = DatasetDict({
    "train": Dataset.from_dict(train_dict),
    "validation": Dataset.from_dict(val_dict),
    "test": Dataset.from_dict(test_dict)
})

# ✅ Step 4: Set Format for PyTorch
dataset.set_format(type='torch')

# ✅ Sample Check
print(dataset['train'][0])

{'input_ids': tensor([  1,  22,  90,  93, 171,  35, 142,  35, 498, 138,   6,  30,   7,  94,
        171,  35, 123,  35, 221, 440,   6,  30,   7, 157,  94, 171,  35, 142,
         35, 445, 217,   6,  30, 135,  94, 171,  35,  95,  35, 452, 189,   6,
         30, 135, 119,  35,  13,  94, 171,   6,  30,   7,  94, 171,   6,  30,
          7,  77,   6,  30,   7, 119,  35,  12,  94, 171,   6,  30,   7,  94,
        171,   6,  30,   7,  77,   6,  30,   7,  76,  13,  11,  21,   8,  43,
          8,  40,  34,  14,   8, 106, 155, 121,  35, 211, 197,  21,  71,  74,
         35,  17, 233, 186,  35,   4,  35,   4,  71,  74,  35,  17, 233, 200,
         35,   4,  35,   4,  57,   8, 106, 124, 121,  35, 211, 197,  21,  71,
         74,  35,  17, 233, 177,  35,   4,  35,   4,  71, 123,  35, 199, 411,
         35,   4,  35,   4,  57,   8,  37,  72,  43,  35, 164, 146,  13,  71,
         74,  35,  17, 233, 200,  35,   4,  35,   4, 125,  30,  84,  55,   8,
         37,  72,  46,  35, 164,  18, 177,  71,  7

In [31]:
text_lengths = [len(tokenizer(text, add_special_tokens=True)['input_ids']) for text in df['text'].tolist()]
label_lengths = [len(tokenizer(label, add_special_tokens=True)['input_ids']) for label in df['label'].tolist()]

# Print statistics for text
print("Sequence length statistics for 'text' column:")
print(f"Max sequence length (text): {max(text_lengths)}")
print(f"90th percentile (text): {np.percentile(text_lengths, 90)}")
print(f"95th percentile (text): {np.percentile(text_lengths, 95)}")
print(f"99th percentile (text): {np.percentile(text_lengths, 99)}")

# Print statistics for label
print("\nSequence length statistics for 'label' column:")
print(f"Max sequence length (label): {max(label_lengths)}")
print(f"90th percentile (label): {np.percentile(label_lengths, 90)}")
print(f"95th percentile (label): {np.percentile(label_lengths, 95)}")
print(f"99th percentile (label): {np.percentile(label_lengths, 99)}")

Sequence length statistics for 'text' column:
Max sequence length (text): 534
90th percentile (text): 482.0
95th percentile (text): 493.0
99th percentile (text): 509.0

Sequence length statistics for 'label' column:
Max sequence length (label): 421
90th percentile (label): 351.0
95th percentile (label): 413.0
99th percentile (label): 418.0


In [43]:
from torch.utils.data import DataLoader

batch_size = 16

train_loader = DataLoader(dataset['train'], batch_size=batch_size, shuffle=True)
val_loader = DataLoader(dataset['validation'], batch_size=batch_size)
test_loader = DataLoader(dataset['test'], batch_size=batch_size)


In [44]:
import pickle

### Saving DataLoaders in the Python File

In [45]:
with open(r'../src/Dataloaders/train_loader.pkl', 'wb') as f:
    pickle.dump(train_loader, f)

with open(r'../src/Dataloaders/test_loader.pkl', 'wb') as f:
    pickle.dump(test_loader, f)

with open(r'../src/Dataloaders/val_loader.pkl', 'wb') as f:
    pickle.dump(val_loader, f)



In [36]:
tokenizer.vocab_size

512

In [46]:
tokenizer.decode([123, 567, 812, 345])

'del 314'