In [15]:
from transformers import AutoTokenizer
from datasets import Dataset
import json
import os
import re
from datasets import load_from_disk

In [7]:
def load_json_file(file_path):
    with open(file_path, "r") as f:
        return [json.loads(line) for line in f.readlines()]
BASE_DIR = os.path.dirname(os.getcwd()) #Going up a level in the folder
DATA_DIR = os.path.join(BASE_DIR, "data")
train_path = os.path.join(DATA_DIR, "train.json")
val_path = os.path.join(DATA_DIR, "dev.json")

In [8]:
train_data_raw = load_json_file(train_path)
eval_data_raw = load_json_file(val_path)

In [9]:
# Flatten into Hugging Face compatible format
def convert_to_input_target(example_dicts):
    result = []
    for item in example_dicts:
        for _, v in item.items():
            result.append({
                "input_text": v["disfluent"],
                "target_text": v["original"]
            })
    return result

train_data = convert_to_input_target(train_data_raw)
eval_data = convert_to_input_target(eval_data_raw)

### Load tokenizer

In [10]:
model_checkpoint = "t5-small"  # or "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [11]:

# Assume train_examples & val_examples already exist
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(eval_data)

# Tokenization function
def tokenize_function(examples, max_input_len=64, max_target_len=64):
    model_inputs = tokenizer(
        examples["input_text"],
        max_length=max_input_len,
        padding="max_length",
        truncation=True
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target_text"],
            max_length=max_target_len,
            padding="max_length",
            truncation=True
        )
    # Mask out padding tokens in labels
    labels["input_ids"] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label]
        for label in labels["input_ids"]
    ]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize both train and validation
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 7182/7182 [00:07<00:00, 972.13 examples/s] 
Map: 100%|██████████| 1000/1000 [00:01<00:00, 974.86 examples/s]


In [12]:
tokenized_train

Dataset({
    features: ['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 7182
})

In [13]:
tokenized_val

Dataset({
    features: ['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})

In [14]:
train_path_tokenized = os.path.join(DATA_DIR, "train_tokenize")
val_path_tokenized = os.path.join(DATA_DIR, "dev_tokenize")
tokenized_train.save_to_disk(train_path_tokenized)
tokenized_val.save_to_disk(val_path_tokenized)

Saving the dataset (1/1 shards): 100%|██████████| 7182/7182 [00:00<00:00, 160030.45 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 58980.00 examples/s]


In [17]:
train_ds = load_from_disk(train_path_tokenized)
val_ds = load_from_disk(val_path_tokenized)

In [18]:
train_ds

Dataset({
    features: ['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 7182
})

In [19]:
val_ds

Dataset({
    features: ['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})