<a href="https://colab.research.google.com/github/AlperYildirim1/Pay-Attention-Later/blob/main/Multi3k_data_processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import os
import numpy as np
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer
from huggingface_hub import login
from tqdm.auto import tqdm

# --- 2. CONFIGURATION (Edit this section only) ---

HF_USERNAME = "Your Username"

# --- General Settings ---
# This now points to YOUR STABLE copy of the dataset.
BASE_DATASET_ID = f"Yujivus/multi30k-de-en-3k-subset"
LANGUAGE_PAIR = "de-en"
MODEL_CHECKPOINT = "Helsinki-NLP/opus-mt-de-en"
REPRODUCIBILITY_SEED = 42

# --- Processing Settings ---
MAX_LENGTH = 128
BUCKET_WIDTH = 4
TOKEN_LIMIT_PER_BATCH = 500

# --- Automatic Repository Naming (Do not touch) ---
BASE_NAME = "multi3k-de-en"
BUCKETED_REPO_NAME = f"{BASE_NAME}-bucketed-w{BUCKET_WIDTH}"
PREBATCHED_REPO_NAME = f"{BASE_NAME}-prebatched-w{BUCKET_WIDTH}"

print("--- Configuration ---")
print(f"Target Dataset: {BASE_DATASET_ID}")
print(f"Bucketed Repo to be created: {HF_USERNAME}/{BUCKETED_REPO_NAME}")
print(f"Pre-batched Repo to be created: {HF_USERNAME}/{PREBATCHED_REPO_NAME}")
print("-" * 50)


login("...")

print("\n--- Step 1: Loading Raw Dataset ---")
raw_dataset = load_dataset(BASE_DATASET_ID)
print(f" Raw dataset loaded successfully: {raw_dataset}")

# --- 4. TOKENIZE, BUCKET, AND SORT ---

print("\n--- Step 2: Tokenizing, Bucketing, and Sorting ---")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

def process_function(examples):
    # This function is adapted for the flat column names ("de", "en") of Multi30k.
    inputs = examples["de"]
    targets = examples["en"]

    model_inputs = tokenizer(inputs, max_length=MAX_LENGTH, truncation=True, padding=False)
    labels = tokenizer(text_target=targets, max_length=MAX_LENGTH, truncation=True, padding=False)

    model_inputs["labels"] = labels["input_ids"]
    model_inputs["input_length"] = [len(x) for x in model_inputs["input_ids"]]
    model_inputs["labels_length"] = [len(x) for x in model_inputs["labels"]]

    # Add bucket_id simultaneously
    costs = [max(i_len, l_len) for i_len, l_len in zip(model_inputs["input_length"], model_inputs["labels_length"])]
    model_inputs["bucket_id"] = [cost // BUCKET_WIDTH for cost in costs]

    return model_inputs

processed_dataset = raw_dataset.map(
    process_function,
    batched=True,
    num_proc=os.cpu_count() or 2,
    remove_columns=raw_dataset["train"].column_names,
    desc="Tokenize & Bucket"
)

print("Sorting...")
# We only need to sort the train split.
processed_dataset['train'] = processed_dataset['train'].sort("bucket_id")

print(f" Processing complete. Uploading dataset to '{HF_USERNAME}/{BUCKETED_REPO_NAME}'...")
processed_dataset.push_to_hub(f"{HF_USERNAME}/{BUCKETED_REPO_NAME}", private=False)
print(" Bucketed dataset uploaded successfully.")

# --- 5. PRE-BATCHING ---

print(f"\n--- Step 3: Pre-batching and creating '{HF_USERNAME}/{PREBATCHED_REPO_NAME}' ---")
train_split_for_batching = processed_dataset['train']

bucket_ids = np.array(train_split_for_batching['bucket_id'], dtype=np.int32)
lengths = np.maximum(
    np.array(train_split_for_batching['input_length'], dtype=np.int32),
    np.array(train_split_for_batching['labels_length'], dtype=np.int32)
)

boundaries = np.where(np.diff(bucket_ids) != 0)[0] + 1
bucket_indices_list = np.split(np.arange(len(bucket_ids)), boundaries)

all_batches = []
for bucket_indices in tqdm(bucket_indices_list, desc="Creating batches"):
    if len(bucket_indices) == 0: continue
    max_len_in_bucket = np.max(lengths[bucket_indices])
    samples_per_batch = max(1, TOKEN_LIMIT_PER_BATCH // (max_len_in_bucket + 1))
    for j in range(0, len(bucket_indices), int(samples_per_batch)):
        batch = bucket_indices[j : j + samples_per_batch].tolist()
        if batch: all_batches.append(batch)

prebatched_train_split = Dataset.from_dict({"batch_indices": all_batches})

print(f" Pre-batching complete. Uploading dataset to '{HF_USERNAME}/{PREBATCHED_REPO_NAME}'...")
prebatched_train_split.push_to_hub(f"{HF_USERNAME}/{PREBATCHED_REPO_NAME}", split="train", private=False)
print(" Pre-batched (train split) dataset uploaded successfully.")

# ==============================================================================
# --- FINAL RESULT ---
# ==============================================================================
print("\n" + "="*60)
print(" ALL STEPS COMPLETED SUCCESSFULLY! ")
print("\nRepository IDs to use in your training script:")
print(f'PREBATCHED_REPO_ID = "{HF_USERNAME}/{PREBATCHED_REPO_NAME}"')
print(f'ORIGINAL_BUCKETED_REPO_ID = "{HF_USERNAME}/{BUCKETED_REPO_NAME}"')
print("="*60)