In [1]:
!pip install huggingface-hub         # if not already installed



In [3]:
from huggingface_hub import notebook_login
notebook_login()  # prompts you to paste the token interactively

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
pip install transformers sentencepiece

Note: you may need to restart the kernel to use updated packages.


In [5]:
# Install necessary libraries (uncomment if not already installed)
# !pip install datasets transformers

from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer
import random

# 1. Load the full arXiv summarization dataset
raw_datasets = load_dataset("ccdv/arxiv-summarization")

# 2. Select a random subset of 5,000 samples from the 'train' split
#    (you can also sample across all splits if preferred)
train_full = raw_datasets["train"]
random.seed(42)
indices = random.sample(range(len(train_full)), 5_000)
subset = train_full.select(indices)

# 3. Extract input/output pairs
#    - 'article' is the input
#    - 'abstract' is the target summary
subset = subset.map(lambda ex: {"input_text": ex["article"], "target_text": ex["abstract"]},
                    remove_columns=[col for col in subset.column_names if col not in ["input_text","target_text"]])

# 4. Load your base model’s tokenizer (Llama 3.2 1B)
#    Replace 'path/to/llama-3.2-1b-tokenizer' with the actual model ID or local path
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1b", use_fast=True)

# 5. Tokenization function
def tokenize_fn(examples):
    # Tokenize inputs (articles) and targets (abstracts) separately
    model_inputs = tokenizer(examples["input_text"], truncation=True, padding="max_length", max_length=1024)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["target_text"], truncation=True, padding="max_length", max_length=256)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply tokenization
tokenized_ds = subset.map(tokenize_fn, batched=True, remove_columns=subset.column_names)

# 6. Split into train/validation/test (80/10/10)
split_ds = tokenized_ds.train_test_split(test_size=0.20, seed=42)
# From the 20% held out, split into val and test each 50% of that 20%
test_valid = split_ds["test"].train_test_split(test_size=0.50, seed=42)

dataset_dict = DatasetDict({
    "train": split_ds["train"],
    "validation": test_valid["train"],
    "test": test_valid["test"]
})

# 7. Quick sanity check
print(dataset_dict)
print(dataset_dict["train"][0])

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [21]:
# randomly shuffle and select first 5000 samples
small = raw.shuffle(seed=42).select(range(5000))
print(small)

Dataset({
    features: ['article', 'abstract'],
    num_rows: 5000
})


In [22]:
# preview one sample
print(small[0]["article"][:200], "...")   # input
print(small[0]["abstract"][:200], "...")  # target

arp  220 is the nearest ( @xmath3 77  mpc ) example of an ultraluminous infrared galaxy ( ulirg ) that supports star formation at extreme levels . 
 it contains two nuclei separated by 350  pc , both  ...
the cores of arp 220 , the closest ultraluminous infrared starburst galaxy , provide an opportunity to study interactions of cosmic rays under extreme conditions . in this paper , we model the populat ...


In [28]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-3.2-1B",
    use_auth_token=True,       # uses your cached token
    trust_remote_code=True     # allows repo’s custom tokenizer code to run
)

OSError: Can't load tokenizer for 'meta-llama/Llama-3.2-1B'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'meta-llama/Llama-3.2-1B' is the correct path to a directory containing all relevant files for a LlamaTokenizerFast tokenizer.

In [10]:
max_input_length  = 2048
max_target_length = 512

def preprocess(example):
    # tokenize article
    inputs = tokenizer(
        example["article"],
        max_length=max_input_length,
        truncation=True,
        padding="max_length",
    )
    # tokenize summary
    targets = tokenizer(
        example["abstract"],
        max_length=max_target_length,
        truncation=True,
        padding="max_length",
    )
    return {
        "input_ids": inputs.input_ids,
        "attention_mask": inputs.attention_mask,
        "labels": targets.input_ids,
    }

tokenized = small.map(
    preprocess,
    batched=True,
    remove_columns=small.column_names,  # drop raw text columns
)
print(tokenized)


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

NameError: name 'tokenizer' is not defined

In [None]:
# 20% for validation+test, 80% for training
split1 = tokenized.train_test_split(test_size=0.20, seed=42)
train_ds = split1["train"]
val_test_ds = split1["test"]

# split the 20% into 10% val, 10% test
split2 = val_test_ds.train_test_split(test_size=0.50, seed=42)
val_ds = split2["train"]
test_ds = split2["test"]

print(f"Train: {len(train_ds)} examples")
print(f"Validation: {len(val_ds)} examples")
print(f"Test: {len(test_ds)} examples")


In [None]:
train_ds.save_to_disk("arxiv_5k_train")
val_ds.save_to_disk("arxiv_5k_val")
test_ds.save_to_disk("arxiv_5k_test")