# Tokenizer for LLaMA 2

This script handles dataset processing, cleaning, tokenization, and saving tokenized data.
It is designed to work with the SentencePiece tokenizer and PyTorch.

In [None]:
import argparse
import glob
import json
import os
import random
from typing import List
from concurrent.futures import ProcessPoolExecutor
from functools import partial

import numpy as np
import requests
import sentencepiece as spm
import torch
import torch.distributed as dist
from tqdm import tqdm

from tokenizers import Tokenizer

# Authenticate Hugging Face CLI

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
The token `keras-gsoc` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `keras-gsoc`


# Load dataset from Hugging Face

In [None]:
from datasets import load_dataset

ds = load_dataset("codeparrot/codeparrot-valid-near-deduplication")


Repo card metadata block was not found. Setting CardData to empty.


# Convert dataset into a Pandas DataFrame

In [None]:
import pandas as pd
df = pd.DataFrame(ds['train'])

In [None]:
df.head()

Unnamed: 0,repo_name,path,copies,size,content,license,hash,line_mean,line_max,alpha_frac,autogenerated
0,pansapiens/mytardis,tardis/apps/mx_views/views.py,3,2892,from django.conf import settings\nfrom django....,bsd-3-clause,-8726488663588781404,37.052632,79,0.65491,False
1,twidi/pytyrant,pytyrant.py,1,14361,"""""""Pure python implementation of the binary To...",mit,-5985833604781467244,25.110909,114,0.572871,False
2,HonzaKral/curator,test_curator/integration/test_time_based.py,1,1872,"from datetime import datetime, timedelta\n\nim...",apache-2.0,-1606032251548790876,40.6,91,0.634615,False
3,chaubold/hytra,tests/core/test_conflictingsegmentations.py,1,5839,"from __future__ import print_function, absolut...",mit,-4230047409174636003,43.572519,125,0.679911,False
4,makelove/OpenCV-Python-Tutorial,ch21-轮廓Contours/21-findContour.py,1,1096,# -*- coding: utf-8 -*-\n\nimport numpy as np\...,mit,-2430169206289489120,31.8,113,0.703252,False


In [None]:
df.shape

(110960, 11)

# --------------------------------
# Cleaning the dataset
# --------------------------------

In [None]:
df.drop(columns=['copies','size',"license","hash","line_mean","line_max","alpha_frac","autogenerated"], inplace=True)

## we can keep extract file name and repo-name and keep them as keywords in our dataset

# Extracting repository name and file path as keywords

In [None]:
def extract_repo_name(string):
    return "".join(string.split('/')[1:])
df['repo_name'] = df['repo_name'].apply(extract_repo_name)
def extract_path_name(string):
    if('/' not in string):
      return string
    else:
      return "".join(string.split('/')[1:])
df['path'] = df['path'].apply(extract_path_name)

In [None]:
df.head(10)

Unnamed: 0,repo_name,path,content
0,mytardis,appsmx_viewsviews.py,from django.conf import settings\nfrom django....
1,pytyrant,pytyrant.py,"""""""Pure python implementation of the binary To..."
2,curator,integrationtest_time_based.py,"from datetime import datetime, timedelta\n\nim..."
3,hytra,coretest_conflictingsegmentations.py,"from __future__ import print_function, absolut..."
4,OpenCV-Python-Tutorial,21-findContour.py,# -*- coding: utf-8 -*-\n\nimport numpy as np\...
5,codetransformer,teststest_code.py,from dis import dis\nfrom io import StringIO\n...
6,azure-sdk-for-python,resourcesazure-mgmt-resourceazuremgmtresourcel...,# coding=utf-8\n# ----------------------------...
7,sourcer,test_salesforce.py,from sourcer import Grammar\n\n# This is work ...
8,timtools,sdocfeeders.py,## Copyright 2003-2009 Luc Saffre\n## This fil...
9,zulip,viewshome.py,from __future__ import absolute_import\nfrom t...


In [None]:
df.isnull().sum()

Unnamed: 0,0
repo_name,0
path,0
content,0


# --------------------------------
# Save dataset as JSON batches
# --------------------------------

In [None]:
import pandas as pd
import json

def process_and_dump_json(df, output_folder = 'Dataset', batch_size=2000):
    os.makedirs(output_folder, exist_ok=True)
    num_rows = len(df)
    num_batches = (num_rows + batch_size - 1) // batch_size

    for batch_num in range(num_batches):
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, num_rows)

        batch_df = df.iloc[start_index:end_index]
        batch_data = batch_df.to_dict(orient='records')
        filename = os.path.join(output_folder, f"batch_{batch_num + 1}.json")
        with open(filename, 'w') as f:
            json.dump(batch_data, f, indent=4)  # Use indent for readability

        print(f"Batch {batch_num + 1} dumped to {filename}")

process_and_dump_json(df)

#lets convert the dataset to .json file for tokenization

In [None]:
import pandas as pd
import json

def process_and_dump_json(df, batch_size=2000):
    num_rows = len(df)
    num_batches = (num_rows + batch_size - 1) // batch_size

    for batch_num in range(num_batches):
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, num_rows)

        batch_df = df.iloc[start_index:end_index]
        batch_data = batch_df.to_dict(orient='records')
        filename = f"batch_{batch_num + 1}.json"
        with open(filename, 'w') as f:
            json.dump(batch_data, f, indent=4)

        print(f"Batch {batch_num + 1} dumped to {filename}")

process_and_dump_json(df)

# --------------------------------
# Zip and download dataset
# --------------------------------

In [None]:
import shutil

shutil.make_archive('Dataset', 'zip', 'Dataset')

'/content/Dataset.zip'

In [None]:
from google.colab import files

files.download('Dataset.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# --------------------------------
# Tokenization using [Sentencepiece](https://github.com/google/sentencepiece)
# --------------------------------

In [None]:
!unzip Dataset.zip -d Dataset

In [None]:
DATA_CACHE_DIR = "Dataset"

def train_vocab(vocab_size):
    """
    Trains a custom sentencepiece tokenizer on the TinyStories dataset.
    The custom tokenizer files will be saved in DATA_CACHE_DIR/tok{N} directories,
    where N is the vocab size. This is also where the pretok .bin files will go.
    """
    assert vocab_size > 0, "Vocab size must be positive"

    # output file prefix path for sentencepiece
    prefix = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")

    # how many shards we'll use for vocab training, kept low for usability in colab
    num_shards = 10

    # 1) export a large chunk of text as a single text file tiny.txt
    tiny_file = os.path.join(DATA_CACHE_DIR, "tiny.txt")
    data_dir = os.path.join(DATA_CACHE_DIR)
    shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))

    print(f"Writing temporary file {tiny_file} with {num_shards} shards...")
    with open(tiny_file, "w", encoding="utf-8") as of:
        for shard in tqdm(shard_filenames[:num_shards]):
            with open(shard, "r") as f:
                data = json.load(f)
            for example in data:
                text = example["content"]
                text = text.strip()
                of.write(text + "\n")
    print(f"Size is: {os.path.getsize(tiny_file) / 1024 / 1024:.2f} MB")

    # 2) train the sentencepiece model
    print("Will now train the vocab...")
    spm.SentencePieceTrainer.train(input=tiny_file,
                                   model_prefix=prefix,
                                   model_type="bpe",
                                   vocab_size=vocab_size,
                                   self_test_sample_size=0,
                                   input_format="text",
                                   character_coverage=1.0,
                                   num_threads=os.cpu_count(),
                                   split_digits=True,
                                   allow_whitespace_only_pieces=True,
                                   byte_fallback=True,
                                   unk_surface=r" \342\201\207 ",
                                   normalization_rule_name="identity")

    # 3) optional cleanup, ask the user if they'd like to delete tiny.txt
    dec = input(f"Delete the temporary file {tiny_file}? [y/N] ")
    if dec.lower() == "y":
        os.remove(tiny_file)
        print(f"Deleted {tiny_file}")

    print(f"Trained tokenizer is in {prefix}.model")
    print("Done.")


def process_shard(args, vocab_size):
    shard_id, shard = args
    tokenizer_model = get_tokenizer_model_path(vocab_size)
    enc = Tokenizer(tokenizer_model)
    with open(shard, "r") as f:
        data = json.load(f)
    all_tokens = []
    for example in tqdm(data, position=shard_id):
        text = example["content"]
        text = text.strip()  # get rid of leading/trailing whitespace
        tokens = enc.encode(text, bos=True, eos=False)  # encode the text, use BOS
        all_tokens.extend(tokens)
    # convert to uint16 nparray
    all_tokens = np.array(all_tokens, dtype=np.uint16)
    # calculate the output filename
    if vocab_size == 0:
        # if we're using Llama 2, just save the tokenized file in the same dir
        tokenized_filename = shard.replace(".json", ".bin")
    else:
        # save .bin files into a new tok{N} directory
        bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
        shard_basename = os.path.basename(shard)
        bin_basename = shard_basename.replace(".json", ".bin")
        tokenized_filename = os.path.join(bin_dir, bin_basename)
    # write the bytes
    with open(tokenized_filename, "wb") as f:
        f.write(all_tokens.tobytes())
    # calculate the average sequence length (they are separated by BOS=1)
    avg_seq_len = all_tokens.size / ((all_tokens == 1).sum())
    print(f"Saved {tokenized_filename}, average seqlen: {avg_seq_len:.2f}")

    
# --------------------------------
# Process dataset for tokenization
# --------------------------------

In [None]:
def pretokenize(vocab_size):
    # iterate the shards and tokenize all of them one by one
    data_dir = os.path.join(DATA_CACHE_DIR)
    shard_filenames = sorted(glob.glob(os.path.join(data_dir, "*.json")))
    if vocab_size > 0:
        # .bin files will be saved into tok{N} directory, create it once here
        bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}")
        os.makedirs(bin_dir, exist_ok=True)

    # process all the shards in a process pool
    fun = partial(process_shard, vocab_size=vocab_size)
    with ProcessPoolExecutor() as executor:
        executor.map(fun, enumerate(shard_filenames))
    print("Done.")

# --------------------------------
# PyTorch Dataset for Pretokenized Data
# --------------------------------

In [None]:
class PretokDataset(torch.utils.data.IterableDataset):
    """Loads pretokenized examples from disk and yields them as PyTorch tensors."""

    def __init__(self, split, max_seq_len, vocab_size, vocab_source):
        super().__init__()
        self.split = split
        self.max_seq_len = max_seq_len
        self.vocab_size = vocab_size
        self.vocab_source = vocab_source

    def __iter__(self):
        # get worker info within a DataLoader
        worker_info = torch.utils.data.get_worker_info()
        worker_id = worker_info.id if worker_info else 0
        # get DDP rank info
        rank = dist.get_rank() if dist.is_initialized() else 0
        # combine the worker_id and worker_rank to create a unique seed for rng
        seed = 42 + worker_id + 1337 * rank
        rng = random.Random(seed)
        print(f"Created a PretokDataset with rng seed {seed}")
        if self.vocab_source == "llama2":
            # the .bin files are right along the .json files
            bin_dir = os.path.join(DATA_CACHE_DIR)
            shard_filenames = sorted(glob.glob(os.path.join(bin_dir, "*.bin")))
        elif self.vocab_source == "custom":
            # the .bin files are in tok{N} directory
            bin_dir = os.path.join(DATA_CACHE_DIR, f"tok{self.vocab_size}")
            shard_filenames = sorted(glob.glob(os.path.join(bin_dir, "*.bin")))
        # train/test split. let's use only shard 0 for test split, rest train
        shard_filenames = shard_filenames[1:] if self.split == "train" else shard_filenames[:1]
        assert len(shard_filenames)>0, f"No bin files found in {bin_dir}"
        while True:
            rng.shuffle(shard_filenames)
            for shard in shard_filenames:
                # open the dataset for reading but keep it on disk with memmap
                m = np.memmap(shard, dtype=np.uint16, mode="r")
                num_batches = len(m) // self.max_seq_len
                num_batches -= 1  # drop the last partial batch
                assert num_batches > 0, "this shard is way too small? investigate."
                ixs = list(range(num_batches))
                rng.shuffle(ixs)
                for ix in ixs:
                    start = ix * self.max_seq_len
                    end = start + self.max_seq_len + 1
                    # calling .astype will copy the data into a new numpy array, now in RAM
                    chunk = torch.from_numpy((m[start:end]).astype(np.int64))
                    x = chunk[:-1]
                    y = chunk[1:]
                    yield x, y

# -----------------------------------------------------------------------------
# public interface functions

def get_tokenizer_model_path(vocab_size):
    """
    Returns path to the sentencepiece tokenizer model for a given vocab size
    vocab_size = 0 designates the default Llama 2 tokenizer, in that case
    None is returned.
    """
    if vocab_size == 0:
        return None
    else:
        return os.path.join(DATA_CACHE_DIR, f"tok{vocab_size}.model")

class Task:

    @staticmethod
    def iter_batches(batch_size, device, num_workers=0, **dataset_kwargs):
        ds = PretokDataset(**dataset_kwargs)
        dl = torch.utils.data.DataLoader(
            ds, batch_size=batch_size, pin_memory=True, num_workers=num_workers
        )
        for x, y in dl:
            x = x.to(device, non_blocking=True)
            y = y.to(device, non_blocking=True)
            yield x, y

# Run tokenizer training

In [None]:
train_vocab(4703)

Writing temporary file Dataset/tiny.txt with 10 shards...


100%|██████████| 10/10 [00:01<00:00,  7.95it/s]


Size is: 183.39 MB
Will now train the vocab...
Delete the temporary file Dataset/tiny.txt? [y/N] y
Deleted Dataset/tiny.txt
Trained tokenizer is in Dataset/tok4703.model
Done.
