In [None]:
!pip install datasets
!pip install transformers
!pip install tqdm
!pip install nltk
!pip install bitsandbytes
!pip install accelerate
!pip install ijson

In [2]:
from datasets import load_dataset
from tqdm import tqdm
import json
import nltk
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import ijson

nltk.download('punkt')
nltk.download('punkt_tab')
model_id = "NousResearch/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="right", user_fast=True)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/danjietang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/danjietang/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# Load dataset and store it.

In [None]:
dataset = load_dataset("HuggingFaceFW/clean-wikipedia")

In [None]:
# Entire wikipedia
with open("entire_wikipedia.jsonl", 'w') as f:
    for text_item in dataset["train"]["text"]:
        f.write(json.dumps(text_item) + '\n')

In [None]:
# English wikipedia
with open("english_wikipedia.jsonl", 'w') as f:
    for item in dataset["train"]: # There's only train dataset loll
        if item["wikicode"] == "en":
            f.write(json.dumps(item["text"]) + '\n')

# Split into sentences.

In [None]:
def split_into_sentences(text: str) -> list[str]:
    return nltk.sent_tokenize(text)

with open("english_wikipedia.jsonl", 'r') as fin, open("english_wikipedia_sentences.jsonl", 'w') as fout:
    for line in tqdm(fin):
        sentences = split_into_sentences(line)
        for sentence in sentences:
            fout.write(json.dumps(sentence) + '\n')

# Tokenize training data

In [None]:
with open("english_wikipedia.jsonl", 'r') as fin, open("english_wikipedia_tokenized.jsonl", 'w') as fout:
    for line in tqdm(fin):
        fout.write(json.dumps(tokenizer(line)["input_ids"]) + '\n')

# Count how many qualified sentences there are

In [None]:
max_token = 49
min_token = 10
counter = 0

with open("english_wikipedia_tokenized.jsonl", 'r') as file:
    for line in tqdm(file):
        tokenized_sentence = json.loads(line)
        length = len(tokenized_sentence)
        if length < max_token and length > min_token:
            counter += 1
print(counter)

# Store tokenized data into tensor

In [None]:
max_token = 49
min_token = 10
shape = (counter, 50)
counter = 0

tokenized_tensor = torch.empty(shape, dtype=torch.int16)
eos_token_id = tokenizer.eos_token_id

with open("english_wikipedia_tokenized.jsonl", 'r') as file:
    for line in tqdm(file):
        tokenized_sentence = json.loads(line)
        length = len(tokenized_sentence)
        if length < max_token and length > min_token:
            tokenized_sentence.append(eos_token_id)
            tokenized_sentence = tokenized_sentence + [32000] * (50 - len(tokenized_sentence))
            sentence_tokenized_tensor = torch.tensor(tokenized_sentence, dtype=torch.int16)
            tokenized_tensor[counter] = sentence_tokenized_tensor
            counter += 1

torch.save(tokenized_tensor, "llama2_wiki_50.pt")

# Split into train and eval

In [None]:
# No trim_padding
tensor = torch.load("llama2_wiki_50.pt")

eval_ratio = 0.05
n = tensor.shape[0]
n_eval = int(n * eval_ratio)

perm = torch.randperm(n)
eval_indices = perm[:n_eval]
train_indices = perm[n_e val:]

# Sort indices to preserve original order
eval_indices, _ = torch.sort(eval_indices)
train_indices, _ = torch.sort(train_indices)

# Split tensors while keeping order
eval_tensor = tensor[eval_indices]
train_tensor = tensor[train_indices]

np.save("llama2_wiki_50_train.npy", train_tensor.numpy())
np.save("llama2_wiki_50_eval.npy", eval_tensor.numpy())

# Load token embedding and store it.

In [None]:
model_id = "NousResearch/Meta-Llama-3-8B"

model = AutoModelForCausalLM.from_pretrained(model_id)

# Access the embedding matrix
word_embeddings_tensor = model.model.embed_tokens.weight.data

# Store vocabulary size and embedding dimension
num_embeddings, embedding_dim = word_embeddings_tensor.shape
word_embeddings_tensor.requires_grad = False

torch.save(word_embeddings_tensor, 'word_embeddings_tensor_llama3.pt')

# Histogram

In [None]:
tokens = torch.load("llama2_wiki_50.pt")
import matplotlib.pyplot as plt

PAD_ID = 32000

# Count how many padding tokens per row
pad_counts = (tokens == PAD_ID).sum(dim=1)

# Convert to CPU numpy for plotting
pad_counts_np = pad_counts.cpu().numpy()

# Plot histogram
plt.hist(pad_counts_np, bins=range(65), edgecolor='black', align='left')
plt.title("Histogram of Padding Tokens per Sequence")
plt.xlabel("Number of Padding Tokens")
plt.ylabel("Number of Sequences")
plt.xlim(0, 64)
plt.show()

# Preprocess multiple languages

In [None]:
import pysbd
from datasets import load_dataset
from transformers import AutoTokenizer
import os

# 2. Define a worker class to manage segmenters
# We initialize segmenters once per process to save memory/time
class SentenceProcessor:
    def __init__(self):
        self.segs = {
            "en": pysbd.Segmenter(language="en", clean=False),
            "es": pysbd.Segmenter(language="es", clean=False),
            "fr": pysbd.Segmenter(language="fr", clean=False),
            "de": pysbd.Segmenter(language="de", clean=False),
            "zh": pysbd.Segmenter(language="zh", clean=False),
        }

    def process_batch(self, batch):
        counts = []
        for code, text in zip(batch["wikicode"], batch["text"]):
            if code not in self.segs:
                counts.append(0)
                continue

            try:
                # pysbd has a known regex bug in the German ('de') module
                # wrapping this in a try-except prevents a full crash at 90%+
                sentences = self.segs[code].segment(text)
            except Exception:
                # If segmentation fails, skip this specific document and continue
                counts.append(0)
                continue
            
            if not sentences:
                counts.append(0)
                continue

            # Tokenize the whole list of sentences at once
            tokenized = tokenizer(sentences, add_special_tokens=False)["input_ids"]
            
            # Filter based on your criteria (len > 10 and len < 50)
            valid_count = sum(1 for ids in tokenized if 9 < len(ids) < 49)
            counts.append(valid_count)
            
        return {"valid_sentence_count": counts}

# 3. Execution
if __name__ == "__main__":
    # Load the dataset
    dataset = load_dataset("HuggingFaceFW/clean-wikipedia", split="train")

    # Initialize processor
    processor = SentenceProcessor()

    # Apply the mapping in parallel
    # num_proc should generally be the number of CPU cores you have
    processed_ds = dataset.map(
        processor.process_batch,
        batched=True,
        batch_size=100,
        num_proc=os.cpu_count(),
        remove_columns=dataset.column_names, # Clears memory by dropping raw text
        desc="Counting qualifying sentences"
    )

    # The result is exact because we sum the pre-calculated counts
    final_total = sum(processed_ds["valid_sentence_count"])

    print(f"\nFinal counter value: {final_total}")

In [None]:
import numpy as np
import pysbd
from tqdm import tqdm
from transformers import AutoTokenizer
from datasets import load_dataset

# Configuration
model_id = "NousResearch/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
dataset = load_dataset("HuggingFaceFW/clean-wikipedia", split="train")

# Pre-initialize segmenters in a dictionary for fast lookup
segmenters = {
    "en": pysbd.Segmenter(language="en", clean=False),
    "es": pysbd.Segmenter(language="es", clean=False),
    "fr": pysbd.Segmenter(language="fr", clean=False),
    "de": pysbd.Segmenter(language="de", clean=False),
    "zh": pysbd.Segmenter(language="zh", clean=False)
}

def process_batch(batch):
    all_filtered_ids = []
    
    for text, lang in zip(batch["text"], batch["wikicode"]):
        if lang not in segmenters:
            continue
            
        # 1. Segment sentences
        try:
            # pysbd has a known regex bug in the German ('de') module
            # wrapping this in a try-except prevents a full crash at 90%+
            sentences = segmenters[lang].segment(text)
        except Exception:
            # If segmentation fails, skip this specific document and continue
            continue
        
        # 2. Tokenize sentences in a sub-batch (faster than one-by-one)
        tokenized = tokenizer(sentences, add_special_tokens=False)["input_ids"]
        
        # 3. Filter and add EOS token (128001)
        for ids in tokenized:
            if 9 < len(ids) < 49:
                ids.append(128001)
                all_filtered_ids.append(ids)
                
    return {"token_ids": all_filtered_ids}

# Use .map with multiple processes
# Adjust num_proc based on your Mac's core count (e.g., 8 or 10)
processed_ds = dataset.map(
    process_batch,
    batched=True,
    batch_size=1000,
    num_proc=os.cpu_count(), 
    remove_columns=dataset.column_names,
    desc="Segmenting and Tokenizing"
)

# Convert to final numpy array efficiently
print("Creating final numpy array...")
rows = len(processed_ds)
cols = 49
# Using 128002 as padding value
final_array = np.full((rows, cols), 128002, dtype=np.int32)

for i, entry in enumerate(tqdm(processed_ds["token_ids"], desc=f"Filling Array")):
    length = len(entry)
    final_array[i, :length] = entry

np.save("languages_tokenized.npy", final_array)

In [None]:
# Load the numpy array
# Use mmap_mode="r" if the file is very large
data = np.load("languages_tokenized.npy")

eval_ratio = 0.05
n = data.shape[0]
n_eval = int(n * eval_ratio)

# Generate shuffled indices
perm = np.random.permutation(n)
eval_indices = perm[:n_eval]
train_indices = perm[n_eval:]

# Split the array using the sorted indices
eval_data = data[eval_indices]
train_data = data[train_indices]

# Save the results
np.save("languages_tokenized_50_train.npy", train_data[:20000000])
np.save("languages_tokenized_50_eval.npy", eval_data[:2000000])

In [None]:
model_id = "NousResearch/Meta-Llama-3-8B"

model = AutoModelForCausalLM.from_pretrained(model_id)

# Access the embedding matrix
word_embeddings_tensor = model.model.embed_tokens.weight.data

# Store vocabulary size and embedding dimension
word_embeddings_tensor.requires_grad = False

torch.save(word_embeddings_tensor, 'word_embeddings_tensor_llama3.pt')

# Given numpy array, add in image text training pairs

In [3]:
data = np.load("languages_tokenized.npy")

In [4]:
print(data.shape)

(292951138, 49)


# Make the numpy file smaller by filtering out unused indexs

In [None]:
import numpy as np
from numba import njit, prange

# 1. SCAN: Find which tokens are actually present
@njit(parallel=True)
def get_used_tokens_mask(data, max_token_id=128002):
    # Create a small bitmask (128KB)
    flags = np.zeros(max_token_id + 1, dtype=np.uint8)
    rows = data.shape[0]
    cols = data.shape[1]
    
    for i in prange(rows):
        for j in range(cols):
            token = data[i, j]
            if token <= max_token_id:
                flags[token] = 1
    return flags

# 2. MAP: Create the translation dictionary (Old ID -> New ID)
def create_lookup_table(used_flags):
    lookup = np.zeros_like(used_flags, dtype=np.int32)
    current_new_idx = 0
    
    for i in range(len(used_flags)):
        if used_flags[i] == 1:
            lookup[i] = current_new_idx
            current_new_idx += 1
        else:
            lookup[i] = -1 # Placeholder for unused tokens
            
    return lookup, current_new_idx

# 3. APPLY: Overwrite the 54GB array in-place
@njit(parallel=True)
def apply_reindexing_inplace(data, lookup):
    rows = data.shape[0]
    cols = data.shape[1]
    
    for i in prange(rows):
        for j in range(cols):
            old_val = data[i, j]
            # Directly overwrite the memory address
            data[i, j] = lookup[old_val]

print("Step 1: Scanning for used tokens...")
languages_tokenized_array = np.load("languages_tokenized.npy")
used_mask = get_used_tokens_mask(languages_tokenized_array)

print("Step 2: Creating re-indexing map...")
mapping_table, total_unique = create_lookup_table(used_mask)
print(f"Original Range: 0-128002 | New Range: 0-{total_unique - 1}")
np.save("mapping_table.npy", mapping_table)

print("Step 3: Applying transformation in-place (no extra RAM used)...")
apply_reindexing_inplace(languages_tokenized_array, mapping_table)

print("Success! Your array has been compacted.")
np.save("languages_tokenized_sliced.npy", languages_tokenized_array)

In [None]:
# Load the numpy array
# Use mmap_mode="r" if the file is very large
data = np.load("languages_tokenized_sliced.npy")

eval_ratio = 0.05
n = data.shape[0]
n_eval = int(n * eval_ratio)

# Generate shuffled indices
perm = np.random.permutation(n)
eval_indices = perm[:n_eval]
train_indices = perm[n_eval:]

# Sort indices to preserve original order
eval_indices.sort()
train_indices.sort()

# Split the array using the sorted indices
eval_data = data[eval_indices]
train_data = data[train_indices]

# Save the results
np.save("languages_tokenized_50_train_sliced.npy", train_data)
np.save("languages_tokenized_50_eval_sliced.npy", eval_data)

In [None]:
import torch

# 1. Get the indices of the tokens you actually kept
# 'used_mask' is the boolean array from our previous script
used_indices = np.where(used_mask)[0]

# 2. Convert to a torch tensor
indices_to_keep = torch.tensor(used_indices, dtype=torch.long)

# 3. Slice the embedding matrix
# This selects only the rows we need, in the order of our new IDs
model = AutoModelForCausalLM.from_pretrained(model_id)
word_embeddings_tensor = model.model.embed_tokens.weight.data
pruned_embeddings = word_embeddings_tensor[indices_to_keep]

print(f"Original shape: {word_embeddings_tensor.shape}")
print(f"Pruned shape:   {pruned_embeddings.shape}")

# 4. Save the pruned version
torch.save(pruned_embeddings, 'word_embeddings_llama3_sliced.pt')