In [None]:
!pip install datasets
!pip install transformers
!pip install tqdm
!pip install nltk
!pip install bitsandbytes
!pip install accelerate

In [None]:
from datasets import load_dataset
from tqdm import tqdm
import json
import nltk
from transformers import AutoTokenizer
from joblib import Parallel, delayed
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

nltk.download('punkt')
model_id = "NousResearch/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="right", user_fast=True)

# Load dataset and store it.

In [None]:
dataset = load_dataset("wikipedia", "20220301.en", trust_remote_code=True)
training_data = dataset["train"]["text"]
with open("training_data.json", 'w') as file:
    json.dump(training_data, file, indent=4)

# Split into sentences.

In [None]:
with open("training_data.json", 'r') as file:
    training_data = json.load(file)

def split_into_sentences(text: str) -> list[str]:
    return nltk.sent_tokenize(text)

training_data = Parallel(n_jobs=64)(delayed(split_into_sentences)(webpage) for webpage in tqdm(training_data, desc="Processing"))

with open("sentence_training_data.json", 'w') as file:
    json.dump(training_data, file, indent=4)

# Tokenize training data

In [None]:
with open("sentence_training_data.json", 'r') as file:
    training_data = json.load(file)

def tokenize_function(sentences: list[str]) -> list[list[int]]:
    for i in range(len(sentences)):
        sentences[i] = tokenizer(sentences[i])["input_ids"]
    return sentences

# The tokenization process cannot be executed in parallel.
for i in tqdm(range(len(training_data))):
    for j in range(len(training_data[i])):
        training_data[i][j] = tokenizer(training_data[i][j])["input_ids"]

with open("tokenized_training_data.json", 'w') as file:
    json.dump(training_data, file, indent=4)

# Filter out sentences that exceed token limit and flatten the list.

In [None]:
max_token = 64

# Function to process each sublist
def process_sublist(sublist, max_token):
    return [item for item in sublist if len(item) < max_token]

# Load the data
with open("tokenized_training_data.json", 'r') as file:
    training_data = json.load(file)

# Use joblib to parallelize the processing
# n_jobs=-1 means use all available CPUs
processed_training_data = Parallel(n_jobs=-1)(
    delayed(process_sublist)(sublist, max_token) for sublist in tqdm(training_data)
)

# Flatten the list of lists if necessary
flat_training_data = [item for sublist in processed_training_data for item in sublist]

with open("flatten_data.json", 'w') as file:
    json.dump(flat_training_data, file)

# Add eos token + padding.

In [None]:
max_token = 64

with open("flatten_data.json", 'r') as file:
    flat_training_data = json.load(file)

for i in tqdm(range(len(flat_training_data))):
    flat_training_data[i].append(128001)

def pad_list(lst, max_length=64, pad_value=128002):
    return lst + [pad_value] * (max_length - len(lst)) if len(lst) < max_length else lst[:max_length]

# Using joblib to parallelize the padding
padded_data = Parallel(n_jobs=-1)(delayed(pad_list)(lst) for lst in tqdm(flat_training_data))

with open("llama3_wiki_64.json", 'w') as file:
    json.dump(padded_data, file)

# Convert into pytorch tensor and store it.

In [None]:
with open("llama3_wiki_64.json", 'r') as file:
    padded_data = json.load(file)

tensor = torch.tensor(padded_data)
torch.save(tensor, "llama3_wiki_64.pt")

# Load token embedding and store it.

In [None]:
model_id = "NousResearch/Llama-2-7b-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="right")
tokenizer.pad_token_id = 128002

# Access the embedding matrix
word_embeddings_tensor = model.model.embed_tokens.weight.data

# Delete llama3 because we are no longer using it.
del model

# Store vocabulary size and embedding dimension
num_embeddings, embedding_dim = word_embeddings_tensor.shape
word_embeddings_tensor.requires_grad = False

torch.save(word_embeddings_tensor, 'word_embeddings_tensor.pt')