In [None]:
!pip install datasets
!pip install transformers
!pip install tqdm
!pip install nltk
!pip install bitsandbytes
!pip install accelerate

In [None]:
from datasets import load_dataset
from tqdm import tqdm
import json
import nltk
from transformers import AutoTokenizer
from joblib import Parallel, delayed
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import ijson

nltk.download('punkt')
nltk.download('punkt_tab')
model_id = "NousResearch/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="right", user_fast=True)

# Load dataset and store it.

In [None]:
dataset = load_dataset("wikipedia", "20220301.en", trust_remote_code=True)
training_data = dataset["train"]["text"]
with open("training_data.json", 'w') as file:
    json.dump(training_data, file, indent=4)

# Split into sentences.

In [None]:
with open("training_data.json", 'r') as file:
    training_data = json.load(file)

def split_into_sentences(text: str) -> list[str]:
    return nltk.sent_tokenize(text)

training_data = Parallel(n_jobs=-1)(delayed(split_into_sentences)(webpage) for webpage in tqdm(training_data, desc="Processing"))

with open("sentence_training_data.json", 'w') as file:
    json.dump(training_data, file, indent=4)

# Tokenize training data

In [None]:
with open("sentence_training_data.json", 'r') as file:
    training_data = json.load(file)

def tokenize_function(sentences: list[str]) -> list[list[int]]:
    for i in range(len(sentences)):
        sentences[i] = tokenizer(sentences[i])["input_ids"]
    return sentences

# The tokenization process cannot be executed in parallel.
for i in tqdm(range(len(training_data))):
    for j in range(len(training_data[i])):
        training_data[i][j] = tokenizer(training_data[i][j])["input_ids"]

with open("tokenized_training_data.json", 'w') as file:
    json.dump(training_data, file, indent=4)

# Count how many qualified sentences there are

In [None]:
max_token = 63
min_token = 10
counter = 0

with open('tokenized_training_data.json', 'r') as file:
    for item in ijson.items(file, 'item'):
        for tokenized_sentence in item:
            length = len(tokenized_sentence)
            if length < max_token and length > min_token:
                counter += 1

print(counter)

# Store tokenized data into tensor

In [None]:
shape = (117979663, 64)

tokenized_tensor = torch.empty(shape, dtype=torch.int16)

In [None]:
max_token = 63
min_token = 10
counter = 0

with open('tokenized_training_data.json', 'r') as file:
    for item in ijson.items(file, 'item'):
        for tokenized_sentence in item:
            length = len(tokenized_sentence)
            if length < max_token and length > min_token:
                tokenized_sentence.append(tokenizer.eos_token_id)
                tokenized_sentence = tokenized_sentence + [32000] * (64 - len(tokenized_sentence))
                sentence_tokenized_tensor = torch.tensor(tokenized_sentence, dtype=torch.int16)
                tokenized_tensor[counter] = sentence_tokenized_tensor
                counter += 1

In [None]:
torch.save(tokenized_tensor, "llama2_wiki_64.pt")

# Load token embedding and store it.

In [None]:
model_id = "NousResearch/Llama-2-7b-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="right")
tokenizer.pad_token_id = 128002

# Access the embedding matrix
word_embeddings_tensor = model.model.embed_tokens.weight.data

# Delete llama3 because we are no longer using it.
del model

# Store vocabulary size and embedding dimension
num_embeddings, embedding_dim = word_embeddings_tensor.shape
word_embeddings_tensor.requires_grad = False

torch.save(word_embeddings_tensor, 'word_embeddings_tensor_llama2.pt')