In [None]:
!pip install tiktoken --quiet
!pip install datasets --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.

In [None]:
print("hi")

## **File: previous_chapters**

**1. tokenizer initialization**

```
# original code
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
```

```
# modified code
from transformers import PreTrainedTokenizerFast

# Load the tokenizer
tokenizer = PreTrainedTokenizerFast.from_pretrained("Aananda-giri/NepaliBPE")
```


**2. tokenizer.encode**

```
# original code
token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})
```

```
# modified code
token_ids = tokenizer.encode(txt)
```

**2. tokenizer.decode**
* leave it as it is


In [None]:
# sebastian original tokenizer
import tiktoken

txt = "hello world!"

tokenizer = tiktoken.get_encoding("gpt2")

# Encode
token_ids = tokenizer.encode(txt, allowed_special={'<|endoftext|>'})
print(f'Encoded: {type(token_ids)} {token_ids}')

# tokenize
print([tokenizer.decode([token]) for token in tokenizer.encode(txt)])

# Decode
print(f'Decoded: {type(tokenizer.decode(token_ids))} {tokenizer.decode(token_ids)}')


Encoded: <class 'list'> [31373, 995, 0]
['hello', ' world', '!']
Decoded: <class 'str'> hello world!


In [None]:
# our tokenizer
from transformers import PreTrainedTokenizerFast

# Load the tokenizer
tokenizer = PreTrainedTokenizerFast.from_pretrained("Aananda-giri/NepaliBPE")

text = "तपाईंलाई कस्तो छ?"
token_ids = tokenizer.encode(text)
print(f'Encoded: {type(token_ids)} {token_ids}')

# Tokenize
print(f'tokenized: {tokenizer.tokenize(text)}')

# Decode
print(f'Decoded: {type(tokenizer.decode(token_ids))} {tokenizer.decode(token_ids)}')

Encoded: <class 'list'> [8987, 3199, 186, 1]
tokenized: ['तपाईंलाई</w>', 'कस्तो</w>', 'छ</w>', '<|unk|>']
Decoded: <class 'str'> तपाईंलाई कस्तो छ <|unk|>


## **File: `prepare_dataset.py`**
use nepberta (nepali) dataset instead of project gutengerg

**original code:**

* removes non english text
* combines files from project gutenberg and generate 500Mb chunks with file_name: \<target_dir\>/combined_{file_counter}.txt

* where `file_counter` starts from 1
* and `target_dir = gutenberg_preprocessed/`


**modified code**
* removes non devanagari text
* combines text rows from nepberta and generate 500Mb chunks with file_name \<target_dir\>/chunk_{file_counter}.txt

* **upload to huggingface** for ease of use on different notebooks
* access these chunks from here: https://huggingface.co/datasets/Aananda-giri/nepberta-sample


```
# download 1 chunk of size 500Mb from huggingface
from datasets import load_dataset

num_chunks_to_save = 1
target_dir = 'nepberta_sample'

# Load the dataset from the Hugging Face Hub
sampled_dataset_stream = load_dataset("Aananda-giri/nepberta-sample", split="train", streaming=True)

import os
if not os.path.exists(target_dir):
  os.mkdir(target_dir)

# Save each chunk to a separate text file
for i in range(num_chunks_to_save):
    chunk = next(iter(sampled_dataset_stream))  # Get the next chunk
    with open(os.path.join(target_dir, f"combined_{i+1}.txt"), "w", encoding="utf-8") as file:
        file.write(chunk['text'])
    print(f"Saved chunk {i+1} to chunk_{i+1}.txt")
```

## **File: `pretraining_simple.py`**


**1. tokenizer initialization**

```
# original code
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
```

```
# modified code
from transformers import PreTrainedTokenizerFast

# Load the tokenizer
tokenizer = PreTrainedTokenizerFast.from_pretrained("Aananda-giri/NepaliBPE")
```

**2. Vocab size:**

```
# original code
GPT_CONFIG_124M = {
            "vocab_size": 50257
            ...
```

```
# modified code
GPT_CONFIG_124M = {
            "vocab_size": 50000
            ...
```

# **Run the code**

In [None]:
!pip install datasets --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m471.0/480.6 kB[0m [31m23.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/134.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# this location contains files like: prepare_dataset.py, pretraining_simple.py and previous_chapters.py
%cd /content/drive/MyDrive/Research/llm.np/sebastian_gutenberg/

[Errno 2] No such file or directory: '/content/drive/MyDrive/Research/llm.np/sebastian_gutenberg/'
/content


In [None]:
#  Download dataset
!python3 prepare_dataset.py

README.md:   0% 0.00/2.39k [00:00<?, ?B/s]README.md: 100% 2.39k/2.39k [00:00<00:00, 13.5MB/s]
Repo card metadata block was not found. Setting CardData to empty.
Saved chunk 1 to chunk_1.txt


In [None]:
# source: https://github.com/rasbt/LLMs-from-scratch/blob/main/appendix-D/01_main-chapter-code/appendix-D.ipynb
!python pretraining_bells_n_whistles.py \
  --data_dir "nepberta_sample" \
  --n_epochs 5 \
  --batch_size 4 \
  --output_dir model_checkpoints

Total files: 1
Tokenizing file 1 of 1: nepberta_sample/combined_1.txt
1793
Training ...
Ep 1 (Iter 000000): Train loss 10.966, Val loss 10.977
Ep 1 (Iter 000100): Train loss 8.701, Val loss 8.410
Ep 1 (Iter 000200): Train loss 8.213, Val loss 8.032
Ep 1 (Iter 000300): Train loss 8.104, Val loss 7.724
Ep 1 (Iter 000400): Train loss 8.121, Val loss 7.451
Ep 1 (Iter 000500): Train loss 8.122, Val loss 7.085
Ep 1 (Iter 000600): Train loss 7.244, Val loss 6.794
Ep 1 (Iter 000700): Train loss 7.670, Val loss 6.366
Ep 1 (Iter 000800): Train loss 6.663, Val loss 6.292
Ep 1 (Iter 000900): Train loss 7.567, Val loss 6.315
Ep 1 (Iter 001000): Train loss 7.092, Val loss 5.973
Ep 1 (Iter 001100): Train loss 7.603, Val loss 5.891
Ep 1 (Iter 001200): Train loss 6.382, Val loss 5.798
Ep 1 (Iter 001300): Train loss 6.817, Val loss 5.496
Ep 1 (Iter 001400): Train loss 7.190, Val loss 5.462
Ep 1 (Iter 001500): Train loss 6.995, Val loss 5.476
Ep 1 (Iter 001600): Train loss 6.431, Val loss 5.573
Ep 1 (Ite

In [None]:
# Alternatively: to run the simpler version of code (source: https://github.com/rasbt/LLMs-from-scratch/tree/main/ch05/03_bonus_pretraining_on_gutenberg)
!python pretraining_simple.py \
  --data_dir "nepberta_sample" \
  --n_epochs 5 \
  --batch_size 4 \
  --output_dir model_checkpoints

Total files: 1
Tokenizing file 1 of 1: nepberta_sample/combined_1.txt
Training ...
Ep 1 (Step 0): Train loss 10.700, Val loss 10.543
2024-11-06 15:07:19.711472: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 15:07:19.988826: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 15:07:20.070071: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 15:07:20.482996: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AV

In [None]:
# !rm -rf model_checkpoints