In [None]:
%cd /content/drive/MyDrive/Research/dataset/hf_dataset

/content/drive/MyDrive/Research/dataset/hf_dataset


In [None]:
!ls

'0. download_data.ipynb'     clean_date_categories.csv		 iriis_text.txt      tokenize
'1. train_tokenizer.ipynb'   clean_nepberta_data.zip		 NepaliBPE	     vocab_old.json
'3. tokenize_data.ipynb'     IRIISNEPAL_Nepali_Text_Corpus.csv	 nepberta_text.txt   vocab_old.text


## 0. pseudocode.
```
def tokens_generator():
  * for line in lines of text file
    * tokens=tokenize(line)
    * yield [token for token in tokens]

def collector(max_len)
  repeat generator stops giving tokens:
  while len(collected_tokens) < max_len+1:
    * collected_tokens.append(collect from tokens_generator_function)
    * get input_tokens, target_tokens and append to csv
    * collected_tokens = collected_tokens[stride:] # preserving tokens of previous data_item for next data_item
```

## 1. Testing pre-tokenization

In [None]:
class TestTokenizer():
  def __init__(self):
    self.vocab = {
        'zero': 0,
        'one': 1,
        'two': 2,
        'three': 3,
        'four': 4,
        'five': 5,
        'six': 6,
        'seven': 7,
        'eight': 8,
        'nine': 9,
        'ten': 10,
        'eleven': 11,
        'twelve': 12,
        'thirteen': 13,
        'fourteen': 14,
        'fifteen':15
    }
  def encode(self, text, allowed_special=None):
    return [self.vocab[word] for word in text.split()]

  def decode(self, tokens):
    return ' '.join([list(self.vocab.keys())[list(self.vocab.values()).index(token)] for token in tokens])

tokenizer=TestTokenizer()
tokenizer.encode('one two three')
tokenizer.decode([1,2,3])

'one two three'

In [None]:
import csv
import os
import time
import torch
# from transformers import GPT2Tokenizer
from transformers import PreTrainedTokenizerFast

class LargeFileTokenizer:
    def __init__(self, tokenizer, max_length, stride, input_file, output_file):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.stride = stride
        self.input_file = input_file
        self.output_file = output_file

        print(f'file: {self.input_file}')

    def tokens_generator(self):
        """Yields tokenized lines from a large text file."""
        with open(self.input_file, 'r', encoding='utf-8') as file:
            for line in file:
                # Tokenize the line and allow special tokens
                tokens = self.tokenizer.encode(line.strip(), allowed_special={'<|endoftext|>'})
                for token in tokens:
                    yield token

    def collect_and_save(self):
        """Collects tokens into chunks and saves them to a CSV file."""
        collected_tokens = []
        token_gen = self.tokens_generator()

        # # Open the CSV file to save tokenized data
        # with open(self.output_file, 'w', newline='', encoding='utf-8') as csvfile:
        #     csvwriter = csv.writer(csvfile)
        #     # Write the header
        #     csvwriter.writerow(['input_ids', 'target_ids'])

        file_exists = os.path.exists(self.output_file)

        with open(self.output_file, 'a', newline='', encoding='utf-8') as csvfile:
            csvwriter = csv.writer(csvfile)
            # Write the header only if the file does not exist
            if not file_exists:
                csvwriter.writerow(['input_ids', 'target_ids'])
                print(f'Output file: creating new: {self.output_file}')
            else:
              print(f'Output file exists: {self.output_file}')

            try:
                count = 0
                while True:
                    # Collect tokens until we have enough for one chunk
                    while len(collected_tokens) < self.max_length + 1:
                        collected_tokens.append(next(token_gen))

                    # Create input and target chunks
                    input_tokens = collected_tokens[:self.max_length]
                    target_tokens = collected_tokens[1:self.max_length + 1]

                    # Save to CSV
                    csvwriter.writerow([input_tokens, target_tokens])

                    # Preserve tokens for the next chunk
                    collected_tokens = collected_tokens[self.stride:]

                    count += 1
                    if count % 50000==0:
                      print(f'count:{count}')
            except StopIteration:
                # Handle the end of the token generator
                print("End of file reached.")
                pass

# Example usage
if __name__ == '__main__':
    # Testing
    # ------------
    with open('file1.txt','w') as f:
      f.write('zero one two three four \n five six seven')
    with open('file2.txt','w') as f:
      f.write('eight nine ten eleven twelve')
    tokenizer=TestTokenizer()
    # File paths
    input_files = ['file1.txt', 'file2.txt']
    output_file = 'tokenized_data_test.csv'

    # Tokenizer parameters
    max_length = 3
    stride = 2  # int(max_length*.75) = 384 (using stride as 75% of context length)

    start_time = time.time()
    for input_file in input_files:
        # Tokenize and save to CSV
        tokenizer_obj = LargeFileTokenizer(tokenizer, max_length, stride, input_file, output_file)
        tokenizer_obj.collect_and_save()
    print(f'time taken:{(time.time()-start_time)/60} hours')
    print('done!')
    # output <!cat tokenized_data_test.csv>
    #   input_ids,target_ids
    # "[0, 1, 2]","[1, 2, 3]"

    # five six seven is in new line and we cant see the difference (which is good)
    # "[2, 3, 4]","[3, 4, 5]"
    # "[4, 5, 6]","[5, 6, 7]"
    # "[8, 9, 10]","[9, 10, 11]"

    # this is new file so previous stride info. is not saved
    # [8, 9, 10]	[9, 10, 11]

    '''
    # Initialize tokenizer
    # tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    # # Save as tokenizer.json
    # tokenizer.save("NepaliBPE/tokenizer.json")
    # Load the tokenizer
    tokenizer = PreTrainedTokenizerFast(tokenizer_file="NepaliBPE/tokenizer.json")
    # tokenizer.encode('टिकटक वनाउने क्रममा तेह्रथुमको मेन्छ्यायेम गाउँपालिकाको खोरुङगा खोलाको खोंचमा खसेर शिक्षिका र एक छात्राको शनिबार अपराह्न मृत्यु भएको छ ।')

    # File paths
    input_files = ['nepberta_text.txt', 'iriis_text.txt']
    output_file = 'tokenized_data.csv'

    # Tokenizer parameters
    max_length = 512
    stride = 384  # int(max_length*.75) = 384 (using stride as 75% of context length)

    start_time = time.time()
    for input_file in input_files:
        # Tokenize and save to CSV
        tokenizer_obj = LargeFileTokenizer(tokenizer, max_length, stride, input_file, output_file)
        tokenizer_obj.collect_and_save()
    print(f'time taken:{(time.time()-start_time)/60} hours')
    print('done!')
    '''

file: file1.txt
Output file: creating new: tokenized_data_test.csv
End of file reached.
file: file2.txt
Output file exists: tokenized_data_test.csv
End of file reached.
time taken:0.0003130078315734863 hours
done!


## 2. Train the actual tokenizer

In [None]:
!rm tokenized_data.csv

rm: cannot remove 'tokenized_data.csv': No such file or directory


In [8]:
import csv
import os
import time
import torch
# from transformers import GPT2Tokenizer
from transformers import PreTrainedTokenizerFast

class LargeFileTokenizer:
    def __init__(self, tokenizer, max_length, stride, input_file, output_file):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.stride = stride
        self.input_file = input_file
        self.output_file = output_file

        print(f'file: {self.input_file}')

    def tokens_generator(self):
        """Yields tokenized lines from a large text file."""
        with open(self.input_file, 'r', encoding='utf-8') as file:
            for line in file:
                # Tokenize the line and allow special tokens
                tokens = self.tokenizer.encode(line.strip())  # , allowed_special={'<|endoftext|>'}
                for token in tokens:
                    yield token

    def collect_and_save(self):
        """Collects tokens into chunks and saves them to a CSV file."""
        collected_tokens = []
        token_gen = self.tokens_generator()

        # # Open the CSV file to save tokenized data
        # with open(self.output_file, 'w', newline='', encoding='utf-8') as csvfile:
        #     csvwriter = csv.writer(csvfile)
        #     # Write the header
        #     csvwriter.writerow(['input_ids', 'target_ids'])

        file_exists = os.path.exists(self.output_file)

        with open(self.output_file, 'a', newline='', encoding='utf-8') as csvfile:
            csvwriter = csv.writer(csvfile)
            # Write the header only if the file does not exist
            if not file_exists:
                csvwriter.writerow(['input_ids', 'target_ids'])
                print(f'Output file: creating new: {self.output_file}')
            else:
              print(f'Output file exists: {self.output_file}')

            try:
                count = 0
                while True:
                    # Collect tokens until we have enough for one chunk
                    while len(collected_tokens) < self.max_length + 1:
                        collected_tokens.append(next(token_gen))

                    # Create input and target chunks
                    input_tokens = collected_tokens[:self.max_length]
                    target_tokens = collected_tokens[1:self.max_length + 1]

                    # Save to CSV
                    csvwriter.writerow([input_tokens, target_tokens])

                    # Preserve tokens for the next chunk
                    collected_tokens = collected_tokens[self.stride:]

                    count += 1
                    if count % 50000==0:
                      print(f'count:{count}')
            except StopIteration:
                # Handle the end of the token generator
                print("End of file reached.")
                pass

# Example usage
if __name__ == '__main__':

    # Initialize tokenizer
    # tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

    # # Save as tokenizer.json
    # tokenizer.save("NepaliBPE/tokenizer.json")
    # Load the tokenizer
    tokenizer = PreTrainedTokenizerFast(tokenizer_file="NepaliBPE/tokenizer.json")
    # tokenizer.encode('टिकटक वनाउने क्रममा तेह्रथुमको मेन्छ्यायेम गाउँपालिकाको खोरुङगा खोलाको खोंचमा खसेर शिक्षिका र एक छात्राको शनिबार अपराह्न मृत्यु भएको छ ।')

    # File paths
    input_files = ['nepberta_text.txt', 'iriis_text.txt']
    output_file = 'tokenized_data.csv'

    # Tokenizer parameters
    max_length = 512
    stride = 384  # int(max_length*.75) = 384 (using stride as 75% of context length)

    start_time = time.time()
    for input_file in input_files:
        # Tokenize and save to CSV
        tokenizer_obj = LargeFileTokenizer(tokenizer, max_length, stride, input_file, output_file)
        tokenizer_obj.collect_and_save()
    print(f'time taken:{(time.time()-start_time)/60} hours')
    print('done!')

file: nepberta_text.txt
Output file: creating new: tokenized_data.csv
count:50000
count:100000
count:150000
count:200000
count:250000
count:300000
count:350000
count:400000
count:450000
count:500000
End of file reached.
file: iriis_text.txt
Output file exists: tokenized_data.csv
count:50000
count:100000
count:150000
count:200000
count:250000
count:300000
count:350000
count:400000
count:450000
count:500000
count:550000
count:600000
count:650000
count:700000
count:750000
count:800000
count:850000
count:900000
count:950000
count:1000000
count:1050000
count:1100000
count:1150000
count:1200000
count:1250000
count:1300000
count:1350000
count:1400000
count:1450000
count:1500000
count:1550000
count:1600000
count:1650000
count:1700000
count:1750000
count:1800000
count:1850000
count:1900000
count:1950000
count:2000000
count:2050000
count:2100000
count:2150000
count:2200000
count:2250000
count:2300000
count:2350000
count:2400000
count:2450000
count:2500000
count:2550000
count:2600000
count:265000