In [1]:
%cd /content/drive/MyDrive/Research/dataset/hf_dataset

/content/drive/MyDrive/Research/dataset/hf_dataset


In [None]:
!ls -l --block-size=G *.csv *.parquet

-rw------- 1 root root 14G Dec 18 12:23 clean_date_categories.csv
-rw------- 1 root root 26G Dec 18 14:40 IRIISNEPAL_Nepali_Text_Corpus.csv
-rw------- 1 root root  3G Dec 29 08:58 test_split.csv
-rw------- 1 root root  2G Dec 29 09:12 test_split.parquet
-rw------- 1 root root 30G Dec 22 05:25 tokenized_data.csv
-rw------- 1 root root  8G Dec 29 05:18 tokenized_data.parquet
-rw------- 1 root root  2G Dec 29 06:52 tokenized_data_train.parquet
-rw------- 1 root root 27G Dec 29 08:58 train_split.csv
-rw------- 1 root root 15G Dec 29 09:09 train_split.parquet


## Tokenized_data.csv -> train.parquet, test.parquet


* 90% train data
* 10% test data

### Reference:
* [querying parquet](https://duckdb.org/2021/06/25/querying-parquet.html)
* [csv to parquet](https://medium.com/@mariusz_kujawski/converting-csv-files-to-parquet-with-polars-pandas-dask-and-dackdb-52a77378349d)

In [12]:
# total rows in tokenized_data.parquet

import duckdb

print(duckdb.query('''
   SELECT count(*)
   FROM 'tokenized_data.parquet'
   ''').fetchall())

# Ref. https://duckdb.org/2021/06/25/querying-parquet.html

[(5312289,)]


In [9]:
total_rows = 5312289
train_size = int(total_rows * 0.9)
print(f'train_size: {train_size}')

train_size: 4781060


In [None]:
!rm train.parquet

In [None]:
# Convert the first train rows of CSV to Parquet (using limit)
import duckdb
import time

duck_time = time.time()
con = duckdb.connect(database=':memory:')

con.execute(f"""
 COPY (SELECT * FROM read_csv_auto('tokenized_data.csv', delim='\t', header=True, SAMPLE_SIZE=1000000) LIMIT {train_size})
 TO 'train.parquet'
 (FORMAT PARQUET, CODEC 'SNAPPY');
""")
duck_time = time.time() - duck_time

print(f"Duckdb time: {duck_time:.2f} seconds")


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Duckdb time: 501.94 seconds


In [10]:
# Number of rows in train.parquet
import duckdb

print(duckdb.query('''
   SELECT count(*)
   FROM 'train.parquet'
   ''').fetchall())

# Ref. https://duckdb.org/2021/06/25/querying-parquet.html

[(4781060,)]


In [5]:
# Convert the test rows of CSV to Parquet (using offset)
import duckdb
import time

duck_time = time.time()
con = duckdb.connect(database=':memory:')

con.execute(f"""
 COPY (SELECT * FROM read_csv_auto('tokenized_data.csv', delim='\t', header=True, SAMPLE_SIZE=1000000) OFFSET {train_size})
 TO 'test.parquet'
 (FORMAT PARQUET, CODEC 'SNAPPY');
""")
duck_time = time.time() - duck_time

print(f"Duckdb time: {duck_time:.2f} seconds")


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Duckdb time: 425.19 seconds


In [6]:
# Number of rows in test.parquet
import duckdb

print(duckdb.query('''
   SELECT count(*)
   FROM 'test.parquet'
   ''').fetchall())

# Ref. https://duckdb.org/2021/06/25/querying-parquet.html

[(531229,)]


In [11]:
!ls -l --block-siz=G *.parquet

-rw------- 1 root root  1G Dec 30 04:24 test.parquet
-rw------- 1 root root  2G Dec 29 09:11 test_split.parquet
-rw------- 1 root root  8G Dec 29 05:18 tokenized_data.parquet
-rw------- 1 root root  2G Dec 29 06:52 tokenized_data_train.parquet
-rw------- 1 root root  7G Dec 30 04:17 train.parquet
-rw------- 1 root root 15G Dec 29 08:59 train_split.parquet


In [13]:
# looks good.
# len_rows(train_split.parquet) + len_rows(test_split.parquet) = len_rows(tokenized_data.parquet)
4781060 + 531229 == 5312289

True

## Get sample data

In [14]:
!pip install datasets --quiet
from datasets import load_dataset
data = load_dataset('parquet', data_files="train.parquet", streaming=True)
print(data)
for n,d in enumerate(data['train']):
  print(d)
  if n>=1:
    break

IterableDatasetDict({
    train: IterableDataset({
        features: ['input_ids,target_ids'],
        num_shards: 1
    })
})
{'input_ids,target_ids': '"[239, 552, 875, 904, 630, 2809, 13407, 6327, 3525, 38388, 4834, 283, 15880, 1227, 4385, 6106, 410, 37792, 12186, 170, 251, 630, 3981, 745, 12622, 22082, 6478, 875, 904, 38388, 4834, 283, 15880, 1227, 750, 834, 3345, 14118, 1017, 3656, 26349, 410, 1076, 170, 251, 2162, 800, 343, 630, 2809, 13407, 6327, 3525, 38388, 4834, 283, 4743, 2852, 5708, 251, 630, 2809, 43214, 6286, 13407, 6327, 3525, 258, 38388, 4834, 283, 6131, 675, 966, 675, 19002, 937, 15880, 5949, 952, 38388, 18818, 4743, 1962, 597, 684, 251, 1489, 9991, 207, 8484, 935, 479, 11787, 18767, 3862, 6131, 675, 966, 675, 19002, 937, 411, 92, 13623, 7945, 750, 14970, 43214, 2186, 3345, 1076, 170, 251, 25266, 328, 3110, 2022, 10835, 1366, 2549, 10090, 4719, 2036, 339, 251, 834, 7823, 750, 17438, 13763, 410, 728, 251, 5175, 20802, 201, 283, 5288, 1188, 43214, 2186, 5396, 487, 271, 35

In [16]:
!pip install datasets --quiet
from datasets import load_dataset
data = load_dataset('parquet', data_files="test.parquet", streaming=True)
print(data)
for n,d in enumerate(data['train']):
  print(d)
  if n>=1:
    break

IterableDatasetDict({
    train: IterableDataset({
        features: ['input_ids,target_ids'],
        num_shards: 1
    })
})
{'input_ids,target_ids': '"[1169, 170, 552, 168, 802, 251, 530, 2549, 3085, 718, 1327, 1257, 7061, 7050, 3318, 797, 1464, 6023, 552, 168, 802, 251, 5175, 6267, 9348, 9045, 1144, 9757, 3055, 152, 3701, 5442, 9056, 8485, 552, 168, 802, 251, 3144, 1839, 414, 1722, 5308, 9876, 3929, 9652, 1469, 1270, 2477, 3976, 552, 168, 802, 251, 630, 2025, 1882, 2675, 2922, 1918, 1794, 2211, 689, 10562, 11024, 552, 168, 802, 251, 1025, 258, 1270, 287, 688, 1327, 1257, 3489, 3561, 896, 1831, 5260, 552, 802, 205, 251, 2731, 2011, 1915, 11060, 7991, 805, 2301, 923, 21297, 251, 3902, 552, 802, 205, 251, 10119, 7906, 9536, 674, 805, 18882, 18689, 8668, 13047, 13659, 152, 552, 802, 205, 251, 718, 1327, 1257, 1554, 1742, 2459, 805, 2301, 923, 21297, 251, 2301, 552, 802, 205, 251, 241, 248, 434, 248, 270, 801, 3021, 2044, 755, 193, 283, 2477, 7456, 4714, 12949, 8481, 2512, 41593, 7912, 

In [15]:
print('hi...')

hi...


## **Push to hub**
`train.parquet` -> `pre_tokenized/iriisnepal_u_nepberta_train_512.parquet`
`test.parquet` -> `pre_tokenized/iriisnepal_u_nepberta_test_512.parquet`

In [18]:
from google.colab import userdata

# put huggingface key in secrets as `HF_TOKEN`
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj="train.parquet", # tokenized_data.csv
    path_in_repo="pre_tokenized/iriisnepal_u_nepberta_train_512.parquet", # iriisnepal_u_nepberta_512.csv
    repo_id="Aananda-giri/nepali_llm_datasets",
    repo_type="dataset",
    token=userdata.get('HF_TOKEN')
)

api.upload_file(
    path_or_fileobj="test.parquet", # tokenized_data.csv
    path_in_repo="pre_tokenized/iriisnepal_u_nepberta_test_512.parquet", # iriisnepal_u_nepberta_512.csv
    repo_id="Aananda-giri/nepali_llm_datasets",
    repo_type="dataset",
    token=userdata.get('HF_TOKEN')
)

train.parquet:   0%|          | 0.00/7.48G [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/846M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Aananda-giri/nepali_llm_datasets/commit/344703cd568921b6341585262040c1e153ae0e83', commit_message='Upload pre_tokenized/iriisnepal_u_nepberta_test_512.parquet with huggingface_hub', commit_description='', oid='344703cd568921b6341585262040c1e153ae0e83', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Aananda-giri/nepali_llm_datasets', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Aananda-giri/nepali_llm_datasets'), pr_revision=None, pr_num=None)

# README.md
 - (add below code to README.md)
 - please refer to [preprocess.ipynb](https://huggingface.co/datasets/Aananda-giri/nepali_llm_datasets/blob/main/preprocess.ipynb) (code from GPT-2 data prepration)

---
- config_name: iriisnepal_u_nepberta_512
  data_files:
  - split: train
    path:
      - "iriisnepal_u_nepberta_512.csv"
\---


## Load from hub

In [19]:
!pip install datasets --quiet

### (error) direct loading

In [20]:
# it loads entire dataset first
from datasets import load_dataset

# use streaming=True to avoid downloading entire dataset
data = load_dataset("Aananda-giri/nepali_llm_datasets", name="iriisnepal_u_nepberta_512", streaming=True)
print(data)

# train data
print(f'\n\n TRAIN DATA')
for n,d in enumerate(data['train']):
  print(d)
  if n >= 2:
    break

# Test data
print(f'\n\n TEST DATA')
for n,d in enumerate(data['train']):
  print(d)
  if n >= 2:
    break

README.md:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/184 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/46 [00:00<?, ?it/s]

IterableDatasetDict({
    train: IterableDataset({
        features: Unknown,
        num_shards: 1
    })
    test: IterableDataset({
        features: Unknown,
        num_shards: 1
    })
})


 TRAIN DATA


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd2 in position 7: invalid continuation byte

### loading from file urls

In [2]:
!pip install datasets --quiet
from datasets import load_dataset

In [4]:
'''
direct loading gives error. lets load from file url (it downloads the ~8GB dataset first)
'''
base_url = "https://huggingface.co/datasets/Aananda-giri/nepali_llm_datasets/resolve/main/pre_tokenized/"
data_files = {"train": base_url + "iriisnepal_u_nepberta_train_512.parquet", "test":base_url + "iriisnepal_u_nepberta_test_512.parquet"}
dataset = load_dataset("parquet", data_files=data_files, cache_dir='hf_cache', streaming=True)

print(dataset)

# train data
print(f'\n\n TRAIN DATA')
for n,d in enumerate(dataset['train']):
  print(d)
  if n >= 2:
    break

# Test data
print(f'\n\n TEST DATA')
for n,d in enumerate(dataset['train']):
  print(d)
  if n >= 2:

    break

IterableDatasetDict({
    train: IterableDataset({
        features: ['input_ids,target_ids'],
        num_shards: 1
    })
    test: IterableDataset({
        features: ['input_ids,target_ids'],
        num_shards: 1
    })
})


 TRAIN DATA
{'input_ids,target_ids': '"[239, 552, 875, 904, 630, 2809, 13407, 6327, 3525, 38388, 4834, 283, 15880, 1227, 4385, 6106, 410, 37792, 12186, 170, 251, 630, 3981, 745, 12622, 22082, 6478, 875, 904, 38388, 4834, 283, 15880, 1227, 750, 834, 3345, 14118, 1017, 3656, 26349, 410, 1076, 170, 251, 2162, 800, 343, 630, 2809, 13407, 6327, 3525, 38388, 4834, 283, 4743, 2852, 5708, 251, 630, 2809, 43214, 6286, 13407, 6327, 3525, 258, 38388, 4834, 283, 6131, 675, 966, 675, 19002, 937, 15880, 5949, 952, 38388, 18818, 4743, 1962, 597, 684, 251, 1489, 9991, 207, 8484, 935, 479, 11787, 18767, 3862, 6131, 675, 966, 675, 19002, 937, 411, 92, 13623, 7945, 750, 14970, 43214, 2186, 3345, 1076, 170, 251, 25266, 328, 3110, 2022, 10835, 1366, 2549, 10090, 4719, 2036, 339, 2

In [9]:
import json
splitted_data_item = d['input_ids,target_ids'].split("\",")
input_ids = json.loads(splitted_data_item[0].replace('\"',''))
print(f'input_ids: {type(input_ids)} {input_ids}')
target_ids = json.loads(splitted_data_item[1].replace('\"',''))
print(f'target_ids: {type(target_ids)} {target_ids}')

input_ids: <class 'list'> [36959, 10552, 2833, 17668, 200, 251, 777, 5794, 308, 601, 8298, 7282, 7282, 287, 393, 1661, 11549, 5379, 280, 36133, 6538, 251, 1415, 160, 26299, 557, 34102, 1410, 27063, 9343, 160, 241, 248, 434, 248, 270, 22744, 414, 936, 16638, 24504, 21802, 979, 393, 1381, 11509, 8905, 18808, 11098, 25150, 1013, 547, 170, 251, 7573, 851, 9945, 1165, 11290, 14455, 164, 285, 22744, 414, 936, 1380, 9568, 979, 393, 24504, 8905, 18808, 11098, 14320, 283, 1013, 382, 170, 251, 1250, 23924, 2272, 25160, 390, 850, 1381, 11509, 48838, 3892, 6834, 2354, 3401, 177, 419, 3797, 152, 11460, 1716, 1084, 24504, 7624, 147, 419, 3797, 998, 170, 251, 24504, 8905, 283, 15998, 639, 285, 24504, 6303, 9965, 702, 152, 12964, 1716, 1084, 24504, 30445, 419, 343, 9965, 8671, 479, 9499, 666, 19033, 2219, 1247, 6569, 6100, 611, 251, 873, 24504, 21020, 15998, 872, 739, 24504, 7227, 343, 419, 6203, 152, 673, 25160, 280, 393, 20181, 15635, 25477, 895, 1619, 1144, 702, 251, 31018, 8188, 367, 328, 8905, 28

In [16]:
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
# Source for "Build a Large Language Model From Scratch"
#   - https://www.manning.com/books/build-a-large-language-model-from-scratch
# Code: https://github.com/rasbt/LLMs-from-scratch

# This file collects all the relevant code that we covered thus far
# throughout Chapters 2-4.
# This file can be run as a standalone script.

import json

# modified from `import tiktoken`
from transformers import PreTrainedTokenizerFast


import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

# modified. added for create_dataloader_v2
from datasets import load_dataset


#####################################
# Chapter 2
#####################################

def create_dataloader_v3(shuffle=True, drop_last=True, num_workers=0):
    '''
    modified.
    * parameter: text removed
    * parameters: max_length and stride removed : they were set during preparing tokenized_datasets
    * parameter: context_length removed (as dataset is pre-tokenized)
    '''
    # Download the whole dataset
    base_url = "https://huggingface.co/datasets/Aananda-giri/nepali_llm_datasets/resolve/main/pre_tokenized/"
    # data_files = {"train": base_url + "nepberta_" + str(context_length) + ".parquet"}
    data_files = {
        "train": base_url + "iriisnepal_u_nepberta_train_512.parquet",
        "test": base_url + "iriisnepal_u_nepberta_test_512.parquet"
        }
    dataset = load_dataset("parquet", data_files=data_files, cache_dir='hf_cache', streaming=True)

    print(dataset)

    # and split it later
    # dataset = dataset.train_test_split(train_size=train_ratio, seed=42)
    # Convert Hugging Face Dataset to PyTorch tensors (we can directly use the dataset as it is already in the correct format)
    # dataset.set_format(type="torch", columns=['input_ids,target_ids'])  # Directly set columns to torch tensors



    # Define the custom collate_fn function
    def collate_fn(batch):
        # Extract the 'input_ids' and 'target_ids' from the batch and return them as a list of tensors
        input_ids = []
        target_ids = []
        for data_item in batch:
            splitted_data_item = data_item['input_ids,target_ids'].split("\",")
            input_ids.append(torch.tensor(json.loads(splitted_data_item[0].replace('\"',''))))
            # print(f'input_ids: {type(input_ids)} {input_ids}')
            target_ids.append(torch.tensor(json.loads(splitted_data_item[1].replace('\"',''))))

        # Convert to tensors (if not already)
        input_ids_tensor = torch.stack(input_ids)
        target_ids_tensor = torch.stack(target_ids)

        return [input_ids_tensor, target_ids_tensor]


    # Creating the DataLoader for the 'train' split of the dataset with the custom collate_fn
    train_loader = DataLoader(
        dataset['train'],
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers,
        collate_fn=collate_fn
    )

    val_loader =  DataLoader(
        dataset['test'],
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers,
        collate_fn=collate_fn
    )

    return train_loader, val_loader


train_loader, val_loader = create_dataloader_v3(shuffle=False, drop_last=True, num_workers=0)

IterableDatasetDict({
    train: IterableDataset({
        features: ['input_ids,target_ids'],
        num_shards: 1
    })
    test: IterableDataset({
        features: ['input_ids,target_ids'],
        num_shards: 1
    })
})


In [34]:
for input_batch, target_batch in train_loader:
  print(f'input_batch: len.{len(input_batch)}len[0]:{len(input_batch[0])}{type(input_batch)} {input_batch} \n\ntarget_batch: len.{len(target_batch)}len[0]:{type(target_batch)} {target_batch}')
  break

input_batch: len.1len[0]:512<class 'torch.Tensor'> tensor([[  239,   552,   875,   904,   630,  2809, 13407,  6327,  3525, 38388,
          4834,   283, 15880,  1227,  4385,  6106,   410, 37792, 12186,   170,
           251,   630,  3981,   745, 12622, 22082,  6478,   875,   904, 38388,
          4834,   283, 15880,  1227,   750,   834,  3345, 14118,  1017,  3656,
         26349,   410,  1076,   170,   251,  2162,   800,   343,   630,  2809,
         13407,  6327,  3525, 38388,  4834,   283,  4743,  2852,  5708,   251,
           630,  2809, 43214,  6286, 13407,  6327,  3525,   258, 38388,  4834,
           283,  6131,   675,   966,   675, 19002,   937, 15880,  5949,   952,
         38388, 18818,  4743,  1962,   597,   684,   251,  1489,  9991,   207,
          8484,   935,   479, 11787, 18767,  3862,  6131,   675,   966,   675,
         19002,   937,   411,    92, 13623,  7945,   750, 14970, 43214,  2186,
          3345,  1076,   170,   251, 25266,   328,  3110,  2022, 10835,  1366,
 

In [24]:
print(type(data), len(data))

print(type(data[0]), len(data[0]))

<class 'list'> 2
<class 'torch.Tensor'> 1
