# Creating dataset in json format

dataset will be in the json format {en : "text", ne : "translated-text"}

In [23]:
import os

def get_file_names_without_extension(folder_path):
    # Get a list of all files and directories in the folder
    files_and_dirs = os.listdir(folder_path)

    # Create an empty set to store the file names without extensions
    file_names_set = set()

    # Filter out only the files and add their base names to the set
    for file_name in files_and_dirs:
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path):
            base_name = os.path.splitext(file_name)[0]
            file_names_set.add(base_name)

    return file_names_set


folder_path = "./dataset"  # Replace this with the actual folder path
file_names_set = get_file_names_without_extension(folder_path)

# Print the set of file names without extensions
print(file_names_set)




{'NNC', 'bible', 'NLC', 'PR_improved', 'the guardian', 'gnome_final', 'globalvoices_improved'}


In [24]:
file_names_set

{'NLC',
 'NNC',
 'PR_improved',
 'bible',
 'globalvoices_improved',
 'gnome_final',
 'the guardian'}

In [33]:
import json

def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read().strip().split('\n')

def combine_data(english_data, nepali_data, combined_data = []):
    
    for eng, nep in zip(english_data, nepali_data):
        data_pair = {"en": eng, "ne": nep}
        combined_data.append(data_pair)
    return combined_data

def write_combined_data(combined_data, output_file):
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(combined_data, file, ensure_ascii=False, indent=4)

def main():
    # Replace 'english.txt' and 'nepali.txt' with your file paths
#     english_data = read_file('./dataset/globalvoices_improved.en')
#     nepali_data = read_file('./dataset/globalvoices_improved.ne')

    for file_name in file_names_set:
        english_data = read_file("./dataset/" + file_name + ".en")
        nepali_data = read_file("./dataset/" + file_name + ".ne")
        combined_data = combine_data(english_data, nepali_data)

#     combined_data = combine_data(english_data, nepali_data)

    # Replace 'output.json' with the desired output file name
    write_combined_data(combined_data, 'dataset.json')

if __name__ == "__main__":
    main()


In [34]:
def load_json_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

In [35]:
data = load_json_file('dataset.json')

In [36]:
len(data)

177334

In [37]:
import datasets
from datasets import load_dataset
dataset = load_dataset("json", data_files="dataset.json")

Downloading data files: 100%|██████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 3731.59it/s]
Extracting data files: 100%|████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 170.11it/s]
Generating train split: 177334 examples [00:00, 185018.96 examples/s]


In [38]:
dataset

DatasetDict({
    train: Dataset({
        features: ['en', 'ne'],
        num_rows: 177334
    })
})

In [43]:
!huggingface-cli login


        _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
        _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
        _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
        _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
        _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

        To login, `huggingface_hub` now requires a token generated from https://huggingface.co/settings/tokens .
        
Token: Traceback (most recent call last):
  File "/Users/cohlem/anaconda3/bin/huggingface-cli", line 11, in <module>
    sys.exit(main())
  File "/Users/cohlem/anaconda3/lib/python3.10/site-packages/huggingface_hub/commands/huggingface_cli.py", line 45, in main
    service.run()
  Fil

In [45]:
dataset.push_to_hub("CohleM/english-to-nepali")

Pushing dataset shards to the dataset hub:   0%|                                                                     | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format:   0%|                                                                          | 0/178 [00:00<?, ?ba/s][A
Creating parquet from Arrow format:  24%|███████████████▍                                                | 43/178 [00:00<00:00, 426.90ba/s][A
Creating parquet from Arrow format: 100%|███████████████████████████████████████████████████████████████| 178/178 [00:00<00:00, 604.36ba/s][A
Pushing dataset shards to the dataset hub: 100%|█████████████████████████████████████████████████████████████| 1/1 [00:26<00:00, 26.06s/it]
Downloading metadata: 100%|█████████████████████████████████████████████████████████████████████████████| 28.0/28.0 [00:00<00:00, 73.7kB/s]
