In [1]:
import os
import re
from tqdm import tqdm
import json
from datasets import Dataset, DatasetDict

def get_ner_tag(word):
    """Assign NER tag based on word's capitalization."""
    if word.islower():
        return 0
    elif word.istitle():
        return 1
    elif word.isupper():
        return 2
    return 0  # default to 0 if none of the conditions match

def process_sentence(sentence):
    """Tokenize sentence and generate corresponding NER tags."""
    tokens = re.findall(r'\b\w+\b', sentence)
    ner_tags = [get_ner_tag(token) for token in tokens]
    return tokens, ner_tags

def create_dataset_from_txt_files(folder_path):
    """Process all .txt files and create dataset with id, tokens, and ner_tags."""
    dataset = {"id": [], "tokens": [], "ner_tags": []}

    for idx, filename in tqdm(enumerate(os.listdir(folder_path))):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as f:
                sentences = f.readlines()
                for i, sentence in enumerate(sentences):
                    tokens, ner_tags = process_sentence(sentence.strip())
                    dataset["id"].append(f"{idx}")
                    dataset["tokens"].append(tokens)
                    dataset["ner_tags"].append(ner_tags)

    return dataset

def split_dataset(dataset, train_ratio=0.7, validation_ratio=0.15):
    """Split dataset into train, validation, and test sets."""
    num_samples = len(dataset["id"])
    train_size = int(num_samples * train_ratio)
    validation_size = int(num_samples * validation_ratio)

    train_dataset = {
        "id": dataset["id"][:train_size],
        "tokens": dataset["tokens"][:train_size],
        "ner_tags": dataset["ner_tags"][:train_size]
    }

    validation_dataset = {
        "id": dataset["id"][train_size:train_size + validation_size],
        "tokens": dataset["tokens"][train_size:train_size + validation_size],
        "ner_tags": dataset["ner_tags"][train_size:train_size + validation_size]
    }

    test_dataset = {
        "id": dataset["id"][train_size + validation_size:],
        "tokens": dataset["tokens"][train_size + validation_size:],
        "ner_tags": dataset["ner_tags"][train_size + validation_size:]
    }

    return train_dataset, validation_dataset, test_dataset

def convert_to_hf_dataset(dataset):
    """Convert a dictionary to Hugging Face Dataset."""
    return Dataset.from_dict({
        'id': dataset['id'],
        'tokens': dataset['tokens'],
        'ner_tags': dataset['ner_tags']
    })

def create_dataset_dict(folder_path):
    """Create DatasetDict with train, validation, and test splits."""
    # Step 1: Process the .txt files into a dataset
    raw_dataset = create_dataset_from_txt_files(folder_path)

    # Step 2: Split the dataset
    train_dataset, validation_dataset, test_dataset = split_dataset(raw_dataset)

    # Step 3: Convert each split to Hugging Face Dataset format
    hf_train_dataset = convert_to_hf_dataset(train_dataset)
    hf_validation_dataset = convert_to_hf_dataset(validation_dataset)
    hf_test_dataset = convert_to_hf_dataset(test_dataset)

    # Step 4: Combine into a DatasetDict
    dataset_dict = DatasetDict({
        'train': hf_train_dataset,
        'validation': hf_validation_dataset,
        'test': hf_test_dataset
    })

    return dataset_dict

# Specify the folder path with the .txt files
folder_path = '/home/vahan/Documents/NER_data/final_data'

# Create the DatasetDict
dataset_dict = create_dataset_dict(folder_path)

# Example: Print details of the dataset
print(dataset_dict)

# Example: Save the dataset in the Hugging Face format (optional)
dataset_dict.save_to_disk('final_dataset')

1239961it [03:27, 5967.46it/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 867972
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 185994
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 185995
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/867972 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/185994 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/185995 [00:00<?, ? examples/s]