In [4]:
import pandas
import pathlib
import os
from datasets import Dataset ,concatenate_datasets, DatasetDict , Sequence , Value , Features , ClassLabel

In [5]:
def convert_tags(labels):
    '''
    Convert each label to a unique label (aligning labels for two datasets)
    '''
    maps = {'B-loc':'B_LOC', 'I-loc':'I_LOC', 'B-pers':'B_PER','I-pers':'I_PER',
              'B-org':'B_ORG', 'I-org':'I_ORG', 'B-fac':'B_FAC', 'I-fac':'I_FAC',
              'B-event' : 'B_EVE', 'I-event' : 'I_EVE', 'B-pro':'B_PRO', 'I-pro':'I_PRO',
              'I-fac': 'I_FAC', 'B-fac': 'B_FAC'
             }
    for index in range(len(labels)):
        if labels[index] in maps.keys():
            labels[index] =  maps[labels[index]]
    return labels


In [6]:
def generate_dataset(token_path,label_path,shuffle=True):
    '''
    Genrate dataset -> reading from raw txt data and split 
                       labels and tokens. Apply convert_tags to align datasests
    
    '''
    
    # List of lists for tokens and labels
    tokens = []
    labels = []
    
    with open(token_path , encoding='utf-8' , mode='r') as t , open(label_path , encoding='utf-8' , mode='r') as l:
        
        for row in t:
            tokens.append(row.strip('\n').split())
        
        for row in l :
            labels.append(row.strip('\n').split())

    # Verifying data
    if len(tokens) != len(labels):
        raise Exception("Number of rows in tokens and labels doesn't match!")
    # aligning
    labels = list(map(convert_tags,labels))
    
    # Verifying labels after aligning
    for token , label in zip(tokens,labels):
        if len(token) != len(label):
            raise Exception("Number of tokens and labels in the first row doesn't match!")
    


    # Finding unique tags in labels
    tags =  list(set(['B_DAT', 'B_LOC', 'B_MON', 'B_ORG', 'B_PCT', 'B_PER', 'B_TIM', 'I_DAT',
            'I_LOC', 'I_MON', 'I_ORG', 'I_PCT', 'I_PER', 'I_TIM',
            'B_LOC', 'I_LOC', 'B_PER', 'I_PER', 'B_ORG', 'I_ORG', 'B_FAC', 'I_FAC',
            'B_EVE', 'I_EVE', 'B_PRO', 'I_PRO','O']))
    tags.sort()
    
    # Verifying tags
    if '\n' in tags or '' in tags:
        raise Exception('Wrong tags detected!')
    
    # Creating Dataset object
    dataset_dict = {
        'tokens' : tokens,
        'ner_tags' : labels
    }
    
    ner_tags = ClassLabel(num_classes=len(tags) , names=tags)
    dataset = Dataset.from_dict(
        mapping = dataset_dict,
        features = Features({
            'tokens' : Sequence(feature=Value('string')),
            'ner_tags' : Sequence(ner_tags)
        })
    )
    
    
    # Returning train, test and validation splits
    train_split = round(dataset.num_rows * 0.8)
    test_split = round(dataset.num_rows * 0.9)
    
    train = dataset.select(range(0,train_split))
    test = dataset.select(range(train_split,test_split))
    validation = dataset.select(range(test_split,dataset.num_rows))
    
    return train , test , validation

In [7]:
arman_tokens = './PEYMA_ARMAN/arman-tokens.txt'
arman_labels = './PEYMA_ARMAN/arman-labels.txt'

peyma_tokens = './PEYMA_ARMAN/peyma-tokens.txt'
peyma_labels = './PEYMA_ARMAN/peyma-labels.txt'

In [8]:
# Creating two datasets
corpus = DatasetDict()
corpus_arman = DatasetDict()
corpus_peyma = DatasetDict()
corpus_arman['train'] , corpus_arman['test'] , corpus_arman['validation'] = generate_dataset(arman_tokens,arman_labels)
corpus_peyma['train'] , corpus_peyma['test'] , corpus_peyma['validation']  = generate_dataset(peyma_tokens,peyma_labels)

In [9]:
# Concatenate two datsets two unique datset.
corpus['train'] = concatenate_datasets([corpus_arman['train'],corpus_peyma['train']])
corpus['test'] = concatenate_datasets([corpus_arman['test'],corpus_peyma['test']])
corpus['validation'] = concatenate_datasets([corpus_arman['validation'],corpus_peyma['validation']])

In [11]:
def create_tag_names(batch):
    """
   Convert each number label to its value for improve readability
                                           and return new column

    """
    return {'ner_tags_names' : [tags.int2str(idx) for idx in batch['ner_tags']]}

In [12]:
# Apply that function
tags = corpus['train'].features['ner_tags'].feature
corpus = corpus.map(create_tag_names)

Map:   0%|          | 0/26417 [00:00<?, ? examples/s]

Map:   0%|          | 0/3303 [00:00<?, ? examples/s]

Map:   0%|          | 0/3302 [00:00<?, ? examples/s]

In [13]:
# Shuffle and save dataset
corpus = corpus.shuffle(42)
corpus.save_to_disk("PEYMA_ARMAN_MIXED.hf")

Saving the dataset (0/1 shards):   0%|          | 0/26417 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3303 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3302 [00:00<?, ? examples/s]