In [17]:
import pandas
import pathlib
import os
from datasets import Dataset ,load_dataset,concatenate_datasets, DatasetDict , Sequence , Value , Features , ClassLabel

In [2]:
def convert_tags(labels):
    '''
    Convert each label to a unique label (aligning labels for two datasets)
    '''
    maps = {'B-loc':'B_LOC', 'I-loc':'I_LOC', 'B-pers':'B_PER','I-pers':'I_PER',
              'B-org':'B_ORG', 'I-org':'I_ORG', 'B-fac':'B_FAC', 'I-fac':'I_FAC',
              'B-event' : 'B_EVE', 'I-event' : 'I_EVE', 'B-pro':'B_PRO', 'I-pro':'I_PRO',
              'I-fac': 'I_FAC', 'B-fac': 'B_FAC'
             }
    for index in range(len(labels)):
        if labels[index] in maps.keys():
            labels[index] =  maps[labels[index]]
    return labels


In [3]:
def generate_dataset(token_path,label_path,shuffle=True):
    '''
    Genrate dataset -> reading from raw txt data and split 
                       labels and tokens. Apply convert_tags to align datasests
    
    '''
    
    # List of lists for tokens and labels
    tokens = []
    labels = []
    
    with open(token_path , encoding='utf-8' , mode='r') as t , open(label_path , encoding='utf-8' , mode='r') as l:
        
        for row in t:
            tokens.append(row.strip('\n').split())
        
        for row in l :
            labels.append(row.strip('\n').split())

    # Verifying data
    if len(tokens) != len(labels):
        raise Exception("Number of rows in tokens and labels doesn't match!")
    # aligning
    labels = list(map(convert_tags,labels))
    
    # Verifying labels after aligning
    for token , label in zip(tokens,labels):
        if len(token) != len(label):
            raise Exception("Number of tokens and labels in the first row doesn't match!")
    


    # Finding unique tags in labels
    tags =  list(set(['B_DAT', 'B_LOC', 'B_MON', 'B_ORG', 'B_PCT', 'B_PER', 'B_TIM', 'I_DAT',
            'I_LOC', 'I_MON', 'I_ORG', 'I_PCT', 'I_PER', 'I_TIM',
            'B_LOC', 'I_LOC', 'B_PER', 'I_PER', 'B_ORG', 'I_ORG', 'B_FAC', 'I_FAC',
            'B_EVE', 'I_EVE', 'B_PRO', 'I_PRO','O']))
    tags.sort()
    
    # Verifying tags
    if '\n' in tags or '' in tags:
        raise Exception('Wrong tags detected!')
    
    # Creating Dataset object
    dataset_dict = {
        'tokens' : tokens,
        'ner_tags' : labels
    }
    
    ner_tags = ClassLabel(num_classes=len(tags) , names=tags)
    dataset = Dataset.from_dict(
        mapping = dataset_dict,
        features = Features({
            'tokens' : Sequence(feature=Value('string')),
            'ner_tags' : Sequence(ner_tags)
        })
    )
    
    
    # Returning train, test and validation splits
    train_split = round(dataset.num_rows * 0.8)
    test_split = round(dataset.num_rows * 0.9)
    
    train = dataset.select(range(0,train_split))
    test = dataset.select(range(train_split,test_split))
    validation = dataset.select(range(test_split,dataset.num_rows))
    
    return train , test , validation

In [4]:
arman_tokens = './PEYMA_ARMAN/arman-tokens.txt'
arman_labels = './PEYMA_ARMAN/arman-labels.txt'

peyma_tokens = './PEYMA_ARMAN/peyma-tokens.txt'
peyma_labels = './PEYMA_ARMAN/peyma-labels.txt'

In [5]:
# Creating two datasets
corpus = DatasetDict()
corpus_arman = DatasetDict()
corpus_peyma = DatasetDict()
corpus_arman['train'] , corpus_arman['test'] , corpus_arman['validation'] = generate_dataset(arman_tokens,arman_labels)
corpus_peyma['train'] , corpus_peyma['test'] , corpus_peyma['validation']  = generate_dataset(peyma_tokens,peyma_labels)

In [6]:
# Concatenate two datsets two unique datset.
corpus['train'] = concatenate_datasets([corpus_arman['train'],corpus_peyma['train']])
corpus['test'] = concatenate_datasets([corpus_arman['test'],corpus_peyma['test']])
corpus['validation'] = concatenate_datasets([corpus_arman['validation'],corpus_peyma['validation']])

In [7]:
def create_tag_names(batch):
    """
   Convert each number label to its value for improve readability
                                           and return new column

    """
    return {'ner_tags_names' : [tags.int2str(idx) for idx in batch['ner_tags']]}

In [8]:
# Apply that function
tags = corpus['train'].features['ner_tags'].feature
corpus = corpus.map(create_tag_names)

Map:   0%|          | 0/26417 [00:00<?, ? examples/s]

Map:   0%|          | 0/3303 [00:00<?, ? examples/s]

Map:   0%|          | 0/3302 [00:00<?, ? examples/s]

In [15]:
# Shuffle and save dataset
corpus = corpus.shuffle(42)
corpus.save_to_disk("PEYMA_ARMAN_Mixed.json")

Saving the dataset (0/1 shards):   0%|          | 0/26417 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3303 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3302 [00:00<?, ? examples/s]

In [23]:
df_train = corpus['train'].to_pandas()
df_test = corpus['test'].to_pandas()
df_validation = corpus['validation'].to_pandas()

In [24]:
df_train.to_csv('train.csv')
df_test.to_csv('test.csv')
df_validation.to_csv('validation.csv')

In [57]:
import pandas as pd
tag_counts = {
    "B_DAT": 0, "B_EVE": 0, "B_FAC": 0, "B_LOC": 0, "B_MON": 0,
    "B_ORG": 0, "B_PCT": 0, "B_PER": 0, "B_PRO": 0, "B_TIM": 0,
    "I_DAT": 0, "I_EVE": 0, "I_FAC": 0, "I_LOC": 0, "I_MON": 0,
    "I_ORG": 0, "I_PCT": 0, "I_PER": 0, "I_PRO": 0, "I_TIM": 0,
    "O":0
}
dicts = {}
# Iterate through each split of the dataset
splits = ["train","test", "validation" ]
for split in splits:
    print(split)
    ner_tags = corpus[split]["ner_tags_names"]
    tag_counts_temp = tag_counts.copy()
    for tags in ner_tags:
        for tag in tags:
            tag_counts_temp[tag] += 1
    print(tag_counts_temp)
    dicts[split] = tag_counts_temp
    del(tag_counts_temp)

# Convert tag_counts dictionary to DataFrame
df = pd.DataFrame.from_dict(dicts, orient="index", columns=tag_counts.keys())



# Display the resulting DataFrame
print(df)

train
{'B_DAT': 1512, 'B_EVE': 1379, 'B_FAC': 1334, 'B_LOC': 13040, 'B_MON': 446, 'B_ORG': 15762, 'B_PCT': 266, 'B_PER': 11371, 'B_PRO': 1719, 'B_TIM': 224, 'I_DAT': 1939, 'I_EVE': 4600, 'I_FAC': 2222, 'I_LOC': 4254, 'I_MON': 1314, 'I_ORG': 21347, 'I_PCT': 308, 'I_PER': 7160, 'I_PRO': 1736, 'I_TIM': 375, 'O': 747216}
test
{'B_DAT': 185, 'B_EVE': 218, 'B_FAC': 124, 'B_LOC': 1868, 'B_MON': 53, 'B_ORG': 2017, 'B_PCT': 27, 'B_PER': 1566, 'B_PRO': 281, 'B_TIM': 27, 'I_DAT': 245, 'I_EVE': 697, 'I_FAC': 237, 'I_LOC': 511, 'I_MON': 142, 'I_ORG': 2843, 'I_PCT': 31, 'I_PER': 1075, 'I_PRO': 345, 'I_TIM': 37, 'O': 92214}
validation
{'B_DAT': 161, 'B_EVE': 143, 'B_FAC': 192, 'B_LOC': 1539, 'B_MON': 28, 'B_ORG': 2180, 'B_PCT': 33, 'B_PER': 1335, 'B_PRO': 172, 'B_TIM': 30, 'I_DAT': 217, 'I_EVE': 520, 'I_FAC': 349, 'I_LOC': 494, 'I_MON': 54, 'I_ORG': 2923, 'I_PCT': 34, 'I_PER': 813, 'I_PRO': 136, 'I_TIM': 39, 'O': 96857}
            B_DAT  B_EVE  B_FAC  B_LOC  B_MON  B_ORG  B_PCT  B_PER  B_PRO  \
trai

In [58]:
df

Unnamed: 0,B_DAT,B_EVE,B_FAC,B_LOC,B_MON,B_ORG,B_PCT,B_PER,B_PRO,B_TIM,...,I_EVE,I_FAC,I_LOC,I_MON,I_ORG,I_PCT,I_PER,I_PRO,I_TIM,O
train,1512,1379,1334,13040,446,15762,266,11371,1719,224,...,4600,2222,4254,1314,21347,308,7160,1736,375,747216
test,185,218,124,1868,53,2017,27,1566,281,27,...,697,237,511,142,2843,31,1075,345,37,92214
validation,161,143,192,1539,28,2180,33,1335,172,30,...,520,349,494,54,2923,34,813,136,39,96857
