# load data 

In [77]:
from datasets import load_dataset,Dataset,load_from_disk ,concatenate_datasets, DatasetDict , Sequence , Value , Features , ClassLabel
persian = load_from_disk("PEYMA_ARMAN_MIXED.hf")
english = load_from_disk("Conell_en.hf")

In [78]:
english

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'ner_tags_names'],
        num_rows: 11343
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'ner_tags_names'],
        num_rows: 2890
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'ner_tags_names'],
        num_rows: 2555
    })
})

# Removing non-subscription tags

In [79]:
tags = ['B_DAT',
 'B_EVE',
 'B_FAC',
 'B_MON',
 'B_PCT',
 'B_PRO',
 'B_TIM',
 'I_DAT',
 'I_EVE',
 'I_FAC',
 'I_MON',
 'I_PCT',
 'I_PRO',
 'I_TIM']

In [80]:
tags = set(tags)
def remove_tags(example):
    if any(tag in tags for tag in example):
        return None
    else:
        return example

In [81]:
import pandas
df_train = persian['train'].to_pandas()
df_test = persian['test'].to_pandas()
df_valid = persian['validation'].to_pandas()

df_train['ner_tags_names'] = df_train['ner_tags_names'].apply(remove_tags)
df_test['ner_tags_names'] = df_test['ner_tags_names'].apply(remove_tags)
df_valid['ner_tags_names'] = df_valid['ner_tags_names'].apply(remove_tags)

# Remove rows with None values
df_train = df_train.dropna()
df_valid = df_valid.dropna()
df_test = df_test.dropna()

In [82]:
res = []
for tags in df_train['ner_tags_names']:
    for tag in tags:
        res.append(tag)
set(res)

{'B_LOC', 'B_ORG', 'B_PER', 'I_LOC', 'I_ORG', 'I_PER', 'O'}

In [83]:
persian_aligned = DatasetDict()
persian_aligned['train'] = Dataset.from_pandas(df_train)
persian_aligned['test'] = Dataset.from_pandas(df_test)
persian_aligned['validation'] = Dataset.from_pandas(df_valid)

# Convert tags to unique one

In [84]:

def convert_tags(example):
    '''
    Convert each label to a unique label (aligning labels for two datasets)
    '''
    maps = {'B-LOC':'B_LOC', 'I-LOC':'I_LOC', 'I-PER':'I_PER',
            'B-ORG':'B_ORG', 'I-ORG':'I_ORG','B-PER':'B_PER',
            'O':'O'}
    
    converted_tags = [maps[tag] for tag in example['ner_tags_names']]
    example['ner_tags_names'] = converted_tags
    return example


english_aligned = english.map(convert_tags)
tags = ['O', 'B_PER', 'I_PER', 'B_ORG', 'I_ORG', 'B_LOC', 'I_LOC']
ner_tags = ClassLabel(num_classes=len(tags) , names=tags)

# Change the data type of the 'tags_names' column to string
english_aligned = english_aligned.cast_column('ner_tags',Sequence(ner_tags))
persian_aligned = persian_aligned.cast_column('ner_tags',Sequence(ner_tags))

Map:   0%|          | 0/11343 [00:00<?, ? examples/s]

Map:   0%|          | 0/2890 [00:00<?, ? examples/s]

Map:   0%|          | 0/2555 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/11343 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2890 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2555 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/21838 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2693 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2779 [00:00<?, ? examples/s]

In [108]:
persian_aligned['train'].features['ner_tags'],english_aligned['train'].features['ner_tags']

(Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 Sequence(feature=Value(dtype='string', id=None), length=-1, id=None))

# Convert each tag to a number

In [109]:
def create_tag_number(batch):
    tags = {'O':0, 'B_PER':1, 'I_PER':2, 'B_ORG':3, 'I_ORG':4, 'B_LOC':5, 'I_LOC':6}
    
    """
   Convert each number label to its value for improve readability
                                           and return new column
    """
    return {'ner_tags' : [tags[idx] for idx in batch['ner_tags_names']]}

In [110]:
persian_aligned = persian_aligned.map(create_tag_number)
english_aligned  = english_aligned.map(create_tag_number)

Map:   0%|          | 0/21838 [00:00<?, ? examples/s]

Map:   0%|          | 0/2693 [00:00<?, ? examples/s]

Map:   0%|          | 0/2779 [00:00<?, ? examples/s]

Map:   0%|          | 0/11343 [00:00<?, ? examples/s]

Map:   0%|          | 0/2890 [00:00<?, ? examples/s]

Map:   0%|          | 0/2555 [00:00<?, ? examples/s]

In [121]:
# Concatenate two datsets two unique datset.
corpus = DatasetDict()
corpus['train'] = concatenate_datasets([persian_aligned['train'],english_aligned['train']])
corpus['test'] = concatenate_datasets([persian_aligned['test'],english_aligned['test']])
corpus['validation'] = concatenate_datasets([persian_aligned['validation'],english_aligned['validation']])


In [112]:
res = []
for tags in persian_aligned['train']['ner_tags']:
    for tag in tags:
        res.append(tag)
set(res)

{0, 1, 2, 3, 4, 5, 6}

In [113]:
res = []
for tags in english_aligned['train']['ner_tags']:
    for tag in tags:
        res.append(tag)
set(res)

{0, 1, 2, 3, 4, 5, 6}

In [122]:
# save model
corpus = corpus.shuffle(42)
corpus.save_to_disk("ner_dataset_fa_en.hf")

Saving the dataset (0/1 shards):   0%|          | 0/33181 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5583 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5334 [00:00<?, ? examples/s]