# Download and load dataset

In [63]:
from datasets import load_dataset,Dataset, load_from_disk ,concatenate_datasets, DatasetDict , Sequence , Value , Features , ClassLabel
conll = load_dataset("conll2003")

Found cached dataset conll2003 (C:/Users/11/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)


  0%|          | 0/3 [00:00<?, ?it/s]

In [64]:
conll['train'].features['ner_tags'].feature.names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [65]:
# convert each label to it's number
tags = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
tags_names = ClassLabel(num_classes=len(tags) , names=tags)
def create_tag_names(batch):
    """
   Convert each number label to its value for improve readability
                                           and return new column
    """
    return {'ner_tags_names' : [tags_names.int2str(idx) for idx in batch['ner_tags']]}

In [66]:
conll = conll.map(create_tag_names)

Loading cached processed dataset at C:\Users\11\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98\cache-5e807906eac5bcfd.arrow
Loading cached processed dataset at C:\Users\11\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98\cache-b068e7a3ef702cd1.arrow
Loading cached processed dataset at C:\Users\11\.cache\huggingface\datasets\conll2003\conll2003\1.0.0\9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98\cache-e33a7f87947c4ce6.arrow


# Aligning

## Remove MISC tags to aligin data

In [67]:
conll['train']['ner_tags_names'][0]

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [68]:
def remove_tags(example):
    non_related_tags = set(['B-MISC', 'I-MISC'])
    if any(tag in non_related_tags for tag in example):
        return None
    else:
        return example

In [69]:
import pandas
df_train = conll['train'].to_pandas()
df_test = conll['test'].to_pandas()
df_valid = conll['validation'].to_pandas()

In [70]:
# Apply the remove_tags function to the DataFrame
df_train['ner_tags_names'] = df_train['ner_tags_names'].apply(remove_tags)
df_test['ner_tags_names'] = df_test['ner_tags_names'].apply(remove_tags)
df_valid['ner_tags_names'] = df_valid['ner_tags_names'].apply(remove_tags)

# Remove rows with None values
df_train = df_train.dropna()
df_valid = df_valid.dropna()
df_test = df_test.dropna()

In [71]:
res = []
for tags in df_train['ner_tags_names']:
    for tag in tags:
        res.append(tag)
set(res)

{'B-LOC', 'B-ORG', 'B-PER', 'I-LOC', 'I-ORG', 'I-PER', 'O'}

In [72]:
filtered_conill = DatasetDict()
filtered_conill['train'] = Dataset.from_pandas(df_train)
filtered_conill['test'] = Dataset.from_pandas(df_test)
filtered_conill['validation'] = Dataset.from_pandas(df_valid)

In [73]:
filtered_conill

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'ner_tags_names', '__index_level_0__'],
        num_rows: 11343
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'ner_tags_names', '__index_level_0__'],
        num_rows: 2890
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags', 'ner_tags_names', '__index_level_0__'],
        num_rows: 2555
    })
})

In [74]:
# remove additional columns
filtered_conill['train'] = filtered_conill['train'].remove_columns(['__index_level_0__','pos_tags','chunk_tags','id'])
filtered_conill['test'] = filtered_conill['test'].remove_columns(['__index_level_0__','pos_tags','chunk_tags','id'])
filtered_conill['validation'] = filtered_conill['validation'].remove_columns(['__index_level_0__','pos_tags','chunk_tags','id'])

In [79]:
filtered_conill.save_to_disk("Conell_en.hf")

Saving the dataset (0/1 shards):   0%|          | 0/11343 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2890 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2555 [00:00<?, ? examples/s]

In [80]:
filtered_conill

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'ner_tags_names'],
        num_rows: 11343
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'ner_tags_names'],
        num_rows: 2890
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'ner_tags_names'],
        num_rows: 2555
    })
})