In [23]:
# imports
import pandas as pd
from collections import Counter
from scripts.load_data import extract_labeled_tokens, label_mapping, read_tsv_file

## Loading in the data

In [24]:
# paths to the data
path_train_original = "data/da_news/da_news_train.tsv"
path_dev_original = "data/da_news/da_news_dev.tsv"
path_test_original = "data/da_news/da_news_test.tsv"

path_train_emerging = "data/no_overlap_da_news/da_news_train.tsv"
path_dev_emerging = "data/no_overlap_da_news/da_news_dev.tsv"
path_test_emerging = "data/no_overlap_da_news/da_news_test.tsv"

In [25]:
# create mapping
label2id, id2label = label_mapping(path_train_original)

In [26]:
# read in the datasets
train_data_original = read_tsv_file(path_train_original, label2id)
dev_data_original = read_tsv_file(path_dev_original, label2id)
test_data_original = read_tsv_file(path_test_original, label2id)

train_data_emerging = read_tsv_file(path_train_emerging, label2id)
dev_data_emerging = read_tsv_file(path_dev_emerging, label2id)
test_data_emerging = read_tsv_file(path_test_emerging, label2id)

# Exploration of token overlap of original data

In [28]:
# checking for token overlap between the train, dev and test sets
def check_token_overlap(train_data, dev_data, test_data):
    train_tokens = extract_labeled_tokens(train_data)
    dev_tokens = extract_labeled_tokens(dev_data)
    test_tokens = extract_labeled_tokens(test_data)

    print('overlap between train and dev:', len(train_tokens & dev_tokens))
    print('overlap between dev and test:', len(dev_tokens & test_tokens))
    print('overlap between train and test:', len(train_tokens & test_tokens))

check_token_overlap(train_data_original, dev_data_original, test_data_original)

overlap between train and dev: 256
overlap between dev and test: 78
overlap between train and test: 219


# Sizes of the datasets before and after removing overlap

In [27]:
print("Before removing overlap")
print('\tSize of training set:', len(train_data_original))
print('\tSize of dev set:', len(dev_data_original))
print('\tSize of test set:', len(test_data_original))

print("\nAfter fixing overlap")
print('\tSize of training set:', len(train_data_emerging))
print('\tSize of dev set:', len(dev_data_emerging))
print('\tSize of test set:', len(test_data_emerging))

Before emerging entities
	Size of training set: 4383
	Size of dev set: 564
	Size of test set: 565

After emerging entities
	Size of training set: 4411
	Size of dev set: 549
	Size of test set: 552


# Exploring occurence of -part and -deriv in original data

In [29]:
# loading data sets with Pandas
news_train = pd.read_csv("data/da_news/da_news_train.tsv", sep="\t", header=None, names=["Token", "Tag", "Nested"], usecols=["Token", "Tag"])
news_test = pd.read_csv("data/da_news/da_news_test.tsv", sep="\t", header=None, names=["Token", "Tag", "Nested"], usecols=["Token", "Tag"])
news_dev = pd.read_csv("data/da_news/da_news_dev.tsv", sep="\t", header=None, names=["Token", "Tag", "Nested"], usecols=["Token", "Tag"], engine="python", on_bad_lines="skip")

In [30]:
# percentage of part/deriv tags in original training data
part_deriv = news_train["Tag"].str.contains("part|deriv").sum()

non_O_tag = (news_train["Tag"] != "O").sum()

print(part_deriv/non_O_tag*100)

12.109955423476968


In [31]:
# number of part/deriv by type in training data
tags = ["B-LOCpart", "B-LOCderiv", "B-ORGpart", "B-ORGderiv", "B-PERpart", "B-PERderiv", "B-MISCpart", "B-MISCderiv",
        "I-LOCpart", "I-LOCderiv", "I-ORGpart", "I-ORGderiv", "I-PERpart", "I-PERderiv", "I-MISCpart", "I-MISCderiv"]

for tag in tags:
    count = (news_train["Tag"]==tag).sum()
    print(f"{tag}: {count}")

B-LOCpart: 17
B-LOCderiv: 232
B-ORGpart: 51
B-ORGderiv: 4
B-PERpart: 6
B-PERderiv: 1
B-MISCpart: 13
B-MISCderiv: 2
I-LOCpart: 0
I-LOCderiv: 0
I-ORGpart: 0
I-ORGderiv: 0
I-PERpart: 0
I-PERderiv: 0
I-MISCpart: 0
I-MISCderiv: 0


# Counting length of LOC and ORG named entities in original data

### Counting LOC

In [32]:
# counting the token length of locations
def count_loc(tags):
    '''
    This function counts span lengths for named entities with -LOC tags
    '''
    i = 0
    LOC_length = []

    while i < len(tags):
        if tags[i] == 'B-LOC':
            length = 1  
            i += 1
            while i < len(tags) and tags[i] == 'I-LOC':
                length += 1
                i += 1
            LOC_length.append(length)
        else:
            i += 1

    return Counter(LOC_length)

In [33]:
# length of tokens in train
count_loc(news_train["Tag"].tolist())

Counter({1: 388, 2: 27, 3: 3, 4: 2})

In [34]:
# length of tokens in dev
count_loc(news_dev["Tag"].tolist())

Counter({1: 52, 2: 4})

In [35]:
# length of tokens in test
count_loc(news_test["Tag"].tolist())

Counter({1: 47})

### Counting ORG

In [36]:
# counting the token length of organizations
def count_org(tags):
    '''
    This function counts span lengths for named entities with -ORG tags
    '''
    i = 0
    ORG_length = []

    while i < len(tags):
        if tags[i] == 'B-ORG':
            length = 1  
            i += 1
            while i < len(tags) and tags[i] == 'I-ORG':
                length += 1
                i += 1
            ORG_length.append(length)
        else:
            i += 1

    return Counter(ORG_length)

In [37]:
# length of tokens in train
count_org(news_train["Tag"].tolist())

Counter({1: 260, 2: 102, 3: 48, 4: 6, 5: 1})

In [38]:
# length of tokens in dev
count_org(news_dev["Tag"].tolist())

Counter({1: 36, 2: 7, 3: 4})

In [39]:
# length of tokens in test
count_org(news_test["Tag"].tolist())

Counter({1: 35, 2: 21, 3: 5})

# Named Entity Distribution in Original Data

In [46]:
# counting the occurence of a specific NER tag in the data
def count_tag(data, specific_tag):
    '''
    This function counts the number of times a specific NER tag appears in the data.
    '''
    return sum(tag == specific_tag for sentence in data for tag in sentence["ner_tags"])

# counting the total number of NER tags
def total_tags(data):
    '''
    This function counts the total number of NER tags in the data.
    '''
    return sum(len(sentence["ner_tags"]) for sentence in data)

# defining the NER tags
NER_tags = ["B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-MISC", "I-MISC", "O"]

# printing count and percentage of each NER tag in the dataset
def counts_and_percentages(data):
    '''
    This function counts and prints the occurence of each NER tag and calculates the percentage of each tag relative to the total number of tags in the dataset.
    '''
    total = total_tags(data)
    for tag in NER_tags:
        count = count_tag(data, tag)
        percentage = (count / total) * 100 
        print(f"{tag}: {count} ({percentage:.2f}%)")

# printing results for overlapping data
counts_and_percentages(train_data_original)
counts_and_percentages(dev_data_original)
counts_and_percentages(test_data_original)

# Named Entity Distribution in Non-Overlapping Data

In [48]:
# printing results for non-overlapping data
counts_and_percentages(train_data_emerging)
counts_and_percentages(dev_data_emerging)
counts_and_percentages(test_data_emerging)

B-PER: 1353 (1.65%)
I-PER: 1039 (1.27%)
B-ORG: 891 (1.09%)
I-ORG: 507 (0.62%)
B-LOC: 898 (1.10%)
I-LOC: 78 (0.10%)
B-MISC: 260 (0.32%)
I-MISC: 285 (0.35%)
O: 76593 (93.52%)
B-PER: 94 (0.99%)
I-PER: 30 (0.32%)
B-ORG: 85 (0.90%)
I-ORG: 21 (0.22%)
B-LOC: 70 (0.74%)
I-LOC: 8 (0.08%)
B-MISC: 60 (0.63%)
I-MISC: 17 (0.18%)
O: 9094 (95.94%)
B-PER: 117 (1.25%)
I-PER: 53 (0.57%)
B-ORG: 82 (0.88%)
I-ORG: 28 (0.30%)
B-LOC: 58 (0.62%)
I-LOC: 6 (0.06%)
B-MISC: 35 (0.37%)
I-MISC: 13 (0.14%)
O: 8958 (95.81%)
