In [31]:
import pandas as pd
from collections import Counter
from scripts.load_data import extract_labeled_tokens, label_mapping, read_tsv_file

In [32]:
# loading data sets with Pandas
news_train = pd.read_csv("data/da_news/da_news_train.tsv", sep="\t", header=None, names=["Token", "Tag", "Nested"], usecols=["Token", "Tag"])
news_test = pd.read_csv("data/da_news/da_news_test.tsv", sep="\t", header=None, names=["Token", "Tag", "Nested"], usecols=["Token", "Tag"])
news_dev = pd.read_csv("data/da_news/da_news_dev.tsv", sep="\t", header=None, names=["Token", "Tag", "Nested"], usecols=["Token", "Tag"], engine="python", on_bad_lines="skip")

In [35]:
# loading data sets as tsv
path_train = "data/da_news/da_news_train.tsv"
path_dev = "data/da_news/da_news_dev.tsv"
path_test = "data/da_news/da_news_test.tsv"

# create mapping
label2id, id2label = label_mapping(path_train)

# read in the DaN+ data
train_data = read_tsv_file(path_train, label2id)
dev_data = read_tsv_file(path_dev, label2id)
test_data = read_tsv_file(path_test, label2id)

# Size and Token Overlap of Original Data

In [36]:
# dataset sizes
def check_dataset_sizes(train_data, dev_data, test_data):
    total = len(train_data) + len(dev_data) + len(test_data)
    print("train size:", len(train_data))
    print("dev size:", len(dev_data))
    print("test size:", len(test_data))
    print("total dataset size:", total)

# token overlap
def check_token_overlap(train_data, dev_data, test_data):
    train_tokens = extract_labeled_tokens(train_data)
    dev_tokens = extract_labeled_tokens(dev_data)
    test_tokens = extract_labeled_tokens(test_data)

    print('overlap between train and dev:', len(train_tokens & dev_tokens))
    print('overlap between dev and test:', len(dev_tokens & test_tokens))
    print('overlap between train and test:', len(train_tokens & test_tokens))

check_dataset_sizes(train_data, dev_data, test_data)
check_token_overlap(train_data, dev_data, test_data)

train size: 4383
dev size: 564
test size: 565
total dataset size: 5512
overlap between train and dev: 256
overlap between dev and test: 78
overlap between train and test: 219


# Checking for Part and Deriv

In [13]:
# frequency of part/deriv tags in training data
part_deriv = news_train["Tag"].str.contains("part|deriv").sum()

non_O_tag = (news_train["Tag"] != "O").sum()

print(part_deriv/non_O_tag*100)

12.109955423476968


In [14]:
# number of part/deriv by type in training data
tags = ["B-LOCpart", "B-LOCderiv", "B-ORGpart", "B-ORGderiv", "B-PERpart", "B-PERderiv", "B-MISCpart", "B-MISCderiv",
        "I-LOCpart", "I-LOCderiv", "I-ORGpart", "I-ORGderiv", "I-PERpart", "I-PERderiv", "I-MISCpart", "I-MISCderiv"]

for tag in tags:
    count = (news_train["Tag"]==tag).sum()
    print(f"{tag}: {count}")

B-LOCpart: 17
B-LOCderiv: 232
B-ORGpart: 51
B-ORGderiv: 4
B-PERpart: 6
B-PERderiv: 1
B-MISCpart: 13
B-MISCderiv: 2
I-LOCpart: 0
I-LOCderiv: 0
I-ORGpart: 0
I-ORGderiv: 0
I-PERpart: 0
I-PERderiv: 0
I-MISCpart: 0
I-MISCderiv: 0


# Named Entity Distribution in Danish News Data

In [15]:
# counting for B-PER
((news_train["Tag"] == 'B-PER').sum()+(news_test["Tag"] == 'B-PER').sum()+(news_dev["Tag"] == 'B-PER').sum())

747

In [16]:
# counting for I-PER
((news_train["Tag"] == 'I-PER').sum()+(news_test["Tag"] == 'I-PER').sum()+(news_dev["Tag"] == 'I-PER').sum())

526

In [18]:
# counting the token length of locations
def count_loc(tags):
    i = 0
    LOC_length = []

    while i < len(tags):
        if tags[i] == 'B-LOC':
            length = 1  
            i += 1
            while i < len(tags) and tags[i] == 'I-LOC':
                length += 1
                i += 1
            LOC_length.append(length)
        else:
            i += 1

    return Counter(LOC_length)

In [19]:
# length of tokens in train
count_loc(news_train["Tag"].tolist())

Counter({1: 388, 2: 27, 3: 3, 4: 2})

In [21]:
# length of tokens in dev
count_loc(news_dev["Tag"].tolist())

Counter({1: 52, 2: 4})

In [20]:
# length of tokens in test
count_loc(news_test["Tag"].tolist())

Counter({1: 47})

In [22]:
# counting the token length of locations
def count_org(tags):
    i = 0
    ORG_length = []

    while i < len(tags):
        if tags[i] == 'B-ORG':
            length = 1  
            i += 1
            while i < len(tags) and tags[i] == 'I-ORG':
                length += 1
                i += 1
            ORG_length.append(length)
        else:
            i += 1

    return Counter(ORG_length)

In [23]:
# length of tokens in train
count_org(news_train["Tag"].tolist())

Counter({1: 260, 2: 102, 3: 48, 4: 6, 5: 1})

In [24]:
# length of tokens in dev
count_org(news_dev["Tag"].tolist())

Counter({1: 36, 2: 7, 3: 4})

In [25]:
# length of tokens in test
count_org(news_test["Tag"].tolist())

Counter({1: 35, 2: 21, 3: 5})