In [1]:
import pandas as pd
from pathlib import Path
from pprint import pprint

# Setup

In [2]:
path_to_entity_data = Path("../../data/raw/entities/")
path_to_processed_data = Path("../../data/processed/entities")
path_to_test_data = Path("../../data") / "test"
path_to_output_data = Path("../../data/processed/")

In [3]:
# Example to print the first few lines of the file
file_path = path_to_entity_data / 'wikigold.conll.txt'
with open(file_path, 'r') as file:
    for _ in range(5):
        print(file.readline().strip())

010 I-MISC
is O
the O
tenth O
album O


In [27]:
entities = pd.read_csv(path_to_entity_data / 'wikigold.conll.txt', sep=" ", names=['Token', 'Tag'], header=None, skip_blank_lines=False)


In [28]:
entities.columns = entities.columns.str.lower()
entities.token = entities.token.str.lower()

In [29]:
# Using case=False to make the search case-insensitive
entities[entities['token'].fillna('').str.contains("merkel", case=False)]

Unnamed: 0,token,tag


# Entity Data

In [4]:
entity_data = pd.read_csv(path_to_entity_data / "ner_dataset.csv", delimiter=",", encoding="latin1")

In [5]:
entity_data.columns

Index(['Sentence #', 'Word', 'POS', 'Tag'], dtype='object')

In [6]:
entity_data.columns = entity_data.columns.str.lower()

In [7]:
# we only need the words and the tags
entity_data = entity_data.drop(columns=["sentence #", "pos"])

In [8]:
entity_data.head()

Unnamed: 0,word,tag
0,Thousands,O
1,of,O
2,demonstrators,O
3,have,O
4,marched,O


In [9]:
len(entity_data)

1048575

In [10]:
pprint(entity_data.tag.unique().tolist(), compact=True)

['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim', 'B-art',
 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve', 'I-eve', 'I-nat']


In [11]:
# we only need people
relevant_tags = ["B-per", "I-per"]

In [12]:
entity_data = entity_data[entity_data['tag'].isin(relevant_tags)]

In [13]:
len(entity_data)

34241

In [14]:
entity_data.loc[:, 'word'] = entity_data['word'].str.lower()

In [15]:
entity_data.head()

Unnamed: 0,word,tag
42,bush,B-per
270,president,B-per
271,mahmoud,I-per
272,ahmadinejad,I-per
331,thomas,B-per


In [16]:
entity_data.to_csv(path_to_output_data / "entity-list.csv", index=False)

# Test-Data

In [4]:
test_data = pd.read_csv(path_to_test_data / "testset_1.csv")

In [5]:
len(test_data)

11027

In [18]:
test_data.columns = ['truth', 'equivalent']
test_data = test_data.sort_values(by='truth', ascending=True)
test_cases = test_data['truth'].unique()
print("Testset")
print(f"Testcases (all): {len(test_data)}")
print(f"Testcases (unique): {len(test_cases)}")

Testset
Testcases (all): 11027
Testcases (unique): 499


In [19]:
test_data

Unnamed: 0,truth,equivalent
8370,49ers-1990,patriots-2015
8351,49ers-1990,cowboys-1993
8352,49ers-1990,cowboys-1994
8353,49ers-1990,49ers-1995
8354,49ers-1990,cowboys-1996
...,...,...
9632,zika-2016,ebola-2015
9630,zika-2016,sars-2003
9629,zika-2016,plague-1994
9628,zika-2016,cholera-1990


In [20]:
test_data['word'] = test_data['truth'].apply(lambda x: x.split('-')[0])

In [21]:
test_data

Unnamed: 0,truth,equivalent,word
8370,49ers-1990,patriots-2015,49ers
8351,49ers-1990,cowboys-1993,49ers
8352,49ers-1990,cowboys-1994,49ers
8353,49ers-1990,49ers-1995,49ers
8354,49ers-1990,cowboys-1996,49ers
...,...,...,...
9632,zika-2016,ebola-2015,zika
9630,zika-2016,sars-2003,zika
9629,zika-2016,plague-1994,zika
9628,zika-2016,cholera-1990,zika


In [22]:
merged_data = pd.merge(test_data, entity_data, how='left')

In [23]:
merged_data

Unnamed: 0,truth,equivalent,word,tag
0,49ers-1990,patriots-2015,49ers,
1,49ers-1990,cowboys-1993,49ers,
2,49ers-1990,cowboys-1994,49ers,
3,49ers-1990,49ers-1995,49ers,
4,49ers-1990,cowboys-1996,49ers,
...,...,...,...,...
557940,zika-2016,ebola-2015,zika,
557941,zika-2016,sars-2003,zika,
557942,zika-2016,plague-1994,zika,
557943,zika-2016,cholera-1990,zika,


In [24]:
null_tag_rows = merged_data[merged_data['tag'].isna()]

In [25]:
null_tag_rows["word"].unique()

array(['49ers', 'athens', 'atlanta', 'barcelona', 'bates', 'beijing',
       'berry', 'blasio', 'brennan', 'bridges', 'brody', 'broncos',
       'bulls', 'cage', 'cavaliers', 'celtics', 'cholera', 'colts',
       'courier', 'cowboys', 'crowe', 'dempsey', 'dicaprio', 'dinkins',
       'ebola', 'edberg', 'euro', 'freeh', 'giants', 'hanks', 'heat',
       'hollande', 'hopkins', 'katrina', 'kidman', 'knicks', 'lakers',
       'lange', 'larson', 'malaria', 'mavericks', 'mitterrand', 'modi',
       'nicholson', 'pacino', 'patriots', 'pistons', 'plague', 'quayle',
       'rams', 'ravens', 'redskins', 'rockets', 'sampras', 'sars',
       'seahawks', 'sessions', 'shelton', 'spacey', 'spurs', 'steelers',
       'streep', 'warrior', 'webster', 'whitaker', 'yeltsin', 'zika'],
      dtype=object)

In [26]:
merged_data['tag'] = merged_data['tag'].apply(lambda x: "unknown" if pd.isna(x) else x)

In [27]:
merged_data.to_csv(path_to_test_data / "testset_1_enriched.csv", index=False)

In [28]:
merged_data

Unnamed: 0,truth,equivalent,word,tag
0,49ers-1990,patriots-2015,49ers,unknown
1,49ers-1990,cowboys-1993,49ers,unknown
2,49ers-1990,cowboys-1994,49ers,unknown
3,49ers-1990,49ers-1995,49ers,unknown
4,49ers-1990,cowboys-1996,49ers,unknown
...,...,...,...,...
557940,zika-2016,ebola-2015,zika,unknown
557941,zika-2016,sars-2003,zika,unknown
557942,zika-2016,plague-1994,zika,unknown
557943,zika-2016,cholera-1990,zika,unknown
