In [1]:
import pandas as pd
from pathlib import Path
from pprint import pprint

# Setup

In [47]:
path_to_entity_data = Path("../../data/processed/entities/")
path_to_processed_data = Path("../../data/processed/entities")
path_to_test_data = Path("../../data") / "test"
path_to_output_data = Path("../../data/processed/")

# Entity Data

In [49]:
entity_data = pd.read_csv(path_to_entity_data / "persons_pantheon_data.csv", delimiter=",", encoding="latin1")

In [50]:
entity_data.columns

Index(['name', 'domain'], dtype='object')

In [51]:
entity_data.columns = entity_data.columns.str.lower()

In [52]:
# we only need the words and the tags
entity_data = entity_data.drop(columns=["domain"])

In [53]:
entity_data.head()

Unnamed: 0,name
0,"""noynoy"""
1,'ulukalala
2,13th
3,14th
4,`abbas


In [54]:
len(entity_data)

12123

Convert tokens to lowercase

In [55]:
entity_data.loc[:, 'name'] = entity_data['name'].str.lower()

In [56]:
entity_data.head()

Unnamed: 0,name
0,"""noynoy"""
1,'ulukalala
2,13th
3,14th
4,`abbas


In [57]:
entity_data.to_csv(path_to_output_data / "entity-list.csv", index=False)

# Test-Data

In [58]:
test_data = pd.read_csv(path_to_test_data / "testset_1.csv")

In [59]:
len(test_data)

11027

In [60]:
test_data.columns = ['truth', 'equivalent']
test_data = test_data.sort_values(by='truth', ascending=True)
test_cases = test_data['truth'].unique()
print("Testset")
print(f"Testcases (all): {len(test_data)}")
print(f"Testcases (unique): {len(test_cases)}")

Testset
Testcases (all): 11027
Testcases (unique): 499


In [62]:
test_data['name'] = test_data['truth'].apply(lambda x: x.split('-')[0])

In [80]:
test_data.name

8370    49ers
8351    49ers
8352    49ers
8353    49ers
8354    49ers
        ...  
9632     zika
9630     zika
9629     zika
9628     zika
9631     zika
Name: name, Length: 11027, dtype: object

In [83]:
entity_data.name[entity_data.name == "49ers"]

Series([], Name: name, dtype: object)

In [87]:
merged_data = pd.merge(test_data, entity_data, how='inner', on='name')

In [88]:
merged_data

Unnamed: 0,truth,equivalent,name
0,agassi-1999,djokovic-2015,agassi
1,agassi-1999,sampras-1994,agassi
2,agassi-1999,djokovic-2014,agassi
3,agassi-1999,nadal-2013,agassi
4,agassi-1999,djokovic-2012,agassi
...,...,...,...
8266,yeltsin-1999,putin-2000,yeltsin
8267,yeltsin-1999,putin-2001,yeltsin
8268,yeltsin-1999,putin-2002,yeltsin
8269,yeltsin-1999,putin-2004,yeltsin


In [94]:
merged_data.to_csv(path_to_test_data / "testset_1_filtered.csv", index=False)

In [95]:
merged_data

Unnamed: 0,truth,equivalent,name
0,agassi-1999,djokovic-2015,agassi
1,agassi-1999,sampras-1994,agassi
2,agassi-1999,djokovic-2014,agassi
3,agassi-1999,nadal-2013,agassi
4,agassi-1999,djokovic-2012,agassi
...,...,...,...
8266,yeltsin-1999,putin-2000,yeltsin
8267,yeltsin-1999,putin-2001,yeltsin
8268,yeltsin-1999,putin-2002,yeltsin
8269,yeltsin-1999,putin-2004,yeltsin
