In [202]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from tqdm import tqdm
import os

In [203]:
# Constant paths
DATA_FOLDER = 'data/'
PICKLES_FOLDER = 'pickles/'
WIKILITE_FOLDER = DATA_FOLDER + 'wiki_lite/'
SUBMISSIONS_FOLDER = 'submissions/'

# Create folders if they don't exist
if not os.path.exists(PICKLES_FOLDER):
    os.makedirs(PICKLES_FOLDER)

if not os.path.exists(SUBMISSIONS_FOLDER):
    os.makedirs(SUBMISSIONS_FOLDER)

In [204]:
try:
    # Load data from pickles
    train_df = pd.read_pickle(PICKLES_FOLDER + 'train_df.pkl')
    test_df = pd.read_pickle(PICKLES_FOLDER + 'test_df.pkl')
    
    enwinki_redirects = pd.read_pickle(PICKLES_FOLDER + 'enwinki_redirects.pkl')
    item_aliases = pd.read_pickle(PICKLES_FOLDER + 'item_aliases.pkl')
    properties = pd.read_pickle(PICKLES_FOLDER + 'properties.pkl')
    statements = pd.read_pickle(PICKLES_FOLDER + 'statements.pkl')
    wiki_items = pd.read_pickle(PICKLES_FOLDER + 'wiki_items.pkl')
except:
    # Load data from csv
    train_df = pd.read_csv(DATA_FOLDER + 'train.csv')
    test_df = pd.read_csv(DATA_FOLDER + 'test.csv')
    
    enwinki_redirects = pd.read_csv(WIKILITE_FOLDER + 'enwiki_redirects.tsv', sep='\t')
    item_aliases = pd.read_csv(WIKILITE_FOLDER + 'item_aliases.csv')
    properties = pd.read_csv(WIKILITE_FOLDER + 'property.csv')
    statements = pd.read_csv(WIKILITE_FOLDER + 'statements.csv')
    wiki_items = pd.read_csv(WIKILITE_FOLDER + 'wiki_items.csv')
  
    # Save to pickles  
    train_df.to_pickle(PICKLES_FOLDER + 'train_df.pkl')
    test_df.to_pickle(PICKLES_FOLDER + 'test_df.pkl')
    
    enwinki_redirects.to_pickle(PICKLES_FOLDER + 'enwinki_redirects.pkl')
    item_aliases.to_pickle(PICKLES_FOLDER + 'item_aliases.pkl')
    properties.to_pickle(PICKLES_FOLDER + 'properties.pkl')
    statements.to_pickle(PICKLES_FOLDER + 'statements.pkl')
    wiki_items.to_pickle(PICKLES_FOLDER + 'wiki_items.pkl')

## Data exploration

In [205]:
print('Redirects:')
display(enwinki_redirects)

print('Aliases:')
display(item_aliases)

print('Properties:')
display(properties)

print('Statements:')
display(statements)

print('Wiki items:')
display(wiki_items)

Redirects:


Unnamed: 0,!,Exclamation mark
0,! (CONFIG.SYS directive),CONFIG.SYS
1,! (Donnie Vie Album),Donnie Vie
2,! (Donnie Vie album),Donnie Vie
3,! (The Song Formerly Known As),Unit (album)
4,! (album),! (disambiguation)
...,...,...
8558897,󠁽,Tags (Unicode block)
8558898,󠁾,Tags (Unicode block)
8558899,󠁿,Tags (Unicode block)
8558900,􍁷,Private Use Areas


Aliases:


Unnamed: 0,item_id,en_alias
0,1,Our Universe
1,1,The Universe
2,1,The Cosmos
3,1,cosmos
4,2,Blue Planet
...,...,...
1495310,76858465,Photinus luciferin 4-monooxygenase (adenosine ...
1495311,76858465,firefly luciferase
1495312,76868858,12 Canum Venaticorum
1495313,76868858,Alpha Canum Venaticorum


Properties:


Unnamed: 0,property_id,en_label,en_description
0,6,head of government,"head of the executive power of this town, city..."
1,10,video,"relevant video. For images, use the property P..."
2,14,traffic sign,"graphic symbol describing the item, used at th..."
3,15,route map,image of route map at Wikimedia Commons
4,16,highway system,system (or specific country specific road type...
...,...,...,...
6980,7663,Scienza a due voci ID,identifier for an Italian female scientist in ...
6981,7665,FMV World ID,identifier of a video game in the FMV World da...
6982,7666,Visuotinė lietuvių enciklopedija ID,identifier for an item in the online edition o...
6983,7667,Hellenic Civil Aviation Authority airport code,


Statements:


Unnamed: 0,source_item_id,edge_property_id,target_item_id
0,1,398,497745
1,1,398,1133705
2,1,398,1139177
3,1,398,7439451
4,1,398,15241043
...,...,...,...
26903183,77240068,31,16970
26903184,77240068,131,23556
26903185,77242291,17,145
26903186,77242291,131,23311


Wiki items:


Unnamed: 0,item_id,en_label,en_description,wikipedia_title
0,1,Universe,totality of space and all contents,Universe
1,2,Earth,third planet from the Sun in the Solar System,Earth
2,3,life,matter capable of extracting energy from the e...,Life
3,4,death,permanent cessation of vital functions,Death
4,5,human,"common name of Homo sapiens, unique extant spe...",Human
...,...,...,...,...
5216231,77042017,HR 4523,,HD 102365
5216232,77043280,Charlie Johnston,,Charlie Johnstone
5216233,77231860,Aldo Rossi,musician,Aldo Rossi (musician)
5216234,77240068,Ebenezer Baptist Church,"church in Atlanta, Georgia, USA",Ebenezer Baptist Church


Merging `item_aliases` and `wiki_items` on `item_id` to get the `wikipedia_title` for each `en_alias`:

In [206]:
merged_wiki_items = wiki_items.merge(item_aliases, how='left', on='item_id')
merged_wiki_items.head()

Unnamed: 0,item_id,en_label,en_description,wikipedia_title,en_alias
0,1,Universe,totality of space and all contents,Universe,Our Universe
1,1,Universe,totality of space and all contents,Universe,The Universe
2,1,Universe,totality of space and all contents,Universe,The Cosmos
3,1,Universe,totality of space and all contents,Universe,cosmos
4,2,Earth,third planet from the Sun in the Solar System,Earth,Blue Planet


## Trying to find links to test tokens in the wiki items and train tokens

In [207]:
test_df_mod = test_df.copy(deep=True)
train_df_mod = train_df.copy(deep=True)

In [208]:
# Beginning of the URL to wikipedia
URL = 'http://en.wikipedia.org/wiki/'

In [209]:
train_df_mod['full_mention'] = train_df_mod['full_mention'].str.lower()

# We only keep the tokens that have a wiki_url
train_df_mod = train_df_mod[train_df_mod['wiki_url'].notnull() & (train_df_mod['wiki_url'] != '--NME--')]
train_df_mod.head()

Unnamed: 0,id,token,entity_tag,full_mention,wiki_url
3,4,German,B,german,http://en.wikipedia.org/wiki/Germany
7,8,British,B,british,http://en.wikipedia.org/wiki/United_Kingdom
14,15,BRUSSELS,B,brussels,http://en.wikipedia.org/wiki/Brussels
18,19,European,B,european commission,http://en.wikipedia.org/wiki/European_Commission
26,27,German,B,german,http://en.wikipedia.org/wiki/Germany


In [210]:
test_df_mod['full_mention'] = test_df_mod['full_mention'].str.lower()
test_df_mod.head()

Unnamed: 0,id,token,entity_tag,full_mention,wiki_url
0,0,-DOCSTART- (947testa CRICKET),,,
1,1,CRICKET,,,
2,2,-,,,
3,3,LEICESTERSHIRE,B,leicestershire,?
4,4,TAKE,,,


In [211]:
merged_wiki_items['en_alias'] = merged_wiki_items['en_alias'].str.lower()
merged_wiki_items['wikipedia_title_lower'] = merged_wiki_items['wikipedia_title'].str.lower()

For each token in `test_df_mod`, we try to find a link in `train_df_mod` and `merged_wiki_items`:

### 1. Just pandas

In [212]:
pandas = False

In [213]:
if pandas:
    aliases_matching = 0
    
    for index, row in tqdm(test_df_mod.iterrows(), total=test_df_mod.shape[0]):
        if str(row['wiki_url']) == 'nan' or row['wiki_url'] != '?':
            continue
            
        token = row['full_mention']
        train_url = train_df_mod[train_df_mod['full_mention'] == token]['wiki_url'].values
        
        if len(train_url) > 0:
            test_df_mod.at[index, 'wiki_url'] = train_url[0]
            
        else:
            wiki_title = merged_wiki_items[(merged_wiki_items['wikipedia_title_lower'] == token) | (merged_wiki_items['en_alias'] == token)]['wikipedia_title'].values
            
            if len(wiki_title) > 0:
                aliases_matching += 1
                test_df_mod.at[index, 'wiki_url'] = URL + wiki_title[0].replace(' ', '_')

### 2. Using dictionaries

In [214]:
wiki_items_dict = pd.Series(merged_wiki_items['wikipedia_title'].values, index=merged_wiki_items['en_alias']).to_dict()

wiki_items['wikipedia_title_lower'] = wiki_items['wikipedia_title'].str.lower()
wiki_titles_dict = pd.Series(wiki_items['wikipedia_title'].values, index=wiki_items['wikipedia_title_lower']).to_dict()

train_df_dict = pd.Series(train_df_mod['wiki_url'].values, index=train_df_mod['full_mention']).to_dict()

In [216]:
dictionaries = True

In [217]:
if dictionaries:
    aliases_matching = 0
    
    for index, row in tqdm(test_df_mod.iterrows(), total=test_df_mod.shape[0]):
        if str(row['wiki_url']) == 'nan' or row['wiki_url'] != '?':
            continue
            
        token = row['full_mention']
        train_url = train_df_dict.get(token)
        
        if train_url is not None:
            test_df_mod.at[index, 'wiki_url'] = train_url
            
        else:
            wiki_title = wiki_titles_dict.get(token)
            
            if wiki_title is not None:
                aliases_matching += 1
                test_df_mod.at[index, 'wiki_url'] = URL + wiki_title.replace(' ', '_')
            else:
                wiki_title = wiki_items_dict.get(token)
                
                if wiki_title is not None:
                    aliases_matching += 1
                    test_df_mod.at[index, 'wiki_url'] = URL + wiki_title.replace(' ', '_')

100%|██████████| 104890/104890 [00:02<00:00, 46332.80it/s]


### Let's look at the results

In [218]:
aliases_matching

350

Let's see how many tokens we found links for:

In [219]:
print(f"Previously we had {test_df[test_df['wiki_url'] == '?']['wiki_url'].count()} tokens without a link")
print(f"Now we have {test_df_mod[test_df_mod['wiki_url'] == '?']['wiki_url'].count()} tokens without a link")

Previously we had 9166 tokens without a link
Now we have 547 tokens without a link


In [220]:
submission = test_df_mod[['id', 'wiki_url']]
submission.loc[:, 'wiki_url'] = submission['wiki_url'].apply(lambda x: 'NOT_FOUND' if not str(x).startswith('http') else x)

name = 'submission_train_title_alias_dict.csv'

submission.to_csv(SUBMISSIONS_FOLDER + name, index=False)

### Looking at differences in results

In [221]:
name_1 = 'submission_train_title_alias_lower_separate.csv'
name_2 = 'submission_train_alias_title_dict.csv'

df1 = pd.read_csv(SUBMISSIONS_FOLDER + name_1)
df2 = pd.read_csv(SUBMISSIONS_FOLDER + name_2)

diff = pd.merge(df1[df1['wiki_url'] != df2['wiki_url']], df2[df1['wiki_url'] != df2['wiki_url']], how='inner', on='id')
diff

Unnamed: 0,id,wiki_url_x,wiki_url_y
0,69,http://en.wikipedia.org/wiki/Surrey_County_Cri...,http://en.wikipedia.org/wiki/Surrey
1,143,http://en.wikipedia.org/wiki/Simmons,http://en.wikipedia.org/wiki/Simmons_University
2,264,http://en.wikipedia.org/wiki/Surrey_County_Cri...,http://en.wikipedia.org/wiki/Surrey
3,315,http://en.wikipedia.org/wiki/Surrey_County_Cri...,http://en.wikipedia.org/wiki/Surrey
4,519,http://en.wikipedia.org/wiki/Chester-le-Street,http://en.wikipedia.org/wiki/Riverside_Ground
...,...,...,...
1169,104572,http://en.wikipedia.org/wiki/Seville,http://en.wikipedia.org/wiki/Sevilla_FC
1170,104590,http://en.wikipedia.org/wiki/Heracles,http://en.wikipedia.org/wiki/USS_Asheville_(PF-1)
1171,104617,http://en.wikipedia.org/wiki/The_Irishman_(197...,http://en.wikipedia.org/wiki/The_Irishman
1172,104819,http://en.wikipedia.org/wiki/1998_FIFA_World_Cup,http://en.wikipedia.org/wiki/Cricket_World_Cup
