# Project 3. InfoExplorers.

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import os
from tqdm import tqdm
from tqdm.auto import tqdm
import networkx as nx
import heapq
import random

## Data loading

In [73]:
# Constant paths
DATA_FOLDER = 'data/'
PICKLES_FOLDER = 'pickles/'
WIKILITE_FOLDER = DATA_FOLDER + 'wiki_lite/'
SUBMISSIONS_FOLDER = 'submissions/'
CORRECTED_FOLDER = 'corrected/'

# Create folders if they don't exist
if not os.path.exists(PICKLES_FOLDER):
    os.makedirs(PICKLES_FOLDER)

if not os.path.exists(SUBMISSIONS_FOLDER):
    os.makedirs(SUBMISSIONS_FOLDER)

In [23]:
# Train data loading
try:
    train_df = pd.read_pickle(PICKLES_FOLDER + 'train_df.pkl')
except:
    train_df = pd.read_csv(DATA_FOLDER + 'train.csv')
    train_df.to_pickle(PICKLES_FOLDER + 'train_df.pkl')

# Test data loading  
try:
    test_df = pd.read_pickle(PICKLES_FOLDER + 'test_df.pkl')
except:
    test_df = pd.read_csv(DATA_FOLDER + 'test.csv')
    test_df.to_pickle(PICKLES_FOLDER + 'test_df.pkl')

# Redirects loading
try:
    enwinki_redirects = pd.read_pickle(PICKLES_FOLDER + 'enwiki_redirects.pkl')
except:
    enwinki_redirects = pd.read_csv(WIKILITE_FOLDER + 'enwiki_redirects.tsv', names=['en_title', 'en_redirect_title'],
                                    sep='\t')
    enwinki_redirects.to_pickle(PICKLES_FOLDER + 'enwiki_redirects.pkl')

# Aliases loading  
try:
    item_aliases = pd.read_pickle(PICKLES_FOLDER + 'item_aliases.pkl')
except:
    item_aliases = pd.read_csv(WIKILITE_FOLDER + 'item_aliases.csv')
    item_aliases.to_pickle(PICKLES_FOLDER + 'item_aliases.pkl')

# Properties loading
try:
    properties = pd.read_pickle(PICKLES_FOLDER + 'property.pkl')
except:
    properties = pd.read_csv(WIKILITE_FOLDER + 'property.csv')
    properties.to_pickle(PICKLES_FOLDER + 'property.pkl')

# Statements loading
try:
    statements = pd.read_pickle(PICKLES_FOLDER + 'statements.pkl')
except:
    statements = pd.read_csv(WIKILITE_FOLDER + 'statements.csv')
    statements.to_pickle(PICKLES_FOLDER + 'statements.pkl')

# Wiki items loading 
try:
    wiki_items = pd.read_pickle(PICKLES_FOLDER + 'wiki_items.pkl')
except:
    wiki_items = pd.read_csv(WIKILITE_FOLDER + 'wiki_items.csv')
    wiki_items.to_pickle(PICKLES_FOLDER + 'wiki_items.pkl')

## Part 1: Using existing datasets

### Data preprocessing

Merging `item_aliases` and `wiki_items` on `item_id` to get the `wikipedia_title` for each `en_alias`:

In [24]:
merged_wiki_items = wiki_items.merge(item_aliases, how='left', on='item_id')
merged_wiki_items.head()

Unnamed: 0,item_id,en_label,en_description,wikipedia_title,en_alias
0,1,Universe,totality of space and all contents,Universe,Our Universe
1,1,Universe,totality of space and all contents,Universe,The Universe
2,1,Universe,totality of space and all contents,Universe,The Cosmos
3,1,Universe,totality of space and all contents,Universe,cosmos
4,2,Earth,third planet from the Sun in the Solar System,Earth,Blue Planet


In [25]:
# Copying the dataframes to modify them
test_df_mod = test_df.copy(deep=True)
train_df_mod = train_df.copy(deep=True)

In [65]:
# Beginning of the URL to wikipedia
URL = 'http://en.wikipedia.org/wiki/'
LEN_URL = len(URL)

In [27]:
# We need to lowercase the tokens to avoid problems with case in future steps
train_df_mod['full_mention_lower'] = train_df_mod['full_mention'].str.lower()

# We only keep the tokens that have a wiki_url
train_df_mod = train_df_mod[train_df_mod['wiki_url'].notnull() & (train_df_mod['wiki_url'] != '--NME--')]

# We also lowercase the tokens in the test data for future steps
test_df_mod['full_mention'] = test_df_mod['full_mention'].str.lower()

We transform:
- `en_alias` and `wikipedia_title` in `merged_wiki_items`
- `wikipedia_title` in `wiki_items`
- `en_title` in `redirects`

to lowercase to avoid problems with case in future steps

In [28]:
merged_wiki_items['en_alias_lower'] = merged_wiki_items['en_alias'].str.lower()
merged_wiki_items['wikipedia_title_lower'] = merged_wiki_items['wikipedia_title'].str.lower()

wiki_items['wikipedia_title_lower'] = wiki_items['wikipedia_title'].str.lower()

enwinki_redirects['en_title_lower'] = enwinki_redirects['en_title'].str.lower()

### Creating dictionaries for faster lookup

In [29]:
DICT_FOLDER = PICKLES_FOLDER + 'dictionaries/'

In [30]:
try:
    aliases_dict = pickle.load(open(DICT_FOLDER + 'aliases_dict.pkl', 'rb'))
except:
    aliases_dict = pd.Series(merged_wiki_items['wikipedia_title'].values,
                             index=merged_wiki_items['en_alias_lower']).to_dict()
    pickle.dump(aliases_dict, open(DICT_FOLDER + 'aliases_dict.pkl', 'wb'))

In [31]:
try:
    titles_dict = pickle.load(open(DICT_FOLDER + 'titles_dict.pkl', 'rb'))
except:
    titles_dict = pd.Series(wiki_items['wikipedia_title'].values, index=wiki_items['wikipedia_title_lower']).to_dict()
    pickle.dump(titles_dict, open(DICT_FOLDER + 'titles_dict.pkl', 'wb'))


In [32]:
try:
    train_dict = pickle.load(open(DICT_FOLDER + 'train_dict.pkl', 'rb'))
except:
    train_dict = pd.Series(train_df_mod['wiki_url'].values, index=train_df_mod['full_mention_lower']).to_dict()
    pickle.dump(train_dict, open(DICT_FOLDER + 'train_dict.pkl', 'wb'))

In [33]:
try:
    redirects_dict = pickle.load(open(DICT_FOLDER + 'redirects_dict.pkl', 'rb'))
except:
    redirects_dict = pd.Series(enwinki_redirects['en_redirect_title'].values,
                               index=enwinki_redirects['en_title_lower']).to_dict()
    pickle.dump(redirects_dict, open(DICT_FOLDER + 'redirects_dict.pkl', 'wb'))

In [34]:
# We want to keep the count of the number of matches we find with aliases and redirects
aliases_matching = 0
redirects_matching = 0

for index, row in tqdm(test_df_mod.iterrows(), total=test_df_mod.shape[0]):
    if str(row['wiki_url']) == 'nan' or row['wiki_url'] != '?':
        continue

    token = row['full_mention']
    train_url = train_dict.get(token)

    if train_url is not None:
        # We found a link in the train data and it is the true identity so we can use it
        test_df_mod.at[index, 'wiki_url'] = train_url
        continue

    else:
        # TODO: explain the order of the if statements in the report
        wiki_title = titles_dict.get(token)

        if wiki_title is not None:
            aliases_matching += 1

        else:
            wiki_title = aliases_dict.get(token)

            if wiki_title is not None:
                aliases_matching += 1

    if wiki_title is not None:
        redirect_title = redirects_dict.get(wiki_title.lower())

        if redirect_title is not None:
            redirects_matching += 1
            test_df_mod.at[index, 'wiki_url'] = URL + redirect_title.replace(' ', '_')

        else:
            test_df_mod.at[index, 'wiki_url'] = URL + wiki_title.replace(' ', '_')


  0%|          | 0/104890 [00:00<?, ?it/s]

In [35]:
aliases_matching

2107

In [36]:
redirects_matching

393

Let's see how many tokens we found links for:

In [37]:
print(f"Previously we had {test_df[test_df['wiki_url'] == '?']['wiki_url'].count()} tokens without a link")
print(f"Now we have {test_df_mod[test_df_mod['wiki_url'] == '?']['wiki_url'].count()} tokens without a link")

Previously we had 9166 tokens without a link
Now we have 547 tokens without a link


### Creating submission file for second part

In [38]:
result_first_part = test_df_mod[['id', 'wiki_url']]
result_first_part.loc[:, 'wiki_url'] = result_first_part['wiki_url'].apply(lambda x: 'NOT_FOUND' if not (str(x).startswith('http') or str(x) == '?') else x)
# result_first_part.loc[:, 'wiki_url'] = result_first_part['wiki_url'].apply(lambda x: 'NOT_FOUND' if not (str(x).startswith('http')) else x)

# name = 'tr_title_alias_dict_redirects_url'
# 
# result_first_part.to_csv(SUBMISSIONS_FOLDER + name + '.csv', index=False)

## Part 2: Knowledge Graph

In [42]:
def get_connection(only_train=False):
    """
    Function retrieving the connection between the wiki_items from the statements.csv file.
    
    Parameters
    ----------
    only_train: bool, optional (default=False)
        If True, only the connections between the items in the train data are returned.
    
    Returns
    -------  
    connection: pd.DataFrame
        DataFrame containing the connections between the wiki_items.
    """
    
    connection = statements
    if only_train:
        list_of_ids = pd.read_csv('train_wiki.csv')  # obtain by running get_wiki.ipynb
        list_of_ids = list_of_ids.item_id.tolist()
        all_ids = set(connection['source_item_id'].unique()).union(set(connection['target_item_id'].unique()))

        filtered_ids = set(list_of_ids).intersection(all_ids)

        connection = connection[
            connection['source_item_id'].isin(filtered_ids) | connection['target_item_id'].isin(filtered_ids)]
    return connection


In [43]:
def add_edges(G, connection, progress=True):
    """
    Function adding the edges to the graph.
    
    Parameters
    ----------
    G: nx.Graph
        Graph to which the edges are added.
    connection: pd.DataFrame
        DataFrame containing the connections between the wiki_items.
    progress: bool, optional (default=True)
        If True, a progress bar is shown.
        
    Returns
    -------
    G: nx.Graph
        Graph with the added edges. 
    """
    
    if progress:
        for source_item_id, _, target_item_id in tqdm(connection.iloc, total=len(connection)):
            G.add_edge(source_item_id, target_item_id)
    else:
        G = nx.from_pandas_edgelist(connection, 'source_item_id', 'target_item_id', create_using=nx.Graph)
    return G

In [44]:
connection = get_connection()

In [45]:
G = nx.Graph()  # Undirected Graph
G = add_edges(G, connection, progress=True)

  0%|          | 0/26903188 [00:00<?, ?it/s]

In [46]:
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()

print(f"Number of nodes: {num_nodes}")
print(f"Number of edges: {num_edges}")

Number of nodes: 4906271
Number of edges: 24528246


In [47]:
pickle_filename = "graph_undirected_full.pkl"

# Save graph pickle to the file
with open(PICKLES_FOLDER + pickle_filename, 'wb') as pickle_file:
    pickle.dump(G, pickle_file)

In [48]:
# Specify the filename for the pickle file
pickle_filename = "graph_undirected_full.pkl"

# Load the graph from the pickle file
with open(PICKLES_FOLDER + pickle_filename, 'rb') as pickle_file:
    G = pickle.load(pickle_file).to_undirected()

In [78]:
# Taking the result from the first part
partial = result_first_part

testdf = test_df.copy(deep=True)
testdf = testdf.merge(partial, on='id')
display(testdf)

Unnamed: 0,id,token,entity_tag,full_mention,wiki_url_x,wiki_url_y
0,0,-DOCSTART- (947testa CRICKET),,,,NOT_FOUND
1,1,CRICKET,,,,NOT_FOUND
2,2,-,,,,NOT_FOUND
3,3,LEICESTERSHIRE,B,LEICESTERSHIRE,?,http://en.wikipedia.org/wiki/Leicestershire_Co...
4,4,TAKE,,,,NOT_FOUND
...,...,...,...,...,...,...
104885,104885,brother,,,,NOT_FOUND
104886,104886,",",,,,NOT_FOUND
104887,104887,Bobby,B,Bobby,?,http://en.wikipedia.org/wiki/Bobby
104888,104888,.,,,,NOT_FOUND


In [58]:
redirect = enwinki_redirects
temp = testdf[partial.wiki_url == "?"].merge(redirect, left_on='full_mention', right_on='en_title')[['id', 'en_redirect_title']]

display(temp)

Unnamed: 0,id,en_redirect_title
0,795,Universities in the United Kingdom
1,975,Universities in the United Kingdom
2,4396,David Barr
3,4402,Michael Sullivan
4,4552,United States Amateur Championships
...,...,...
342,103998,Predrag Mijatović
343,104041,Predrag Mijatović
344,104124,Mijatović
345,104310,De Graafschap


In [79]:
testdf = testdf.merge(temp, on='id', how='left')
testdf['en_redirect_title'] = testdf.apply(lambda row: row.full_mention if pd.isna(row['en_redirect_title']) else row['en_redirect_title'], axis=1)

testdf = testdf.drop(columns=['wiki_url_x', 'full_mention'])
testdf.rename(columns={'wiki_url_y': 'wiki_url'}, inplace=True)
display(testdf)

Unnamed: 0,id,token,entity_tag,wiki_url,en_redirect_title
0,0,-DOCSTART- (947testa CRICKET),,NOT_FOUND,
1,1,CRICKET,,NOT_FOUND,
2,2,-,,NOT_FOUND,
3,3,LEICESTERSHIRE,B,http://en.wikipedia.org/wiki/Leicestershire_Co...,LEICESTERSHIRE
4,4,TAKE,,NOT_FOUND,
...,...,...,...,...,...
104885,104885,brother,,NOT_FOUND,
104886,104886,",",,NOT_FOUND,
104887,104887,Bobby,B,http://en.wikipedia.org/wiki/Bobby,Bobby
104888,104888,.,,NOT_FOUND,


In [62]:
wiki_item = wiki_items[['item_id', 'en_label', 'wikipedia_title']]
# wiki_item = wiki_item[wiki_item['item_id'].isin(filtered_ids)]

# wiki_item['en_label'] = wiki_item['en_label'] + " " + wiki_item['wikipedia_title']
col_ = wiki_item.wikipedia_title.str.lower().str
lower_case_wiki_item_titles = wiki_item.wikipedia_title.str.lower().str
display(wiki_item)

Unnamed: 0,item_id,en_label,wikipedia_title
0,1,Universe,Universe
1,2,Earth,Earth
2,3,life,Life
3,4,death,Death
4,5,human,Human
...,...,...,...
5216231,77042017,HR 4523,HD 102365
5216232,77043280,Charlie Johnston,Charlie Johnstone
5216233,77231860,Aldo Rossi,Aldo Rossi (musician)
5216234,77240068,Ebenezer Baptist Church,Ebenezer Baptist Church


In [80]:
testdf['wiki_title'] = testdf.wiki_url.str[LEN_URL:].str.replace('_', ' ')
testdf = testdf.merge(wiki_item, left_on='wiki_title', right_on='wikipedia_title', how='left').drop(
    columns=['wikipedia_title', 'en_label', 'wiki_title'])
display(testdf)

Unnamed: 0,id,token,entity_tag,wiki_url,en_redirect_title,item_id
0,0,-DOCSTART- (947testa CRICKET),,NOT_FOUND,,
1,1,CRICKET,,NOT_FOUND,,
2,2,-,,NOT_FOUND,,
3,3,LEICESTERSHIRE,B,http://en.wikipedia.org/wiki/Leicestershire_Co...,LEICESTERSHIRE,3229147.0
4,4,TAKE,,NOT_FOUND,,
...,...,...,...,...,...,...
104885,104885,brother,,NOT_FOUND,,
104886,104886,",",,NOT_FOUND,,
104887,104887,Bobby,B,http://en.wikipedia.org/wiki/Bobby,Bobby,289262.0
104888,104888,.,,NOT_FOUND,,


In [None]:
def get_dist(certain_list, candidate, fill_na=9999):
    """
    Function calculating the distance between the full mention and a candidate.
    
    Parameters
    ----------
    certain_list: list
        List containing the already assigned entities.
    candidate: float
        Candidate id.
        
    Returns
    -------
    distance: float
        Distance between the full mention and the candidate.
    """
    
    distance_list = []
    subset_certain = list(certain_list)
    random.shuffle(subset_certain)
    subset_size = 10
    subset_certain = subset_certain[:subset_size]
    
    for assigned_entity in subset_certain:
        try:
            shortest = nx.shortest_path_length(G, source=assigned_entity, target=candidate)
            distance_list.append(shortest)
        except:
            distance_list.append(fill_na)
            
    return sum(distance_list) / subset_size if distance_list else fill_na

In [69]:
def get_all_dist(filtered_ls, full_mention_train):
    """
    Function calculating the distance between the full mention and all the candidates.
    
    Parameters
    ----------
    filtered_ls: pd.DataFrame
        DataFrame containing the candidates.
    full_mention_train: list
        List of the already assigned entities.
        
    Returns
    -------
    distances: list
        List containing the distances between the full mention and all the candidates.
    """
    
    distances = []
    
    for candidate, _, title in filtered_ls.iloc:
        distances.append((title, candidate, get_dist(full_mention_train, candidate)))
        
    return distances

In [70]:
def get_info(current_document):
    """
    Function retrieving the information from the current document.
     
    Parameters
    ----------
    current_document: list
        List containing all the rows of the current document.
        
    Returns
    -------
    found: pd.DataFrame
        DataFrame containing the rows of the current document that have a link.
    not_found: pd.DataFrame
        DataFrame containing the rows of the current document that have to be assigned.
    item_id_train: set
        Set containing the ids of the entities that have a link.
    full_mention_found: dict
        Dictionary containing the full mentions that have a link and the corresponding link.
    mention_test: set
        Set containing the full mentions that have to be assigned.
    """
    
    if len(current_document):
        document_df = pd.DataFrame(current_document)
        full_mention_ = document_df[document_df['wiki_url'] != 'NOT_FOUND']
        found = full_mention_[full_mention_['wiki_url'] != '?']
        item_id_train = set(found.item_id.tolist())
        full_mention_found = dict(zip(found['full_mention'].str.lower(), found['wiki_url']))

        not_found = full_mention_[full_mention_['wiki_url'] == '?'].copy()
        not_found.full_mention = not_found.full_mention.str.lower()
        mention_test = set(not_found.full_mention.tolist())
        
        return found, not_found, item_id_train, full_mention_found, mention_test
    
    else:
        return None, None, None, None, None

In [87]:
def find_doc_range(df):
    """
    Function returning a zipped tuple of end indexes and start indexes of documents.
    """
    # TODO doc
    def check_docstart(row):
        """
        True if the current row is the row where the document starts.
        """
        if pd.notnull(row['token']) and 'DOCSTART' in row['token']:
            return True
        else:
            return False
    data = df.copy()
    data['docstart_id'] = df.apply(check_docstart, axis=1)
    start_ids = data[data['docstart_id']]['id'].values
    print(len(start_ids))
    end_ids = start_ids[1:] - 1
    end_ids = np.append(end_ids,len(df))
    docs_range = zip(start_ids, end_ids)
    
    return docs_range

In [None]:
def split_document_findings(document_dataframe):
    """
    Function retrieving and separating entities depending on weather they are associated with a found url or not.
    """
    #TODO doc.
    document = document_dataframe[document_dataframe['wiki_url'] != 'NOT_FOUND']
    found_links = document[document['wiki_url'] != '?']
    not_found_links = document[document['wiki_url'] == '?']
    return document, found_links, not_found_links

In [None]:
def find_equal_string_candidates(mention, full_mention_found):
    #TODO doc.
    return set([url for found_mention, url in full_mention_found.items() if mention in found_mention])

In [1]:
def find_substring_candidates(mention, saved_candidates):
    #TODO make sure all dataframe have a lower case version for comparaison and use it.
    #TODO doc.
    if mention not in saved_candidates.keys(): 
        candidates = wiki_item.loc[lower_case_wiki_item_titles.contains(r'\b{}\b'.format(mention), na=False)]
        saved_candidates[mention] = candidates
        return candidates
    else:
        return saved_candidates[uncertain_word]
    
    

SyntaxError: invalid syntax (3954033972.py, line 1)

In [None]:
def find_best_candidate(candidates_for_mention, found_ids):
    """
    Function returning the best candidates for the mention.
    """
    distances = get_all_dist(candidates_for_mention, found_ids)
    obtained_distances = len(distances)
    if obtained_distances == 1:
        title, _, _ = distances[0]
        return URL + title.replace(' ', '_')
    elif obtained_distances >=1:
        first_candidate, second_candidate = heapq.nsmallest(2, distances,key=lambda x: x[-1])       
        if first_candidate[-1] < second_candidate[-1]:
            title, _, _ = first_candidate
            return URL + title.replace(' ', '_')
    return None

Define the global constants and variables used for graph decisions.

In [None]:
MAX_CANDIDATES = 50
MAX_FETCHING_TRIES = 3
SEED = 42
OLD_SIZE = np.inf
current_document = []
saved_candidates = {}

random.seed(SEED)

In [None]:
testdf['full_mention'] = testdf['full_mention'].str.lower()
docs_range = find_doc_range(testdf)

In [88]:
for start_ids, end_ids in docs_range:
    current_document = testdf.iloc[start_ids:end_ids]
    display(current_document)
    break #TODO remove
    #TODO make sure that everything is lowercase when comparing strings. IN ALL FUNCTIONS.
    
    # We are interested in the rows that should have a link
    current_document, found_links, not_found_links = split_document_findings(current_document)
    
    found_links_count = len(found_links)
    
    for _ in range(MAX_FETCHING_TRIES):
        #Shuffle mentions TODO check shuffle works and does not mess up the indexes
        random_mentions = not_found_links[['item_id','full_mention']].sample(frac=1)
        
        #Find candidates
        candidates_urls = pd.DataFrame({
            'item_id': random_mentions['item_id'],
            'candidates_url': random_mentions.apply(lambda mention: find_equal_string_candidates(mention, dict(zip(found['full_mention'], found['wiki_url']))))
        })
        len_candidates_urls = candidates_urls['candidates_url'].apply(len)
        
        #for single candidates, retrieve the value and attribute it.
        single_candidates = candidates_urls[len_candidates_urls == 1]
        for idx, row in single_candidates.iterrows():
            #TODO check indexes not messed up
            found_links.at[row['item_id'], 'wiki_url'] = row['candidates_url'][0]
        
        # Update not found links
        not_found_links = not_found_links[~not_found_links['item_id'].isin(single_candidates['item_id'])]

        
        #try finding the best for substrings
        candidates_urls = pd.DataFrame({
            'item_id': random_mentions['item_id'],
            'candidates_url': random_mentions.apply(lambda mention: find_substring_candidates(mention, saved_candidates))
        })
        
        # Calculate the number of candidates for each mention
        len_candidates_urls = candidates_urls['candidates_url'].apply(len)
        
        # Process single candidates
        single_candidates = candidates_urls[len_candidates_urls == 1]
        for idx, row in single_candidates.iterrows():
            #TODO check indexes not messed up
            found_links.at[idx, 'wiki_url'] = row['candidates_url'][0]
        
        # Filter for multi candidates
        multi_candidates = candidates_urls[(len_candidates_urls > 1) & (len_candidates_urls <= MAX_CANDIDATES)]
        
        # Apply find_best_candidate for each mention in multi_candidates
        for not_found_id in multi_candidates['item_id']:
            #TODO check indexes not messed up
            best_candidate = find_best_candidate(multi_candidates[multi_candidates['item_id'] == not_found_id]['candidates_url'].iloc[0], found_links['item_id'])
            
            if best_candidate is not None:
                found_links.at['item_id', 'wiki_url'] = best_candidate
        
        if found_links_count < len(found_links):
            found_links_count = len(found_links)
        else:
            break
            
# TODO re-concat not_found and found.

447


Unnamed: 0,id,token,full_mention,wiki_url
0,0,-DOCSTART- (947testa CRICKET),,NOT_FOUND
1,1,CRICKET,,NOT_FOUND
2,2,-,,NOT_FOUND
3,3,LEICESTERSHIRE,LEICESTERSHIRE,http://en.wikipedia.org/wiki/Leicestershire_Co...
4,4,TAKE,,NOT_FOUND
...,...,...,...,...
451,451,reached,,NOT_FOUND
452,452,108,,NOT_FOUND
453,453,for,,NOT_FOUND
454,454,three,,NOT_FOUND


In [81]:
id = 1000
MAX_CANDIDATES = 50
OLD_SIZE = np.inf
current_document = []
saved_candidates = {}

for index in tqdm(range(len(testdf)), desc="Processing", total=len(testdf)):
    row = testdf.iloc[index]
    current_document.append({
        'id': row['id'],
        'token': row['token'],
        'full_mention': row['en_redirect_title'],
        'wiki_url': row['wiki_url'],
        'item_id': row['item_id']
    })

    try:
        if row['token'].startswith('-DOCSTART-'):
            document_df = pd.DataFrame(current_document)
            current_document = []

            full_mention_ = document_df[document_df['wiki_url'] != 'NOT_FOUND']
            found = full_mention_[full_mention_['wiki_url'] != '?']
            item_id_train = set(found.item_id.tolist())
            full_mention_found = dict(zip(found['full_mention'].str.lower(), found['wiki_url']))

            not_found = full_mention_[full_mention_['wiki_url'] == '?'].copy()
            not_found.full_mention = not_found.full_mention.str.lower()
            mention_test = set(not_found.full_mention.tolist())

            old_size = OLD_SIZE
            new_size = len(mention_test)
            correct_links = []
            for tries in range(3):
                if new_size < old_size:
                    random_order = list(mention_test)
                    random.shuffle(random_order)

                    for uncertain_word in (random_order):
                        correct_link = None
                        matching_urls = set(
                            [url for mention, url in full_mention_found.items() if uncertain_word in mention])
                        if len(matching_urls) == 1:  # if uncertain_word is a part of the correct entity
                            full_mention_found[uncertain_word] = list(matching_urls)[0]
                        elif len(matching_urls) > 1:
                            print(uncertain_word, "seems to belong to", matching_urls)
                        else:
                            filtered_ls = wiki_item.loc[col_.contains(r'\b{}\b'.format(uncertain_word),na=False)] if uncertain_word not in saved_candidates.keys() else \
                            saved_candidates[uncertain_word]
                            saved_candidates[uncertain_word] = filtered_ls
                            no_candidates = len(filtered_ls)
                            if not no_candidates:
                                print("No match for", uncertain_word)
                            elif no_candidates == 1:
                                full_mention_found[uncertain_word] = URL + \
                                                                     filtered_ls.wikipedia_title.tolist()[0].replace(
                                                                         ' ', '_')
                                item_id_train.add(filtered_ls.item_id.tolist()[0])
                            elif no_candidates < MAX_CANDIDATES:
                                distances = get_all_dist(filtered_ls, item_id_train)
                                if len(distances) > 1:
                                    first_candidate, second_candidate = heapq.nsmallest(2, distances,
                                                                                        key=lambda x: x[-1])
                                    if first_candidate[-1] < second_candidate[-1]:
                                        title, choice, _ = first_candidate
                                        full_mention_found[
                                            uncertain_word] = URL + title.replace(' ', '_')
                                        item_id_train.add(choice)
                                    else:
                                        print("can not decide between", first_candidate, "and", second_candidate)
                                else:
                                    title, choice, _ = distances[0]
                                    full_mention_found[
                                        uncertain_word] = URL + title.replace(' ', '_')
                                    item_id_train.add(choice)

                            else:
                                print(uncertain_word, "has too many candidates", no_candidates)
                    old_size = len(mention_test)
                    mention_test.difference_update(full_mention_found.keys())
                    new_size = len(mention_test)
            if len(not_found):
                not_found.wiki_url = not_found.full_mention.map(full_mention_found)
                not_found.to_csv(f'{CORRECTED_FOLDER}{id}.csv', index=False)
                id += 1
    except Exception as e:
        pass

Processing:   0%|          | 0/104890 [00:00<?, ?it/s]

transylvania has too many candidates 82
given has too many candidates 1929
serbia and montenegro has too many candidates 64
southeast asia has too many candidates 67
No match for mohammed idris
No match for carroll a. campbell jr.
No match for pirelli cables
azerbaijani has too many candidates 140
azerbaijani has too many candidates 140
republika has too many candidates 103
No match for brazilian-born
donato has too many candidates 132
No match for inverness thistle f.c.
No match for inverness thistle f.c.
fernández has too many candidates 874
inter milan has too many candidates 58
inter milan has too many candidates 58


  filtered_ls = wiki_item.loc[col_.contains(r'\b{}\b'.format(uncertain_word),


No match for labour party (uk)
No match for london-based
bahraini has too many candidates 123
No match for interstates 80
No match for warner bros.
afc asian cup has too many candidates 124
cuttitta seems to belong to {'http://en.wikipedia.org/wiki/Massimo_Cuttitta', 'http://en.wikipedia.org/wiki/Marcello_Cuttitta'}
No match for 1995 world cup
afc asian cup has too many candidates 124
asian cup has too many candidates 171
warne has too many candidates 52
No match for michael divenuto
No match for ny islanders
No match for ny rangers
anaheim has too many candidates 94
sacramento has too many candidates 436
golden state has too many candidates 83
No match for ny jets
green bay has too many candidates 224
national has too many candidates 33477
No match for uefa cups
No match for uefa cups


  filtered_ls = wiki_item.loc[col_.contains(r'\b{}\b'.format(uncertain_word),


No match for northwest (disambiguation)


  filtered_ls = wiki_item.loc[col_.contains(r'\b{}\b'.format(uncertain_word),


No match for salient (geography)
tempe has too many candidates 62
No match for cocker spaniels
national alliance has too many candidates 83
uganda has too many candidates 686
uganda has too many candidates 686
national has too many candidates 33477
national has too many candidates 33477
national has too many candidates 33477
No match for oestersund
No match for oestersund


  filtered_ls = wiki_item.loc[col_.contains(r'\b{}\b'.format(uncertain_word),


No match for united provinces (1937–50)
No match for inverness thistle f.c.
No match for elgin city f.c.
No match for inverness thistle f.c.
No match for n.ireland
No match for n.ireland
No match for michael divenuto
No match for michael divenuto
afc asian cup has too many candidates 124
afc asian cup has too many candidates 124
asian cup has too many candidates 171
afc asian cup has too many candidates 124
afc asian cup has too many candidates 124
asian cup has too many candidates 171
sacramento has too many candidates 436
golden state has too many candidates 83
sacramento has too many candidates 436
golden state has too many candidates 83
No match for ny islanders
No match for ny rangers
anaheim has too many candidates 94
No match for ny rangers


In [82]:
partial = result_first_part
testdf = test_df[['id', 'token', 'full_mention']]
testdf = testdf.merge(partial, on='id')
display(testdf)

Unnamed: 0,id,token,full_mention,wiki_url
0,0,-DOCSTART- (947testa CRICKET),,NOT_FOUND
1,1,CRICKET,,NOT_FOUND
2,2,-,,NOT_FOUND
3,3,LEICESTERSHIRE,LEICESTERSHIRE,http://en.wikipedia.org/wiki/Leicestershire_Co...
4,4,TAKE,,NOT_FOUND
...,...,...,...,...
104885,104885,brother,,NOT_FOUND
104886,104886,",",,NOT_FOUND
104887,104887,Bobby,Bobby,http://en.wikipedia.org/wiki/Bobby
104888,104888,.,,NOT_FOUND


In [83]:
corrected_df = pd.DataFrame()
counter = 0
for i in range(100000):
    try:
        corrected_df = pd.concat([corrected_df, pd.read_csv(f'{CORRECTED_FOLDER}{i}.csv')[['id', 'wiki_url']]],
                                 ignore_index=True)
        counter += 1
    except Exception as e:
        pass
corrected_df = corrected_df.drop_duplicates('id')
counter

109

In [84]:
merged_df = pd.merge(testdf, corrected_df, on='id', how='left', suffixes=('_original', '_update'))

merged_df['wiki_url'] = merged_df['wiki_url_update'].combine_first(merged_df['wiki_url_original'])

merged_df = merged_df.drop(['wiki_url_original', 'wiki_url_update'], axis=1)
print(len(merged_df[merged_df.wiki_url == '?']))
merged_df.loc[merged_df['wiki_url'] == '?', 'wiki_url'] = 'NOT_FOUND'
merged_df

352


Unnamed: 0,id,token,full_mention,wiki_url
0,0,-DOCSTART- (947testa CRICKET),,NOT_FOUND
1,1,CRICKET,,NOT_FOUND
2,2,-,,NOT_FOUND
3,3,LEICESTERSHIRE,LEICESTERSHIRE,http://en.wikipedia.org/wiki/Leicestershire_Co...
4,4,TAKE,,NOT_FOUND
...,...,...,...,...
104885,104885,brother,,NOT_FOUND
104886,104886,",",,NOT_FOUND
104887,104887,Bobby,Bobby,http://en.wikipedia.org/wiki/Bobby
104888,104888,.,,NOT_FOUND


In [85]:
merged_df[['id', 'wiki_url']].to_csv(SUBMISSIONS_FOLDER + 'submission.csv', index=False)