In [4]:
import pandas as pd
from tqdm.auto import tqdm
import networkx as nx
import heapq
import random
import pickle


In [2]:
def get_connection(only_train = False):
    connection = pd.read_csv('statements.csv')
    if only_train:
        list_of_ids = pd.read_csv('train_wiki.csv') # obtain by running get_wiki.ipynb
        list_of_ids = list_of_ids.item_id.tolist()
        all_ids = set(connection['source_item_id'].unique()).union(set(connection['target_item_id'].unique()))

        filtered_ids = set(list_of_ids).intersection(all_ids)

        connection = connection[connection['source_item_id'].isin(filtered_ids) | connection['target_item_id'].isin(filtered_ids)]
    return connection

def add_edges(G,connection, progress= True):
    if progress:
        for source_item_id, _, target_item_id in tqdm(connection.iloc,total=len(connection)):
            G.add_edge(source_item_id, target_item_id)
    else:
        G = nx.from_pandas_edgelist(list_of_ids, 'source_item_id', 'target_item_id', create_using=nx.Graph)
    return G

In [None]:
connection = get_connection()

In [None]:
G = nx.Graph() # Undirected Graph  //// Use DiGraph for directed graph
G = add_edges(G, connection, progress = True) # progress = False is faster (use when submit)


In [None]:
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()

print(f"Number of nodes: {num_nodes}")
print(f"Number of edges: {num_edges}")

In [None]:
import pickle
pickle_filename = "graph_undirected_full.pkl"

# save graph pickle to the file
with open(pickle_filename, 'wb') as pickle_file:
    pickle.dump(G, pickle_file)

In [5]:
import pickle

# Specify the filename for the pickle file
pickle_filename = "graph_undirected_full.pkl"

# Load the graph from the pickle file
with open(pickle_filename, 'rb') as pickle_file:
    G = pickle.load(pickle_file).to_undirected()

In [6]:
partial = pd.read_csv('submission_train_aliases_uncertain.csv')
testdf = pd.read_csv('test.csv')
testdf = testdf.merge(partial, on='id')
testdf

Unnamed: 0,id,token,entity_tag,full_mention,wiki_url_x,wiki_url_y
0,0,-DOCSTART- (947testa CRICKET),,,,NOT_FOUND
1,1,CRICKET,,,,NOT_FOUND
2,2,-,,,,NOT_FOUND
3,3,LEICESTERSHIRE,B,LEICESTERSHIRE,?,http://en.wikipedia.org/wiki/Leicestershire_Co...
4,4,TAKE,,,,NOT_FOUND
...,...,...,...,...,...,...
104885,104885,brother,,,,NOT_FOUND
104886,104886,",",,,,NOT_FOUND
104887,104887,Bobby,B,Bobby,?,http://en.wikipedia.org/wiki/Bobby_Timmons
104888,104888,.,,,,NOT_FOUND


In [8]:
redirect = pd.read_csv('enwiki_redirects.tsv', sep='\t', header=None)
temp = testdf[partial.wiki_url =="?"].merge(redirect, left_on='full_mention', right_on=0)[['id',1]]
temp

Unnamed: 0,id,1
0,795,Universities in the United Kingdom
1,975,Universities in the United Kingdom
2,4396,David Barr
3,4402,Michael Sullivan
4,4552,United States Amateur Championships
...,...,...
351,103998,Predrag Mijatović
352,104041,Predrag Mijatović
353,104124,Mijatović
354,104310,De Graafschap


In [9]:
testdf = testdf.merge(temp, on='id', how='left')
testdf[1] = testdf.apply(lambda row: row.full_mention if pd.isna(row[1]) else row[1], axis=1)

testdf =  testdf.drop(columns=['wiki_url_x',  'full_mention'] )
testdf

Unnamed: 0,id,token,entity_tag,wiki_url_y,1
0,0,-DOCSTART- (947testa CRICKET),,NOT_FOUND,
1,1,CRICKET,,NOT_FOUND,
2,2,-,,NOT_FOUND,
3,3,LEICESTERSHIRE,B,http://en.wikipedia.org/wiki/Leicestershire_Co...,LEICESTERSHIRE
4,4,TAKE,,NOT_FOUND,
...,...,...,...,...,...
104885,104885,brother,,NOT_FOUND,
104886,104886,",",,NOT_FOUND,
104887,104887,Bobby,B,http://en.wikipedia.org/wiki/Bobby_Timmons,Bobby
104888,104888,.,,NOT_FOUND,


In [10]:
wiki_item = pd.read_csv('wiki_items.csv')[['item_id', 'en_label','wikipedia_title']]
#wiki_item = wiki_item[wiki_item['item_id'].isin(filtered_ids)]

wiki_item['en_label'] = wiki_item['en_label'] + " " + wiki_item['wikipedia_title']
col_ = wiki_item.wikipedia_title.str.lower().str
wiki_item

Unnamed: 0,item_id,en_label,wikipedia_title
0,1,Universe Universe,Universe
1,2,Earth Earth,Earth
2,3,life Life,Life
3,4,death Death,Death
4,5,human Human,Human
...,...,...,...
5216231,77042017,HR 4523 HD 102365,HD 102365
5216232,77043280,Charlie Johnston Charlie Johnstone,Charlie Johnstone
5216233,77231860,Aldo Rossi Aldo Rossi (musician),Aldo Rossi (musician)
5216234,77240068,Ebenezer Baptist Church Ebenezer Baptist Church,Ebenezer Baptist Church


In [11]:
testdf['wiki_title'] = testdf.wiki_url_y.str[29:].str.replace('_', ' ')
testdf = testdf.merge(wiki_item, left_on='wiki_title', right_on='wikipedia_title',how='left').drop(columns=['wikipedia_title','en_label', 'wiki_title'])
testdf

Unnamed: 0,id,token,entity_tag,wiki_url_y,1,item_id
0,0,-DOCSTART- (947testa CRICKET),,NOT_FOUND,,
1,1,CRICKET,,NOT_FOUND,,
2,2,-,,NOT_FOUND,,
3,3,LEICESTERSHIRE,B,http://en.wikipedia.org/wiki/Leicestershire_Co...,LEICESTERSHIRE,3229147.0
4,4,TAKE,,NOT_FOUND,,
...,...,...,...,...,...,...
104885,104885,brother,,NOT_FOUND,,
104886,104886,",",,NOT_FOUND,,
104887,104887,Bobby,B,http://en.wikipedia.org/wiki/Bobby_Timmons,Bobby,132341.0
104888,104888,.,,NOT_FOUND,,


In [39]:
def get_all_dist(filtered_ls, full_mention_train):
    distances = []
    for candidate, _,title in filtered_ls.iloc:
        distances.append((title,candidate, get_dist(full_mention_train, candidate)))
    return distances
    
def get_dist(certain_list, b, fill_na = 9999):
    distance_list = []
    subset_certain = list(certain_list)
    random.shuffle(subset_certain)
    subset_size = 10
    subset_certain = subset_certain[:subset_size]
    for a in subset_certain:
        try:
            shortest = nx.shortest_path_length(G, source=a, target=b)
            distance_list.append(shortest)
        except:
            distance_list.append(fill_na)
    return sum(distance_list)/subset_size if distance_list else fill_na

def get_info(current_document):
    if len(current_document):
        document_df = pd.DataFrame(current_document)
        full_mention_ = document_df[document_df['wiki_url'] != 'NOT_FOUND']
        found = full_mention_[full_mention_['wiki_url'] != '?']
        item_id_train = set(found.item_id.tolist()) 
        full_mention_found = dict(zip(found['full_mention'].str.lower(), found['wiki_url']))

        not_found = full_mention_[full_mention_['wiki_url'] == '?'].copy()
        not_found.full_mention = not_found.full_mention.str.lower()
        mention_test = set(not_found.full_mention.tolist())
        return found, not_found, item_id_train, full_mention_found, mention_test
    else:
        return None, None, None,None, None

id=1000
current_document = []
saved_candidates = {}

for index in tqdm(range(len(testdf)-1, -1, -1), desc="Processing", total=len(testdf)):
    row = testdf.iloc[index]
    current_document.append({
                    'id': row['id'],
                    'token': row['token'],
                    'full_mention': row[1],
                    'wiki_url': row['wiki_url_y'], 
                    'item_id': row['item_id']
                })
    
    try:
        if row['token'].startswith('-DOCSTART-'):        
            document_df = pd.DataFrame(current_document)
            current_document = []

            full_mention_ = document_df[document_df['wiki_url'] != 'NOT_FOUND']
            found = full_mention_[full_mention_['wiki_url'] != '?']
            item_id_train = set(found.item_id.tolist()) 
            full_mention_found = dict(zip(found['full_mention'].str.lower(), found['wiki_url']))

            not_found = full_mention_[full_mention_['wiki_url'] == '?'].copy()
            not_found.full_mention = not_found.full_mention.str.lower()
            mention_test = set(not_found.full_mention.tolist())

            old_size = 1000000
            new_size = len(mention_test)
            correct_links = []
            for tries in range(3):
                if new_size < old_size:
                    random_order = list(mention_test)
                    random.shuffle(random_order)
                    
                    for uncertain_word in (random_order):
                        correct_link = None
                        matching_urls = set([url for mention,url in full_mention_found.items() if uncertain_word in mention])
                        if len(matching_urls)==1: #if uncertain_word is a part of the correct entity
                            full_mention_found[uncertain_word] = list(matching_urls)[0]
                        elif len(matching_urls)>1:
                            print(uncertain_word, "seems to belong to", matching_urls)
                        else:
                            filtered_ls = wiki_item.loc[col_.contains(r'\b{}\b'.format(uncertain_word), na=False)] if uncertain_word not in saved_candidates.keys() else saved_candidates[uncertain_word]
                            saved_candidates[uncertain_word] = filtered_ls
                            no_candidates = len(filtered_ls)
                            if not no_candidates:
                                print("No match for", uncertain_word)
                            elif no_candidates == 1:
                                full_mention_found[uncertain_word] = 'http://en.wikipedia.org/wiki/' + filtered_ls.wikipedia_title.tolist()[0].replace(' ','_')
                                item_id_train.add(filtered_ls.item_id.tolist()[0]) 
                            elif no_candidates <  50:
                                distances = get_all_dist(filtered_ls, item_id_train)
                                if len(distances) > 1:
                                    first_candidate, second_candidate = heapq.nsmallest(2, distances, key=lambda x: x[-1])
                                    if first_candidate[-1] < second_candidate[-1]:
                                        title, choice,_ = first_candidate
                                        full_mention_found[uncertain_word] = 'http://en.wikipedia.org/wiki/' +title.replace(' ','_')
                                        item_id_train.add(choice)
                                    else:
                                        print("can not decicide between", first_candidate, "and", second_candidate)
                                else:
                                    title, choice,_ = distances[0]
                                    full_mention_found[uncertain_word] = 'http://en.wikipedia.org/wiki/' +title.replace(' ','_')
                                    item_id_train.add(choice)

                            else: 
                                print(uncertain_word, "has too much candidates", no_candidates)
                    old_size = len(mention_test)
                    mention_test.difference_update(full_mention_found.keys())
                    new_size = len(mention_test)
            if len(not_found):
                not_found.wiki_url = not_found.full_mention.map(full_mention_found)
                not_found.to_csv(f'corrected/{id}.csv', index=False)
                id +=1    
    except Exception as e:
        pass

Processing:   0%|          | 0/104890 [00:00<?, ?it/s]

can not decicide between ('2015–16 Rayo Vallecano season', 20921826, 1.9) and ('2018–19 Rayo Vallecano season', 59655144, 1.9)
oviedo has too much candidates 87
zaragoza has too much candidates 128
can not decicide between ('Extremadura UD', 994224, 2.0) and ('Canal Extremadura Televisión', 2935928, 2.0)
can not decicide between ('Real Sociedad B', 1067750, 1.9) and ('1981–82 Real Sociedad season', 4580229, 1.9)
tenerife has too much candidates 101
can not decicide between ('Racing de Santander', 12236, 2.0) and ('2007–08 Racing de Santander season', 16824433, 2.0)
can not decicide between ('Real Sociedad Gimnástica Española', 2479165, 2.0) and ('2009–10 Real Sociedad season', 4616132, 2.0)
zaragoza has too much candidates 128
can not decicide between ('2012–13 Rayo Vallecano season', 4628582, 1.9) and ('2014–15 Rayo Vallecano season', 17515428, 1.9)
oviedo has too much candidates 87
tenerife has too much candidates 101
tenerife has too much candidates 101
oviedo has too much candidate

KeyboardInterrupt: 

In [None]:
partial = pd.read_csv('submission_train_aliases_uncertain.csv')
testdf = pd.read_csv('test.csv')[['id', 'token', 'full_mention']]
testdf = testdf.merge(partial, on='id')
testdf

In [None]:
corrected_df = pd.DataFrame()
counter = 0
for i in range (100000):
    try:
        corrected_df = pd.concat([corrected_df, pd.read_csv(f'corrected/{i}.csv')[['id','wiki_url']]], ignore_index=True)
        counter += 1
    except Exception as e:
        pass
corrected_df = corrected_df.drop_duplicates('id')
counter

In [None]:
merged_df = pd.merge(testdf, corrected_df, on='id', how='left', suffixes=('_original', '_update'))

merged_df['wiki_url'] = merged_df['wiki_url_update'].combine_first(merged_df['wiki_url_original'])

merged_df = merged_df.drop(['wiki_url_original', 'wiki_url_update'], axis=1)
print(len(merged_df[merged_df.wiki_url=='?']))
merged_df.loc[merged_df['wiki_url'] == '?', 'wiki_url'] = 'NOT_FOUND'
merged_df

In [None]:
merged_df[['id', 'wiki_url']].to_csv('submission.csv', index=False)