In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from tqdm import tqdm
import os
import networkx as nx

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
aliases = pd.read_csv('data/item_aliases.csv')
property = pd.read_csv('data/property.csv')
wiki_items = pd.read_csv('data/wiki_items.csv')



In [3]:
statements = pd.read_csv('data/statements.csv')
enwinki_redirects = pd.read_csv('data/enwiki_redirects.tsv', sep='\t')

In [4]:
statements

Unnamed: 0,source_item_id,edge_property_id,target_item_id
0,1,398,497745
1,1,398,1133705
2,1,398,1139177
3,1,398,7439451
4,1,398,15241043
...,...,...,...
26903183,77240068,31,16970
26903184,77240068,131,23556
26903185,77242291,17,145
26903186,77242291,131,23311


In [5]:
display(statements.iloc[0])

source_item_id           1
edge_property_id       398
target_item_id      497745
Name: 0, dtype: int64

In [6]:
aliases

Unnamed: 0,item_id,en_alias
0,1,Our Universe
1,1,The Universe
2,1,The Cosmos
3,1,cosmos
4,2,Blue Planet
...,...,...
1495310,76858465,Photinus luciferin 4-monooxygenase (adenosine ...
1495311,76858465,firefly luciferase
1495312,76868858,12 Canum Venaticorum
1495313,76868858,Alpha Canum Venaticorum


In [7]:
wiki_items

Unnamed: 0,item_id,en_label,en_description,wikipedia_title
0,1,Universe,totality of space and all contents,Universe
1,2,Earth,third planet from the Sun in the Solar System,Earth
2,3,life,matter capable of extracting energy from the e...,Life
3,4,death,permanent cessation of vital functions,Death
4,5,human,"common name of Homo sapiens, unique extant spe...",Human
...,...,...,...,...
5216231,77042017,HR 4523,,HD 102365
5216232,77043280,Charlie Johnston,,Charlie Johnstone
5216233,77231860,Aldo Rossi,musician,Aldo Rossi (musician)
5216234,77240068,Ebenezer Baptist Church,"church in Atlanta, Georgia, USA",Ebenezer Baptist Church


In [8]:
statements

Unnamed: 0,source_item_id,edge_property_id,target_item_id
0,1,398,497745
1,1,398,1133705
2,1,398,1139177
3,1,398,7439451
4,1,398,15241043
...,...,...,...
26903183,77240068,31,16970
26903184,77240068,131,23556
26903185,77242291,17,145
26903186,77242291,131,23311


In [9]:
def to_edge(row):
    print(row)
    return (row['source_item_id'], row['target_item_id'])

In [10]:
edges = statements[['source_item_id', 'target_item_id']].values
edges

array([[       1,   497745],
       [       1,  1133705],
       [       1,  1139177],
       ...,
       [77242291,      145],
       [77242291,    23311],
       [77242291,  1537151]])

In [11]:
#Takes 44 seconds.
# Create a generator for edges
edge_generator = ((row[0], row[1]) for row in edges)

# Create a graph with these edges
G = nx.Graph()
G.add_edges_from(edge_generator)

In [12]:
#non_na = test[~test.wiki_url.isna()]

In [13]:
#test

Unnamed: 0,id,token,entity_tag,full_mention,wiki_url
0,0,-DOCSTART- (947testa CRICKET),,,
1,1,CRICKET,,,
2,2,-,,,
3,3,LEICESTERSHIRE,B,LEICESTERSHIRE,?
4,4,TAKE,,,
...,...,...,...,...,...
104885,104885,brother,,,
104886,104886,",",,,
104887,104887,Bobby,B,Bobby,?
104888,104888,.,,,


In [14]:
def find_doc_range(df):
    def check_docstart(row):
        # This function returns the 'id' if 'token' contains 'DOCSTART', otherwise NaN
        if pd.notnull(row['token']) and 'DOCSTART' in row['token']:
            return True
        else:
            return False # Using pandas NA to handle missing values

    # Apply the function across the DataFrame.
    data = df.copy()
    data['docstart_id'] = df.apply(check_docstart, axis=1)
    start_ids = data[data['docstart_id']]['id'].values
    print(len(start_ids))
    end_ids = start_ids[1:] - 1
    end_ids = np.append(end_ids,len(df))
    docs_range = zip(start_ids, end_ids)
    return docs_range


In [15]:
def compute_distance(G, possible_vertices, document_vertices):
    """
    Computes the shortest path distance from each vertex in possible_vertices
    to each vertex in document_vertices in the graph G.

    Parameters:
    G (nx.Graph): The graph containing the vertices.
    possible_vertices (list): A list of vertices from which distances are calculated.
    document_vertices (list): A list of vertices representing the document.

    Returns:
    dict: A dictionary where keys are vertices from possible_vertices and values
    are dictionaries with document vertices as keys and distances as values.
    """
    distances = {}
    for vertex in possible_vertices:
        distances[vertex] = {}
        for doc_vertex in document_vertices:
            try:
                # Compute the shortest path length
                distance = nx.shortest_path_length(G, source=vertex, target=doc_vertex)
                distances[vertex][doc_vertex] = distance
            except nx.NetworkXNoPath:
                # In case there is no path between vertex and doc_vertex
                distances[vertex][doc_vertex] = float('inf')
    return distances

In [16]:
test_df_mod = test.copy(deep=True)
train_df_mod = train.copy(deep=True)
URL = 'http://en.wikipedia.org/wiki/'
merged_wiki_items = wiki_items.merge(aliases, how='left', on='item_id')
merged_wiki_items['en_alias'] = merged_wiki_items['en_alias'].str.lower()
merged_wiki_items

Unnamed: 0,item_id,en_label,en_description,wikipedia_title,en_alias
0,1,Universe,totality of space and all contents,Universe,our universe
1,1,Universe,totality of space and all contents,Universe,the universe
2,1,Universe,totality of space and all contents,Universe,the cosmos
3,1,Universe,totality of space and all contents,Universe,cosmos
4,2,Earth,third planet from the Sun in the Solar System,Earth,blue planet
...,...,...,...,...,...
5853141,77042017,HR 4523,,HD 102365,
5853142,77043280,Charlie Johnston,,Charlie Johnstone,
5853143,77231860,Aldo Rossi,musician,Aldo Rossi (musician),
5853144,77240068,Ebenezer Baptist Church,"church in Atlanta, Georgia, USA",Ebenezer Baptist Church,


In [17]:
for index, row in tqdm(test_df_mod.iterrows(), total=test_df_mod.shape[0]):
    if str(row['wiki_url']) == 'nan' or row['wiki_url'] != '?':
        continue
        
    token = row['full_mention']
    train_url = train_df_mod[train_df_mod['full_mention'] == token]['wiki_url'].values
    
    if len(train_url) > 0:
        test_df_mod.at[index, 'wiki_url'] = train_url[0]
        
    else:
        wiki_title = merged_wiki_items[merged_wiki_items['en_alias'] == token]['wikipedia_title'].values
        
        if len(wiki_title) > 0:
            test_df_mod.at[index, 'wiki_url'] = URL + wiki_title[0].replace(' ', '_')

100%|██████████| 104890/104890 [04:12<00:00, 415.86it/s] 


In [18]:
try:
    test_df_mod = pd.read_pickle('pkl/' + 'opti_test_df.pkl')
except:
    # Load data from csv
    test_df_mod.to_pickle('pkl/' + 'opti_test_df.pkl')

In [19]:
(test_df_mod['wiki_url'] == '?').sum()

2759

In [20]:
non_na_test = test_df_mod[~test_df_mod.wiki_url.isna()]

In [21]:
test_doc_range =  find_doc_range(test)

for start, end in test_doc_range:
    current_doc = non_na_test[(non_na_test.id >= start) & (non_na_test.id <= end)]
    not_found = current_doc[current_doc['wiki_url'] == '?']
    found = current_doc[current_doc['wiki_url'].str.startswith('http')]
    if end < 500:
        display(current_doc)
        display(not_found)
        display(found)
        

447


Unnamed: 0,id,token,entity_tag,full_mention,wiki_url,wikipedia_title
3,3,LEICESTERSHIRE,B,LEICESTERSHIRE,?,
13,13,LONDON,B,LONDON,http://en.wikipedia.org/wiki/London,
16,16,West,B,West Indian,?,
19,19,Phil,B,Phil Simmons,?,
28,28,Leicestershire,B,Leicestershire,http://en.wikipedia.org/wiki/Leicestershire_Co...,
30,30,Somerset,B,Somerset,http://en.wikipedia.org/wiki/Somerset_County_C...,
65,65,Essex,B,Essex,http://en.wikipedia.org/wiki/Essex_County_Cric...,
67,67,Derbyshire,B,Derbyshire,http://en.wikipedia.org/wiki/Derbyshire_County...,
69,69,Surrey,B,Surrey,http://en.wikipedia.org/wiki/Surrey_County_Cri...,
76,76,Kent,B,Kent,http://en.wikipedia.org/wiki/Kent_County_Crick...,


Unnamed: 0,id,token,entity_tag,full_mention,wiki_url,wikipedia_title
3,3,LEICESTERSHIRE,B,LEICESTERSHIRE,?,
16,16,West,B,West Indian,?,
19,19,Phil,B,Phil Simmons,?,
101,101,Grace,B,Grace Road,?,
143,143,Simmons,B,Simmons,?,
169,169,Peter,B,Peter Such,?,
233,233,Such,B,Such,?,
309,309,Mark,B,Mark Butcher,?,
364,364,Tom,B,Tom Moody,?,
371,371,Chris,B,Chris Adams,?,


Unnamed: 0,id,token,entity_tag,full_mention,wiki_url,wikipedia_title
13,13,LONDON,B,LONDON,http://en.wikipedia.org/wiki/London,
28,28,Leicestershire,B,Leicestershire,http://en.wikipedia.org/wiki/Leicestershire_Co...,
30,30,Somerset,B,Somerset,http://en.wikipedia.org/wiki/Somerset_County_C...,
65,65,Essex,B,Essex,http://en.wikipedia.org/wiki/Essex_County_Cric...,
67,67,Derbyshire,B,Derbyshire,http://en.wikipedia.org/wiki/Derbyshire_County...,
69,69,Surrey,B,Surrey,http://en.wikipedia.org/wiki/Surrey_County_Cri...,
76,76,Kent,B,Kent,http://en.wikipedia.org/wiki/Kent_County_Crick...,
87,87,Nottinghamshire,B,Nottinghamshire,http://en.wikipedia.org/wiki/Nottinghamshire_C...,
92,92,Somerset,B,Somerset,http://en.wikipedia.org/wiki/Somerset_County_C...,
104,104,Leicestershire,B,Leicestershire,http://en.wikipedia.org/wiki/Leicestershire_Co...,


In [48]:
def find_possible_indexes(df, text_of_unknown):
    if 'en_label' in df.columns:
        # Case-insensitive search for the text in the 'en_label' column
        matches = df['en_label'].str.contains(text_of_unknown, case=False, na=False)
        return df[matches].item_id  # Returns the indexes of matching rows
    else:
        return "Column 'en_label' not found in the DataFrame"

# Example usage
indexes = find_possible_indexes(wiki_items, 'LEICESTERSHIRE')
print(indexes)


20587         23106
534819       778255
666909       988954
694720      1051582
992738      1842129
             ...   
4671827    23468498
4741410    24993978
4828311    27958026
4944387    30594846
5165169    56290361
Name: item_id, Length: 121, dtype: int64


In [25]:
wiki_items.loc[indexes]

Unnamed: 0,item_id,en_label,en_description,wikipedia_title
20587,23106,Leicestershire,ceremonial county in England (use Q21272890 fo...,Leicestershire
534819,778255,"Moira, Leicestershire",village in United Kingdom,"Moira, Leicestershire"
666909,988954,South Leicestershire,Parliamentary constituency in the United Kingd...,South Leicestershire (UK Parliament constituency)
694720,1051582,North West Leicestershire,Parliamentary constituency in the United Kingd...,North West Leicestershire (UK Parliament const...
992738,1842129,Leicestershire Senior League,,Leicestershire Senior League
...,...,...,...,...
4671827,23468498,Leicestershire Rugby Union,,Leicestershire Rugby Union
4741410,24993978,Leicestershire Police and Crime Commissioner,,Leicestershire Police and Crime Commissioner
4828311,27958026,2017 Leicestershire County Council election,,2017 Leicestershire County Council election
4944387,30594846,Leicestershire County Cup,,Leicestershire County Cup


In [26]:
test_doc_range =  find_doc_range(test)

for start, end in test_doc_range:
    current_doc = test[(test.id >= start) & (test.id <= end)]
    # Ensure NaN values are handled, and join the tokens with a space
    document_string = current_doc['token'].fillna('').str.cat(sep=' ')
    if end < 500:
        print(document_string)


447
-DOCSTART- (947testa CRICKET) CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTER INNINGS VICTORY .  LONDON 1996-08-30  West Indian all-rounder Phil Simmons took four for 38 on Friday as Leicestershire beat Somerset by an innings and 39 runs in two days to take over at the head of the county championship .  Their stay on top , though , may be short-lived as title rivals Essex , Derbyshire and Surrey all closed in on victory while Kent made up for lost time in their rain-affected match against Nottinghamshire .  After bowling Somerset out for 83 on the opening morning at Grace Road , Leicestershire extended their first innings by 94 runs before being bowled out for 296 with England discard Andy Caddick taking three for 83 .  Trailing by 213 , Somerset got a solid start to their second innings before Simmons stepped in to bundle them out for 174 .  Essex , however , look certain to regain their top spot after Nasser Hussain and Peter Such gave them a firm grip on their match against York

In [27]:
wiki_items

Unnamed: 0,item_id,en_label,en_description,wikipedia_title
0,1,Universe,totality of space and all contents,Universe
1,2,Earth,third planet from the Sun in the Solar System,Earth
2,3,life,matter capable of extracting energy from the e...,Life
3,4,death,permanent cessation of vital functions,Death
4,5,human,"common name of Homo sapiens, unique extant spe...",Human
...,...,...,...,...
5216231,77042017,HR 4523,,HD 102365
5216232,77043280,Charlie Johnston,,Charlie Johnstone
5216233,77231860,Aldo Rossi,musician,Aldo Rossi (musician)
5216234,77240068,Ebenezer Baptist Church,"church in Atlanta, Georgia, USA",Ebenezer Baptist Church


In [28]:
URL_LEN = len(URL)

In [29]:
train.wiki_url.loc[3][URL_LEN:]

'Germany'

In [33]:
test_doc_range =  find_doc_range(test)

for start, end in test_doc_range:
    current_doc = non_na_test[(non_na_test.id >= start) & (non_na_test.id <= end)]
    not_found = current_doc[current_doc['wiki_url'] == '?']
    found = current_doc[current_doc['wiki_url'].str.startswith('http')]
    if end < 500:
        #display(current_doc)
        #display(not_found)
        #display(found)
        titles = found['wikipedia_title'].apply(lambda row: row[URL_LEN:])
        #display(titles)
        # Use `isin` to match titles with 'wikipedia_title' in 'wiki_items' and extract 'item_id'
        vertices = wiki_items[wiki_items['wikipedia_title'].isin(titles)]['item_id'].values
        display(vertices)
        #vertices = wiki_items[wiki_items.wikipedia_title.isin(found.wiki_url.apply(lambda row: row[URL_LEN]))].item_id.values
        #display(vertices)
        

447


array([858684])

In [31]:
wiki_items[wiki_items.item_id.isin(vertices)]

Unnamed: 0,item_id,en_label,en_description,wikipedia_title
13,21,England,"country in north-west Europe, part of the Unit...",England
73,84,London,capital and largest city of the United Kingdom,London
377,408,Australia,country in Oceania,Australia


In [32]:
test_df_mod

Unnamed: 0,id,token,entity_tag,full_mention,wiki_url,wikipedia_title
0,0,-DOCSTART- (947testa CRICKET),,,,
1,1,CRICKET,,,,
2,2,-,,,,
3,3,LEICESTERSHIRE,B,LEICESTERSHIRE,?,
4,4,TAKE,,,,
...,...,...,...,...,...,...
104885,104885,brother,,,,
104886,104886,",",,,,
104887,104887,Bobby,B,Bobby,?,
104888,104888,.,,,,


In [37]:

def preprocess_title(title):
    try:
        return str(title).replace(' ', '').replace('_', '').lower()
    except:
        print(title) 

In [49]:
test_doc_range =  find_doc_range(test)

for start, end in test_doc_range:
    current_doc = non_na_test[(non_na_test.id >= start) & (non_na_test.id <= end)]
    not_found = current_doc[current_doc['wiki_url'] == '?']
    found = current_doc[current_doc['wiki_url'].str.startswith('http')]
    if end < 500:
        #display(current_doc)
        #display(not_found)
        #display(found)
        found_titles = found['wiki_url'].apply(lambda row: row[URL_LEN:]).apply(preprocess_title)
        not_found_per_idx = not_found['full_mention'].apply(lambda row: find_possible_indexes(wiki_items, row))
        display(not_found_per_idx)
        wiki_titles = wiki_items['wikipedia_title'].apply(preprocess_title)
        #display(titles)
        # Use `isin` to match titles with 'wikipedia_title' in 'wiki_items' and extract 'item_id'
        doc_vertices = wiki_items[wiki_titles.isin(found_titles)]['item_id'].values
        
        distances = compute_distance(G, not_found_per_idx.iloc[0],vertices)
        display(distances)
        #vertices = wiki_items[wiki_items.wikipedia_title.isin(found.wiki_url.apply(lambda row: row[URL_LEN]))].item_id.values
        #display(vertices)
        

447


Unnamed: 0,20587,534819,666909,694720,992738,995139,1151338,1203777,1232983,1244653,...,3515131,4819783,579356,2141199,2141200,2141201,2141202,2141203,2141204,2843774
3,23106.0,778255.0,988954.0,1051582.0,1842129.0,1853930.0,2358899.0,2516218.0,2612441.0,2645192.0,...,,,,,,,,,,
16,,,,,,,,,,,...,,,,,,,,,,
19,,,,,,,,,,,...,,,,,,,,,,
101,,,,,,,,,,,...,,,,,,,,,,
143,,,,,,,,,,,...,,,,,,,,,,
169,,,,,,,,,,,...,,,,,,,,,,
233,,,,,,,,,,,...,,,,,,,,,,
309,,,,,,,,,,,...,,,,,,,,,,
364,,,,,,,,,,,...,7816918.0,27831051.0,,,,,,,,
371,,,,,,,,,,,...,,,860227.0,5105679.0,5105681.0,5105682.0,5105684.0,5105685.0,5105686.0,


NodeNotFound: Either source 5865259.0 or target 21 is not in G

In [41]:
wiki_items[wiki_items.item_id.isin(vertices)]

Unnamed: 0,item_id,en_label,en_description,wikipedia_title
13,21,England,"country in north-west Europe, part of the Unit...",England
73,84,London,capital and largest city of the United Kingdom,London
377,408,Australia,country in Oceania,Australia
748457,1155836,The Oval,"cricket ground in Kennington, South London",The Oval
760665,1187032,Headingley Stadium,sports venue,Headingley Stadium
977029,1788018,Derbyshire County Cricket Club,english cricket club,Derbyshire County Cricket Club
1126814,2278396,Paul Johnson,squash player from England,Paul Johnson (squash player)
1271284,2725632,Surrey County Cricket Club,English cricket club,Surrey County Cricket Club
1271297,2725667,Worcestershire County Cricket Club,english cricket team,Worcestershire County Cricket Club
1271309,2725705,Warwickshire County Cricket Club,english cricket club,Warwickshire County Cricket Club


In [47]:
# Check if the source and target nodes exist in the graph
source_node = 20587
target_node = 21

if not G.has_node(source_node):
    print(f"Source node {source_node} not found in the graph.")
    # Optionally add the node: G.add_node(source_node)

if not G.has_node(target_node):
    print(f"Target node {target_node} not found in the graph.")
    # Optionally add the node: G.add_node(target_node)

# If both nodes exist, perform your operation
if G.has_node(source_node) and G.has_node(target_node):
    # Example operation: finding a shortest path
    path = nx.shortest_path(G, source=source_node, target=target_node)
    print(path)


Source node 20587 not found in the graph.


In [54]:
wiki_items[wiki_items['en_label'].str.contains('Phil Simon', case=False, na=False)]

Unnamed: 0,item_id,en_label,en_description,wikipedia_title
3138482,7182408,Phil Simon,non-fiction writer,Phil Simon
