# Project 3. InfoExplorers.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import os
from tqdm import tqdm
from tqdm.auto import tqdm
import networkx as nx
import heapq
import random
import warnings 

  from .autonotebook import tqdm as notebook_tqdm


## Data loading

In [2]:
# Constant paths
DATA_FOLDER = 'data/'
PICKLES_FOLDER = 'pickles/'
WIKILITE_FOLDER = DATA_FOLDER + 'wiki_lite/'
SUBMISSIONS_FOLDER = 'submissions/'
CORRECTED_FOLDER = 'corrected/'

# Create folders if they don't exist
if not os.path.exists(PICKLES_FOLDER):
    os.makedirs(PICKLES_FOLDER)

if not os.path.exists(SUBMISSIONS_FOLDER):
    os.makedirs(SUBMISSIONS_FOLDER)

In [3]:
# Train data loading
try:
    train_df = pd.read_pickle(PICKLES_FOLDER + 'train_df.pkl')
except:
    train_df = pd.read_csv(DATA_FOLDER + 'train.csv')
    train_df.to_pickle(PICKLES_FOLDER + 'train_df.pkl')

# Test data loading  
try:
    test_df = pd.read_pickle(PICKLES_FOLDER + 'test_df.pkl')
except:
    test_df = pd.read_csv(DATA_FOLDER + 'test.csv')
    test_df.to_pickle(PICKLES_FOLDER + 'test_df.pkl')

# Redirects loading
try:
    enwinki_redirects = pd.read_pickle(PICKLES_FOLDER + 'enwiki_redirects.pkl')
except:
    enwinki_redirects = pd.read_csv(WIKILITE_FOLDER + 'enwiki_redirects.tsv', names=['en_title', 'en_redirect_title'],
                                    sep='\t')
    enwinki_redirects.to_pickle(PICKLES_FOLDER + 'enwiki_redirects.pkl')

# Aliases loading  
try:
    item_aliases = pd.read_pickle(PICKLES_FOLDER + 'item_aliases.pkl')
except:
    item_aliases = pd.read_csv(WIKILITE_FOLDER + 'item_aliases.csv')
    item_aliases.to_pickle(PICKLES_FOLDER + 'item_aliases.pkl')

# Properties loading
try:
    properties = pd.read_pickle(PICKLES_FOLDER + 'property.pkl')
except:
    properties = pd.read_csv(WIKILITE_FOLDER + 'property.csv')
    properties.to_pickle(PICKLES_FOLDER + 'property.pkl')

# Statements loading
try:
    statements = pd.read_pickle(PICKLES_FOLDER + 'statements.pkl')
except:
    statements = pd.read_csv(WIKILITE_FOLDER + 'statements.csv')
    statements.to_pickle(PICKLES_FOLDER + 'statements.pkl')

# Wiki items loading 
try:
    wiki_items = pd.read_pickle(PICKLES_FOLDER + 'wiki_items.pkl')
except:
    wiki_items = pd.read_csv(WIKILITE_FOLDER + 'wiki_items.csv')
    wiki_items.to_pickle(PICKLES_FOLDER + 'wiki_items.pkl')

## Part 1: Using existing datasets

### Data preprocessing

Merging `item_aliases` and `wiki_items` on `item_id` to get the `wikipedia_title` for each `en_alias`:

In [4]:
merged_wiki_items = wiki_items.merge(item_aliases, how='left', on='item_id')
merged_wiki_items.head()

Unnamed: 0,item_id,en_label,en_description,wikipedia_title,en_alias
0,1,Universe,totality of space and all contents,Universe,Our Universe
1,1,Universe,totality of space and all contents,Universe,The Universe
2,1,Universe,totality of space and all contents,Universe,The Cosmos
3,1,Universe,totality of space and all contents,Universe,cosmos
4,2,Earth,third planet from the Sun in the Solar System,Earth,Blue Planet


In [5]:
# Copying the dataframes to modify them
test_df_mod = test_df.copy(deep=True)
train_df_mod = train_df.copy(deep=True)

In [6]:
# Beginning of the URL to wikipedia
URL = 'http://en.wikipedia.org/wiki/'
LEN_URL = len(URL)

In [7]:
# We need to lowercase the tokens to avoid problems with case in future steps
train_df_mod['full_mention_lower'] = train_df_mod['full_mention'].str.lower()

# We only keep the tokens that have a wiki_url
train_df_mod = train_df_mod[train_df_mod['wiki_url'].notnull() & (train_df_mod['wiki_url'] != '--NME--')]

# We also lowercase the tokens in the test data for future steps
test_df_mod['full_mention'] = test_df_mod['full_mention'].str.lower()

We transform:
- `en_alias` and `wikipedia_title` in `merged_wiki_items`
- `wikipedia_title` in `wiki_items`
- `en_title` in `redirects`

to lowercase to avoid problems with case in future steps

In [8]:
merged_wiki_items['en_alias_lower'] = merged_wiki_items['en_alias'].str.lower()
merged_wiki_items['wikipedia_title_lower'] = merged_wiki_items['wikipedia_title'].str.lower()

wiki_items['wikipedia_title_lower'] = wiki_items['wikipedia_title'].str.lower()

enwinki_redirects['en_title_lower'] = enwinki_redirects['en_title'].str.lower()

### Creating dictionaries for faster lookup

In [9]:
DICT_FOLDER = PICKLES_FOLDER + 'dictionaries/'

In [10]:
try:
    aliases_dict = pickle.load(open(DICT_FOLDER + 'aliases_dict.pkl', 'rb'))
except:
    aliases = merged_wiki_items[['en_alias_lower', 'wikipedia_title']].groupby(['en_alias_lower']).agg(lambda x: x.tolist())
    aliases_dict = pd.Series(aliases['wikipedia_title'].values, index=aliases.index).to_dict()
    
    pickle.dump(aliases_dict, open(DICT_FOLDER + 'aliases_dict.pkl', 'wb'))

In [11]:
try:
    titles_dict = pickle.load(open(DICT_FOLDER + 'titles_dict.pkl', 'rb'))
except:
    titles = wiki_items[['wikipedia_title_lower', 'wikipedia_title']].groupby(['wikipedia_title_lower']).agg(lambda x: x.tolist())
    titles_dict = pd.Series(titles['wikipedia_title'].values, index=titles.index).to_dict()
    
    pickle.dump(titles_dict, open(DICT_FOLDER + 'titles_dict.pkl', 'wb'))

In [12]:
try:
    train_dict = pickle.load(open(DICT_FOLDER + 'train_dict.pkl', 'rb'))
except:
    train_dict = pd.Series(train_df_mod['wiki_url'].values, index=train_df_mod['full_mention_lower']).to_dict()
    pickle.dump(train_dict, open(DICT_FOLDER + 'train_dict.pkl', 'wb'))

In [13]:
try:
    redirects_dict = pickle.load(open(DICT_FOLDER + 'redirects_dict.pkl', 'rb'))
except:
    redirects_dict = pd.Series(enwinki_redirects['en_redirect_title'].values,
                               index=enwinki_redirects['en_title_lower']).to_dict()
    pickle.dump(redirects_dict, open(DICT_FOLDER + 'redirects_dict.pkl', 'wb'))

In [14]:
# We want to keep the count of the number of matches we find with aliases and redirects
aliases_matching = 0
redirects_matching = 0

for index, row in tqdm(test_df_mod.iterrows(), total=test_df_mod.shape[0]):
    if str(row['wiki_url']) == 'nan' or row['wiki_url'] != '?':
        continue

    token = row['full_mention']
    train_url = train_dict.get(token)

    if train_url is not None:
        # We found a link in the train data and it is the true identity so we can use it
        test_df_mod.at[index, 'wiki_url'] = train_url
        continue

    else:
        wiki_title = None
        
        wiki_titles = titles_dict.get(token)

        if wiki_titles is not None:
            if len(wiki_titles) == 1:
                wiki_title = wiki_titles[0]
                aliases_matching += 1

        else:
            wiki_titles = aliases_dict.get(token)

            if wiki_titles is not None:
                if len(wiki_titles) == 1:
                    wiki_title = wiki_titles[0]
                    aliases_matching += 1

    if wiki_title is not None:
        redirect_title = redirects_dict.get(wiki_title.lower())

        if redirect_title is not None:
            redirects_matching += 1
            test_df_mod.at[index, 'wiki_url'] = URL + redirect_title.replace(' ', '_')

        else:
            test_df_mod.at[index, 'wiki_url'] = URL + wiki_title.replace(' ', '_')


100%|██████████| 104890/104890 [00:01<00:00, 64066.55it/s]


In [15]:
print(f"We used {aliases_matching} entity aliases for matching urls")

We used 1919 entity aliases for matching urls


In [16]:
print(f"We used {redirects_matching} link redirections for matching urls")

We used 380 link redirections for matching urls


Let's see how many tokens we found links for:

In [17]:
print(f"Previously we had {test_df[test_df['wiki_url'] == '?']['wiki_url'].count()} tokens without a link")
print(f"Now we have {test_df_mod[test_df_mod['wiki_url'] == '?']['wiki_url'].count()} tokens without a link")

Previously we had 9166 tokens without a link
Now we have 735 tokens without a link


### Sanitize First part result for second part

In [18]:
result_first_part = test_df_mod[['id', 'wiki_url']]
result_first_part.loc[:, 'wiki_url'] = result_first_part['wiki_url'].apply(lambda x: 'NOT_FOUND' if not (str(x).startswith('http') or str(x) == '?') else x)


If you want to submit only the first part, run the below code to create the necessary csv

In [19]:
# result_first_part.loc[:, 'wiki_url'] = result_first_part['wiki_url'].apply(lambda x: 'NOT_FOUND' if not (str(x).startswith('http')) else x)
# name = 'tr_title_alias_dict_redirects_url'
# result_first_part.to_csv(SUBMISSIONS_FOLDER + name + '.csv', index=False)

## Part 2: Knowledge Graph

In [20]:
def add_edges(G, connection, progress=True):
    """
    Function adding the edges to the graph.
    
    Parameters
    ----------
    G: nx.Graph
        Graph to which the edges are added.
    connection: pd.DataFrame
        DataFrame containing the connections between the wiki_items.
    progress: bool, optional (default=True)
        If True, a progress bar is shown.
        
    Returns
    -------
    G: nx.Graph
        Graph with the added edges. 
    """
    
    if progress:
        for source_item_id, _, target_item_id in tqdm(connection.iloc, total=len(connection)):
            G.add_edge(source_item_id, target_item_id)
    else:
        G = nx.from_pandas_edgelist(connection, 'source_item_id', 'target_item_id', create_using=nx.Graph)
    return G

In [21]:
PICKLE_FILENAME = "graph_undirected_full.pkl"

In [22]:
try:
    # Load the graph from the pickle file
    with open(PICKLES_FOLDER + PICKLE_FILENAME, 'rb') as pickle_file:
        G = pickle.load(pickle_file)
except:
    G = nx.Graph()  # Undirected Graph
    G = add_edges(G, statements, progress=True)
    
    # Save graph pickle to the file
    with open(PICKLES_FOLDER + PICKLE_FILENAME, 'wb') as pickle_file:
        pickle.dump(G, pickle_file)

In [23]:
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()

print(f"Number of nodes: {num_nodes}")
print(f"Number of edges: {num_edges}")

Number of nodes: 4906271
Number of edges: 24528246


In [24]:
# Taking the result from the first part
test_df.update(result_first_part)
display(test_df)

Unnamed: 0,id,token,entity_tag,full_mention,wiki_url
0,0,-DOCSTART- (947testa CRICKET),,,NOT_FOUND
1,1,CRICKET,,,NOT_FOUND
2,2,-,,,NOT_FOUND
3,3,LEICESTERSHIRE,B,LEICESTERSHIRE,http://en.wikipedia.org/wiki/Leicestershire_Co...
4,4,TAKE,,,NOT_FOUND
...,...,...,...,...,...
104885,104885,brother,,,NOT_FOUND
104886,104886,",",,,NOT_FOUND
104887,104887,Bobby,B,Bobby,http://en.wikipedia.org/wiki/Bobby
104888,104888,.,,,NOT_FOUND


Fetch item_ids for each entity:

In [25]:
wiki_item = wiki_items[['item_id', 'en_label', 'wikipedia_title']]
lower_case_wiki_item_titles = wiki_item.wikipedia_title.str.lower().str

test_df['wikipedia_title'] = test_df.wiki_url.str[LEN_URL:].str.replace('_', ' ')
test_df = test_df.merge(wiki_item, on='wikipedia_title', how='left').drop(
    columns=['wikipedia_title', 'en_label'])

### Functions 

#### Distance functions

In [26]:
def get_dist(doc_assigned_ids, candidate_id, fill_na=9999):
    """
    Function calculating the distance between the full mention and a candidate.
    
    Parameters
    ----------
    doc_assigned_ids: list
        List containing the already assigned entities.
    candidate_id: float
        Candidate id.
        
    Returns
    -------
    distance: float
        Average distance between the full mention and the candidate in the graph.
    """
    
    distance_list = []
    
    #Retrieve only a subset of ids of the document
    subset_certain = list(doc_assigned_ids)
    random.shuffle(subset_certain)
    subset_size = 10
    subset_certain = subset_certain[:subset_size]
    
    for assigned_entity in subset_certain:
        try:
            shortest = nx.shortest_path_length(G, source=assigned_entity, target=candidate_id)
            distance_list.append(shortest)
        except:
            distance_list.append(fill_na)
            
    return sum(distance_list) / subset_size if distance_list else fill_na

In [27]:
def get_all_dist(candidates, doc_assigned_ids):
    """
    Function calculating the distance between the full mention and all the candidates.
    
    Parameters
    ----------
    candidates: pd.DataFrame
        DataFrame containing the candidates.
    doc_assigned_ids: list
        List of the already assigned entities.
        
    Returns
    -------
    distances: list
        List containing the distances between the full mention and all the candidates.
    """
    
    distances = []
    
    for candidate_id, _, candidate_title in candidates.iloc:
        distances.append((candidate_title, candidate_id, get_dist(doc_assigned_ids, candidate_id)))
        
    return distances

In [28]:
def find_best_candidate(candidate_df, found_ids):
    """
    Function returning the best candidate for the mention.
    
    Parameters
    -------
    candidate_df: pd.DataFrame
        dataframe containing all url candidates for a mention
    found_ids: pd.Serie
        serie containing all ids of a given document
        
    Returns
    -------
    best_candidate_item_id: int
        candidate item_id closest to given ids.
    best_candidate_url: str
        candidate wikipedia url which is closest to the given ids.
    """
    
    distances = get_all_dist(candidate_df, found_ids)
    obtained_distances = len(distances)
    
    # One distance is obtained, it must be the only possible matching => use it.
    if obtained_distances == 1:
        title, item_id, _ = distances[0]
        return item_id, title_to_wiki_url(title)
    
    # More than one distance is obtained, multiple matching exists.
    elif obtained_distances >=1:
        first_candidate, second_candidate = heapq.nsmallest(2, distances,key=lambda x: x[-1])  
        
        # If similarity of the first candidate and second candidate is not the same, take the first.
        # Otherwise, we are unsure and return NaNs.
        if first_candidate[-1] < second_candidate[-1]:
            title, item_id, _ = first_candidate
            return item_id, title_to_wiki_url(title)
    return np.nan, np.nan

#### String manipulation functions

In [29]:
def find_equal_string_candidates(mention, full_mention_found) -> list[str]:
    """
    Function retrieving list of url for the given mention using the possible of the same document.
    Parameters
    -------
    mention: str
        current mention for which we try to find a url in a given document
    full_mention_found: dict 
        dictionary consisting of mentions and their found links in the same document
        
    Returns
    -------
    candidate_urls: list
        candidate urls option for the given mention.
        
    -------
    Example
        Let's say the document references Roger Federer once, and it is already mapped to the link https://en.wikipedia.org/wiki/Roger_Federer.
        This function allows finding any entity starting with "Roger" or "Federer" and map it to this link https://en.wikipedia.org/wiki/Roger_Federer.
    """
    #TODO doc.
    candidate_urls = set()
    for found_mention, url in full_mention_found.items():
        if mention in found_mention:
            candidate_urls.add(url)
    return  list(candidate_urls)

In [30]:
def find_substring_candidates(mention, saved_candidates):
    """
    Function retrieving list of possible candidates for the given mention thanks to the wiki_items dataframe.
    
    Parameters
    -------
    mention: str
        current mention for which we try to find a url in a given document
    saved_candidates: dict 
        dictionary consisting of already found mappings to candidates
        
    Returns
    -------
    candidates: pd.DataFrame
        candidates wikipedia titles option for the given mention.
        
    -------
    Example
        Let's say the document references Roger once, and we don't know which Roger it is
        This function allows finding any entity containing the word "Roger", and return their respective attributes (Roger Frederer, Roger Moore, Roger Waters ...)
    """
    if mention not in saved_candidates.keys(): 
        candidates = wiki_item.loc[lower_case_wiki_item_titles.contains(r'\b{}\b'.format(mention), na=False)]
        saved_candidates[mention] = candidates
        return candidates
    else:
        return saved_candidates[mention]  

In [31]:
def title_to_wiki_url(title):
    """
    Function transforming a wikipedia title into a wikipedia url
    
    Parameters
    -------
    title: str
        wikipedia title
        
    Returns
    -------
    wikipedia_url: str
        wikipedia url
    """
    return URL + title.replace(' ', '_')

#### Document manipulation functions

In [32]:
def find_doc_range(df):
    """
    Function returning a zipped tuple of end indexes and start indexes of documents.
    
    Parameters
    ----------
    df: pd.DataFrame
        DataFrame containing all documents.  
    Returns
    -------
    docs_range: zip
        zipped document range between start ids and ending ids of each document.
    """
    def check_docstart(row):
        """
        True if the current row is the row where the document starts.
        """
        if pd.notnull(row['token']) and 'DOCSTART' in row['token']:
            return True
        else:
            return False
    data = df.copy()
    data['docstart_id'] = df.apply(check_docstart, axis=1)
    start_ids = data[data['docstart_id']]['id'].values
    end_ids = start_ids[1:] - 1
    end_ids = np.append(end_ids,len(df))
    docs_range = zip(start_ids, end_ids)
    docs_number = len(start_ids)
    return docs_range, docs_number

In [33]:
def split_document_findings(document_dataframe):
    """
    Function retrieving and separating entities depending on weather they are associated with a found url or not.
    
    Parameters
    -------
    document_dataframe: pd.DataFrame 
        document containing all the sentences and entities to match
        
    Returns
    -------
    document:  pd.DataFrame
        document containing all the sentences and entities to match 
    found_links: pd.DataFrame
        entities that have been already mapped to links by the previous part of the algorithm
    not_found_links: pd.DataFrame 
        entities that are not yet mapped to links
    """
    document = document_dataframe[document_dataframe['wiki_url'] != 'NOT_FOUND']
    found_links = document[document['wiki_url'] != '?']
    not_found_links = document[document['wiki_url'] == '?']
    return document, found_links, not_found_links

In [34]:
def find_within_doc_similarity(found_links, not_found_links):
    """
    This function attempts to find potential links for exact matches within a document.
    
    Parameters
    ----------
    found_links: pd.DataFrame
        A dataframe of found links.
    not_found_links: pd.DataFrame
        A dataframe of links that were not found.
        
    Returns
    -------
    updated_found_links: pd.DataFrame
        The updated dataframe of found links.
    updated_not_found_links: pd.DataFrame
        The updated dataframe of links that were not found.
    
    Example
    -------
    Let's say the document references Roger Federer once, and it is already mapped to the link https://en.wikipedia.org/wiki/Roger_Federer.
    This function allows finding any entity starting with "Roger" or "Federer" and map it to this link https://en.wikipedia.org/wiki/Roger_Federer.
    """  
    #Series of list of string 
    candidates_urls = not_found_links['full_mention'].apply(lambda mention: find_equal_string_candidates(mention, dict(zip(found_links['full_mention'], found_links['wiki_url']))))
    
    len_candidates_urls = candidates_urls.apply(len)
            
    #for single candidates, retrieve the value and attribute it.
    single_candidates = not_found_links[len_candidates_urls == 1]
    single_candidates['wiki_url'] = candidates_urls[len_candidates_urls == 1].apply(lambda l: l[0])
    
    new_found_links = pd.concat([found_links, single_candidates])
    new_not_found_links = not_found_links[len_candidates_urls != 1]

    return new_found_links, new_not_found_links

In [35]:
def single_row_df_process(single_row_df):
    """
    Helper function used to retrieve the unique row from a dataframe and process it
    
    Parameters
    -------
    single_row_df: pd.DataFrame
        dataframe with a single row.
        
    Returns
    -------
    row:
        updated row with corrected wiki_url
    """
    row = single_row_df.iloc[0]
    row['wiki_url'] = title_to_wiki_url(row['wikipedia_title'])
    return row[['item_id', 'wiki_url']]

In [36]:
def find_substring_similarity(found_links, not_found_links):
    """
    Function retrieving selecting the best candidates for each entity with no link found using the wiki_items dataframe.
    
    Parameters
    ----------
    found_links: pd.DataFrame
        A dataframe of found links.
    not_found_links: pd.DataFrame
        A dataframe of links that were not found.
        
    Returns
    -------
    updated_found_links: pd.DataFrame
        The updated dataframe of found links.
    updated_not_found_links: pd.DataFrame
        The updated dataframe of links that were not found.
    """
    #Find all possible candidates from wiki_item dataframe. 
    #candidates_df is a series of dataframe. type(candidates_df.iloc[0]) = Dataframe
    candidates_df = not_found_links['full_mention'].apply(lambda mention: find_substring_candidates(mention, saved_candidates))
        
    # Calculate the number of candidates for each entity
    df_len_per_candidates = candidates_df.apply(len)
    
    #PHASE ONE : Single possibility elimination.
    # If the wiki items have only a single candidate to propose, simply map it.
    single_candidates = not_found_links[df_len_per_candidates == 1]

    candidates_df_with_one_row = candidates_df[df_len_per_candidates == 1]
    if len(candidates_df_with_one_row) != 0:
        single_candidates['item_id'], single_candidates['wiki_url'] = candidates_df_with_one_row.apply(single_row_df_process)
        
    # Update the link-entity maps.
    new_found_links = pd.concat([found_links, single_candidates])
    new_not_found_links = not_found_links[df_len_per_candidates != 1]
    
    #PHASE TWO : Multiple possibility elimination.
    # Filter for multi candidates, due to time remove entities with too many candidates.
    multi_candidates = not_found_links[(df_len_per_candidates > 1) & (df_len_per_candidates <= MAX_CANDIDATES)]
    multi_candidates_df = candidates_df[(df_len_per_candidates > 1) & (df_len_per_candidates <= MAX_CANDIDATES)]
    
    # Using the graph, find the best candidates for each dataframe or NaN if not findable.
    item_url_tuples = multi_candidates_df.apply(lambda df: find_best_candidate(df, new_found_links[~new_found_links['item_id'].isna()]['item_id']))
    if len(item_url_tuples) !=0:
        multi_candidates [['item_id', 'wiki_url']] = multi_candidates.index.to_series().map(item_url_tuples).apply(pd.Series)
    
    # Update the link-entity maps.  
    new_not_found_links = new_not_found_links[~new_not_found_links['id'].isin(new_found_links['id'])]
    new_found_links = pd.concat([found_links, multi_candidates])
    
    return new_found_links, new_not_found_links

Define the global constants and variables used for graph decisions.

In [37]:
MAX_CANDIDATES = 50
MAX_FETCHING_TRIES = 3
SEED = 42
OLD_SIZE = np.inf
saved_candidates = {}

random.seed(SEED)

In [38]:
test_df['full_mention'] = test_df['full_mention'].str.lower()

In [39]:
warnings.filterwarnings(action="ignore")
docs_range, docs_count = find_doc_range(test_df)

for start_id, end_id in tqdm(docs_range, total=docs_count):
    
    current_document = test_df.iloc[start_id:end_id]
    
    # We are interested in the rows that should have a link
    current_document, found_links, not_found_links = split_document_findings(current_document)
    if len(not_found_links) == 0:
        continue
      
    if len(found_links) == 0:
        continue

    found_links, not_found_links = find_within_doc_similarity(found_links, not_found_links)
    if len(not_found_links) == 0:
        test_df.update(found_links)
        continue
        
    if len(found_links) == 0:
        test_df.update(found_links)
        continue
        
    found_links, not_found_links = find_substring_similarity(found_links, not_found_links)
    test_df.update(found_links)
    

447it [08:32,  1.15s/it]


In [40]:
test_df

Unnamed: 0,id,token,entity_tag,full_mention,wiki_url,item_id
0,0,-DOCSTART- (947testa CRICKET),,,NOT_FOUND,
1,1,CRICKET,,,NOT_FOUND,
2,2,-,,,NOT_FOUND,
3,3,LEICESTERSHIRE,B,leicestershire,http://en.wikipedia.org/wiki/Leicestershire_Co...,3229147.0
4,4,TAKE,,,NOT_FOUND,
...,...,...,...,...,...,...
104885,104885,brother,,,NOT_FOUND,
104886,104886,",",,,NOT_FOUND,
104887,104887,Bobby,B,bobby,http://en.wikipedia.org/wiki/Bobby,289262.0
104888,104888,.,,,NOT_FOUND,


In [42]:
test_df.loc[:, 'wiki_url'] = test_df['wiki_url'].apply(lambda x: 'NOT_FOUND' if not (str(x).startswith('http')) else x)

name = 'submission_new_3'
test_df[['id', 'wiki_url']].to_csv(name + '.csv', index=False)