In [105]:
'''
Task: redirect the URL to new page ID
Ressources: mention_url.csv
            wiki_id_qid.csv
Outputs: mentions_url_including_redirect.csv, all mention URLs including the redirect URLs
'''
import wikipedia
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import time
import requests
import os

# Set the language for Wikipedia queries
wikipedia.set_lang('de')
wikipedia.set_rate_limiting(True)

current_directory = os.getcwd().replace('code', '')
data_path = os.path.join(current_directory, 'data')
output_path = os.path.join(current_directory, 'outputs')
# Function to fetch Wikipedia page ids
def get_page_id(title):
    try:
        page = wikipedia.page(title=title, redirect=True, auto_suggest=False)
        return page.pageid
    except:
        return None

df_mention_url = pd.read_csv(os.path.join(output_path, 'mention_url.csv'))
df_mention = pd.read_csv(os.path.join(output_path, 'mention.csv'))
merged_df = pd.merge(df_mention, df_mention_url, on='url', how='left')
df_qid = pd.read_csv(os.path.join(output_path, 'wiki_id_qid.csv'))
merged_df1 = pd.merge(merged_df, df_qid, on='page id', how='left')

In [109]:
merged_df_na = merged_df1[merged_df1['qid'].isna()]
titles = list(set(merged_df_na['mention']))

In [72]:
titles = list(set(merged_df_na['mention']))
len(titles)

122498

In [56]:
# Concurrently fetch page ids using multi-threading
with ThreadPoolExecutor() as executor:
    results = list(tqdm(executor.map(get_page_id, titles), total=len(titles)))

rows = []
for i, j in zip(titles, results):
    rows.append([i, j])
    
df = pd.DataFrame(rows, columns=['mention', 'new page id'])
merged_df = pd.merge(merged_df_na, df, on='mention', how='left')

In [113]:
urls = list(merged_df['url'])
len(urls)

127037

In [79]:
def check_url_existence(url):
    time.sleep(1)
    try:
        response = requests.head(url)
        if response.status_code == 200:  # 200 indicates a successful request
            return 200
        else:
            if response.status_code == 429:
                print(f"Error occurred while fetching {url}: Status Code {429}")
            return response.status_code 
    except requests.RequestException:
        return None
urls = list(merged_df1['url'])  
with ThreadPoolExecutor() as executor:
    error = list(tqdm(executor.map(check_url_existence, urls), total=len(urls)))

100%|██████████| 127037/127037 [2:42:41<00:00, 13.01it/s]   


In [114]:
merged_df['new error']=error

In [115]:
merged_df

Unnamed: 0,mention,url,page id,error,qid,new page id,new error
0,Aluminiumgehalt,https://de.wikipedia.org/wiki/Aluminiumgehalt,0.0,200.0,,,404
1,hydrophil,https://de.wikipedia.org/wiki/hydrophil,76478.0,200.0,,2256,301
2,EU,https://de.wikipedia.org/wiki/EU,949615.0,200.0,,7604,200
3,"4-""n""-Nonylphenolen",https://de.wikipedia.org/wiki/4-n-Nonylphenol,12381567.0,200.0,,,200
4,östrogen,https://de.wikipedia.org/wiki/%C3%B6strogen,11875637.0,200.0,,11633,200
...,...,...,...,...,...,...,...
127032,Dalvirus,https://de.wikipedia.org/wiki/Dalvirus,0.0,200.0,,,404
127033,Deevirus,https://de.wikipedia.org/wiki/Deevirus,0.0,200.0,,,404
127034,Dobrovirus,https://de.wikipedia.org/wiki/Dobrovirus,0.0,200.0,,,404
127035,Thurisazvirus,https://de.wikipedia.org/wiki/Thurisazvirus,0.0,200.0,,,404


In [80]:
merged_df.to_csv(os.path.join(output_path, 'mentions_redirect_qid.csv'), index=False)

In [117]:
rows = []
for num, val in enumerate(tqdm(merged_df['new error'])):
    if int(val) == 200 or int(val) == 301:
        if merged_df['new page id'][num] != None:
            rows.append(merged_df['new page id'][num])
        else:
            rows.append(merged_df['page id'][num])
    else:
        rows.append(merged_df['page id'][num])

100%|██████████| 127037/127037 [00:01<00:00, 114083.89it/s]


In [118]:
merged_df['new page id 1']=rows
merged_df

Unnamed: 0,mention,url,page id,error,qid,new page id,new error,new page id 1
0,Aluminiumgehalt,https://de.wikipedia.org/wiki/Aluminiumgehalt,0.0,200.0,,,404,0.0
1,hydrophil,https://de.wikipedia.org/wiki/hydrophil,76478.0,200.0,,2256,301,2256
2,EU,https://de.wikipedia.org/wiki/EU,949615.0,200.0,,7604,200,7604
3,"4-""n""-Nonylphenolen",https://de.wikipedia.org/wiki/4-n-Nonylphenol,12381567.0,200.0,,,200,12381567.0
4,östrogen,https://de.wikipedia.org/wiki/%C3%B6strogen,11875637.0,200.0,,11633,200,11633
...,...,...,...,...,...,...,...,...
127032,Dalvirus,https://de.wikipedia.org/wiki/Dalvirus,0.0,200.0,,,404,0.0
127033,Deevirus,https://de.wikipedia.org/wiki/Deevirus,0.0,200.0,,,404,0.0
127034,Dobrovirus,https://de.wikipedia.org/wiki/Dobrovirus,0.0,200.0,,,404,0.0
127035,Thurisazvirus,https://de.wikipedia.org/wiki/Thurisazvirus,0.0,200.0,,,404,0.0


In [119]:
merged_df.to_csv(os.path.join(output_path, 'mentions_redirect_qid.csv'), index=False)

In [120]:
rows = []
for num, val in enumerate(tqdm(merged_df['new error'])):
    try:
        page_id = int(merged_df['new page id 1'][num])
    except:
        page_id = merged_df['new page id 1'][num]
    rows.append([merged_df['mention'][num], merged_df['url'][num], page_id, merged_df['new error'][num]])

100%|██████████| 127037/127037 [00:04<00:00, 30418.09it/s]


In [121]:
df_re = pd.DataFrame(rows, columns=['mention', 'url', 'page id', 'error'])
df_re

Unnamed: 0,mention,url,page id,error
0,Aluminiumgehalt,https://de.wikipedia.org/wiki/Aluminiumgehalt,0.0,404
1,hydrophil,https://de.wikipedia.org/wiki/hydrophil,2256.0,301
2,EU,https://de.wikipedia.org/wiki/EU,7604.0,200
3,"4-""n""-Nonylphenolen",https://de.wikipedia.org/wiki/4-n-Nonylphenol,12381567.0,200
4,östrogen,https://de.wikipedia.org/wiki/%C3%B6strogen,11633.0,200
...,...,...,...,...
127032,Dalvirus,https://de.wikipedia.org/wiki/Dalvirus,0.0,404
127033,Deevirus,https://de.wikipedia.org/wiki/Deevirus,0.0,404
127034,Dobrovirus,https://de.wikipedia.org/wiki/Dobrovirus,0.0,404
127035,Thurisazvirus,https://de.wikipedia.org/wiki/Thurisazvirus,0.0,404


In [122]:
merged_df_qid = pd.merge(df_re, df_qid, on='page id', how='left')
merged_df_qid

Unnamed: 0,mention,url,page id,error,qid
0,Aluminiumgehalt,https://de.wikipedia.org/wiki/Aluminiumgehalt,0.0,404,
1,hydrophil,https://de.wikipedia.org/wiki/hydrophil,2256.0,301,Q41853520
2,EU,https://de.wikipedia.org/wiki/EU,7604.0,200,Q458
3,"4-""n""-Nonylphenolen",https://de.wikipedia.org/wiki/4-n-Nonylphenol,12381567.0,200,
4,östrogen,https://de.wikipedia.org/wiki/%C3%B6strogen,11633.0,200,Q277954
...,...,...,...,...,...
127032,Dalvirus,https://de.wikipedia.org/wiki/Dalvirus,0.0,404,
127033,Deevirus,https://de.wikipedia.org/wiki/Deevirus,0.0,404,
127034,Dobrovirus,https://de.wikipedia.org/wiki/Dobrovirus,0.0,404,
127035,Thurisazvirus,https://de.wikipedia.org/wiki/Thurisazvirus,0.0,404,


In [123]:
merged_df_nna = merged_df1[merged_df1['qid'].notna()]
merged_df_nna

Unnamed: 0,mention,url,page id,error,qid
0,Getreide,https://de.wikipedia.org/wiki/Getreide,1944.0,200.0,Q12117
1,Gattung,https://de.wikipedia.org/wiki/Gattung%20%28Bio...,4003287.0,200.0,Q34740
2,Fingerhirsen,https://de.wikipedia.org/wiki/Fingerhirsen,3719543.0,200.0,Q163915
3,Familie,https://de.wikipedia.org/wiki/Familie%20%28Bio...,1704.0,200.0,Q35409
4,Süßgräser,https://de.wikipedia.org/wiki/S%C3%BC%C3%9Fgr%...,4951.0,200.0,Q43238
...,...,...,...,...,...
468358,Hyperzyklus,https://de.wikipedia.org/wiki/Hyperzyklus,585746.0,200.0,Q4138754
468359,Quasispezies,https://de.wikipedia.org/wiki/Quasispezies,1011909.0,200.0,Q456225
468360,Java,https://de.wikipedia.org/wiki/Java,2521.0,200.0,Q1100132
468361,Spreitenquerschnitt,https://de.wikipedia.org/wiki/Blattform%23Spre...,1528179.0,200.0,Q521246


In [124]:
concatenated_df = pd.concat([merged_df_nna, merged_df_qid], ignore_index=True)
concatenated_df.to_csv(os.path.join(output_path, 'mentions_url_including_redirect.csv'), index=False)

In [126]:
concatenated_df

Unnamed: 0,mention,url,page id,error,qid
0,Getreide,https://de.wikipedia.org/wiki/Getreide,1944.0,200.0,Q12117
1,Gattung,https://de.wikipedia.org/wiki/Gattung%20%28Bio...,4003287.0,200.0,Q34740
2,Fingerhirsen,https://de.wikipedia.org/wiki/Fingerhirsen,3719543.0,200.0,Q163915
3,Familie,https://de.wikipedia.org/wiki/Familie%20%28Bio...,1704.0,200.0,Q35409
4,Süßgräser,https://de.wikipedia.org/wiki/S%C3%BC%C3%9Fgr%...,4951.0,200.0,Q43238
...,...,...,...,...,...
468358,Dalvirus,https://de.wikipedia.org/wiki/Dalvirus,0.0,404.0,
468359,Deevirus,https://de.wikipedia.org/wiki/Deevirus,0.0,404.0,
468360,Dobrovirus,https://de.wikipedia.org/wiki/Dobrovirus,0.0,404.0,
468361,Thurisazvirus,https://de.wikipedia.org/wiki/Thurisazvirus,0.0,404.0,
