# Import packages

In [1]:
import pywikibot

import pandas as pd
import numpy as np

import random
random.seed(16021997)
from tqdm import tqdm
import concurrent.futures
from itertools import combinations
from datetime import datetime

import requests
import bs4

# Scrape table with list of Wikipedias by number of articles

In [2]:
res = requests.get("https://meta.wikimedia.org/wiki/List_of_Wikipedias")
soup = bs4.BeautifulSoup(res.text, 'html.parser')

In [3]:
table = soup.find('table', class_='wikitable')
header_row = table.find('tr')
headers = [header.text.strip() for header in header_row.find_all('th')]
data_wikis = []

In [4]:
for row in table.find_all('tr'):
    columns = row.find_all('td')
    if columns:
        column_data = [column.text.strip() for column in columns]
        data_wikis.append(column_data)

In [5]:
df_wikis = pd.DataFrame(data_wikis, columns=headers)
df_wikis.drop('№',axis=1,inplace=True)
for col in ['Articles','All pages','Edits','Admins','Users','Active users','Files','Depth']:
    df_wikis[col] = df_wikis[col].str.replace(',', '')
    df_wikis[col] = df_wikis[col].astype(int)

In [6]:
df_wikis = df_wikis[(df_wikis.Wiki != 'ceb') & (df_wikis.Wiki != 'sv') & (df_wikis.Wiki != 'war')].reset_index(drop=True) # Dropping Cebuano, Swedish, and Waray

In [7]:
nr_articles_by_wiki = dict(zip(df_wikis.head(20).Wiki,df_wikis.head(20).Articles))

In [8]:
wiki_language_mapping = dict(zip(df_wikis.head(20).Wiki,df_wikis.head(20).Language))

In [9]:
df_wikis.head(20)

Unnamed: 0,Language,Language (local),Wiki,Articles,All pages,Edits,Admins,Users,Active users,Files,Depth
0,English,English,en,6674881,58432953,1157376624,898,45761953,116318,905325,1190
1,German,Deutsch,de,2812513,7760105,233420350,180,4176594,17079,127881,93
2,French,français,fr,2531823,12614699,205035197,152,4688933,17348,71266,257
3,Dutch,Nederlands,nl,2125560,4542605,64418852,34,1283781,3847,20,18
4,Russian,русский,ru,1924496,7723386,130999307,72,3409670,10292,244090,153
5,Spanish,español,es,1873414,7937147,151650031,59,6883855,15470,0,200
6,Italian,italiano,it,1816174,7747452,133860955,120,2397762,7953,130495,184
7,Egyptian Arabic,مصرى,arz,1618023,2075484,8160822,7,213510,213,1483,0
8,Polish,polski,pl,1572091,3623248,70528643,99,1245247,4307,260,33
9,Japanese,日本語,ja,1378332,4057659,95578498,41,2086006,14284,4513,89


# Extract language connections from a sample of pages

Each Wiki will be sampled in proportion to its # of articles.

In [10]:
rescaling_factor = 0.003 # We'll sample only this proportion of pages from each wiki

In [11]:
#nr_articles_by_wiki = {'it':100000}

In [12]:
def process_article(site, article):
    page = pywikibot.Page(site, article.title())
    langlinks = [langlink.site.lang for langlink in page.langlinks() if langlink.site.lang in nr_articles_by_wiki.keys()]
    try:
        page_id = page.data_item()
    except:
        page_id = datetime.now().strftime("%H:%M:%S.%f")
    return page_id, langlinks + [site.code], len(page.text)

In [13]:
# Slow code version
'''dict_data = {}
for wiki_version, nr_articles in tqdm(nr_articles_by_wiki.items(), desc='Outer loop'):
    site = pywikibot.Site(wiki_version, "wikipedia")
    sampled_articles = list(site.randompages(total=max(int(nr_articles * rescaling_factor), 1), namespaces=[0])) # We want to select at least one article per Wiki

    for article in tqdm(sampled_articles, desc='Inner loop'):
        page = pywikibot.Page(site, article.title())
        langlinks = [langlink.site.lang for langlink in page.langlinks()]
        dict_data[page.data_item()] = langlinks + [wiki_version]'''

'dict_data = {}\nfor wiki_version, nr_articles in tqdm(nr_articles_by_wiki.items(), desc=\'Outer loop\'):\n    site = pywikibot.Site(wiki_version, "wikipedia")\n    sampled_articles = list(site.randompages(total=max(int(nr_articles * rescaling_factor), 1), namespaces=[0])) # We want to select at least one article per Wiki\n\n    for article in tqdm(sampled_articles, desc=\'Inner loop\'):\n        page = pywikibot.Page(site, article.title())\n        langlinks = [langlink.site.lang for langlink in page.langlinks()]\n        dict_data[page.data_item()] = langlinks + [wiki_version]'

In [1]:
# Fast code version
dict_data = {}

with concurrent.futures.ThreadPoolExecutor() as executor:
    for wiki_version, nr_articles in tqdm(nr_articles_by_wiki.items(), desc='Outer loop'):
        site = pywikibot.Site(wiki_version, "wikipedia")
        sampled_articles = list(site.randompages(total=max(int(nr_articles * rescaling_factor), 1), namespaces=[0])) # We want to select at least one article per Wiki

        future_to_article = {executor.submit(process_article, site, article): article for article in sampled_articles}

        for future in tqdm(concurrent.futures.as_completed(future_to_article), desc='Inner loop', total=len(future_to_article)):
            article = future_to_article[future]
            result = future.result()
            dict_data[result[0]] = [result[1], result[2]]

'dict_data = {}\n\nwith concurrent.futures.ThreadPoolExecutor() as executor:\n    for wiki_version, nr_articles in tqdm(nr_articles_by_wiki.items(), desc=\'Outer loop\'):\n        site = pywikibot.Site(wiki_version, "wikipedia")\n        sampled_articles = list(site.randompages(total=max(int(nr_articles * rescaling_factor), 1), namespaces=[0])) # We want to select at least one article per Wiki\n\n        future_to_article = {executor.submit(process_article, site, article): article for article in sampled_articles}\n\n        for future in tqdm(concurrent.futures.as_completed(future_to_article), desc=\'Inner loop\', total=len(future_to_article)):\n            article = future_to_article[future]\n            result = future.result()\n            dict_data[result[0]] = [result[1], result[2]]'

# Prepare network data (size of nodes and links) and save it

Create df of node sizes:

In [None]:
df_wikis.to_csv('raw/Node sizes.csv')

Create df of language pairs:

In [None]:
value_combinations = list(combinations(wiki_language_mapping.values(), 2))
df_connections = pd.DataFrame(value_combinations, columns=['lang_1','lang_2'])
df_connections.drop_duplicates(inplace=True)
df_connections['weight'] = 0
df_connections.reset_index(drop=True, inplace=True)

Process the data and add it to the dataframe:

In [None]:
dict_data_final = {key: value for key, value[0] in dict_data.items() if len(value[0]) <= 4} # keeping only articles with at most 4 languages

In [None]:
len(dict_data_final)

In [None]:
for languages in dict_data_final.values():
    languages_full_names = [wiki_language_mapping[language] for language in languages]
    for row in range(len(df_connections)):
        if df_connections.loc[row,'lang_1'] in languages_full_names and df_connections.loc[row,'lang_2'] in languages_full_names:
            df_connections.loc[row,'weight'] += 1

In [None]:
df_connections

In [None]:
df_connections.to_csv('raw/Edges.csv')

Create a second dataframe of connections - long articles only:

In [None]:
dict_data_final2 = {key: value for key, value[0] in dict_data.items() if len(value[0]) <= 4} 

In [None]:
for languages in dict_data_final2.values():
    languages_full_names = [wiki_language_mapping[language] for language in languages]
    for row in range(len(df_connections2)):
        if df_connections2.loc[row,'lang_1'] in languages_full_names and df_connections2.loc[row,'lang_2'] in languages_full_names:
            df_connections2.loc[row,'weight'] += 1

In [None]:
df_connections2.to_csv('raw/Edges - long articles only.csv')