# Import packages and define sampling function

In [33]:
import pywikibot

import pandas as pd
import numpy as np

import random
random.seed(16021997)
from tqdm import tqdm
import concurrent.futures
from itertools import combinations

import requests
import bs4

# Scrape table with list of Wikipedias by number of articles

In [34]:
res = requests.get("https://meta.wikimedia.org/wiki/List_of_Wikipedias")
soup = bs4.BeautifulSoup(res.text, 'html.parser')

In [35]:
table = soup.find('table', class_='wikitable')
header_row = table.find('tr')
headers = [header.text.strip() for header in header_row.find_all('th')]
data_wikis = []

In [36]:
for row in table.find_all('tr'):
    columns = row.find_all('td')
    if columns:
        column_data = [column.text.strip() for column in columns]
        data_wikis.append(column_data)

In [37]:
df_wikis = pd.DataFrame(data_wikis, columns=headers)
df_wikis.drop('№',axis=1,inplace=True)
for col in ['Articles','All pages','Edits','Admins','Users','Active users','Files','Depth']:
    df_wikis[col] = df_wikis[col].str.replace(',', '')
    df_wikis[col] = df_wikis[col].astype(int)

In [38]:
df_wikis = df_wikis[(df_wikis.Wiki != 'ceb') & (df_wikis.Wiki != 'sv') & (df_wikis.Wiki != 'war')].reset_index(drop=True) # Dropping Cebuano, Swedish, and Waray

In [39]:
nr_articles_by_wiki = dict(zip(df_wikis.head(20).Wiki,df_wikis.head(20).Articles))

In [40]:
wiki_language_mapping = dict(zip(df_wikis.head(20).Wiki,df_wikis.head(20).Language))

In [41]:
df_wikis.head(20)

Unnamed: 0,Language,Language (local),Wiki,Articles,All pages,Edits,Admins,Users,Active users,Files,Depth
0,English,English,en,6671977,58390201,1156307364,898,45730862,117087,905115,1189
1,German,Deutsch,de,2810494,7755162,233223523,180,4172723,17140,127766,93
2,French,français,fr,2530286,12605492,204855286,152,4684236,17667,71219,257
3,Dutch,Nederlands,nl,2124993,4540891,64365222,34,1282663,3859,20,18
4,Russian,русский,ru,1923062,7717296,130862850,72,3406128,10449,243980,153
5,Spanish,español,es,1871425,7932009,151502438,58,6878631,15558,0,200
6,Italian,italiano,it,1815326,7743063,133760845,120,2395488,8088,131030,184
7,Egyptian Arabic,مصرى,arz,1617931,2074722,8157539,7,213007,213,1482,0
8,Polish,polski,pl,1571443,3621384,70481442,100,1244001,4313,260,33
9,Japanese,日本語,ja,1377644,4055412,95485382,41,2083725,14225,4614,88


# Extract language connections from a sample of pages

Each Wiki will be sampled in proportion to its # of articles.

In [43]:
rescaling_factor = 0.003 # We'll sample only this proportion of pages from each wiki

In [44]:
#nr_articles_by_wiki = {'it':100000}

In [45]:
def process_article(site, article):
    page = pywikibot.Page(site, article.title())
    langlinks = [langlink.site.lang for langlink in page.langlinks() if langlink.site.lang in nr_articles_by_wiki.keys()]
    return page.data_item(), langlinks + [site.code] 

In [42]:
'''dict_data = {}
for wiki_version, nr_articles in tqdm(nr_articles_by_wiki, desc='Outer loop'):
    
    site = pywikibot.Site(wiki_version, "wikipedia") 
    sampled_articles = list(site.randompages(total=max(int(nr_articles * rescaling_factor),1),namespaces=[0])) # we want to grab at least one article per Wiki 
    
    for article in tqdm(sampled_articles, desc='Inner loop'):
        
        page = pywikibot.Page(site, article.title())
        
        langlinks = [langlink.site.lang for langlink in page.langlinks()]
        dict_data[f'{wiki_version}-{article.title()}'] = langlinks + wiki_version'''
dict_data = {}
for wiki_version, nr_articles in tqdm(nr_articles_by_wiki.items(), desc='Outer loop'):
    site = pywikibot.Site(wiki_version, "wikipedia")
    sampled_articles = list(site.randompages(total=max(int(nr_articles * rescaling_factor), 1), namespaces=[0])) # We want to select at least one article per Wiki

    for article in tqdm(sampled_articles, desc='Inner loop'):
        page = pywikibot.Page(site, article.title())
        langlinks = [langlink.site.lang for langlink in page.langlinks()]
        dict_data[page.data_item()] = langlinks + wiki_version]

'dict_data = {}\nfor wiki_version, nr_articles in tqdm(nr_articles_by_wiki, desc=\'Outer loop\'):\n    \n    site = pywikibot.Site(wiki_version, "wikipedia") \n    sampled_articles = list(site.randompages(total=max(int(nr_articles * rescaling_factor),1),namespaces=[0])) # we want to grab at least one article per Wiki \n    \n    for article in tqdm(sampled_articles, desc=\'Inner loop\'):\n        \n        page = pywikibot.Page(site, article.title())\n        \n        langlinks = [langlink.site.lang for langlink in page.langlinks()]\n        dict_data[f\'{wiki_version}-{article.title()}\'] = langlinks + wiki_version'

In [None]:
'''dict_data = {}

with concurrent.futures.ThreadPoolExecutor() as executor:
    for wiki_version, nr_articles in tqdm(nr_articles_by_wiki.items(), desc='Outer loop'):
        site = pywikibot.Site(wiki_version, "wikipedia")
        sampled_articles = list(site.randompages(total=max(int(nr_articles * rescaling_factor), 1), namespaces=[0])) # We want to select at least one article per Wiki

        future_to_article = {executor.submit(process_article, site, article): article for article in sampled_articles}

        for future in tqdm(concurrent.futures.as_completed(future_to_article), desc='Inner loop', total=len(future_to_article)):
            article = future_to_article[future]
            result = future.result()
            dict_data[result[0]] = result[1]'''

Outer loop:   0%|                                        | 0/20 [00:00<?, ?it/s]
Inner loop:   0%|                                     | 0/20015 [00:00<?, ?it/s][A
Inner loop:   0%|                           | 1/20015 [00:00<2:28:35,  2.24it/s][A
Inner loop:   0%|                           | 2/20015 [00:00<1:30:11,  3.70it/s][A
Inner loop:   0%|                            | 11/20015 [00:00<17:35, 18.96it/s][A
Inner loop:   0%|                            | 16/20015 [00:01<17:58, 18.54it/s][A
Inner loop:   0%|                            | 19/20015 [00:01<17:28, 19.07it/s][A
Inner loop:   0%|                            | 22/20015 [00:01<21:08, 15.76it/s][A
Inner loop:   0%|                            | 24/20015 [00:01<21:45, 15.31it/s][A
Inner loop:   0%|                            | 29/20015 [00:01<16:30, 20.17it/s][A
Inner loop:   0%|                            | 32/20015 [00:02<22:42, 14.67it/s][A
Inner loop:   0%|                            | 34/20015 [00:02<25:15, 13.18it/s

Inner loop:   2%|▍                          | 326/20015 [00:17<20:11, 16.25it/s][A
Inner loop:   2%|▍                          | 330/20015 [00:17<16:01, 20.48it/s][A
Inner loop:   2%|▍                          | 333/20015 [00:17<15:55, 20.60it/s][A
Inner loop:   2%|▍                          | 337/20015 [00:17<17:39, 18.57it/s][A
Inner loop:   2%|▍                          | 342/20015 [00:17<13:28, 24.35it/s][A
Inner loop:   2%|▍                          | 345/20015 [00:17<16:30, 19.86it/s][A
Inner loop:   2%|▍                          | 350/20015 [00:18<15:51, 20.67it/s][A
Inner loop:   2%|▍                          | 353/20015 [00:18<19:45, 16.59it/s][A
Inner loop:   2%|▍                          | 359/20015 [00:18<13:59, 23.42it/s][A
Inner loop:   2%|▍                          | 363/20015 [00:18<13:57, 23.47it/s][A
Inner loop:   2%|▍                          | 366/20015 [00:19<16:15, 20.15it/s][A
Inner loop:   2%|▍                          | 369/20015 [00:19<18:20, 17.85i

Inner loop:   3%|▉                          | 684/20015 [00:36<17:45, 18.14it/s][A
Inner loop:   3%|▉                          | 688/20015 [00:36<15:52, 20.29it/s][A
Inner loop:   3%|▉                          | 691/20015 [00:36<15:58, 20.17it/s][A
Inner loop:   3%|▉                          | 696/20015 [00:36<13:07, 24.52it/s][A
Inner loop:   3%|▉                          | 699/20015 [00:37<15:43, 20.47it/s][A
Inner loop:   4%|▉                          | 702/20015 [00:37<14:45, 21.81it/s][A
Inner loop:   4%|▉                          | 705/20015 [00:37<15:03, 21.38it/s][A
Inner loop:   4%|▉                          | 709/20015 [00:37<18:18, 17.58it/s][A
Inner loop:   4%|▉                          | 714/20015 [00:37<17:36, 18.27it/s][A
Inner loop:   4%|▉                          | 718/20015 [00:38<16:02, 20.04it/s][A
Inner loop:   4%|▉                          | 721/20015 [00:38<14:53, 21.61it/s][A
Inner loop:   4%|▉                          | 724/20015 [00:38<14:49, 21.69i

Inner loop:   5%|█▎                        | 1044/20015 [00:53<12:52, 24.57it/s][A
Inner loop:   5%|█▎                        | 1047/20015 [00:54<14:51, 21.28it/s][A
Inner loop:   5%|█▎                        | 1051/20015 [00:54<15:20, 20.59it/s][A
Inner loop:   5%|█▎                        | 1055/20015 [00:54<13:03, 24.20it/s][A
Inner loop:   5%|█▎                        | 1058/20015 [00:54<12:50, 24.59it/s][A
Inner loop:   5%|█▍                        | 1062/20015 [00:54<13:58, 22.59it/s][A
Inner loop:   5%|█▍                        | 1067/20015 [00:54<13:59, 22.57it/s][A
Inner loop:   5%|█▍                        | 1072/20015 [00:55<12:14, 25.80it/s][A
Inner loop:   5%|█▍                        | 1075/20015 [00:55<17:52, 17.67it/s][A
Inner loop:   5%|█▍                        | 1079/20015 [00:55<19:02, 16.57it/s][A
Inner loop:   5%|█▍                        | 1086/20015 [00:56<17:46, 17.74it/s][A
Inner loop:   5%|█▍                        | 1091/20015 [00:56<16:58, 18.59i

Inner loop:   7%|█▊                        | 1417/20015 [01:12<15:57, 19.42it/s][A
Inner loop:   7%|█▊                        | 1421/20015 [01:12<13:27, 23.03it/s][A
Inner loop:   7%|█▊                        | 1426/20015 [01:12<11:34, 26.76it/s][A
Inner loop:   7%|█▊                        | 1430/20015 [01:13<14:56, 20.73it/s][A
Inner loop:   7%|█▊                        | 1435/20015 [01:13<11:58, 25.86it/s][A
Inner loop:   7%|█▊                        | 1439/20015 [01:13<13:15, 23.36it/s][A
Inner loop:   7%|█▊                        | 1442/20015 [01:13<14:12, 21.79it/s][A
Inner loop:   7%|█▉                        | 1445/20015 [01:13<16:06, 19.22it/s][A
Inner loop:   7%|█▉                        | 1450/20015 [01:13<13:47, 22.45it/s][A
Inner loop:   7%|█▉                        | 1453/20015 [01:14<13:14, 23.36it/s][A
Inner loop:   7%|█▉                        | 1457/20015 [01:14<13:44, 22.52it/s][A
Inner loop:   7%|█▉                        | 1460/20015 [01:14<14:58, 20.64i

# Prepare network data (size of nodes and links) and save it

Create df of node sizes:

In [None]:
df_wikis.to_csv('Node sizes.csv')

Create df of language pairs:

In [None]:
value_combinations = list(combinations(wiki_language_mapping.values(), 2))
rows = [(value1, value2) if value1 <= value2 else (value2, value1) for value1, value2 in value_combinations]
df_connections = pd.DataFrame(rows, columns=['lang_1','lang_2'])
df_connections.drop_duplicates(inplace=True)
df_connections.reset_index(drop=True, inplace=True)

Process the data and add it to the dataframe:

In [None]:
dict_data_final = {key: value for key, value in dict_data.items() if len(value) <= 5}# keeping only articles with at most 5 languages