# Import packages and define sampling function

In [27]:
import pywikibot

import pandas as pd
import numpy as np

import random
random.seed(16021997)
from tqdm import tqdm
import concurrent.futures
from itertools import combinations

import requests
import bs4

# Scrape table with list of Wikipedias by number of articles

In [28]:
res = requests.get("https://meta.wikimedia.org/wiki/List_of_Wikipedias")
soup = bs4.BeautifulSoup(res.text, 'html.parser')

In [29]:
table = soup.find('table', class_='wikitable')
header_row = table.find('tr')
headers = [header.text.strip() for header in header_row.find_all('th')]
data_wikis = []

In [30]:
for row in table.find_all('tr'):
    columns = row.find_all('td')
    if columns:
        column_data = [column.text.strip() for column in columns]
        data_wikis.append(column_data)

In [31]:
df_wikis = pd.DataFrame(data_wikis, columns=headers)
df_wikis.drop('№',axis=1,inplace=True)
for col in ['Articles','All pages','Edits','Admins','Users','Active users','Files','Depth']:
    df_wikis[col] = df_wikis[col].str.replace(',', '')
    df_wikis[col] = df_wikis[col].astype(int)

In [32]:
df_wikis = df_wikis[(df_wikis.Wiki != 'ceb') & (df_wikis.Wiki != 'sv') & (df_wikis.Wiki != 'war')].reset_index(drop=True) # Dropping Cebuano, Swedish, and Waray

In [33]:
nr_articles_by_wiki = dict(zip(df_wikis.head(20).Wiki,df_wikis.head(20).Articles))

In [34]:
wiki_language_mapping = dict(zip(df_wikis.head(20).Wiki,df_wikis.head(20).Language))

In [35]:
df_wikis.head(20)

Unnamed: 0,Language,Language (local),Wiki,Articles,All pages,Edits,Admins,Users,Active users,Files,Depth
0,English,English,en,6671977,58390201,1156307364,898,45730862,117087,905115,1189
1,German,Deutsch,de,2810494,7755162,233223523,180,4172723,17140,127766,93
2,French,français,fr,2530286,12605492,204855286,152,4684236,17667,71219,257
3,Dutch,Nederlands,nl,2124993,4540891,64365222,34,1282663,3859,20,18
4,Russian,русский,ru,1923062,7717296,130862850,72,3406128,10449,243980,153
5,Spanish,español,es,1871425,7932009,151502438,58,6878631,15558,0,200
6,Italian,italiano,it,1815326,7743063,133760845,120,2395488,8088,131030,184
7,Egyptian Arabic,مصرى,arz,1617931,2074722,8157539,7,213007,213,1482,0
8,Polish,polski,pl,1571443,3621384,70481442,100,1244001,4313,260,33
9,Japanese,日本語,ja,1377644,4055412,95485382,41,2083725,14225,4614,88


# Extract language connections from a sample of pages

Each Wiki will be sampled in proportion to its # of articles.

In [36]:
rescaling_factor = 0.003 # We'll sample only this proportion of pages from each wiki

In [37]:
#nr_articles_by_wiki = {'it':100000}

In [38]:
def process_article(site, article):
    page = pywikibot.Page(site, article.title())
    langlinks = [langlink.site.lang for langlink in page.langlinks() if langlink.site.lang in nr_articles_by_wiki.keys()]
    return page.data_item(), langlinks + [site.code] 

In [None]:
'''dict_data = {}
for wiki_version, nr_articles in tqdm(nr_articles_by_wiki, desc='Outer loop'):
    
    site = pywikibot.Site(wiki_version, "wikipedia") 
    sampled_articles = list(site.randompages(total=max(int(nr_articles * rescaling_factor),1),namespaces=[0])) # we want to grab at least one article per Wiki 
    
    for article in tqdm(sampled_articles, desc='Inner loop'):
        
        page = pywikibot.Page(site, article.title())
        
        langlinks = [langlink.site.lang for langlink in page.langlinks()]
        dict_data[f'{wiki_version}-{article.title()}'] = langlinks + wiki_version'''
dict_data = {}
for wiki_version, nr_articles in tqdm(nr_articles_by_wiki.items(), desc='Outer loop'):
    site = pywikibot.Site(wiki_version, "wikipedia")
    sampled_articles = list(site.randompages(total=max(int(nr_articles * rescaling_factor), 1), namespaces=[0])) # We want to select at least one article per Wiki

    for article in tqdm(sampled_articles, desc='Inner loop'):
        page = pywikibot.Page(site, article.title())
        langlinks = [langlink.site.lang for langlink in page.langlinks()]
        dict_data[page.data_item()] = langlinks + [wiki_version]

Outer loop:   0%|                                        | 0/20 [00:00<?, ?it/s]
Inner loop:   0%|                                     | 0/20015 [00:00<?, ?it/s][A
Inner loop:   0%|                          | 1/20015 [00:02<14:36:38,  2.63s/it][A
Inner loop:   0%|                           | 2/20015 [00:03<9:10:57,  1.65s/it][A
Inner loop:   0%|                           | 3/20015 [00:05<8:57:50,  1.61s/it][A
Inner loop:   0%|                           | 4/20015 [00:05<6:43:03,  1.21s/it][A
Inner loop:   0%|                           | 5/20015 [00:06<5:27:48,  1.02it/s][A
Inner loop:   0%|                           | 6/20015 [00:06<4:43:04,  1.18it/s][A
Inner loop:   0%|                          | 7/20015 [00:13<14:33:10,  2.62s/it][A
Inner loop:   0%|                          | 8/20015 [00:14<12:54:03,  2.32s/it][A
Inner loop:   0%|                           | 9/20015 [00:15<9:54:12,  1.78s/it][A
Inner loop:   0%|                         | 10/20015 [00:18<11:52:11,  2.14s/it

Inner loop:   0%|                          | 93/20015 [01:32<3:41:40,  1.50it/s][A
Inner loop:   0%|                          | 94/20015 [01:32<3:35:11,  1.54it/s][A
Inner loop:   0%|                          | 95/20015 [01:33<3:41:00,  1.50it/s][A
Inner loop:   0%|                          | 96/20015 [01:34<3:35:28,  1.54it/s][A
Inner loop:   0%|▏                         | 97/20015 [01:34<3:29:43,  1.58it/s][A
Inner loop:   0%|▏                         | 98/20015 [01:35<3:42:09,  1.49it/s][A
Inner loop:   0%|▏                         | 99/20015 [01:36<3:46:44,  1.46it/s][A
Inner loop:   0%|                         | 100/20015 [01:36<3:37:58,  1.52it/s][A
Inner loop:   1%|▏                        | 101/20015 [01:37<3:34:29,  1.55it/s][A
Inner loop:   1%|▏                        | 102/20015 [01:37<3:28:29,  1.59it/s][A
Inner loop:   1%|▏                        | 103/20015 [01:38<3:41:18,  1.50it/s][A
Inner loop:   1%|▏                        | 104/20015 [01:39<3:57:00,  1.40i

In [None]:
'''dict_data = {}

with concurrent.futures.ThreadPoolExecutor() as executor:
    for wiki_version, nr_articles in tqdm(nr_articles_by_wiki.items(), desc='Outer loop'):
        site = pywikibot.Site(wiki_version, "wikipedia")
        sampled_articles = list(site.randompages(total=max(int(nr_articles * rescaling_factor), 1), namespaces=[0])) # We want to select at least one article per Wiki

        future_to_article = {executor.submit(process_article, site, article): article for article in sampled_articles}

        for future in tqdm(concurrent.futures.as_completed(future_to_article), desc='Inner loop', total=len(future_to_article)):
            article = future_to_article[future]
            result = future.result()
            dict_data[result[0]] = result[1]'''

# Prepare network data (size of nodes and links) and save it

Create df of node sizes:

In [None]:
df_wikis.to_csv('Node sizes.csv')

Create df of language pairs:

In [None]:
value_combinations = list(combinations(wiki_language_mapping.values(), 2))
rows = [(value1, value2) if value1 <= value2 else (value2, value1) for value1, value2 in value_combinations]
df_connections = pd.DataFrame(rows, columns=['lang_1','lang_2'])
df_connections.drop_duplicates(inplace=True)
df_connections.reset_index(drop=True, inplace=True)

Process the data and add it to the dataframe:

In [None]:
dict_data_final = {key: value for key, value in dict_data.items() if len(value) <= 5}# keeping only articles with at most 5 languages