In [1]:
import time, os, pickle
import requests

### 2.1 Create the filters 

The goal here is to get all the subcategories of "Energy" so as to filter the links between wikipedia pages.

In [2]:
def all_categories(title, language, clcontinue=None):
    url = "https://{lang}.wikipedia.org/w/api.php".format(lang=language)
    params = {
        "action": "query",
        "format": "json",
        "prop": "categories",
        "titles": title,
        "cllimit": "max",
    }
    if clcontinue:
        params["clcontinue"] = clcontinue

    data = requests.get(url, params=params)
    data = data.json()
    clcontinue = data.get("continue", {}).get("clcontinue", None)
    for items in data["query"]["pages"].values():
        for cat in items.get("categories", []):
            yield cat["title"]

    if clcontinue is not None:
        yield from all_children(title, language, clcontinue=clcontinue)
        
def all_subcategories(title, language, cmcontinue=None):
    url = "https://{lang}.wikipedia.org/w/api.php".format(lang=language)
    params = {
        "action": "query",
        "format": "json",
        "list": "categorymembers",
        "cmtitle": title,
        "cmtype": "subcat",
        "cmlimit": "max",
    }
    if cmcontinue:
        params["cmcontinue"] = cmcontinue

    data = requests.get(url, params=params)
    data = data.json()
    cmcontinue = data.get("continue", {}).get("cmcontinue", None)
    for items in data["query"]["categorymembers"]:
        yield items["title"]

    if cmcontinue is not None:
        yield from all_subcategories(title, language, cmcontinue=cmcontinue)
        
def get_subcategories(cat_dic, category, language, level=0, max_level=1):
    for cat in all_subcategories(category, language):  
        cat_dic[cat] = cat
        if (level < max_level):
            get_subcategories(cat_dic, cat, language, level=level + 1, max_level=max_level)
    return cat_dic

def get_all_subcategories(category, language, depth):
    cat = "Category:{}".format(category)
    D = {}
    D[cat] = cat
    CD = get_subcategories(D, cat, language, max_level=depth)
    return CD

In [3]:
# Print all categories from a given page
C = all_categories("France", "en")
print(list(C))

['Category:1792 establishments in Europe', 'Category:1792 establishments in France', 'Category:All articles containing potentially dated statements', 'Category:All articles needing additional references', 'Category:All articles with failed verification', 'Category:All articles with unsourced statements', 'Category:Articles containing Dutch-language text', 'Category:Articles containing French-language text', 'Category:Articles containing German-language text', 'Category:Articles containing Italian-language text', 'Category:Articles containing Latin-language text', 'Category:Articles containing Proto-Germanic-language text', 'Category:Articles containing Swedish-language text', 'Category:Articles containing potentially dated statements from 2007', 'Category:Articles containing potentially dated statements from 2008', 'Category:Articles containing potentially dated statements from 2009', 'Category:Articles containing potentially dated statements from 2011', 'Category:Articles containing p

In [4]:
# Print all direct subcategories from a given category
C = all_subcategories("Category:Energy", "en")
print(list(C))

['Category:Energy by continent', 'Category:Energy by country', 'Category:Energy by region', 'Category:Energy-related lists', 'Category:Energy accidents and incidents', 'Category:Attacks on energy sector', 'Category:Energy democracy', 'Category:Energy development', 'Category:Energy economics', 'Category:Energy education', 'Category:Electric power', 'Category:Energetics', 'Category:Energy conversion', 'Category:Energy (physics)', 'Category:Energy sources', 'Category:Forms of energy', 'Category:History of energy', 'Category:Energy industry', 'Category:Energy infrastructure', 'Category:Energetic materials', 'Category:Energy measurement', 'Category:Energy models', 'Category:Energy organizations', 'Category:People associated with energy', 'Category:Energy policy', 'Category:Energy recovery', 'Category:Energy storage', 'Category:Sustainable energy', 'Category:Energy technology', 'Category:Energy in transport', 'Category:Energy weapons', 'Category:Works about energy', 'Category:Energy stubs']


In [8]:
# Build our filters
lang = "de"
wiki_wiki = wkp.Wikipedia(lang)
t0 = time.time()
subcat_list = get_all_subcategories("Energiewesen", lang, 4)
t1 = time.time()
print(t1 - t0, " seconds")

407.95980191230774


In [9]:
# Check the size of our filter
len(subcat_list)

1187

In [10]:
# Save the filter
cwd = os.getcwd()
with open("{}/energy_subcat-{}-L{}".format(cwd, lang, len(subcat_list)), 'wb') as file:
    pickle.dump(subcat_list, file)