<a href="https://colab.research.google.com/github/Dinuda/wiki-stem-extraction/blob/main/urls.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [1]:
import os
import pandas as pd
import numpy as np
import random
from copy import deepcopy
from itertools import chain

## Utils



In [51]:
import requests

api_base = 'https://en.wikipedia.org/w/api.php'

In [60]:
def get_category_members(title, cmcontinue=None):
  params = {
      "action": "query",
      "format": "json",
      "list": "categorymembers",
      "cmtitle": title,
      "cmlimit": "500"
  }

  if cmcontinue:
    params["cmcontinue"] = cmcontinue

  response = requests.get(api_base, params=params)

  return response.json()

def get_all_category_members(title, depth=1):

  page_urls = []

  # check depth of category
  if depth < 0:
    return page_urls

  category_members = get_category_members(title)
  while True:
    for member in category_members["query"]["categorymembers"]:
      page_title = member["title"].replace(" ", "_")
      page_url = f"https://en.wikipedia.org/wiki/{page_title}"

      # check if each member is a page or has sub categories
      if "Category:" in member["title"]:
        # fetch recursively
        page_urls.extend(get_all_category_members(member["title"], depth=depth-1))
      else:
        page_urls.append(page_url)

    if 'continue' in category_members:
      cmcontinue = category_members['continue']['cmcontinue']
      category_members = get_category_members(title, cmcontinue)
      print(cmcontinue)
    else:
      break

  return page_urls

In [61]:
url_bank = set()

## Physics

In [75]:
category2depth = {
    "Category:Concepts_in_physics": 3,
    "Category:Subfields of physics": 2,
    "Category:Physical_systems": 2,
    "Category:Physical modeling": 1,
    "Category:Eponyms_in_physics": 1
}

In [76]:
%%time
url_list = []

for cat, depth in category2depth.items():
    print(f"Getting URLs for {cat} with depth {depth}")
    urls = get_all_category_members(cat, depth=depth)
    print(f"# urls extracted = {len(urls)}")
    url_list.append(urls)

Getting URLs for Category:Concepts_in_physics with depth 3
# urls extracted = 22431
Getting URLs for Category:Subfields of physics with depth 2
# urls extracted = 20630
Getting URLs for Category:Physical_systems with depth 2
# urls extracted = 2206
Getting URLs for Category:Physical modeling with depth 1
# urls extracted = 124
Getting URLs for Category:Eponyms_in_physics with depth 1
# urls extracted = 198
CPU times: user 1min 23s, sys: 1.34 s, total: 1min 25s
Wall time: 2min 40s


In [77]:
physics_urls = list(set(deepcopy(list(chain(*url_list)))))

In [78]:
len(physics_urls)

24799

In [79]:
url_bank.update(physics_urls)
print(f"size of url bank {len(url_bank)}")

size of url bank 24799


In [80]:
random.sample(physics_urls, 5)

['https://en.wikipedia.org/wiki/Blood_rain',
 'https://en.wikipedia.org/wiki/Piezoresistive_effect',
 'https://en.wikipedia.org/wiki/Gauge_gravitation_theory',
 'https://en.wikipedia.org/wiki/Rayleigh–Bénard_convection',
 'https://en.wikipedia.org/wiki/Ampd_Energy']

## Chemistry

In [81]:
category2depth = {
    "Category:Category:Acid–base chemistry": 2,
    "Category:Analytical_chemistry": 2,
    "Category:Astrochemistry": 2,
    "Category:Atmospheric_chemistry": 2,

    "Category:Biochemistry": 2,
    "Category:Chemical_biology": 2,
    "Category:Chemical_bonding": 2,
    "Category:Carbohydrate_chemistry": 2,
    "Category:Chemical_physics": 2,
    "Category:Cheminformatics": 2,
    "Category:Chemical_classification": 2,
    "Category:Chemical_data_pages": 1,
    "Category:Chemical_energy": 2,
    "Category:Chemical_engineering": 2,
    "Category:Environmental_chemistry": 2,
    "Category:Chemical_equipment": 2,
    "Category:Chemistry_experiments": 1,
    "Category:Food_chemistry": 2,
    "Category:General_chemistry": 2,
    "Category:Geochemistry": 2,
    "Category:Green_chemistry": 2,

    "Category:Chemical_industry": 1,
    "Category:Inorganic_chemistry": 2,
    "Category:Laboratory_techniques": 2,
    "Category:Chemistry-related_lists": 1,
    "Category:Medicinal_chemistry": 2,
    "Category:Metallurgy": 2,
    "Category:Microwave_chemistry": 1,
    "Category:Chemical_mixtures": 2,
    "Category:Molecular_physics": 2,
    "Category:Molecules": 1, # 2,
    "Category:Chemical_nomenclature": 1, # 2
    "Category:Nuclear_chemistry": 2,


    "Category:Organic_chemistry": 2,
    "Category:Chemistry_organizations": 1,
    "Category:Photochemistry": 2,
    "Category:Physical_chemistry": 2,
    "Category:Chemical_processes": 2,
    "Category:Chemical_properties": 2,
    "Category:Chemical_reactions": 2,
    "Category:Redox": 2,
    "Category:Chemical_safety": 2,
    "Category:Separation_processes": 2,
    "Category:Chemistry_societies": 1,
    "Category:Chemistry_software": 2,
    "Category:Solid-state_chemistry": 2,
    "Category:Stereochemistry": 2,
    "Category:Stoichiometry": 2,
    "Category:Chemical_structures": 1, # 2,
    "Category:Chemical_substances": 1, # 2,
    "Category:Supramolecular_chemistry": 2,
    "Category:Chemical_synthesis": 2,

    "Category:Textile_chemistry": 2,
    "Category:Theoretical_chemistry": 2,
    "Category:Chemistry_theories": 2,
    "Category:Chemical_warfare": 1,
    "Category:Water_chemistry": 2,
}

In [82]:
%%time
url_list = []

for cat, depth in category2depth.items():
    print(f"Getting URLs for {cat} with depth {depth}")
    urls = get_all_category_members(cat, depth=depth)
    print(f"# urls extracted = {len(urls)}")
    url_list.append(urls)
chemistry_urls = deepcopy(list(chain(*url_list)))
chemistry_urls = list(set(chemistry_urls))
print(len(chemistry_urls))
url_bank.update(chemistry_urls)
print(f"size of URL bank {len(url_bank)}")
random.sample(chemistry_urls, 5)

Getting URLs for Category:Category:Acid–base chemistry with depth 2
# urls extracted = 0
Getting URLs for Category:Analytical_chemistry with depth 2
# urls extracted = 3221
Getting URLs for Category:Astrochemistry with depth 2
# urls extracted = 54
Getting URLs for Category:Atmospheric_chemistry with depth 2
# urls extracted = 27
Getting URLs for Category:Biochemistry with depth 2
page|484c4636324e50324c464432044c322e324850464c042c011b01dcb1dc|35677227
page|4c3248464c50324c0436324432011101dc10|341038
page|484c4650323a44042a010d01dcbfdc|5560666
page|484c4636324e50324c464432044c322e324850464c042c011b01dcb1dc|35677227
page|4c2a2c0f1d010801dcdcdc05|4505175
page|4e422a40400444522e403246402a4c044c442a044e44464c300f25011e01dcb7dcdcdcc5dcdcdcdcdc05|11421969
page|424c484e0f1a010901dcdcdcdc05|15225375
# urls extracted = 16257
Getting URLs for Category:Chemical_biology with depth 2
# urls extracted = 18
Getting URLs for Category:Chemical_bonding with depth 2
# urls extracted = 587
Getting URLs fo

## Astronomy

In [86]:
category2depth = {
    "Category:Materials_science": 2
}

In [87]:
%%time
url_list = []

for cat, depth in category2depth.items():
    print(f"Getting URLs for {cat} with depth {depth}")
    urls = get_all_category_members(cat, depth=depth)
    print(f"# urls extracted = {len(urls)}")
    url_list.append(urls)

astrology_urls = deepcopy(list(chain(*url_list)))
astrology_urls = list(set(astrology_urls))
print(len(astrology_urls))

url_bank.update(astrology_urls)
print(f"size of URL bank {len(url_bank)}")

print(random.sample(astrology_urls, 25))

Getting URLs for Category:Materials_science with depth 2
# urls extracted = 11280
7713
size of URL bank 80588
['https://en.wikipedia.org/wiki/Red_House_Cone', 'https://en.wikipedia.org/wiki/Exact_diagonalization', 'https://en.wikipedia.org/wiki/Flux_(metallurgy)', 'https://en.wikipedia.org/wiki/Antibody_microarray', 'https://en.wikipedia.org/wiki/Wilma_Dierkes', 'https://en.wikipedia.org/wiki/Polymer-protein_hybrid', 'https://en.wikipedia.org/wiki/Rugate_filter', 'https://en.wikipedia.org/wiki/Hardmask', 'https://en.wikipedia.org/wiki/Biomagnetism', 'https://en.wikipedia.org/wiki/Dioxane_tetraketone', 'https://en.wikipedia.org/wiki/Frank_Isakson_Prize', 'https://en.wikipedia.org/wiki/Bromothymol_blue', 'https://en.wikipedia.org/wiki/Secondary_emission', 'https://en.wikipedia.org/wiki/Point_group', 'https://en.wikipedia.org/wiki/Walter_De_Brouwer', 'https://en.wikipedia.org/wiki/Moving_magnet_and_conductor_problem', 'https://en.wikipedia.org/wiki/Nanosocialism', 'https://en.wikipedia.or

## Mathematics

In [89]:
category2depth = {
    "Category:Fields_of_mathematics": 2,
    "Category:Mathematical classification systems": 2,
    "Category:Mathematical_concepts": 3,
    "Category:Mathematical_constants": 2,
    "Category:Mathematical_examples": 2,
    "Category:Mathematical_notation": 2,
    "Category:Outlines_of_mathematics_and_logic": 2,
    "Category:Philosophy_of_mathematics": 1,
    "Category:Mathematical_proofs": 2,
    "Category:Mathematical_terminology": 2,
    "Category:Mathematical_theorems": 2,
    "Category:Mathematical_tools": 2,
}

In [90]:
%%time
url_list = []

for cat, depth in category2depth.items():
    print(f"Getting URLs for {cat} with depth {depth}")
    urls = get_all_category_members(cat, depth=depth)
    print(f"# urls extracted = {len(urls)}")
    url_list.append(urls)

mathematics_urls = deepcopy(list(chain(*url_list)))
mathematics_urls = list(set(mathematics_urls))
print(len(mathematics_urls))

url_bank.update(mathematics_urls)
print(f"size of URL bank {len(url_bank)}")

print(random.sample(mathematics_urls, 25))

Getting URLs for Category:Fields_of_mathematics with depth 2
# urls extracted = 25280
Getting URLs for Category:Mathematical classification systems with depth 2
# urls extracted = 10
Getting URLs for Category:Mathematical_concepts with depth 3
# urls extracted = 18585
Getting URLs for Category:Mathematical_constants with depth 2
# urls extracted = 825
Getting URLs for Category:Mathematical_examples with depth 2
# urls extracted = 8
Getting URLs for Category:Mathematical_notation with depth 2
# urls extracted = 911
Getting URLs for Category:Outlines_of_mathematics_and_logic with depth 2
# urls extracted = 45
Getting URLs for Category:Philosophy_of_mathematics with depth 1
# urls extracted = 524
Getting URLs for Category:Mathematical_proofs with depth 2
# urls extracted = 1121
Getting URLs for Category:Mathematical_terminology with depth 2
# urls extracted = 303
Getting URLs for Category:Mathematical_theorems with depth 2
# urls extracted = 2799
Getting URLs for Category:Mathematical_too

## Technology

In [92]:
category2depth = {
    "Category:Science_and_technology": 2,
    "Category:Technological_comparisons": 2,
    "Category:Mobile_technology": 2,
    "Category:Real-time_technology": 2,
    "Category:Sociology_of_technology": 2,
    "Category:Technology_systems": 2,
    "Category:Technical_specifications": 2,
    "Category:Technology_assessment": 2,
    "Category:Technology_education": 2,
    "Category:Science_and_technology": 2,
    "Category:Technology_hazards": 2,
    "Category:Software": 2,
    "Category:Computing": 2
}

In [93]:
%%time
url_list = []

for cat, depth in category2depth.items():
    print(f"Getting URLs for {cat} with depth {depth}")
    urls = get_all_category_members(cat, depth=depth)
    print(f"# urls extracted = {len(urls)}")
    url_list.append(urls)

technology_urls = deepcopy(list(chain(*url_list)))
technology_urls = list(set(technology_urls))
print(len(technology_urls))

url_bank.update(technology_urls)
print(f"size of URL bank {len(url_bank)}")

print(random.sample(technology_urls, 25))

Getting URLs for Category:Science_and_technology with depth 2
# urls extracted = 3526
Getting URLs for Category:Technological_comparisons with depth 2
# urls extracted = 343
Getting URLs for Category:Mobile_technology with depth 2
# urls extracted = 2739
Getting URLs for Category:Real-time_technology with depth 2
# urls extracted = 696
Getting URLs for Category:Sociology_of_technology with depth 2
# urls extracted = 22
Getting URLs for Category:Technology_systems with depth 2
# urls extracted = 9714
Getting URLs for Category:Technical_specifications with depth 2
# urls extracted = 27
Getting URLs for Category:Technology_assessment with depth 2
# urls extracted = 193
Getting URLs for Category:Technology_education with depth 2
# urls extracted = 8
Getting URLs for Category:Technology_hazards with depth 2
# urls extracted = 1760
Getting URLs for Category:Software with depth 2
page|54324c500858010a01dc09|37679972
page|423a2e4c464e463450044e562a5a011201dcbddc07|43997955
file|30523e3244523e3

## Engineering

In [94]:
category2depth = {
    "Category:Engineering_disciplines": 2,
    "Category:Engineering_concepts": 3,
    "Category:Engineering_equipment": 2,
    "Category:Industrial_equipment": 2,
    "Category:Engineering_organizations": 2,
    "Category:Engineering_projects": 2,
}

In [95]:
%%time
url_list = []

for cat, depth in category2depth.items():
    print(f"Getting URLs for {cat} with depth {depth}")
    urls = get_all_category_members(cat, depth=depth)
    print(f"# urls extracted = {len(urls)}")
    url_list.append(urls)

engineering_urls = deepcopy(list(chain(*url_list)))
engineering_urls = list(set(engineering_urls))
print(len(engineering_urls))

url_bank.update(engineering_urls)
print(f"size of URL bank {len(url_bank)}")

print(random.sample(engineering_urls, 25))

Getting URLs for Category:Engineering_disciplines with depth 2
page|4e2e38423a3050044452422c324c011201dc11|2994664
subcat|0f50e503062c523a40303a44364e042a4430044e504c522e50524c324e042e4642484032503230043a44040f50e501250c01c4dc2a|45719511
subcat|0f52de03062c523a40303a44364e042a4430044e504c522e50524c324e042e4642484032503230043a44040f52de01250c01c4dc2a|30095415
# urls extracted = 41571
Getting URLs for Category:Engineering_concepts with depth 3
# urls extracted = 572
Getting URLs for Category:Engineering_equipment with depth 2
# urls extracted = 1843
Getting URLs for Category:Industrial_equipment with depth 2
# urls extracted = 1692
Getting URLs for Category:Engineering_organizations with depth 2
# urls extracted = 2469
Getting URLs for Category:Engineering_projects with depth 2
# urls extracted = 1077
33534
size of URL bank 160630
['https://en.wikipedia.org/wiki/Covariant_transformation', 'https://en.wikipedia.org/wiki/Talent_scheduling', 'https://en.wikipedia.org/wiki/Michael_Beaumont,_

## Biology

In [97]:
category2depth = {
    "Category:Branches_of_biology": 1,
    "Category:Organisms": 1,
    "Category:Biotechnology": 1,
    "Category:Biological_classification": 1,
    "Category:Biological_concepts": 3,

    "Category:Biological_interactions": 1,
    "Category:Biological_processes": 1,
    "Category:Quantum_biology": 1,
    "Category:Biological_rules": 1,
    "Category:Biological_systems": 1,

    "Category:Biological_techniques_and_tools": 2,
    "Category:Biology_terminology": 2,
    "Category:Biological_waste": 2,
}

In [98]:
%%time
url_list = []

for cat, depth in category2depth.items():
    print(f"Getting URLs for {cat} with depth {depth}")
    urls = get_all_category_members(cat, depth=depth)
    print(f"# urls extracted = {len(urls)}")
    url_list.append(urls)

biology_urls = deepcopy(list(chain(*url_list)))
biology_urls = list(set(biology_urls))
print(len(biology_urls))

url_bank.update(biology_urls)
print(f"size of URL bank {len(url_bank)}")

print(random.sample(biology_urls, 25))

Getting URLs for Category:Branches_of_biology with depth 1
page|504c3a2e38464e2e40324c323a30011201dc11|20651551
page|4c3248464c50324c0436324432011101dc10|341038
# urls extracted = 5720
Getting URLs for Category:Organisms with depth 1
# urls extracted = 765
Getting URLs for Category:Biotechnology with depth 1
# urls extracted = 2971
Getting URLs for Category:Biological_classification with depth 1
# urls extracted = 386
Getting URLs for Category:Biological_concepts with depth 3
# urls extracted = 12407
Getting URLs for Category:Biological_interactions with depth 1
# urls extracted = 388
Getting URLs for Category:Biological_processes with depth 1
# urls extracted = 406
Getting URLs for Category:Quantum_biology with depth 1
# urls extracted = 16
Getting URLs for Category:Biological_rules with depth 1
# urls extracted = 25
Getting URLs for Category:Biological_systems with depth 1
# urls extracted = 148
Getting URLs for Category:Biological_techniques_and_tools with depth 2
# urls extracted =

## Environmental Science

In [100]:
category2depth = {
    "Category:Environmental_science": 2,
}

In [101]:
%%time
url_list = []

for cat, depth in category2depth.items():
    print(f"Getting URLs for {cat} with depth {depth}")
    urls = get_all_category_members(cat, depth=depth)
    print(f"# urls extracted = {len(urls)}")
    url_list.append(urls)

env_sci_urls = deepcopy(list(chain(*url_list)))
env_sci_urls = list(set(env_sci_urls))
print(len(env_sci_urls))

url_bank.update(env_sci_urls)
print(f"size of URL bank {len(url_bank)}")

print(random.sample(env_sci_urls, 25))

Getting URLs for Category:Environmental_science with depth 2
# urls extracted = 7181
5977
size of URL bank 176970
['https://en.wikipedia.org/wiki/Tapinella_panuoides', 'https://en.wikipedia.org/wiki/Epidemiology_of_measles', 'https://en.wikipedia.org/wiki/Living_Planet_Database', 'https://en.wikipedia.org/wiki/Energy_Reduction_Assets', 'https://en.wikipedia.org/wiki/Chernobyl_groundwater_contamination', 'https://en.wikipedia.org/wiki/Naphthalene_poisoning', 'https://en.wikipedia.org/wiki/Soft_water_path', 'https://en.wikipedia.org/wiki/Analytic_element_method', 'https://en.wikipedia.org/wiki/Biocide', 'https://en.wikipedia.org/wiki/Fall_protection', "https://en.wikipedia.org/wiki/Yolanda's_Law", 'https://en.wikipedia.org/wiki/Leaving_the_world_a_better_place', 'https://en.wikipedia.org/wiki/Scientific_and_Technical_Centre_for_Building', 'https://en.wikipedia.org/wiki/Epidemic_dropsy', 'https://en.wikipedia.org/wiki/Fixed_bill', 'https://en.wikipedia.org/wiki/Hydrometeorology', 'https:/

## Art

In [103]:
category2depth = {
    "Category:Digital art": 1,
}

In [104]:
%%time
url_list = []

for cat, depth in category2depth.items():
    print(f"Getting URLs for {cat} with depth {depth}")
    urls = get_all_category_members(cat, depth=depth)
    print(f"# urls extracted = {len(urls)}")
    url_list.append(urls)

art_urls = deepcopy(list(chain(*url_list)))
art_urls = list(set(art_urls))
print(len(art_urls))

url_bank.update(art_urls)
print(f"size of URL bank {len(url_bank)}")

print(random.sample(art_urls, 25))

Getting URLs for Category:Digital art with depth 1
# urls extracted = 464
412
size of URL bank 177295
['https://en.wikipedia.org/wiki/Fetch.AI', 'https://en.wikipedia.org/wiki/Media_Scape', 'https://en.wikipedia.org/wiki/Computational_art', 'https://en.wikipedia.org/wiki/Crown_Fountain', 'https://en.wikipedia.org/wiki/Curio_Cards', 'https://en.wikipedia.org/wiki/The_Dream_Weaver', 'https://en.wikipedia.org/wiki/Telegarden', 'https://en.wikipedia.org/wiki/Heart_Beacon', 'https://en.wikipedia.org/wiki/Ixi_lang', 'https://en.wikipedia.org/wiki/Interactive_art', 'https://en.wikipedia.org/wiki/Andy_Lomas', 'https://en.wikipedia.org/wiki/Pixelization', 'https://en.wikipedia.org/wiki/Ars_Mathematica_(organization)', 'https://en.wikipedia.org/wiki/Nathalie_Lawhead', 'https://en.wikipedia.org/wiki/The_Urban_Conga', 'https://en.wikipedia.org/wiki/Programming_(music)', 'https://en.wikipedia.org/wiki/Roche_Musique', 'https://en.wikipedia.org/wiki/Noise_in_music', 'https://en.wikipedia.org/wiki/Tat

## Developmental psychology

In [105]:
category2depth = {
  "Category:Developmental psychology": 2
}

In [106]:
%%time
url_list = []

for cat, depth in category2depth.items():
    print(f"Getting URLs for {cat} with depth {depth}")
    urls = get_all_category_members(cat, depth=depth)
    print(f"# urls extracted = {len(urls)}")
    url_list.append(urls)

developmental_psy_urls = deepcopy(list(chain(*url_list)))
art_urls = list(set(developmental_psy_urls))
print(len(developmental_psy_urls))

url_bank.update(developmental_psy_urls)
print(f"size of URL bank {len(url_bank)}")

print(random.sample(developmental_psy_urls, 25))

Getting URLs for Category:Developmental psychology with depth 2
# urls extracted = 1574
1574
size of URL bank 178496
['https://en.wikipedia.org/wiki/Leann_Birch', 'https://en.wikipedia.org/wiki/Odds_and_evens_(hand_game)', 'https://en.wikipedia.org/wiki/Michael_Fordham', 'https://en.wikipedia.org/wiki/Secure_attachment', 'https://en.wikipedia.org/wiki/Critical_period', 'https://en.wikipedia.org/wiki/Charles_Brainerd', 'https://en.wikipedia.org/wiki/Baby_Race_(Bluey)', 'https://en.wikipedia.org/wiki/Server_supported_gaming', 'https://en.wikipedia.org/wiki/Dysacusis', 'https://en.wikipedia.org/wiki/Michael_Rutter', 'https://en.wikipedia.org/wiki/Catherine_McBride', 'https://en.wikipedia.org/wiki/Progressive_jackpot', 'https://en.wikipedia.org/wiki/The_Kid_(book)', 'https://en.wikipedia.org/wiki/Pride_&_Joy_(TV_series)', 'https://en.wikipedia.org/wiki/Rochel_Gelman', 'https://en.wikipedia.org/wiki/Mothering_(magazine)', 'https://en.wikipedia.org/wiki/Epidemiology_of_attention_deficit_hype

## Save

In [107]:
url_bank = list(url_bank)
print(f"size of URL bank {len(url_bank)}")

size of URL bank 178496


In [108]:
import json
with open("stem_url_bank.json", "w") as f:
    json.dump(url_bank, f)