In [None]:
import urllib.request
import json
import re
import os
import time
import networkx as nx

In [None]:
#Create list of bands

baseurl     = "https://en.wikipedia.org/w/api.php?"
action      = "action=query"
title       = "titles=List_of_mainstream_rock_performers"
content     = "prop=revisions&rvprop=content"
dataformat  = "format=json"

# construct the query
query = "%s%s&%s&%s&%s" % (baseurl, action, title, content, dataformat)

# Add a User-Agent header
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
req = urllib.request.Request(query, headers=headers)

# Fetch response
with urllib.request.urlopen(req) as response:
    data = response.read().decode("utf-8")


json_data = json.loads(data)
#print(json.dumps(json_data, indent=2)[:1000])  # preview first 1000 chars

wikitext = list(json_data["query"]["pages"].values())[0]["revisions"][0]["*"]
matches = re.findall(r"^\* \[\[(.*?)(?:\|(.*?))?\]\]", wikitext, flags=re.M)
bands = [display if display else full for full, display in matches]




In [None]:
#Save each band page as a text file

# Folder where files will be saved
save_dir = r"C:\Users\alexj\OneDrive - Danmarks Tekniske Universitet\SAS\9. Semester\Social graphs\Assignment 1 data"
os.makedirs(save_dir, exist_ok=True)

baseurl = "https://en.wikipedia.org/w/api.php"
headers = {"User-Agent": "Mozilla/5.0"}   # (4) safer User-Agent

for band in bands:
    # --- Build Wikipedia title ---
    band_for_url = band.replace(" ", "_")
    band_encoded = urllib.parse.quote(band_for_url, safe="_")

    # --- Sanitize filename (3) ---
    safe_filename = re.sub(r'[\\/*?:"<>|]', "_", band_for_url)
    filepath = os.path.join(save_dir, safe_filename + ".txt")

    if os.path.exists(filepath):
        continue

    # --- API query (1) use rvslots=main ---
    url = (f"{baseurl}?action=query&titles={band_encoded}"
           f"&prop=revisions&rvslots=main&rvprop=content&format=json")

    # --- Fetch ---
    req = urllib.request.Request(url, headers=headers)
    with urllib.request.urlopen(req) as response:
        data = response.read().decode("utf-8")

    json_data = json.loads(data)
    page = list(json_data["query"]["pages"].values())[0]

    if "revisions" in page:
        rev = page["revisions"][0]

        # --- Extract content (2) check "*" then slots ---
        if "*" in rev:
            wikitext = rev["*"]
        else:
            wikitext = rev["slots"]["main"]["*"]
    else:
        raise ValueError(f"No revisions found for {band}")

    # --- Save ---
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(wikitext)

    print(f"Saved {band}")


In [None]:

# Folder where you saved the downloaded wikitext files
save_dir = r"C:\Users\alexj\OneDrive - Danmarks Tekniske Universitet\SAS\9. Semester\Social graphs\Assignment 1 data"

bands = [b.replace(" ", "_") for b in bands]

# Create directed graph
G = nx.DiGraph()

band_set = set(bands)


In [None]:


for band in bands:
    filename = re.sub(r'[\\/*?:"<>|]', "_", band) + ".txt"
    filepath = os.path.join(save_dir, filename)

    if not os.path.exists(filepath):
        print(f"⚠️ Missing file for {band}, skipping")
        continue

    with open(filepath, "r", encoding="utf-8") as f:
        text = f.read()

    # Count words in the page
    word_count = len(re.findall(r"\w+", text))
    G.add_node(band, content_length=word_count)

    # Extract wiki links [[Target|Display]]
    matches = re.findall(r"\[\[([^\]|#]+)", text)

    for target in matches:
        target = target.strip().replace(" ", "_")

        # Normalize capitalization
        if target:
            target = target[0].upper() + target[1:]

        # Only add edges if target is another band
        if target in band_set:
            G.add_edge(band, target)

print(f"✅ Graph now has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges")


In [None]:
# Create a file to upload to Github
nx.write_gexf(G, "rock_bands.gexf")