# Imports:

In [4]:
import re
import bz2
import os
import json
import xml.etree.ElementTree as ET
from langchain.docstore.document import Document as LangchainDocument

# Cleaning and helper function to extract relevant information:

In [None]:
# Compile regular expressions once.
infobox_pattern = re.compile(r'\{\{Infobox [^}]+\}\}', flags=re.DOTALL)
sidebar_pattern = re.compile(r'\{\{Sidebar [^}]+\}\}', flags=re.DOTALL)
link_pattern = re.compile(r'\[\[([^|\]]+\|)?([^\]]+)\]\]')
references_pattern = re.compile(r'==\s*(References|External links|See also|Notes)\s*==.*', flags=re.DOTALL)
citation_needed_pattern = re.compile(r'\{\{citation needed[^}]*\}\}', flags=re.DOTALL)
cn_pattern = re.compile(r'\{\{cn\}\}', flags=re.DOTALL)
curly_braces_pattern = re.compile(r'\{\{[^}]+\}\}', flags=re.DOTALL)
whitespace_pattern = re.compile(r'\s+')
table_pattern = re.compile(r'\{\|.*?\|\}', flags=re.DOTALL)
line_table_pattern = re.compile(r'\{\|.*$', flags=re.MULTILINE)

def clean_text(text: str) -> str:
    """Clean Wikipedia page text by removing non-essential sections and formatting."""
    text = infobox_pattern.sub('', text)
    text = sidebar_pattern.sub('', text)
    text = table_pattern.sub('', text)
    text = line_table_pattern.sub('', text)
    text = link_pattern.sub(r'\2', text)
    text = references_pattern.sub('', text)
    text = citation_needed_pattern.sub('', text)
    text = cn_pattern.sub('', text)
    text = curly_braces_pattern.sub('', text)
    text = whitespace_pattern.sub(' ', text)
    text = re.sub(r"\b\w+\s*=\s*[^|{}]+\s*\|\s*", "", text)
    text = re.sub(r"\[.*?\]|\(.*?\)|http\S+", "", text)
    text = re.sub(r"\|\s*\w+\s*=\s*[^|{}]+", "", text)
    text = re.sub(r"<ref[^>]*>.*?</ref>", "", text)
    text = re.sub(r"<ref[^>]*\/>", "", text)
    text = re.sub(r"\{\{.*?\}\}", "", text)
    return text.strip()

# Remove namespace from XML tag.
def strip_namespace(tag: str) -> str:
    return tag.split("}")[-1] if "}" in tag else tag

# Extract all category names from the wikitext.
# Danish Wikipedia category tags look like: [[Kategori:SomeCategory]]
def extract_categories(text: str) -> list:
    return re.findall(r'\[\[Kategori:(.*?)\]\]', text, flags=re.IGNORECASE)


# Loading and categorizing pages - i.e. whether they are history, grammar, etc.

In [None]:
# Load pages from a Wikipedia bz2 dump, filter them using provided category keywords,
# and create LangChain documents grouped by category.

def load_pages_by_category(file_path: str, category_keywords: dict, max_docs_per_category: int = 1):
    docs_by_category = {cat: [] for cat in category_keywords}
    
    with bz2.open(file_path, 'rb') as file:
        for event, elem in ET.iterparse(file, events=("end",)):
            if strip_namespace(elem.tag) == "page":
                title = "No Title"
                raw_text = ""
                # Extract title and text from the page.
                for child in elem:
                    tag = strip_namespace(child.tag)
                    if tag == "title":
                        title = child.text or title
                    elif tag == "revision":
                        for subchild in child:
                            if strip_namespace(subchild.tag) == "text":
                                raw_text = subchild.text or ""
                                break
                # First, do a quick check for category tags in raw text.
                page_categories = extract_categories(raw_text)
                # If the page contains any of the keywords for a category, process it.
                for friendly_cat, keywords in category_keywords.items():
                    if len(docs_by_category[friendly_cat]) < max_docs_per_category:
                        if any(kw.lower() in cat.lower() for cat in page_categories for kw in keywords):
                            # Only clean the text if the page matches the category.
                            cleaned_text = clean_text(raw_text)
                            doc = LangchainDocument(page_content=cleaned_text, 
                                                    metadata={"title": title, "categories": [friendly_cat]})
                            docs_by_category[friendly_cat].append(doc)
                # Clear the element to free memory.
                elem.clear()
                
                # Check if we have reached the desired count for all categories.
                if all(len(docs_by_category[cat]) >= max_docs_per_category for cat in docs_by_category):
                    break
    return docs_by_category

# Save the LangChain documents (grouped by category) as separate JSON files.
def save_documents_by_category(docs_by_category: dict, output_dir: str = "Documents"):
    os.makedirs(output_dir, exist_ok=True)
    for cat, docs in docs_by_category.items():
        documents = [{"page_content": doc.page_content, "metadata": doc.metadata} for doc in docs]
        filename = os.path.join(output_dir, f"doc_{cat.lower()}.json")
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(documents, f, ensure_ascii=False, indent=2)
        print(f"Saved {len(documents)} documents for category '{cat}' to {filename}")

if __name__ == "__main__":
    file_path = "../data/dawiki-latest-pages-articles.xml.bz2"
    category_keywords = {
        "History": ["histor", "middelalder", "grundlov"],
        "Grammar": ["grammat", "syntaks", "ordbog"],
        "CultureTradition": ["Kult", "tradi", "reli", "ritual"],
        "ArtEntertainment": ["Kunst", "underhold", "film", "musik", "teater", "mode", "TV og Radio"],
        "Geography": ["geografi", "land", "kommune", "hovedstad", "danmark", "amt", "jorden", "Demografi", "Ekspeditioner", "Geodæsi", "Geografer", "Kartografi", "Landsdele"]
    }
    # Here we can specify how many documents we want per subject by changing "max_docs_per_category"
    docs_by_category = load_pages_by_category(file_path, category_keywords, max_docs_per_category=2000)
    
    # Display the count per category.
    for cat, docs in docs_by_category.items():
        print(f"Loaded {len(docs)} document(s) for category '{cat}'.")
    
    save_documents_by_category(docs_by_category, output_dir="Documents")

Loaded 2000 document(s) for category 'History'.
Loaded 236 document(s) for category 'Grammar'.
Loaded 2000 document(s) for category 'CultureTradition'.
Loaded 2000 document(s) for category 'ArtEntertainment'.
Loaded 2000 document(s) for category 'Geography'.
Saved 2000 documents for category 'History' to Documents\doc_history.json
Saved 236 documents for category 'Grammar' to Documents\doc_grammar.json
Saved 2000 documents for category 'CultureTradition' to Documents\doc_culturetradition.json
Saved 2000 documents for category 'ArtEntertainment' to Documents\doc_artentertainment.json
Saved 2000 documents for category 'Geography' to Documents\doc_geography.json
