In [None]:
import requests
import bs4

# URL to fetch
url = "https://www.alphavantage.co/documentation/"

# Send a GET request to the URL
response = requests.get(url)

def process_function(function: bs4.element.Tag) -> dict[str, str]:
    """
    Given an <h4> tag (function), extract:
    - function_name
    - description
    - api_params (arguments)
    - python_code_example
    """
    result = {
        "function_name": "",
        "description": "",
        "api_params": "",
        "python_code_example": "",
    }

    # Extract the function name
    function_name = function.get_text(strip=True)
    result["function_name"] = function_name

    # We'll iterate through siblings to find description, parameters, and code
    siblings = list(function.next_siblings)

    # Helper variables
    description_parts = []
    api_params_parts = []
    python_code = ""
    
    # States to control where we are in the HTML
    in_description = True  # Initially, we are reading description until we find "API Parameters"
    in_api_params = False

    for sib in siblings:
        if isinstance(sib, bs4.NavigableString):
            # often just whitespace or newline
            continue
        
        if isinstance(sib, bs4.element.Tag):
            # Check for headings that mark transitions
            if sib.name == "h6":
                heading_text = sib.get_text(strip=True)
                if "API Parameters" in heading_text:
                    # Found start of API parameters
                    in_description = False
                    in_api_params = True
                    continue
                elif "Examples" in heading_text:
                    # Found examples section; stop API params parsing
                    in_api_params = False
                    # We don't necessarily stop reading because we might still find the python code below
                    continue

            # If we're in the description phase, accumulate <p> text until API params start
            if in_description:
                if sib.name == "p" or sib.name == "br":
                    desc_text = sib.get_text(strip=True)
                    if desc_text:
                        description_parts.append(desc_text)
            
            # If we're in the API params phase, accumulate <p> elements until another section appears
            if in_api_params:
                if sib.name == "p":
                    param_text = sib.get_text(" ", strip=True)
                    if param_text:
                        api_params_parts.append(param_text)

            # Regardless of where we are, look for python code block
            # The python code block is inside <div class="python-code">
            if sib.name == "div" and "python-code" in sib.get("class", []):
                code_block = sib.find("code", class_="python")
                if code_block:
                    python_code = code_block.get_text()
    
    # Join the accumulated parts
    description_str = "\n".join(description_parts)
    api_params_str = "\n".join(api_params_parts)

    # Assign to result
    result["description"] = description_str
    result["api_params"] = api_params_str
    result["python_code_example"] = python_code

    return result


def process_section(section: bs4.element.Tag) -> None:
    section_soup = bs4.BeautifulSoup(str(section), "html.parser")
    section_title_tag = section_soup.find("h2")
    section_title = section_title_tag.get_text(strip=True) if section_title_tag else "Untitled Section"
    
    print(f"Section Title: {section_title}")

    functions = section_soup.find_all("h4")
    for i, function_tag in enumerate(functions):
        extracted = process_function(function_tag)
        print(f"Function {i+1}: {extracted['function_name']}")
        print("Description:")
        print(extracted["description"])
        print("\nAPI Parameters:")
        print(extracted["api_params"])
        print("\nPython Code Example:")
        print(extracted["python_code_example"])
        print("\n" + "-"*80 + "\n")
        break


# Check if the request was successful
if response.status_code == 200:
    # Parse the main HTML content
    soup = bs4.BeautifulSoup(response.text, "html.parser")
    
    # Find the <article> tag with the specific class and role
    article = soup.find("article", class_="main-content", role="main")
    
    if article:
        # Find all <section> tags within this <article>
        sections = article.find_all("section")
        
        # Process each section as its own HTML
        for i, section in enumerate(sections, 1):
            process_section(section)
            break
    else:
        print("No <article> with class 'main-content' and role 'main' found.")
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

In [67]:
import requests
import bs4
import json

url = "https://www.alphavantage.co/documentation/"

response = requests.get(url)
if response.status_code != 200:
    raise RuntimeError("Failed to get HTML documentation from AlphaVantage!")

soup = bs4.BeautifulSoup(response.text, "html.parser")

In [None]:
def process_section(section: bs4.element.Tag) -> tuple[str, dict[str, dict[str, str | list[str]]]]:
    section_title = section.find("h2").text
    # print("Section Title:", section_title)

    collection = {}
    contents = [c for c in section.contents if c != "\n" and not str(c).startswith("<br/")]
    i = 0
    while True:
        try:
            while not str(contents[i]).startswith("<h4"):
                i += 1
        except IndexError:
            break
        func_name = contents[i].text
        if func_name.startswith("Quote Endpoint"):
            func_name = "GLOBAL_QUOTE"
        i += 1
        descr = []
        while str(contents[i]).startswith("<p"):
            descr.append(contents[i].text)
            i += 1
        description = "\n".join(descr)

        assert str(contents[i]) == "<h6><b>API Parameters</b></h6>"
        i += 1
        reqs = []
        opts = []
        while True:
            if str(contents[i]) == "<p><b>❚ Required: <code>apikey</code></b></p>":
                i += 2
                break
            argument = str(contents[i].find("code").text)
            lines = []
            if str(contents[i]).startswith("<p><b>❚ Required: "):
                if argument == "function":
                    code = contents[i + 1].find("code")
                    if code is not None:
                        func_name = code.text.split("=")[1]
                else:
                    while 
                    reqs.append([argument, contents[i + 1].text.strip()])
            else:
                assert str(contents[i]).startswith("<p>❚ Optional: ")
                opts.append([argument, contents[i + 1].text.strip()])
            i += 2
        
        assert func_name is not None
        collection[func_name] = {
            "description": description,
            "args_required": reqs,
            "args_optional": opts
        }
        func_name = None
    return section_title, collection


sections = soup.find_all("section")
section_dict = {}
for section in [sections[2]]:
    print(section)
    section_title, collection = process_section(section)
    section_dict[section_title] = collection

<section>
<h2 id="intelligence">Alpha Intelligence™</h2>
<p>The APIs in this section contain advanced market intelligence built with our decades of expertise in AI, machine learning, and quantitative finance. We hope these highly differentiated alternative datasets can help turbocharge your trading strategy, market research, and financial software application to the next level. </p>
<br/>
<h4 id="news-sentiment">Market News &amp; Sentiment <span class="popular-label">Trending</span></h4>
<p>Looking for market news data to train your LLM models or to augment your trading strategy? You have just found it. This API returns live and historical market news &amp; sentiment data from a large &amp; growing selection of premier news outlets around the world, covering stocks, cryptocurrencies, forex, and a wide range of topics such as fiscal policy, mergers &amp; acquisitions, IPOs, etc. This API, combined with our core stock API, fundamental data, and technical indicator APIs, can provide you w

AssertionError: 

In [102]:
"   asdaas.\n    ".strip()

'asdaas.'