In [None]:
import requests
import bs4

# URL to fetch
url = "https://www.alphavantage.co/documentation/"

# Send a GET request to the URL
response = requests.get(url)

def process_function(function: bs4.element.Tag) -> dict[str, str]:
    """
    Given an <h4> tag (function), extract:
    - function_name
    - description
    - api_params (arguments)
    - python_code_example
    """
    result = {
        "function_name": "",
        "description": "",
        "api_params": "",
        "python_code_example": "",
    }

    # Extract the function name
    function_name = function.get_text(strip=True)
    result["function_name"] = function_name

    # We'll iterate through siblings to find description, parameters, and code
    siblings = list(function.next_siblings)

    # Helper variables
    description_parts = []
    api_params_parts = []
    python_code = ""
    
    # States to control where we are in the HTML
    in_description = True  # Initially, we are reading description until we find "API Parameters"
    in_api_params = False

    for sib in siblings:
        if isinstance(sib, bs4.NavigableString):
            # often just whitespace or newline
            continue
        
        if isinstance(sib, bs4.element.Tag):
            # Check for headings that mark transitions
            if sib.name == "h6":
                heading_text = sib.get_text(strip=True)
                if "API Parameters" in heading_text:
                    # Found start of API parameters
                    in_description = False
                    in_api_params = True
                    continue
                elif "Examples" in heading_text:
                    # Found examples section; stop API params parsing
                    in_api_params = False
                    # We don't necessarily stop reading because we might still find the python code below
                    continue

            # If we're in the description phase, accumulate <p> text until API params start
            if in_description:
                if sib.name == "p" or sib.name == "br":
                    desc_text = sib.get_text(strip=True)
                    if desc_text:
                        description_parts.append(desc_text)
            
            # If we're in the API params phase, accumulate <p> elements until another section appears
            if in_api_params:
                if sib.name == "p":
                    param_text = sib.get_text(" ", strip=True)
                    if param_text:
                        api_params_parts.append(param_text)

            # Regardless of where we are, look for python code block
            # The python code block is inside <div class="python-code">
            if sib.name == "div" and "python-code" in sib.get("class", []):
                code_block = sib.find("code", class_="python")
                if code_block:
                    python_code = code_block.get_text()
    
    # Join the accumulated parts
    description_str = "\n".join(description_parts)
    api_params_str = "\n".join(api_params_parts)

    # Assign to result
    result["description"] = description_str
    result["api_params"] = api_params_str
    result["python_code_example"] = python_code

    return result


def process_section(section: bs4.element.Tag) -> None:
    section_soup = bs4.BeautifulSoup(str(section), "html.parser")
    section_title_tag = section_soup.find("h2")
    section_title = section_title_tag.get_text(strip=True) if section_title_tag else "Untitled Section"
    
    print(f"Section Title: {section_title}")

    functions = section_soup.find_all("h4")
    for i, function_tag in enumerate(functions):
        extracted = process_function(function_tag)
        print(f"Function {i+1}: {extracted['function_name']}")
        print("Description:")
        print(extracted["description"])
        print("\nAPI Parameters:")
        print(extracted["api_params"])
        print("\nPython Code Example:")
        print(extracted["python_code_example"])
        print("\n" + "-"*80 + "\n")
        break


# Check if the request was successful
if response.status_code == 200:
    # Parse the main HTML content
    soup = bs4.BeautifulSoup(response.text, "html.parser")
    
    # Find the <article> tag with the specific class and role
    article = soup.find("article", class_="main-content", role="main")
    
    if article:
        # Find all <section> tags within this <article>
        sections = article.find_all("section")
        
        # Process each section as its own HTML
        for i, section in enumerate(sections, 1):
            process_section(section)
            break
    else:
        print("No <article> with class 'main-content' and role 'main' found.")
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

In [67]:
import requests
import bs4
import json

url = "https://www.alphavantage.co/documentation/"

response = requests.get(url)
if response.status_code != 200:
    raise RuntimeError("Failed to get HTML documentation from AlphaVantage!")

soup = bs4.BeautifulSoup(response.text, "html.parser")

In [128]:
def process_section(section: bs4.element.Tag) -> tuple[str, dict[str, dict[str, str | list[str]]]]:
    section_title = section.find("h2").text
    # print("Section Title:", section_title)

    collection = {}
    contents = [c for c in section.contents if c != "\n" and not str(c).startswith("<br/")]
    i = 0
    while True:
        try:
            while not str(contents[i]).startswith("<h4"):
                i += 1
        except IndexError:
            break
        func_name = contents[i].text
        if func_name.startswith("Quote Endpoint"):
            func_name = "GLOBAL_QUOTE"
        i += 1
        descr = []
        while str(contents[i]).startswith("<p"):
            descr.append(contents[i].text)
            i += 1
        description = "\n".join(descr)

        assert str(contents[i]) == "<h6><b>API Parameters</b></h6>"
        i += 1
        reqs = []
        opts = []
        while True:
            if str(contents[i]) == "<p><b>❚ Required: <code>apikey</code></b></p>":
                i += 2
                break
            argument = str(contents[i].find("code").text)
            if argument == "function":
                code = contents[i + 1].find("code")
                if code is not None:
                    func_name = code.text.split("=")[1] 
                i += 2
                continue

            lines = []
            is_req = str(contents[i]).startswith("<p><b>❚ Required: ")
            i += 1
            while "❚" not in str(contents[i]):
                lines.append(contents[i].text)
                i += 1

            annotated_content = [argument, "\n".join(lines)]
            if is_req:
                reqs.append(annotated_content)
            else:
                opts.append(annotated_content)
        
        assert func_name is not None
        collection[func_name] = {
            "description": description,
            "args_required": reqs,
            "args_optional": opts
        }
        func_name = None
    return section_title, collection


sections = soup.find_all("section")
section_dict = {}
for section in sections:
    section_title, collection = process_section(section)
    section_dict[section_title] = collection

In [131]:
print(json.dumps(section_dict, indent = 4))

{
    "Time Series Stock Data APIs": {
        "TIME_SERIES_INTRADAY": {
            "description": "This API returns current and 20+ years of historical intraday OHLCV time series of the equity specified, covering pre-market and post-market hours where applicable (e.g., 4:00am to 8:00pm Eastern Time for the US market). You can query both raw (as-traded) and split/dividend-adjusted intraday data from this endpoint. The OHLCV data is sometimes called \"candles\" in finance literature.",
            "args_required": [
                [
                    "symbol",
                    "The name of the equity of your choice. For example: symbol=IBM "
                ],
                [
                    "interval",
                    "Time interval between two consecutive data points in the time series. The following values are supported: 1min, 5min, 15min, 30min, 60min"
                ]
            ],
            "args_optional": [
                [
                    "adjusted",
 

In [207]:
print("from typing import Optional")
print("class C:")

print("    def _request(self, *args, **kwargs) -> None: ...")
for section, dict_ in section_dict.items():
    print("#" * (len(section) + 4))
    print("#", section, "#")
    print("#" * (len(section) + 4))

    print()
    for k, v in dict_.items():
        args_req = [a[0] for a in v["args_required"]]
        args_req_str = ", ".join(args_req)
        args_req_request = [f'"{arg}={arg}"' for arg in args_req]

        args_opt = [a[0] for a in v["args_optional"]]
        args_opt_adj = [f"{a[0]}:Optional[any]=None" for a in v["args_optional"]]
        args_opt_str = ", ".join(args_opt_adj)
        args_opt_request = [f'(f"[{arg}=' + "{" + arg + "}] if " + arg + " is not None else [])" for arg in args_opt]

        print("    ", f"def get_{k.lower()}(self, {args_req_str}, {args_opt_str}) -> dict[str, any]:", sep="")
        print('        """')
        print("        ", v["description"], sep = "")
        for arg, desc in v["args_required"]:
            print(f"        {arg} (required)")
            print(f"         - {desc}")
        for arg, desc in v["args_optional"]:
            print(f"        {arg} (optional)")
            print(f"         - {desc}")
        print('        """')
        print(f"""
        return self._send_request(
            function="{k}",
            request_args=[{','.join(args_req_request)}] + {' + '.join(args_opt_request)}
        )
        """)
        break

    break

from typing import Optional
class C:
    def _request(self, *args, **kwargs) -> None: ...
###############################
# Time Series Stock Data APIs #
###############################

    def get_time_series_intraday(self, symbol, interval, adjusted:Optional[any]=None, extended_hours:Optional[any]=None, month:Optional[any]=None, outputsize:Optional[any]=None, datatype:Optional[any]=None) -> dict[str, any]:
        """
        This API returns current and 20+ years of historical intraday OHLCV time series of the equity specified, covering pre-market and post-market hours where applicable (e.g., 4:00am to 8:00pm Eastern Time for the US market). You can query both raw (as-traded) and split/dividend-adjusted intraday data from this endpoint. The OHLCV data is sometimes called "candles" in finance literature.
        symbol (required)
         - The name of the equity of your choice. For example: symbol=IBM 
        interval (required)
         - Time interval between two consecutive dat

In [None]:
from typing import Optional
class C:
    def _request(self, *args, **kwargs) -> None: ...
###############################
# Time Series Stock Data APIs #
###############################

    def get_time_series_intraday(self, symbol, interval, adjusted:Optional[any]=None, extended_hours:Optional[any]=None, month:Optional[any]=None, outputsize:Optional[any]=None, datatype:Optional[any]=None) -> dict[str, any]:
        """
        This API returns current and 20+ years of historical intraday OHLCV time series of the equity specified, covering pre-market and post-market hours where applicable (e.g., 4:00am to 8:00pm Eastern Time for the US market). You can query both raw (as-traded) and split/dividend-adjusted intraday data from this endpoint. The OHLCV data is sometimes called "candles" in finance literature.
        symbol (required)
         - The name of the equity of your choice. For example: symbol=IBM 
        interval (required)
         - Time interval between two consecutive data points in the time series. The following values are supported: 1min, 5min, 15min, 30min, 60min
        adjusted (optional)
         - By default, adjusted=true and the output time series is adjusted by historical split and dividend events. Set adjusted=false to query raw (as-traded) intraday values. 
        extended_hours (optional)
         - By default, extended_hours=true and the output time series will include both the regular trading hours and the extended (pre-market and post-market) trading hours (4:00am to 8:00pm Eastern Time for the US market). Set extended_hours=false to query regular trading hours (9:30am to 4:00pm US Eastern Time) only. 
        month (optional)
         - By default, this parameter is not set and the API will return intraday data for the most recent days of trading. You can use the month parameter (in YYYY-MM format) to query a specific month in history. For example, month=2009-01. Any month in the last 20+ years since 2000-01 (January 2000) is supported.
        outputsize (optional)
         - By default, outputsize=compact. Strings compact and full are accepted with the following specifications: compact returns only the latest 100 data points in the intraday time series; full returns trailing 30 days of the most recent intraday data if the month parameter (see above) is not specified, or the full intraday data for a specific month in history if the month parameter is specified. The "compact" option is recommended if you would like to reduce the data size of each API call. 
        datatype (optional)
         - By default, datatype=json. Strings json and csv are accepted with the following specifications: json returns the intraday time series in JSON format; csv returns the time series as a CSV (comma separated value) file. 
        """

        return self._send_request(
            function="TIME_SERIES_INTRADAY",
            request_args=["symbol=symbol","interval=interval"] + (f"[adjusted={adjusted}] if adjusted is not None else []) + (f"[extended_hours={extended_hours}] if extended_hours is not None else []) + (f"[month={month}] if month is not None else []) + (f"[outputsize={outputsize}] if outputsize is not None else []) + (f"[datatype={datatype}] if datatype is not None else [])
        )
        


SyntaxError: invalid syntax. Maybe you meant '==' or ':=' instead of '='? (3156370365.py, line 29)