# Set environmental variables for local development

In [None]:
import json, os
for k,v in json.load(open("local.settings.json"))["Values"].items():
    os.environ[k] = v
from libs.openapi.clients import XandrAPI

In [None]:
from libs.data import register_binding, from_bind
import pandas as pd

if not from_bind("xandr_dashboard"):
    register_binding(
        "xandr_dashboard",
        "Structured",
        "sql",
        url=os.environ["DATABIND_SQL_XANDR"],
        schemas=["dashboard"],
    )
provider = from_bind("xandr_dashboard")
last_creative = next(
    iter(
        next(
            iter(
                pd.read_sql(
                    sql="SELECT MAX([last_modified]) FROM [dashboard].[creatives]",
                    con=provider.connect().connection(),
                ).iloc
            ),
            "1970-01-01 00:00:00",
        )
    )
)
str(last_creative)

In [None]:
from aiopenapi3 import ResponseSchemaError
XA = XandrAPI(asynchronus=False)
factory = XA.createRequest("GetCreative")
try:
    header, data, raw = factory.request(parameters={"num_elements": 100, "start_element": 0})
except:
    display("test")

In [None]:
XA = XandrAPI(asynchronus=False)
start_element = 0
num_elements = 100
creatives = []
while True:
    print(f"{start_element}")
    factory = XA.createRequest("GetCreative")
    response = factory(parameters={"num_elements": num_elements, "start_element": start_element})
    if response.response.status == "OK":
        start_element += response.response.num_elements
        creatives += response.response.creatives
        if start_element >= response.response.count:
            break

In [None]:
import asyncio, hashlib, httpx, logging, orjson, os, pandas as pd, re
from bs4 import BeautifulSoup
from io import StringIO
from urllib.parse import urlparse, parse_qs, unquote_plus

CACHE_DIR = "xandr_docs_cache"

# Create cache directory if it doesn't exist
if not os.path.exists(CACHE_DIR):
    os.makedirs(CACHE_DIR)


def combine_dicts(dict_list):
    # Function to combine multiple dictionaries
    combined_dict = {}
    for d in dict_list:
        for key, value in d.items():
            if key not in combined_dict:
                combined_dict[key] = []
            combined_dict[key].extend(value)
    return {k: list(set(v)) for k, v in combined_dict.items()}


def transform_url(url):
    # Function to replace spaces with underscores and angle brackets with curly brackets
    def replace_match(match):
        return "{" + match.group(1).replace(" ", "_") + "}"

    # Regular expression pattern to find text between angle brackets
    pattern = r"<([^>]+)>"

    # Replace the matches using the pattern and the replace_match function
    transformed_url = re.sub(pattern, replace_match, url)
    return transformed_url


def replace_uppercase_words(text):
    # Pattern explanation:
    # \b - word boundary
    # [A-Z]+ - one or more uppercase letters
    # (?:[-_][A-Z]+)+ - non-capturing group for dash/underscore followed by uppercase letters, repeated one or more times
    pattern = r"\b[A-Z]+(?:[-_][A-Z]+)+\b"

    # Function to replace dashes/underscores with underscores and add curly braces
    def replace_with_underscores(match):
        word = match.group(0)
        return "{" + word.replace("-", "_").replace("_", "_") + "}"

    # Replace all occurrences in the text
    return re.sub(pattern, replace_with_underscores, text)


def replace_camel_case(text):
    # Regex pattern explanation:
    # \{ - Matches the opening curly brace
    # ([a-z]+) - Matches and captures one or more lowercase letters
    # ([A-Z]) - Matches and captures an uppercase letter
    # ([a-zA-Z]*) - Matches and captures zero or more letters of any case
    # \} - Matches the closing curly brace
    pattern = r"\{([a-z]+)([A-Z])([a-zA-Z]*)\}"

    # Replacement function
    def replace_with_underscore_and_uppercase(match):
        return (
            "{" + (match.group(1) + "_" + match.group(2) + match.group(3)).upper() + "}"
        )

    # Replace all occurrences in the text
    return re.sub(pattern, replace_with_underscore_and_uppercase, text)


def get_cache_filename(url):
    # Use MD5 hash of the URL as the filename
    return os.path.join(
        CACHE_DIR, hashlib.md5(url.encode("utf-8")).hexdigest()
    )


async def fetch_url(client, url):
    cache_file = get_cache_filename(url)
    # Check if the URL is already cached
    if os.path.exists(cache_file):
        with open(cache_file, "r", encoding="utf-8") as file:
            return file.read()
    else:
        response = await client.get(url, headers={"accept": "application/json"})
        content = response.json()["html"]
        # Cache the response
        with open(cache_file, "w", encoding="utf-8") as file:
            file.write(content)
        return content


async def fetch_json(client, url, headers=None):
    cache_file = get_cache_filename(url)
    if os.path.exists(cache_file):
        with open(cache_file, "r", encoding="utf-8") as file:
            return orjson.loads(file.read())
    else:
        response = await client.get(url, headers=headers)
        content = response.json()
        with open(cache_file, "w", encoding="utf-8") as file:
            file.write(orjson.dumps(content).decode("utf-8"))
        return content


async def fetch_data():
    async with httpx.AsyncClient(timeout=None) as client:
        response = await fetch_json(
            client,
            url="https://xandr-be-prod.zoominsoftware.io/bundle/xandr-api/toc/api-getting-started.html",
            headers={"accept": "application/json"},
        )
        soup = BeautifulSoup(response["nav-54"], "html.parser")
        root = soup.find("ul")

        tasks = []
        for ul in root.find_all("ul"):
            ref = ul.find("a").text.strip()
            for li in ul.find_all("li"):
                service_name = li.find("a").text.strip()
                if ref != service_name:
                    url = li.find("a").get("href")
                    task = fetch_url(client, url)
                    tasks.append((ref, service_name, task, url))

        results = await asyncio.gather(*(task for _, _, task, _ in tasks))

        all_ops = pd.DataFrame()
        for (ref, service_name, task, url), result in zip(tasks, results):
            if tables := BeautifulSoup(result, "html.parser").find_all("table"):
                for table in tables:
                    df_api = pd.read_html(StringIO(str(table)))[0]
                    if set(df_api.columns) == set(
                        ["HTTP Method", "Endpoint", "Description"]
                    ):
                        df_api.rename(
                            columns={
                                "HTTP Method": "method",
                                "Description": "description",
                            },
                            inplace=True,
                        )
                        df_api["Endpoint"] = df_api["Endpoint"].apply(
                            lambda x: unquote_plus(
                                re.sub(
                                    r"\s+",
                                    " ",
                                    x.replace("(", " (")
                                    .replace("/ ", "/")
                                    .replace("http", "https")
                                    .replace("httpss", "https")
                                    .replace(".com", ".com/")
                                    .replace(".com//", ".com/")
                                    .replace("- ", "-")
                                    .replace(" ?", "?")
                                    .replace("POST", " POST")
                                    .replace("Important:", " Important:")
                                    .replace("Note:", " Note:")
                                    .replace("Tip:", " Tip:")
                                    .replace("Warning:", " Warning:"),
                                )
                            )
                        )
                        df_api["External Documentation"] = url
                        for _, record in df_api.iterrows():
                            for t in ["Important", "Note", "Tip", "Warning"]:
                                if len(msg := record["Endpoint"].split(f" {t}:")) > 1:
                                    record["description"] += f"\n{t}:{msg[-1]}"
                        for _, record in df_api.iterrows():
                            if (
                                len(endpoints := record["Endpoint"].split("https://"))
                                > 1
                            ):
                                r = dict(record)
                                df_api = pd.concat(
                                    [
                                        df_api,
                                        pd.DataFrame(
                                            [
                                                {
                                                    **r,
                                                    "Endpoint": f"https://{e}".strip(),
                                                }
                                                for e in set(endpoints)
                                                if e
                                            ]
                                        ),
                                    ],
                                    ignore_index=True,
                                )

                        for index, record in df_api.iterrows():
                            if len(endpoints := record["Endpoint"].split("http")) > 2:
                                df_api.drop(index, inplace=True)
                        df_api.drop_duplicates(["Endpoint"], keep="first", inplace=True)
                        df_api["requestBody"] = df_api["Endpoint"].apply(
                            lambda x: r[-1].split(")")[0].replace(" JSON", "")
                            if len(
                                r := x.replace("{", "(")
                                .replace("}", ")")
                                .split(" POST")[0]
                                .strip()
                                .split(" (")
                            )
                            > 1
                            else None
                        )
                        df_api["url"] = df_api["Endpoint"].apply(
                            lambda x: urlparse(
                                transform_url(x.split(" (")[0].strip())
                                .split(" ")[0]
                                .strip()
                            )
                        )
                        df_api["parameters"] = df_api["url"].apply(
                            lambda x: {
                                k: [
                                    p.upper().replace("{", "").replace("}", "")
                                    for p in v
                                ]
                                for k, v in parse_qs(x.query).items()
                            }
                        )
                        df_api["path"] = df_api["url"].apply(
                            lambda x: replace_camel_case(
                                replace_uppercase_words(
                                    unquote_plus(x.path)
                                    .replace("[", "{")
                                    .replace("]", "}")
                                )
                                .replace("{{", "{")
                                .replace("}}", "}")
                            )
                        )
                        df_api["server"] = df_api["url"].apply(lambda x: x.netloc)
                        df_api["tags"] = ref
                        df_api["tags"] = df_api["tags"].apply(
                            lambda x: [x, service_name]
                        )
                        all_ops = pd.concat(
                            [all_ops, df_api[~df_api["path"].str.endswith("/meta")]]
                        )

        all_ops.drop_duplicates(["Endpoint"], keep="first", inplace=True)
        paths = {
            path: {
                method.lower(): {
                    "description": "\n".join(set(method_df["description"].to_list()))
                    .replace("Note:", "\nNote:")
                    .replace("  ", " ")
                    .replace("\n\n", "\n")
                    + "\n"
                    + "\n".join(set(method_df["Endpoint"].to_list())),
                    "parameters": [
                        #     {
                        #         "name": k,
                        #         "in": "query",
                        #         "schema": {"type": "string"},
                        #         "examples": {
                        #             str(i): ev.replace("ID_VALUE", "ID")
                        #             for i, ev in enumerate(v)
                        #         },
                        #     }
                        #     for k, v in combine_dicts(
                        #         [p for p in method_df["parameters"].to_list() if p]
                        #     ).items()
                    ],
                    "requestBody": list(set([rb.replace(" ", "-") for rb in rbs if rb]))
                    if len(rbs := method_df["requestBody"].to_list())
                    else None,
                    "externalDocs": {
                        "url": method_df["External Documentation"].to_list()[0]
                    },
                }
                for method, method_df in path_df.groupby("method")
            }
            for path, path_df in all_ops.groupby("path")
        }

        refs = []
        for path, op in paths.items():
            refs += [o["requestBody"] for o in op.values() if o["requestBody"]]
            op["parameters"] = [
                {
                    "name": k,
                    "in": "query",
                    "schema": {"type": "string"},
                    "required": True,
                }
                for k in re.findall(r"\{([^}]+)\}", path)
            ]

        schemas = {}
        tasks = []

        for path in paths.keys():
            if len(path.split("/")) == 2:
                url = f"https://api.appnexus.com{path}/meta"
                headers = {
                    "accept": "application/json",
                    "authorization": "Bearer authn:279514:ac836e26876a3:nym2"
                }
                tasks.append((fetch_json(client, url, headers), path))

        for (task, path) in tasks:
            try:
                result = await task
                if f := result.get("response").get("fields"):
                    schemas[path[1:]] = f
            except Exception as e:
                logging.error(f"Error fetching schema for {path}: {e}")

        return {"paths": paths, "schemas": schemas}

In [None]:
def map_type_to_openapi(field):
    """
    Maps the field type from the Xandr API to the corresponding OpenAPI data type.
    """
    type_mapping = {
        'int': ('integer', None),
        'double': ('number', 'double'),
        'date': ('string', 'date-time'),
        'money': ('number', 'float'),
        'string': ('string', None),
        'boolean': ('boolean', None)
    }
    openapi_type, format = type_mapping.get(field['type'], ('string', None))
    return openapi_type, format

def process_schema_fields(fields):
    """
    Processes each field in the schema to generate the OpenAPI schema.
    """
    openapi_fields = {}
    for field in fields:
        if field['type'] == 'array of objects':
            if len(field.get('fields', [])) == 1:
                # Array, but schema is not an object, instead it is the type of the single item in fields
                item_type, item_format = map_type_to_openapi(field['fields'][0])
                openapi_fields[field['name']] = {
                    'type': 'array',
                    'items': {'type': item_type}
                }
                if item_format:
                    openapi_fields[field['name']]['items']['format'] = item_format
            else:
                # Array of objects, needs recursive processing
                openapi_fields[field['name']] = {
                    'type': 'array',
                    'items': {'type': 'object', 'properties': process_schema_fields(field.get('fields', []))}
                }
        else:
            # Regular field
            openapi_type, format = map_type_to_openapi(field)
            openapi_fields[field['name']] = {'type': openapi_type}
            if format:
                openapi_fields[field['name']]['format'] = format
    return openapi_fields

def generate_openapi_schemas(schemas):
    """
    Generates OpenAPI schemas for each schema in the Xandr JSON file.
    """
    openapi_schemas = {}
    for schema_name, fields in schemas.items():
        openapi_schemas[schema_name] = {
            'type': 'object',
            'properties': process_schema_fields(fields)
        }
    return openapi_schemas

In [None]:
with open("xandr.json", "wb") as f:
    f.write(
        orjson.dumps(
            # generate_openapi_schemas(
                await fetch_data(),
            # )
        )
    )

In [None]:
import json
with open(os.path.join(CACHE_DIR, hashlib.md5("https://xandr-be-prod.zoominsoftware.io/bundle/xandr-api/toc/api-getting-started.html".encode()).hexdigest())) as file:
    display(list(json.load(file).keys()))

The attached JSON file represents "paths" and "schemas" for the Xandr API. Each path may have a "post", "get", "put", and/or "delete" key which represents the operations that can be executed on that path, and may have a "parameters" key which represents parameters that are used with all of the operations for that path. 

The "post" key represents an HTTP POST request for creating a new record.
The "get" key represents an HTTP GET request for reading one or more existing records.
The "put" key represents an HTTP PUT request for updating an existing record.
The "delete" key represents an HTTP DELETE request for deleting an existing record.

Each operation will have a "description", "parameters", "requestBody", and "externalDocs" keys.

Each key in "schemas" (at the root level of the JSON file) is a list of fields for the object type that a given path manages. Each field has a "name", and "type" key, and may have a "sort_by", "filter_by", and/or "read_only" key.

Fields that are of the type "int", "double", "date", or "money" can be filtered by min and max. For example:
/campaign?min_id=47
/campaign?min_advertiser_id=20

Fields of the type "date" can be filtered by nmin and nmax as well. The nmin filter lets you find dates that are either null or after the specified date, and the nmax filter lets you find dates that are either null or before the specified date. For example:

/campaign?nmax_start_date=2012-12-20+00:00:00
/campaign?nmin_end_date=2013-01-01+00:00:00
Note the required date/time syntax in the preceding example: YYYY-MM-DD+HH:MM:SS

The following additional field-based filters on API responses:
not_*
like_*
min_*
max_*
nmin_*
nmax_*
having_*
having_min_*
having_max_*
Example:
/placement?like_parent_brand_name=Outback

Some services support search as a query string parameter to look for ID or name. 
For example:
/placement?search=17

To sort use the sort query string parameter and pass in a list of fields you'd like to sort by and whether you want them ascending (asc) or descending (desc). 
For example:
/campaign?advertiser_id=1&sort=id.desc'

When getting multiple records, pagination can be used with the start_element and num_elements parameters. If num_elements is not supplied, it defaults to 100 (which is also the maximum value).
/campaign?start_element=20&num_elements=10

By including append=true in the query string of a PUT call, a user can update only a particular child object instead of replacing all child objects. In other words, rather than overwriting an entire array with a new one on a PUT call, you can use append=true on the query string to add a single element to a long array.

Write a python script that will use the attached JSON file and generate an OpenAPI v3.1.0 compliant component schema and parameter for each schemas defined in the JSON file.

Note, OpenAPI only supports the following data types: 
string
number
integer
boolean
array
object

So, mapping the field types to OpenAPI data types with proper formatting would be prudent. For example, if the field type is date, the corresponding OpenAPI data type would be "string" with a format of "date-time".

Also, fields that have the type "array of objects" will also have the key "fields" which represents another object type. In this case fields should be an array of the type "object" and the "fields" will need to be processed recursively to generate a proper OpenAPI schema. 

NOTE: If the field's type is "array of objects" and there is only one item in the "fields" key, then the field's type should be "array", but the schema should not be of the type "object". Instead, the schema should be whatever the type is of the one item in "fields" is.