In [1]:
import os
from dotenv import load_dotenv
load_dotenv()   
#obtaining a session token (valid for 30 days)
import requests

# secret variables
LUMAR_SECRET = os.getenv("LUMAR_SECRET")
LUMAR_USER_KEY_ID = os.getenv("LUMAR_USER_KEY_ID")
LUMAR_CRAWL_ID = os.getenv("LUMAR_CRAWL_ID")

if not LUMAR_SECRET:
    raise RuntimeError("Missing LUMAR_SECRET. Put it in a .env file or environment")

# GraphQL endpoint for Lumar API
url = 'https://api.lumar.io/graphql'
#GraphQL mutation (message)
query = """
mutation LoginWithUserKey($secret: String!, $userKeyId: ObjectID!) {
  createSessionUsingUserKey(input: { userKeyId: $userKeyId, secret: $secret }) {
    token
  }
}
"""
#variables for the mutation
variables = {
    "secret": LUMAR_SECRET,
    "userKeyId": LUMAR_USER_KEY_ID
}
#send the request
response = requests.post(url, json={'query':query, "variables":variables})
#Get the response JSON
data = response.json()
#Extract the session token
token = data['data']['createSessionUsingUserKey']['token']

In [2]:
import os
from dotenv import load_dotenv
load_dotenv()
import requests
import pandas as pd

# secret variables (from environment)
LUMAR_SECRET = os.getenv("LUMAR_SECRET")
LUMAR_USER_KEY_ID = os.getenv("LUMAR_USER_KEY_ID")
LUMAR_CRAWL_ID = os.getenv("LUMAR_CRAWL_ID")

if not LUMAR_SECRET:
    raise RuntimeError("Missing LUMAR_SECRET. Put it in a .env file or environment")

# Obtain a short-lived API session token using the user key
GRAPHQL_URL = "https://api.lumar.io/graphql"
LOGIN_MUTATION = """
mutation LoginWithUserKey($secret: String!, $userKeyId: ObjectID!) {
  createSessionUsingUserKey(input: { userKeyId: $userKeyId, secret: $secret }) {
    token
  }
}
"""

resp = requests.post(GRAPHQL_URL, json={"query": LOGIN_MUTATION, "variables": {"secret": LUMAR_SECRET, "userKeyId": LUMAR_USER_KEY_ID}})
resp.raise_for_status()
data = resp.json()
if "errors" in data:
    raise RuntimeError(f"Login GraphQL errors: {data['errors']}")
token = data["data"]["createSessionUsingUserKey"]["token"]


class LumarAPIClient:
    def __init__(self, api_token: str):
        self.api_token = api_token
        self.base_url = "https://api.lumar.io/graphql"
        self.headers = {
            "Content-Type": "application/json",
            "apollographql-client-name": "python-client",
            "apollographql-client-version": "1.0.0",
            "x-auth-token": api_token
        }

    def fetch_unique_internal_links(self, crawl_id, report_template_code="unique_internal_links"):
        """Return raw nodes list from the API (no aggregation here)."""
        all_data = []
        after_cursor = None
        has_next_page = True

        query = """
        query GetReportStatForCrawl($crawlId: ObjectID!, $reportTemplateCode: String!, $after: String) {
            getReportStat(input: {crawlId: $crawlId, reportTemplateCode: $reportTemplateCode}) {
                crawlUniqueLinks(after: $after, reportType: Basic) {
                    nodes {
                        urlTo
                        primaryUrlFrom
                        anchorText
                    }
                    pageInfo {
                        endCursor
                        hasNextPage
                    }
                }
            }
        }
        """

        while has_next_page:
            variables = {"crawlId": crawl_id, "reportTemplateCode": report_template_code}
            if after_cursor:
                variables["after"] = after_cursor

            r = requests.post(self.base_url, headers=self.headers, json={"query": query, "variables": variables})
            r.raise_for_status()
            payload = r.json()
            if "errors" in payload:
                raise RuntimeError(f"GraphQL errors: {payload['errors']}")

            crawl_data = payload["data"]["getReportStat"]["crawlUniqueLinks"]
            nodes = crawl_data.get("nodes", [])
            all_data.extend(nodes)

            page_info = crawl_data.get("pageInfo", {})
            has_next_page = page_info.get("hasNextPage", False)
            after_cursor = page_info.get("endCursor", None)

        return all_data

    def to_dataframe(self, raw_data):
        """
        Convert raw node list to a DataFrame with columns:
          - target_url
          - anchor_text
          - found_at
          - unique_anchor_text_count (per target_url across all rows)
        Rows are NOT aggregated: each occurrence remains a separate row.
        """
        if not raw_data:
            return pd.DataFrame(columns=["target_url", "anchor_text", "found_at", "unique_anchor_text_count"])

        df = pd.DataFrame(raw_data)

        # Normalize column names returned by API to the desired output columns
        df = df.rename(columns={
            "urlTo": "target_url",
            "primaryUrlFrom": "found_at",
            "anchorText": "anchor_text"
        })

        # Ensure columns exist
        if "target_url" not in df.columns:
            df["target_url"] = None
        if "anchor_text" not in df.columns:
            df["anchor_text"] = None
        if "found_at" not in df.columns:
            df["found_at"] = None

        # Compute unique anchor text count per target_url (distinct anchor_text values)
        unique_counts = df.groupby("target_url")["anchor_text"].nunique()
        df["unique_anchor_text_count"] = df["target_url"].map(unique_counts).fillna(0).astype(int)

        # Keep rows as-is (no aggregation). Return desired column order.
        return df[["target_url", "anchor_text", "found_at", "unique_anchor_text_count"]].copy()


def main():
    client = LumarAPIClient(token)
    raw = client.fetch_unique_internal_links(LUMAR_CRAWL_ID)
    df = client.to_dataframe(raw)

    # Save to CSV for downstream use by the Streamlit app
    df.to_csv("lumar_internal_links.csv", index=False)
    print(f"Saved {len(df)} rows to lumar_internal_links.csv")
    return df


if __name__ == "__main__":
    main()

Saved 32730 rows to lumar_internal_links.csv
