In [4]:
import os
from dotenv import load_dotenv
load_dotenv()   
#obtaining a session token (valid for 30 days)
import requests

# secret variables
LUMAR_SECRET = os.getenv("LUMAR_SECRET")
LUMAR_USER_KEY_ID = os.getenv("LUMAR_USER_KEY_ID")
LUMAR_CRAWL_ID = os.getenv("LUMAR_CRAWL_ID")

if not LUMAR_SECRET:
    raise RuntimeError("Missing LUMAR_SECRET. Put it in a .env file or environment")

# GraphQL endpoint for Lumar API
url = 'https://api.lumar.io/graphql'
#GraphQL mutation (message)
query = """
mutation LoginWithUserKey($secret: String!, $userKeyId: ObjectID!) {
  createSessionUsingUserKey(input: { userKeyId: $userKeyId, secret: $secret }) {
    token
  }
}
"""
#variables for the mutation
variables = {
    "secret": LUMAR_SECRET,
    "userKeyId": LUMAR_USER_KEY_ID
}
#send the request
response = requests.post(url, json={'query':query, "variables":variables})
#Get the response JSON
data = response.json()
#Extract the session token
token = data['data']['createSessionUsingUserKey']['token']

In [None]:
import requests
import pandas as pd

class LumarAPIClient:
    def __init__(self, api_token):
        self.api_token = api_token
        self.base_url = "https://api.lumar.io/graphql"
        self.headers = {
            "Content-Type": "application/json",
            "apollographql-client-name": "python-client",
            "apollographql-client-version": "1.0.0",
            "x-auth-token": api_token
        }
    
    def fetch_unique_internal_links(self, crawl_id, report_template_code="unique_internal_links"):
        """
        Fetch all unique internal links data with pagination
        """
        all_data = []
        after_cursor = None
        has_next_page = True
        
        query = """
        query GetReportStatForCrawl(
            $crawlId: ObjectID!
            $reportTemplateCode: String!
            $after: String
        ) {
            getReportStat(
                input: {crawlId: $crawlId, reportTemplateCode: $reportTemplateCode}
            ) {
                crawlUniqueLinks(after: $after, reportType: Basic) {
                    nodes {
                        urlTo
                        primaryUrlFrom
                        anchorText
                        instanceCount
                    }
                    pageInfo {
                        endCursor
                        hasNextPage
                    }
                }
            }
        }
        """
        
        while has_next_page:
            variables = {
                "crawlId": crawl_id,
                "reportTemplateCode": report_template_code
            }
            if after_cursor:
                variables["after"] = after_cursor
            
            response = requests.post(
                self.base_url,
                headers=self.headers,
                json={"query": query, "variables": variables}
            )
            
            if response.status_code != 200:
                raise Exception(f"API request failed with status {response.status_code}: {response.text}")
            
            data = response.json()
            if 'errors' in data:
                raise Exception(f"GraphQL errors: {data['errors']}")
            
            crawl_data = data['data']['getReportStat']['crawlUniqueLinks']
            all_data.extend(crawl_data['nodes'])
            
            page_info = crawl_data['pageInfo']
            has_next_page = page_info['hasNextPage']
            after_cursor = page_info['endCursor']
            
            print(f"Fetched {len(crawl_data['nodes'])} records. Total so far: {len(all_data)}")
        
        print(f"Completed! Total records fetched: {len(all_data)}")
        return all_data
    
    def to_dataframe(self, raw_data):
        """
        Convert raw API data into a DataFrame with aggregated anchor text counts.
        Returns a DataFrame with columns:
          - target_url
          - anchor_texts: aggregated like 'anchor1 (3) | anchor2 (1)' (counts are summed instanceCount)
          - unique_anchor_text_count: number of distinct anchorText values
          - total_inlinks: sum of instanceCount for the target_url
          - found_at: semicolon-separated unique primaryUrlFrom values
        """
        # handle empty input
        if not raw_data:
            return pd.DataFrame(columns=['target_url','anchor_texts','unique_anchor_text_count','total_inlinks','found_at'])
        df = pd.DataFrame(raw_data)
        df.rename(columns={"urlTo": "target_url", "primaryUrlFrom": "found_at"}, inplace=True)
        # ensure instanceCount exists and is integer
        if 'instanceCount' not in df.columns:
            df['instanceCount'] = 1
        df['instanceCount'] = pd.to_numeric(df['instanceCount'], errors='coerce').fillna(0).astype(int)
        # Group by target_url, anchorText, and found_at to sum instance counts per anchor
        agg = df.groupby(['target_url','anchorText','found_at'], dropna=False)['instanceCount'].sum().reset_index()
        # Build aggregated anchorText strings per target_url
        def _anchor_string(sub):
            parts = []
            for _, r in sub.iterrows():
                anchor = r['anchorText'] if pd.notna(r['anchorText']) else ''
                parts.append(f"{anchor} ({r['instanceCount']})".strip())
            return ' | '.join([p for p in parts if p])
        anchor_strings = agg.groupby('target_url').apply(_anchor_string).rename('anchor_texts')
        # Compute unique anchor text counts and total inlinks per target_url
        grouped = agg.groupby('target_url').agg({
            'anchorText': lambda s: s.nunique(),
            'instanceCount': 'sum',
            'found_at': lambda s: '; '.join(sorted(set([str(x) for x in s if pd.notna(x)])))
        }).rename(columns={'anchorText': 'unique_anchor_text_count', 'instanceCount': 'total_inlinks'})
        result = grouped.join(anchor_strings).reset_index()
        # Ensure column order
        result = result[['target_url','anchor_texts','unique_anchor_text_count','total_inlinks','found_at']]
        return result

# Usage example
def main():
    API_TOKEN = token  # replace with your token
    CRAWL_ID = LUMAR_CRAWL_ID
    
    client = LumarAPIClient(API_TOKEN)
    
    try:
        print("Fetching data...")
        raw_data = client.fetch_unique_internal_links(CRAWL_ID)
        
        print("Converting to DataFrame...")
       
        df = client.to_dataframe(raw_data)
        
        print(f"\nDataFrame created with {len(df)} rows")
        print(df.head())
        
        # Save to CSV
        df.to_csv("lumar_internal_links.csv", index=False)
        print("\nSaved to lumar_internal_links.csv")
        
        return df
    
    except Exception as e:
        print(f"Error: {e}")
        return None

if __name__ == "__main__":
    df = main()
