# Fetching Companies and descriptions with Crunchbase

We have a list of the top 100 highest evaluated Deep Tech startups in Europe. We can now use the Crunchbase API to get infomration relevant for linking

In [5]:
# get permalink
# read the CSV file and extract the second column as an array
import pandas as pd

df = pd.read_csv('../data/company-data/european_deep_tech_list.csv', header=None)
permalinks = df[1].tolist()



In [2]:
from __future__ import annotations
import os
import time
import json
import requests
import pandas as pd
from typing import Iterable, Dict, Any, List
from dotenv import load_dotenv

# ------------------------------------------------------------------
# 1.  Configuration
# ------------------------------------------------------------------
load_dotenv()  # loads CRUNCHBASE_API_KEY from .env
API_KEY        = os.getenv("CRUNCHBASE_API_KEY")
BASE_URL       = "https://api.crunchbase.com/api/v4/entities/organizations"
# Fields to fetch
DEFAULT_FIELDS = [
    "identifier",            # permalink, uuid, name, etc.
    "short_description",
    "description",
    "website",
    "company_type",
    "location_identifiers",
    "funding_total",
    "founded_on",
    "funding_stage",
    "category_groups",
    "categories",
]

# polite crawl settings (v4 limit is 50 req/min = 1.2 s / call)
REQUEST_INTERVAL = 1.3  # seconds

# ------------------------------------------------------------------
# 2.  Helper to call the API
# ------------------------------------------------------------------
def _get_organization(permalink: str,
                      field_ids: Iterable[str] | None = None,
                      api_key: str | None = None) -> Dict[str, Any] | None:
    """
    Fetch one organization entity. Returns None on 404 or errors.
    """
    api_key   = api_key or API_KEY
    field_str = ",".join(field_ids or DEFAULT_FIELDS)
    url       = f"{BASE_URL}/{permalink}"
    params    = {"user_key": api_key, "field_ids": field_str}

    try:
        r = requests.get(url, params=params, timeout=15)
        if r.status_code == 200:
            return r.json().get("properties", {})
        elif r.status_code == 404:
            print(f"[warn] {permalink}: not found")
        else:
            print(f"[warn] {permalink}: HTTP {r.status_code} – {r.text}")
    except requests.RequestException as exc:
        print(f"[error] {permalink}: {exc}")
    return None

# ------------------------------------------------------------------
# 3.  Public wrapper for a list of permalinks
# ------------------------------------------------------------------
def fetch_companies(permalinks: Iterable[str],
                    field_ids: Iterable[str] | None = None,
                    sleep: float = REQUEST_INTERVAL,
                    api_key: str | None = None) -> pd.DataFrame:
    """
    Fetch many companies, save raw JSON array, and return a DataFrame.
    """
    results: List[Dict[str, Any]] = []
    for i, pl in enumerate(permalinks, 1):
        props = _get_organization(pl, field_ids, api_key)
        if props is not None:
            results.append(props)
        time.sleep(sleep)
        print(f"✓ {i}/{len(permalinks)} fetched {pl}")

    # Save raw JSON array of company properties
    json_output = '../data/company-data/companies_raw.json'
    os.makedirs(os.path.dirname(json_output), exist_ok=True)
    with open(json_output, 'w', encoding='utf-8') as jf:
        json.dump(results, jf, indent=2, ensure_ascii=False)
    print(f"Saved raw JSON data for {len(results)} companies to {json_output}")

    # Flatten into DataFrame
    df = pd.json_normalize(results)
    return df

# ------------------------------------------------------------------
# 4.  Usage example
# ------------------------------------------------------------------
if __name__ == "__main__":
    # Read permalinks from CSV
    source_csv = '../data/company-data/european_deep_tech_list.csv'
    df_input = pd.read_csv(source_csv, header=None)
    permalinks = df_input[1].astype(str).tolist()

    # Fetch data
    df = fetch_companies(permalinks)
    print(f"Fetched and saved data for {len(df)} companies.")

    # ------------------------------------------------------------------
    # 5.  Extract and save to CSV
    # ------------------------------------------------------------------
    summary_csv = '../data/company-data/companies_summary.csv'
    os.makedirs(os.path.dirname(summary_csv), exist_ok=True)

    summary_df = df[['identifier.value', 'description', 'short_description']].copy()
    summary_df.columns = ['value', 'description', 'short_description']
    summary_df.to_csv(summary_csv, index=False, encoding='utf-8')
    print(f"Saved summary CSV for {len(summary_df)} companies to {summary_csv}")


✓ 1/99 fetched anybotics
✓ 2/99 fetched ably
✓ 3/99 fetched accelercomm
✓ 4/99 fetched addionics-london-united-kingdom
✓ 5/99 fetched aerospacelab
✓ 6/99 fetched aleph-alpha
✓ 7/99 fetched alice-bob
✓ 8/99 fetched aqemia
✓ 9/99 fetched automata
✓ 10/99 fetched axelera-ai
✓ 11/99 fetched blickfeld
✓ 12/99 fetched charm-therapeutics
✓ 13/99 fetched core-power-holdings-pte-ltd
✓ 14/99 fetched cado-security
✓ 15/99 fetched causalens
✓ 16/99 fetched contunity
✓ 17/99 fetched cervest
✓ 18/99 fetched cloudnc
✓ 19/99 fetched cognigy
✓ 20/99 fetched cybersmart-2
✓ 21/99 fetched descartes-underwriting
✓ 22/99 fetched destinus
✓ 23/99 fetched diabeloop
✓ 24/99 fetched diogenx
✓ 25/99 fetched e-space
✓ 26/99 fetched eleqtron-3d70
✓ 27/99 fetched emergence-therapeutics
✓ 28/99 fetched envisics
✓ 29/99 fetched evox-therapeutics
✓ 30/99 fetched exotrail
✓ 31/99 fetched fairbrics
✓ 32/99 fetched fairmat-tech
✓ 33/99 fetched flockcover
✓ 34/99 fetched ganymed-robotics
✓ 35/99 fetched geopura
✓ 36/99 fe