# Fetching Companies and descriptions with Crunchbase

We have a list of the top 100 highest evaluated Deep Tech startups in Europe. We can now use the Crunchbase API to get infomration relevant for linking

In [5]:
# get permalink
# read the CSV file and extract the second column as an array
import pandas as pd

df = pd.read_csv('../data/company-data/european_deep_tech_list.csv', header=None)
permalinks = df[1].tolist()



In [6]:
"""
Fetch details for a list of Crunchbase company permalinks.

Prerequisites
-------------
pip install requests pandas python-dotenv

Environment
-----------
Create a .env file (or export in your shell) with:
CRUNCHBASE_USER_KEY=your_real_api_key_here
"""

from __future__ import annotations
import os
import time
import requests
import pandas as pd
from typing import Iterable, Dict, Any, List
from dotenv import load_dotenv

# ------------------------------------------------------------------
# 1.  Configuration
# ------------------------------------------------------------------
load_dotenv()                                # loads CRUNCHBASE_USER_KEY from .env
API_KEY        = os.getenv("CRUNCHBASE_API_KEY")
BASE_URL       = "https://api.crunchbase.com/api/v4/entities/organizations"
# Fields you want back (keep the list short to stay within payload limits)
DEFAULT_FIELDS = [
    "identifier",            # permalink, uuid, name, etc.
    "short_description",
    "profile_image_url",
    "homepage_url",
    "cb_url",
    "rank_org",
    "country_code",
    "total_funding_usd",
    "founded_on",
    "last_funding_type",
    "last_funding_on",
]

# polite crawl settings (v4 hard-limit is 50 req/min = 1.2 s / call)
REQUEST_INTERVAL = 1.3       # seconds

# ------------------------------------------------------------------
# 2.  Helper to call the API
# ------------------------------------------------------------------
def _get_organization(permalink: str,
                      field_ids: Iterable[str] | None = None,
                      api_key: str | None = None) -> Dict[str, Any] | None:
    """
    Fetch one organization entity. Returns None on 404 / client errors.
    """
    api_key   = api_key or API_KEY
    field_str = ",".join(field_ids or DEFAULT_FIELDS)
    url       = f"{BASE_URL}/{permalink}"
    params    = {"user_key": api_key, "field_ids": field_str}

    try:
        r = requests.get(url, params=params, timeout=15)
        if r.status_code == 200:
            return r.json()          # full v4 entity response
        elif r.status_code == 404:
            print(f"[warn] {permalink}: not found")
        else:
            print(f"[warn] {permalink}: HTTP {r.status_code} – {r.text}")
    except requests.RequestException as exc:
        print(f"[error] {permalink}: {exc}")

    return None

# ------------------------------------------------------------------
# 3.  Public wrapper for a list of permalinks
# ------------------------------------------------------------------
def fetch_companies(permalinks: Iterable[str],
                    field_ids: Iterable[str] | None = None,
                    sleep: float = REQUEST_INTERVAL,
                    api_key: str | None = None) -> pd.DataFrame:
    """
    Fetch many companies and return a DataFrame of flattened data.
    """
    results: List[Dict[str, Any]] = []
    for i, pl in enumerate(permalinks, 1):
        data = _get_organization(pl, field_ids, api_key)
        if data:
            # The fields you asked for are in data["properties"]
            results.append(data["properties"])
        time.sleep(sleep)            # be polite / avoid 429s
        print(f"✓ {i}/{len(permalinks)} {pl}")
    return pd.json_normalize(results)

# ------------------------------------------------------------------
# 4.  Usage example
# ------------------------------------------------------------------
if __name__ == "__main__":
    df = fetch_companies(permalinks)
    print("\nFetched", len(df), "companies")
    # Show a quick preview
    print(df.head().T)
    

# ------------------------------------------------------------------
# 5.  Extract and save company data to CSV
# ------------------------------------------------------------------
# Extract only the identifier.value and short_description columns
company_data = df[['identifier.value', 'short_description']].copy()

# Rename columns for clarity
company_data.columns = ['company_name', 'description']

# Save to CSV
output_file = '../data/company-data/companies_with_descriptions.csv'
company_data.to_csv(output_file, index=False)

print(f"Saved {len(company_data)} companies to {output_file}")
print("\nFirst few entries:")
print(company_data.head())



✓ 1/99 anybotics
✓ 2/99 ably
✓ 3/99 accelercomm
✓ 4/99 addionics-london-united-kingdom
✓ 5/99 aerospacelab
✓ 6/99 aleph-alpha
✓ 7/99 alice-bob
✓ 8/99 aqemia
✓ 9/99 automata
✓ 10/99 axelera-ai
✓ 11/99 blickfeld
✓ 12/99 charm-therapeutics
✓ 13/99 core-power-holdings-pte-ltd
✓ 14/99 cado-security
✓ 15/99 causalens
✓ 16/99 contunity
✓ 17/99 cervest
✓ 18/99 cloudnc
✓ 19/99 cognigy
✓ 20/99 cybersmart-2
✓ 21/99 descartes-underwriting
✓ 22/99 destinus
✓ 23/99 diabeloop
✓ 24/99 diogenx
✓ 25/99 e-space
✓ 26/99 eleqtron-3d70
✓ 27/99 emergence-therapeutics
✓ 28/99 envisics
✓ 29/99 evox-therapeutics
✓ 30/99 exotrail
✓ 31/99 fairbrics
✓ 32/99 fairmat-tech
✓ 33/99 flockcover
✓ 34/99 ganymed-robotics
✓ 35/99 geopura
✓ 36/99 gourmey
✓ 37/99 h2-green-steel
✓ 38/99 halodi-robotics
✓ 39/99 harbr
✓ 40/99 heart-aerospace
✓ 41/99 helsing
✓ 42/99 hystar-706a
✓ 43/99 iceye
✓ 44/99 iqm
✓ 45/99 inato
✓ 46/99 isar-aerospace
✓ 47/99 kaia-health
✓ 48/99 kili-technology
✓ 49/99 lifebit-ai
✓ 50/99 mablink-bioscience
