Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
d815cef
refactor: use kuzu extension instead of kz
DataLabTechTV Jul 18, 2025
84c73a9
refactor: replace os.path ops with Path ops
DataLabTechTV Jul 18, 2025
ba6de1a
refactor: use ref instead of hardcoded FQN
DataLabTechTV Jul 18, 2025
4035f63
feat: support for loading Parquet into DuckLake from Python
DataLabTechTV Jul 18, 2025
25844f1
fix: any positive ESI is now considered competition, and is separate …
DataLabTechTV Jul 18, 2025
77325bd
feat: edge direction now based on common exports, from highest to low…
DataLabTechTV Jul 18, 2025
ff1f926
feat: add graph analytics module, starting with a CON score
DataLabTechTV Jul 18, 2025
d4d7d9d
refactor: different score reset strategy
DataLabTechTV Jul 18, 2025
8c94f6e
feat: add CLI support for computing the CON score
DataLabTechTV Jul 18, 2025
e29c08f
chore(deps): add jupyterlab, matplotlib, and networkx for graph data …
DataLabTechTV Jul 29, 2025
65defb1
refactor: remove unused import
DataLabTechTV Jul 29, 2025
a36b6c9
feat: networkx graph plot helper to use with notebooks
DataLabTechTV Jul 29, 2025
1d96e63
feat: setup notebook for graph data science
DataLabTechTV Jul 29, 2025
3210fa5
feat: create a basic graph theme matching DLT
DataLabTechTV Jul 29, 2025
ed56184
feat: add edge arrows and node colors per label
DataLabTechTV Jul 29, 2025
02dc859
feat: add graph transparency and improve labels
DataLabTechTV Jul 29, 2025
36cbc33
chore(deps): add adjustText to optionally fix rendering of overlappin…
DataLabTechTV Jul 29, 2025
8c0b6fb
feat: set label w/ prop per node type and render label wo/ overlapping
DataLabTechTV Jul 29, 2025
986a2d6
feat: dominating and weaker economy individual analysis
DataLabTechTV Jul 29, 2025
d8013c4
refactor: no longer setting flags for dominating and weaker
DataLabTechTV Jul 29, 2025
76ef5d4
chore(deps): remove unneeded adjustText and add scipy back as a requi…
DataLabTechTV Jul 30, 2025
62d5ef1
chore(deps): add geopandas to plot maps
DataLabTechTV Jul 30, 2025
266dfca
feat: improve graph plotting and add map plotting
DataLabTechTV Jul 30, 2025
62e54fd
feat: competiton network analysis, including community and weak compo…
DataLabTechTV Jul 30, 2025
4b0c792
feat: script to easily convert Jupyter Notebooks to markdown
DataLabTechTV Aug 1, 2025
80d5ef1
feat: trade alignment analysis
DataLabTechTV Aug 1, 2025
da6e848
feat: trade alignment analysis (cont)
DataLabTechTV Aug 1, 2025
afceea8
feat: compare communities and components, study economical pressure
DataLabTechTV Aug 1, 2025
e4f5b62
fix: log file relative path to cwd failed when not directly contained…
DataLabTechTV Aug 5, 2025
9190d2c
feat: add scale to arrow placement, add optional visualization weight
DataLabTechTV Aug 5, 2025
6a3dcb1
feat: revisted the whole notebook, restructuring and adding depth whe…
DataLabTechTV Aug 5, 2025
454d0dd
chore: commit notebook generated during video recording
DataLabTechTV Aug 5, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ ANALYTICS_MART_DB=marts/analytics.sqlite
# KùzuDB configurations
# =====================

MUSIC_TASTE_GRAPH_DB=graphs/music_taste.kz
ECON_COMP_GRAPH_DB=graphs/econ_comp.kz
MUSIC_TASTE_GRAPH_DB=graphs/music_taste.kuzu
ECON_COMP_GRAPH_DB=graphs/econ_comp.kuzu

# Ollama configurations
# =====================
Expand Down
3 changes: 2 additions & 1 deletion dlctl/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from shared.storage import Storage, StoragePrefix

LOG_FILE = Path(__file__).resolve().parents[1] / "logs/datalab.log"
LOG_FILE_RELATIVE = os.path.relpath(LOG_FILE.resolve(), start=Path.cwd())


@click.group(
Expand All @@ -34,7 +35,7 @@
"logfile_enabled",
is_flag=True,
default=True,
help=f"Disable file logging ({LOG_FILE.relative_to(Path.cwd())})",
help=f"Disable file logging ({LOG_FILE_RELATIVE})",
)
@click.option(
"--version",
Expand Down
71 changes: 71 additions & 0 deletions graph/analytics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from pathlib import Path

import kuzu
from loguru import logger as log

from shared.lakehouse import Lakehouse
from shared.settings import LOCAL_DIR, env


class GraphAnalytics:
def __init__(self, schema: str):
dbname = env.str(f"{schema.upper()}_GRAPH_DB")
db_path = Path(LOCAL_DIR) / dbname

if not db_path.exists():
raise FileNotFoundError(f"db not found: {db_path}")

db = kuzu.Database(db_path)
self.conn = kuzu.Connection(db)

self.lh = Lakehouse()

def compute_con_scores(
self,
node_label: str,
rel_label: str,
column_name: str = "con_score",
):
log.info(
"Computing CON scores for {} nodes via {} rels, storing to {} property",
node_label,
rel_label,
column_name,
)

log.debug("Adding {} to {}, if not exists", column_name, node_label)

self.conn.execute(
f"""
ALTER TABLE {node_label}
ADD IF NOT EXISTS {column_name} DOUBLE
"""
)

log.debug("Resetting {} on {}", column_name, node_label)

self.conn.execute(
f"""
MATCH (c:{node_label})
SET c.`{column_name}` = 0.0
"""
)

log.debug("Computing CON scores")

self.conn.execute(
f"""
MATCH (a:{node_label})-[ac:{rel_label}]->(c:{node_label})
MATCH (b:{node_label})-[bc:{rel_label}]->(c:{node_label})
WHERE a <> b
WITH a, b,
CASE
WHEN ac.esi < bc.esi
THEN ac.esi
ELSE bc.esi
END AS min_esi
WITH a, b, sum(min_esi) AS con_pair
WITH a, sum(con_pair) AS con_score
SET a.`{column_name}` = con_score
"""
)
10 changes: 10 additions & 0 deletions graph/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import click
from loguru import logger as log

from graph.analytics import GraphAnalytics
from graph.embedding import NodeEmbedding, NodeEmbeddingAlgo
from graph.ops import KuzuOps
from graph.rag import ContextAssemblerException, GraphRAG, GraphRetrievalException
Expand Down Expand Up @@ -104,6 +105,15 @@ def embeddings(schema: str, dimension: int, batch_size: int, epochs: int, algo:
log.exception(e)


@compute.command(help="Compute common out-neighbors (CON) score")
@click.argument("schema", type=click.STRING)
@click.argument("node_label", type=click.STRING)
@click.argument("rel_label", type=click.STRING)
def con_score(schema: str, node_label: str, rel_label: str):
ga = GraphAnalytics(schema)
ga.compute_con_scores(node_label, rel_label)


@graph.command(help="Reindex embedding property")
@click.argument("schema", type=click.STRING)
def reindex(schema: str):
Expand Down
12 changes: 6 additions & 6 deletions graph/ops.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import shutil
import tempfile
from enum import Enum
from pathlib import Path
from string import Template
from typing import Any, Optional

Expand All @@ -23,15 +23,15 @@ class KuzuTableType(Enum):
class KuzuOps:
def __init__(self, schema: str, overwrite: bool = False):
dbname = env.str(f"{schema.upper()}_GRAPH_DB")
db_path = os.path.join(LOCAL_DIR, dbname)
db_path = Path(LOCAL_DIR) / dbname

if os.path.exists(db_path):
if db_path.exists():
if overwrite:
log.warning(f"Overwriting database: {db_path}")
if os.path.isdir(db_path):
if db_path.is_dir():
shutil.rmtree(db_path)
elif os.path.isfile(db_path):
os.unlink(db_path)
elif db_path.is_file():
db_path.unlink()

db = kuzu.Database(db_path)
self.conn = kuzu.Connection(db)
Expand Down
Loading