In [1]:
import psycopg2 as pg

In [2]:
conn = pg.connect(dbname='sec', user='secapp', host='r_363-postgres-1')
curr = conn.cursor()

In [3]:
class Node:
    code: str
    children: list["Node"]
    child_codes: set[str]

    def __init__(self, code) -> None:
        self.code = code
        self.children = []
        self.child_codes = set()

    def get(self, code):
        if self.code == code:
            return self
        else:
            for child in self.children:
                node = child.get(code)
                if node:
                    return node
            return None

    def insert(self, parent: 'Node', code: str):
        if parent.code == code:
            return None

        if self is parent:
            if code not in self.child_codes:
                self.child_codes.add(code)
                self.children.append(Node(code))
            return [child for child in self.children if child.code == code][0]
        else:
            for child in self.children:
                node = child.insert(parent, code)
                if node:
                    return node
            return None

    def __repr__(self) -> str:
        return f"<Node>({self.code})"

Get the paths by querying like so,
```sql
select path from ncit_tc_with_path where descendant = 'CHILD' and path ~ '\yROOT\y';
```

In [4]:
paths = []
with open('../../paths.txt') as f:
    for line in f:
        paths.append(line.strip())
# paths = ["C35850|C9315|C132728|C132732|C132733"]

In [5]:
def printgraph(node: Node, level=0):
    global printed_dupe, outfile
    if node.code in visited:
        if not printed_dupe:
            outfile.write("\t" * level + f" - ({node.code}) dupe* \n")
            printed_dupe = True
    elif node.code:
        printed_dupe = False
        curr.execute("select pref_name,display_name from ncit where code = %s", (node.code,))
        pref_name, display_name = curr.fetchone()
        outfile.write("\t" * level + " - (" + node.code + ") " + (display_name or pref_name) + "\n")
        visited.add(node.code)
    else:
        printed_dupe = False
        outfile.write("- root" + "\n")
        visited.add(None)
    for child in node.children:
        printgraph(child, level + 1)

In [6]:
visited = set()
printed_dupe = False


root = Node(None)
for path in paths:
    codes = path.split("|")
    parent = root
    current = None
    for code in codes:
        current = root.insert(parent, code)
        if current:
            parent = current

In [7]:
outfile = open("../../paths.out.txt", "w")
printgraph(root)
outfile.close()

In [None]:
from nltk import Tree

visited = set()
printed_dupe = False


def to_nltk_tree_verbose(node: Node):
    pref_name, display_name = None, None
    if node.code:
        curr.execute(
            "select pref_name,display_name from ncit where code = %s", (node.code,)
        )
        pref_name, display_name = curr.fetchone()
    if len(node.children) > 0:
        return Tree(
            f"({node.code or 'root'}) {display_name or pref_name}",
            [to_nltk_tree_verbose(child) for child in node.children],
        )
    else:
        return f"({node.code}) {display_name or pref_name}"


def to_nltk_tree(node: Node):
    global printed_dupe, outfile
    if node.code in visited:
        if not printed_dupe:
            printed_dupe = True
            return f"({node.code}) dupe*"
    elif node.code:
        printed_dupe = False
        curr.execute(
            "select pref_name,display_name from ncit where code = %s", (node.code,)
        )
        pref_name, display_name = curr.fetchone()
        visited.add(node.code)
        if len(node.children) > 0:
            return Tree(
                "(" + node.code + ") " + (display_name or pref_name),
                [to_nltk_tree(child) for child in node.children],
            )
        return "(" + node.code + ") " + (display_name or pref_name)
    else:
        printed_dupe = False
        visited.add(None)
        if len(node.children) > 0:
            return Tree(
                "(root)",
                [to_nltk_tree(child) for child in node.children],
            )
        return "(root)"


to_nltk_tree_verbose(root)