# demo: TextGraph + LLMs

## intro

In [1]:
import pathlib
from IPython.display import display, HTML

In [2]:
display(HTML(pathlib.Path("docs/demo/blurb.1.html").read_text(encoding = "utf-8")))

## parse a document

In [3]:
from icecream import ic
import pandas as pd
import pyvis
import spacy

from textgraph import Node, Edge, RenderPyVis, TextGraph

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
SRC_TEXT: str = """                                                                                                                      
Werner Herzog is a remarkable filmmaker and an intellectual originally from Germany, the son of Dietrich Herzog. 
"""

In [5]:
tg: TextGraph = TextGraph()

sample_doc: spacy.tokens.doc.Doc = tg.build_doc(
    SRC_TEXT.strip(),
)

2023-11-28 11:05:59,837 - root - INFO - Initializing word embedding with word2vec.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [6]:
spacy.displacy.render(
    sample_doc,
    style = "ent",
    jupyter = True,
)

In [7]:
spacy.displacy.render(
    sample_doc,
    style = "dep",
    jupyter = True,
)

## build a lemma graph from the document

In [8]:
tg.build_graph_embeddings(
    sample_doc,
    debug = True,
)

ic| sent_id: 0
    sent: Werner Herzog is a remarkable filmmaker and an intellectual originally from Germany, the son of Dietrich Herzog.
    sent.start: 0
ic| sent_nodes: [Node(node_id=0, span=Werner Herzog, text='Werner Herzog', pos='PROPN', sents={0}, kind='PERSON', count=1, weight=0.0),
                 Node(node_id=1, span=is, text='is', pos='AUX', sents={0}, kind=None, count=0, weight=0.0),
                 Node(node_id=2, span=a, text='a', pos='DET', sents={0}, kind=None, count=0, weight=0.0),
                 Node(node_id=3, span=remarkable, text='remarkable', pos='ADJ', sents={0}, kind=None, count=0, weight=0.0),
                 Node(node_id=4, span=filmmaker, text='filmmaker', pos='NOUN', sents={0}, kind=None, count=1, weight=0.0),
                 Node(node_id=5, span=and, text='and', pos='CCONJ', sents={0}, kind=None, count=0, weight=0.0),
                 Node(node_id=6, span=an, text='an', pos='DET', sents={0}, kind=None, count=0, weight=0.0),
                 Node(node_

In [9]:
tg.infer_relations(
    SRC_TEXT.strip(),
    debug = True,
)

ic| src.node_id: 0, dst.node_id: 10, path: [0, 1, 4, 7, 9, 10]
ic| rel: 'country of citizenship', prob: 0.9073653221130371
ic| src.node_id: 0
    dst.node_id: 15
    path: [0, 1, 4, 7, 9, 10, 13, 14, 15]
ic| rel: 'father', prob: 0.5981622934341431
ic| src.node_id: 10, dst.node_id: 0, path: [10, 9, 7, 4, 1, 0]
ic| rel: 'father', prob: 0.46716827154159546
ic| src.node_id: 10, dst.node_id: 15, path: [10, 13, 14, 15]
ic| rel: 'father', prob: 0.6251675486564636
ic| src.node_id: 15
    dst.node_id: 0
    path: [15, 14, 13, 10, 9, 7, 4, 1, 0]
ic| rel: 'father', prob: 0.41431477665901184
ic| src.node_id: 15, dst.node_id: 10, path: [15, 14, 13, 10]
ic| rel: 'country of citizenship', prob: 0.8607672452926636


In [10]:
tg.calc_phrase_ranks()

ic(tg.edges);

ic| tg.edges: {'0.1.nsubj.0': Edge(src_node=0, dst_node=1, kind=<RelEnum.DEP: 0>, rel='nsubj', prob=1.0, count=1),
               '0.10.country_of_citizenship.1': Edge(src_node=0, dst_node=10, kind=<RelEnum.INF: 1>, rel='country of citizenship', prob=0.9073653221130371, count=1),
               '0.15.father.1': Edge(src_node=0, dst_node=15, kind=<RelEnum.INF: 1>, rel='father', prob=0.5981622934341431, count=1),
               '10.0.father.1': Edge(src_node=10, dst_node=0, kind=<RelEnum.INF: 1>, rel='father', prob=0.46716827154159546, count=1),
               '10.15.father.1': Edge(src_node=10, dst_node=15, kind=<RelEnum.INF: 1>, rel='father', prob=0.6251675486564636, count=1),
               '10.9.pobj.0': Edge(src_node=10, dst_node=9, kind=<RelEnum.DEP: 0>, rel='pobj', prob=1.0, count=1),
               '11.10.punct.0': Edge(src_node=11, dst_node=10, kind=<RelEnum.DEP: 0>, rel='punct', prob=1.0, count=1),
               '12.13.det.0': Edge(src_node=12, dst_node=13, kind=<RelEnum.DEP: 

show the resulting entities extracted from the document

In [11]:
df: pd.DataFrame = tg.get_phrases_as_df()
df

Unnamed: 0,node_id,text,pos,kind,count,weight
0,10,Germany,PROPN,GPE,1,0.212239
1,0,Werner Herzog,PROPN,PERSON,1,0.192566
2,15,Dietrich Herzog,PROPN,PERSON,1,0.192566
3,4,filmmaker,NOUN,,1,0.140934
4,7,intellectual,NOUN,,1,0.134073
5,13,son,NOUN,,1,0.127621


## visualize the lemma graph

In [12]:
render: RenderPyVis = RenderPyVis(
    tg.nodes,
    tg.edges,
    tg.lemma_graph,
)

vis_graph: pyvis.network.Network = render.build_lemma_graph()

ic| node.count: 1
    node: Node(node_id=0, span=Werner Herzog, text='Werner Herzog', pos='PROPN', sents={0}, kind='PERSON', count=1, weight=0.19256602205245993)
    nx_node: {'color': '#d2d493',
              'kind': 1,
              'label': 'Werner Herzog',
              'neighbors': 3,
              'shape': 'circle',
              'size': 1,
              'value': 0.19256602205245993}
ic| node.count: 0
    node: Node(node_id=1, span=is, text='is', pos='AUX', sents={0}, kind=None, count=0, weight=0.0)
    nx_node: {'color': 'hsla(72, 19%, 90%, 0.4)',
              'kind': 0,
              'label': '',
              'neighbors': 0,
              'shape': 'star',
              'size': 0,
              'title': 'is',
              'value': 0.0}
ic| node.count: 0
    node: Node(node_id=2, span=a, text='a', pos='DET', sents={0}, kind=None, count=0, weight=0.0)
    nx_node: {'color': 'hsla(72, 19%, 90%, 0.4)',
              'kind': 0,
              'label': '',
              'neighbors':

set the layout parameters

In [13]:
vis_graph.force_atlas_2based(
    gravity = -38,
    central_gravity = 0.01,
    spring_length = 231,
    spring_strength = 0.7,
    damping = 0.8,
    overlap = 0,
)

vis_graph.show_buttons(filter_ = [ "physics" ])
vis_graph.toggle_physics(True)

In [14]:
vis_graph.prep_notebook()
vis_graph.show("vis.html")

vis.html


## outro

_\[ more parts are in progress, getting added to this demo \]_