# demo: TextGraph + LLMs

## intro

In [1]:
import pathlib
from IPython.display import display, HTML

In [2]:
display(HTML(pathlib.Path("docs/demo/blurb.1.html").read_text(encoding = "utf-8")))

## parse a document

In [3]:
from icecream import ic
import pandas as pd
import pyvis
import spacy

from textgraph import Edge, Node, RenderPyVis, TextGraph

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
SRC_TEXT: str = """                                                                                                                      
Werner Herzog is a remarkable filmmaker and an intellectual originally from Germany, the son of Dietrich Herzog.
Although in fact Werner Herzog never really liked his father, as we all know.
"""

In [5]:
tg: TextGraph = TextGraph()

sample_doc: spacy.tokens.doc.Doc = tg.build_doc(
    SRC_TEXT.strip(),
)

2023-11-28 15:01:43,808 - root - INFO - Initializing word embedding with word2vec.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [6]:
spacy.displacy.render(
    sample_doc,
    style = "ent",
    jupyter = True,
)

In [7]:
spacy.displacy.render(
    sample_doc,
    style = "dep",
    jupyter = True,
)

## build a lemma graph from the document

In [8]:
tg.build_graph_embeddings(
    sample_doc,
    debug = True,
)

ic| sent_id: 0
    sent: Werner Herzog is a remarkable filmmaker and an intellectual originally from Germany, the son of Dietrich Herzog.
    sent.start: 0
ic| sent_nodes: [Node(node_id=0, span=Werner Herzog, text='Werner Herzog', pos='PROPN', sents={0}, kind='PERSON', count=1, neighbors=0, weight=0.0),
                 Node(node_id=1, span=is, text='is', pos='AUX', sents={0}, kind=None, count=0, neighbors=0, weight=0.0),
                 Node(node_id=2, span=a, text='a', pos='DET', sents={0}, kind=None, count=0, neighbors=0, weight=0.0),
                 Node(node_id=3, span=remarkable, text='remarkable', pos='ADJ', sents={0}, kind=None, count=0, neighbors=0, weight=0.0),
                 Node(node_id=4, span=filmmaker, text='filmmaker', pos='NOUN', sents={0}, kind=None, count=1, neighbors=0, weight=0.0),
                 Node(node_id=5, span=and, text='and', pos='CCONJ', sents={0}, kind=None, count=0, neighbors=0, weight=0.0),
                 Node(node_id=6, span=an, text='an', pos=

In [9]:
tg.calc_phrase_ranks()

ic(tg.edges);

ic| tg.edges: {'0.1.nsubj.0': Edge(src_node=0, dst_node=1, kind=<RelEnum.DEP: 0>, rel='nsubj', prob=1.0, count=1),
               '0.19.nsubj.0': Edge(src_node=0, dst_node=19, kind=<RelEnum.DEP: 0>, rel='nsubj', prob=1.0, count=1),
               '10.9.pobj.0': Edge(src_node=10, dst_node=9, kind=<RelEnum.DEP: 0>, rel='pobj', prob=1.0, count=1),
               '11.10.punct.0': Edge(src_node=11, dst_node=10, kind=<RelEnum.DEP: 0>, rel='punct', prob=1.0, count=1),
               '12.13.det.0': Edge(src_node=12, dst_node=13, kind=<RelEnum.DEP: 0>, rel='det', prob=1.0, count=1),
               '13.10.appos.0': Edge(src_node=13, dst_node=10, kind=<RelEnum.DEP: 0>, rel='appos', prob=1.0, count=1),
               '14.13.prep.0': Edge(src_node=14, dst_node=13, kind=<RelEnum.DEP: 0>, rel='prep', prob=1.0, count=1),
               '15.14.pobj.0': Edge(src_node=15, dst_node=14, kind=<RelEnum.DEP: 0>, rel='pobj', prob=1.0, count=1),
               '16.1.punct.0': Edge(src_node=16, dst_node=1, kind=

show the resulting entities extracted from the document

In [10]:
df: pd.DataFrame = tg.get_phrases_as_df()
df

Unnamed: 0,node_id,text,pos,kind,count,weight
0,4,filmmaker,NOUN,,1,0.135571
1,7,intellectual,NOUN,,1,0.135571
2,13,son,NOUN,,1,0.131913
3,20,fact,NOUN,,1,0.128481
4,25,father,NOUN,,1,0.128481
5,0,Werner Herzog,PROPN,PERSON,2,0.116932
6,10,Germany,PROPN,GPE,1,0.112995
7,15,Dietrich Herzog,PROPN,PERSON,1,0.110056


## visualize the lemma graph

In [11]:
render: RenderPyVis = RenderPyVis(
    tg.nodes,
    tg.edges,
    tg.lemma_graph,
)

pv_graph: pyvis.network.Network = render.build_lemma_graph()

ic| node.count: 2
    node: Node(node_id=0, span=Werner Herzog, text='Werner Herzog', pos='PROPN', sents={0, 1}, kind='PERSON', count=2, neighbors=2, weight=0.11693226085364829)
    nx_node: {'color': 'hsl(65, 46%, 58%)',
              'kind': <NodeKind.ENT: 2>,
              'label': 'Werner Herzog',
              'neighbors': 2,
              'shape': 'circle',
              'size': 2,
              'title': 'werner herzog.PROPN',
              'value': 0.11693226085364829}
ic| node.count: 0
    node: Node(node_id=1, span=is, text='is', pos='AUX', sents={0}, kind=None, count=0, neighbors=0, weight=0.0)
    nx_node: {'color': 'hsla(72, 19%, 90%, 0.4)',
              'kind': <NodeKind.DEP: 0>,
              'label': '',
              'neighbors': 0,
              'shape': 'star',
              'size': 0,
              'title': '1.is.AUX',
              'value': 0.0}
ic| node.count: 0
    node: Node(node_id=2, span=a, text='a', pos='DET', sents={0}, kind=None, count=0, neighbors=1, weig

set the layout parameters

In [12]:
pv_graph.force_atlas_2based(
    gravity = -38,
    central_gravity = 0.01,
    spring_length = 231,
    spring_strength = 0.7,
    damping = 0.8,
    overlap = 0,
)

pv_graph.show_buttons(filter_ = [ "physics" ])
pv_graph.toggle_physics(True)

In [13]:
pv_graph.prep_notebook()
pv_graph.show("vis.html")

vis.html


## infer relations

In [14]:
if False:
    tg.infer_relations(
        SRC_TEXT.strip(),
        debug = True,
    )

## outro

_\[ more parts are in progress, getting added to this demo \]_