# demo: textgraph

<details>
    <summary><strong>What is this project?</strong></summary>
<p>
Implementation of an LLM-augmented `textgraph` algorithm for constructing a <em>knowledge graph</em> from raw, unstructured text source.
</p>
<p>
This code is based on work developed by
<a href="https://derwen.ai/graph" target="_blank">Derwen</a>
in early 2023 for enterprise customer sample apps and our
<a href="https://derwen.ai/cysoni" target="_blank">Cysoni</a>
product.
It integrates code from:
</p>
<ul>
  <li>
    <a href="https://github.com/tomaarsen/SpanMarkerNER/" target="_blank">`SpanMarkerNER`</a>
  </li>
  <li>
    <a href="https://github.com/thunlp/OpenNRE/" target="_blank">`OpenNRE`</a>
  </li>
  <li>
    <a href="https://github.com/DerwenAI/pytextrank/" target="_blank">`PyTextRank`</a>
  </li>
  <li>
    <a href="https://medium.com/@groxli/create-a-spacy-visualizer-with-streamlit-8b9b41b36745" target="_blank"><em>Create a spaCy Visualizer with Streamlit</em></a>
  </li>
</ul>

<p>
This approach was presented in the talks:
</p>
<ul>
  <li>
    <a href="https://derwen.ai/s/mqqm" target="_blank">"Language, Graphs, and AI in Industry"</a>
    <br/>
    <strong>Paco Nathan</strong>, K1st World (2023-10-11)
  </li>
  <li>
    <a href="https://derwen.ai/s/rhvg" target="_blank">"Language Tools for Creators"</a>
    <br/>
    <strong>Paco Nathan</strong>, FOSSY (2023-07-13)
  </li>
</ul>
<p>
Some other good tutorials 2023 include closely related material:
</p>
<ul>
  <li>
    <a href="https://youtu.be/C9p7suS-NGk?si=7Ohq3BV654ia2Im4" target="_blank">"Natural Intelligence is All You Need™"</a>
    <br/>
    <strong>Vincent Warmerdam</strong>, PyData Amsterdam (2023-09-15)
  </li>
  <li>
    <a href="https://towardsdatascience.com/how-to-convert-any-text-into-a-graph-of-concepts-110844f22a1a" target="_blank">"How to Convert Any Text Into a Graph of Concepts"</a>
    <br/>
    <strong>Rahul Nayak</strong>, <em>Towards Data Science</em> (2023-11-09)
  </li>
</ul>

</details>

## parse a document

In [1]:
from icecream import ic
from textgraph import Node, Edge, TextGraph
import spacy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SRC_TEXT: str = """                                                                                                                      
Werner Herzog is a remarkable filmmaker and an intellectual originally from Germany, the son of Dietrich Herzog.                                
"""

SRC_TEXT = """
Herzog was born Werner Stipetić in Munich, Nazi Germany, to Elisabeth Stipetić, an Austrian of Croatian descent, and Dietrich Herzog, a German.
"""

In [3]:
tg: TextGraph = TextGraph()

sample_doc: spacy.tokens.doc.Doc = tg.build_doc(
    SRC_TEXT.strip(),
)

2023-11-26 18:03:03,292 - root - INFO - Initializing word embedding with word2vec.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [4]:
spacy.displacy.render(
    sample_doc,
    style = "ent",
    jupyter = True,
)

In [5]:
spacy.displacy.render(
    sample_doc,
    style = "dep",
    jupyter = True,
)

## build a lemma graph from the document

In [6]:
tg.build_graph_embeddings(
    sample_doc,
    debug = True,
)

ic| sent: Herzog was born Werner Stipetić in Munich, Nazi Germany, to Elisabeth Stipetić, an Austrian of Croatian descent, and Dietrich Herzog, a German.


In [7]:
tg.infer_relations(
    SRC_TEXT.strip(),
    debug = True,
)

ic| src.node_id: 0, dst.node_id: 3, path: [0, 2, 3]
ic| rel: 'sibling', prob: 0.26503854990005493
ic| src.node_id: 0, dst.node_id: 5, path: [0, 2, 3, 4, 5]
ic| rel: 'father', prob: 0.09945425391197205
ic| src.node_id: 0, dst.node_id: 7, path: [0, 2, 3, 4, 5, 8, 7]
ic| rel: 'mother', prob: 0.3116134703159332
ic| src.node_id: 0, dst.node_id: 8, path: [0, 2, 3, 4, 5, 8]
ic| rel: 'country of citizenship', prob: 0.42767634987831116
ic| src.node_id: 0, dst.node_id: 11, path: [0, 2, 10, 11]
ic| rel: 'mother', prob: 0.5403972864151001
ic| src.node_id: 0, dst.node_id: 14, path: [0, 2, 10, 11, 14]
ic| rel: 'country of citizenship', prob: 0.30031293630599976
ic| src.node_id: 0
    dst.node_id: 16
    path: [0, 2, 10, 11, 14, 15, 17, 16]
ic| rel: 'country', prob: 0.17259138822555542
ic| src.node_id: 0, dst.node_id: 20, path: [0, 2, 20]
ic| rel: 'sibling', prob: 0.17631608247756958
ic| src.node_id: 0, dst.node_id: 23, path: [0, 2, 20, 23]
ic| rel: 'father', prob: 0.17636685073375702
ic| src.node_id

In [8]:
tg.calc_phrase_ranks()

ic(tg.edges);

ic| tg.edges: {'0.11.mother.1': Edge(src_node=0, dst_node=11, kind=<RelEnum.INF: 1>, rel='mother', prob=0.5403972864151001, count=1),
               '0.14.country_of_citizenship.1': Edge(src_node=0, dst_node=14, kind=<RelEnum.INF: 1>, rel='country of citizenship', prob=0.30031293630599976, count=1),
               '0.16.country.1': Edge(src_node=0, dst_node=16, kind=<RelEnum.INF: 1>, rel='country', prob=0.17259138822555542, count=1),
               '0.2.nsubjpass.0': Edge(src_node=0, dst_node=2, kind=<RelEnum.DEP: 0>, rel='nsubjpass', prob=1.0, count=1),
               '0.20.sibling.1': Edge(src_node=0, dst_node=20, kind=<RelEnum.INF: 1>, rel='sibling', prob=0.17631608247756958, count=1),
               '0.23.father.1': Edge(src_node=0, dst_node=23, kind=<RelEnum.INF: 1>, rel='father', prob=0.17636685073375702, count=1),
               '0.3.sibling.1': Edge(src_node=0, dst_node=3, kind=<RelEnum.INF: 1>, rel='sibling', prob=0.26503854990005493, count=1),
               '0.5.father.1': E

show the resulting entities extracted from the document

In [9]:
for node in tg.get_phrases():
    ic(node)

ic| node: Node(node_id=11, span=Elisabeth Stipetić, text='Elisabeth Stipetić', pos='PROPN', kind='PERSON', count=1, weight=0.11075195549074206)
ic| node: Node(node_id=8, span=Germany, text='Germany', pos='PROPN', kind='GPE', count=1, weight=0.10623756211400794)
ic| node: Node(node_id=3, span=Werner Stipetić, text='Werner Stipetić', pos='PROPN', kind='PERSON', count=1, weight=0.10194127764622507)
ic| node: Node(node_id=14, span=Austrian, text='Austrian', pos='ADJ', kind='NORP', count=1, weight=0.10194127764622507)
ic| node: Node(node_id=5, span=Munich, text='Munich', pos='PROPN', kind='GPE', count=1, weight=0.09789182344646799)
ic| node: Node(node_id=20, span=Dietrich Herzog, text='Dietrich Herzog', pos='PROPN', kind='PERSON', count=1, weight=0.09789182344646799)
ic| node: Node(node_id=7, span=Nazi, text='Nazi', pos='PROPN', kind='NORP', count=1, weight=0.09412106370264221)
ic| node: Node(node_id=16, span=Croatian, text='Croatian', pos='ADJ', kind='NORP', count=1, weight=0.0941210637026

## visualize the lemma graph

In [10]:
from textgraph import RenderPyVis
import pyvis

render: RenderPyVis = RenderPyVis(
    tg.nodes,
    tg.edges,
    tg.lemma_graph,
)

vis_graph: pyvis.network.Network = render.build_lemma_graph()

ic| node.count: 1
    node: Node(node_id=0, span=Herzog, text='Herzog', pos='PROPN', kind='PERSON', count=1, weight=0.0906637782836777)
    nx_node: {'color': '#d2d493',
              'kind': 1,
              'label': 'Herzog',
              'neighbors': 10,
              'shape': 'circle',
              'size': 1,
              'value': 0.0906637782836777}
ic| node.count: 0
    node: Node(node_id=1, span=was, text='was', pos='AUX', kind=None, count=0, weight=0.0)
    nx_node: {'color': 'hsla(72, 19%, 90%, 0.4)',
              'kind': 0,
              'label': '',
              'neighbors': 1,
              'shape': 'star',
              'size': 0,
              'title': 'was',
              'value': 0.0}
ic| node.count: 1
    node: Node(node_id=2, span=born, text='born', pos='VERB', kind=None, count=1, weight=0.007551185579300494)
    nx_node: {'color': '#c083bb',
              'kind': 2,
              'label': 'born',
              'neighbors': 0,
              'shape': 'square',
   

set the layout parameters

In [11]:
vis_graph.force_atlas_2based(
    gravity = -38,
    central_gravity = 0.01,
    spring_length = 231,
    spring_strength = 0.7,
    damping = 0.8,
    overlap = 0,
)

vis_graph.show_buttons(filter_ = [ "physics" ])
vis_graph.toggle_physics(True)

In [12]:
vis_graph.prep_notebook()
vis_graph.show("vis.html")

vis.html


## outro

_\[ more parts are getting added to this demo \]_