# demo: textgraph

<details>
    <summary><strong>What is this project?</strong></summary>
<p>
Implementation of an LLM-augmented `textgraph` algorithm for constructing a <em>knowledge graph</em> from raw, unstructured text source.
</p>
<p>
This code is based on work developed by
<a href="https://derwen.ai/graph" target="_blank">Derwen</a>
in early 2023 for enterprise customer sample apps and our
<a href="https://derwen.ai/cysoni" target="_blank">Cysoni</a>
product.
It integrates code from:
</p>
<ul>
  <li>
    <a href="https://github.com/tomaarsen/SpanMarkerNER/" target="_blank">`SpanMarkerNER`</a>
  </li>
  <li>
    <a href="https://github.com/thunlp/OpenNRE/" target="_blank">`OpenNRE`</a>
  </li>
  <li>
    <a href="https://github.com/DerwenAI/pytextrank/" target="_blank">`PyTextRank`</a>
  </li>
  <li>
    <a href="https://medium.com/@groxli/create-a-spacy-visualizer-with-streamlit-8b9b41b36745" target="_blank"><em>Create a spaCy Visualizer with Streamlit</em></a>
  </li>
</ul>

<p>
This approach was presented in the talks:
</p>
<ul>
  <li>
    <a href="https://derwen.ai/s/mqqm" target="_blank">"Language, Graphs, and AI in Industry"</a>
    <br/>
    <strong>Paco Nathan</strong>, K1st World (2023-10-11)
  </li>
  <li>
    <a href="https://derwen.ai/s/rhvg" target="_blank">"Language Tools for Creators"</a>
    <br/>
    <strong>Paco Nathan</strong>, FOSSY (2023-07-13)
  </li>
</ul>
<p>
Some other good tutorials 2023 include closely related material:
</p>
<ul>
  <li>
    <a href="https://youtu.be/C9p7suS-NGk?si=7Ohq3BV654ia2Im4" target="_blank">"Natural Intelligence is All You Need™"</a>
    <br/>
    <strong>Vincent Warmerdam</strong>, PyData Amsterdam (2023-09-15)
  </li>
  <li>
    <a href="https://towardsdatascience.com/how-to-convert-any-text-into-a-graph-of-concepts-110844f22a1a" target="_blank">"How to Convert Any Text Into a Graph of Concepts"</a>
    <br/>
    <strong>Rahul Nayak</strong>, <em>Towards Data Science</em> (2023-11-09)
  </li>
</ul>

</details>

## parse a document

In [1]:
from icecream import ic
from textgraph import Node, Edge, TextGraph
import spacy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SRC_TEXT: str = """                                                                                                                      
Werner Herzog is a remarkable filmmaker and an intellectual originally from Germany, the son of Dietrich Herzog.                                
"""

SRC_TEXT = """
Herzog was born Werner Stipetić in Munich, Nazi Germany, to Elisabeth Stipetić, an Austrian of Croatian descent, and Dietrich Herzog, a German.
"""

In [3]:
tg: TextGraph = TextGraph()

sample_doc: spacy.tokens.doc.Doc = tg.build_doc(
    SRC_TEXT.strip(),
    use_llm = False,
)

2023-11-26 11:20:43,533 - root - INFO - Initializing word embedding with word2vec.


In [4]:
spacy.displacy.render(
    sample_doc,
    style = "ent",
    jupyter = True,
)

In [5]:
spacy.displacy.render(
    sample_doc,
    style = "dep",
    jupyter = True,
)

## build a lemma graph from the document

In [6]:
tg.build_graph_embeddings(
    sample_doc,
    debug = True,
)

ic| sent: Herzog was born Werner Stipetić in Munich, Nazi Germany, to Elisabeth Stipetić, an Austrian of Croatian descent, and Dietrich Herzog, a German.


In [7]:
tg.infer_relations(
    SRC_TEXT.strip(),
    debug = True,
)

ic| src.node_id: 3, dst.node_id: 5, path: [3, 4, 5]
ic| rel: 'residence', prob: 0.393816202878952
ic| src.node_id: 3, dst.node_id: 7, path: [3, 4, 5, 7]
ic| rel: 'country of citizenship', prob: 0.8388875722885132
ic| src.node_id: 3, dst.node_id: 10, path: [3, 2, 9, 10]
ic| rel: 'mother', prob: 0.5407394170761108
ic| src.node_id: 3, dst.node_id: 13, path: [3, 2, 9, 10, 13]
ic| rel: 'country of citizenship', prob: 0.767692506313324
ic| src.node_id: 3
    dst.node_id: 15
    path: [3, 2, 9, 10, 13, 14, 16, 15]
ic| rel: 'country of citizenship', prob: 0.5234690308570862
ic| src.node_id: 3, dst.node_id: 19, path: [3, 2, 19]
ic| rel: 'sibling', prob: 0.37065911293029785
ic| src.node_id: 3, dst.node_id: 22, path: [3, 2, 19, 22]
ic| rel: 'father', prob: 0.16549424827098846
ic| src.node_id: 5, dst.node_id: 3, path: [5, 4, 3]
ic| rel: 'sibling', prob: 0.31588953733444214
ic| src.node_id: 5, dst.node_id: 7, path: [5, 7]
ic| rel: 'country', prob: 0.30468812584877014
ic| src.node_id: 5, dst.node_id

In [8]:
tg.calc_phrase_ranks()

ic(tg.edges);

ic| tg.edges: {'0.2.nsubjpass.0': Edge(src_node=0, dst_node=2, kind=<RelEnum.DEP: 0>, rel='nsubjpass', prob=1.0, count=1),
               '1.2.auxpass.0': Edge(src_node=1, dst_node=2, kind=<RelEnum.DEP: 0>, rel='auxpass', prob=1.0, count=1),
               '10.13.country_of_citizenship.1': Edge(src_node=10, dst_node=13, kind=<RelEnum.INF: 1>, rel='country of citizenship', prob=0.6766995191574097, count=1),
               '10.15.country_of_citizenship.1': Edge(src_node=10, dst_node=15, kind=<RelEnum.INF: 1>, rel='country of citizenship', prob=0.47829607129096985, count=1),
               '10.19.sibling.1': Edge(src_node=10, dst_node=19, kind=<RelEnum.INF: 1>, rel='sibling', prob=0.4432893991470337, count=1),
               '10.22.father.1': Edge(src_node=10, dst_node=22, kind=<RelEnum.INF: 1>, rel='father', prob=0.21240809559822083, count=1),
               '10.3.sibling.1': Edge(src_node=10, dst_node=3, kind=<RelEnum.INF: 1>, rel='sibling', prob=0.45087841153144836, count=1),
         

show the resulting entities extracted from the document

In [9]:
for node in tg.get_phrases():
    ic(node)

ic| node: Node(node_id=10, span=Elisabeth Stipetić, text='Elisabeth Stipetić', pos='PROPN', kind='PERSON', count=1, weight=0.12497108235660234)
ic| node: Node(node_id=13, span=Austrian, text='Austrian', pos='ADJ', kind='NORP', count=1, weight=0.12497108235660234)
ic| node: Node(node_id=3, span=Werner Stipetić, text='Werner Stipetić', pos='PROPN', kind='PERSON', count=1, weight=0.11991720772538196)
ic| node: Node(node_id=7, span=Nazi Germany, text='Nazi Germany', pos='PROPN', kind='GPE', count=1, weight=0.11991720772538196)
ic| node: Node(node_id=5, span=Munich, text='Munich', pos='PROPN', kind='GPE', count=1, weight=0.11515368845567156)
ic| node: Node(node_id=19, span=Dietrich Herzog, text='Dietrich Herzog', pos='PROPN', kind='PERSON', count=1, weight=0.11515368845567156)
ic| node: Node(node_id=15, span=Croatian, text='Croatian', pos='ADJ', kind='NORP', count=1, weight=0.11071800754286118)
ic| node: Node(node_id=22, span=German, text='German', pos='PROPN', kind='NORP', count=1, weight=

## visualize the lemma graph

In [10]:
from textgraph import RenderPyVis
import pyvis

render: RenderPyVis = RenderPyVis(
    tg.nodes,
    tg.edges,
    tg.lemma_graph,
)

vis_graph: pyvis.network.Network = render.build_lemma_graph()

ic| node.count: 1
    node: Node(node_id=0, span=Herzog, text='Herzog', pos='PROPN', kind=None, count=1, weight=0.018152660693847897)
    nx_node: {'color': '#c083bb',
              'kind': 2,
              'label': 'Herzog',
              'neighbors': 1,
              'shape': 'square',
              'size': 1,
              'value': 0.018152660693847897}
ic| node.count: 0
    node: Node(node_id=1, span=was, text='was', pos='AUX', kind=None, count=0, weight=0.0)
    nx_node: {'color': 'hsla(72, 19%, 90%, 0.4)',
              'kind': 0,
              'label': '',
              'neighbors': 1,
              'shape': 'star',
              'size': 0,
              'title': 'was',
              'value': 0.0}
ic| node.count: 1
    node: Node(node_id=2, span=born, text='born', pos='VERB', kind=None, count=1, weight=0.022174706451270045)
    nx_node: {'color': '#c083bb',
              'kind': 2,
              'label': 'born',
              'neighbors': 0,
              'shape': 'square',
    

set the layout parameters

In [11]:
vis_graph.force_atlas_2based(
    gravity = -38,
    central_gravity = 0.01,
    spring_length = 231,
    spring_strength = 0.7,
    damping = 0.8,
    overlap = 0,
)

vis_graph.show_buttons(filter_ = [ "physics" ])
vis_graph.toggle_physics(True)

In [12]:
vis_graph.prep_notebook()
vis_graph.show("vis.html")

vis.html


## outro

_\[ more parts are getting added to this demo \]_