## Setup

In [16]:
import pandas as pd
import numpy as np
import os
#from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
#from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
#from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
data_dir = "cureus"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents

In [17]:
## Dir PDF Loader
# loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
#loader = DirectoryLoader(inputdirectory, show_progress=True)
#documents = loader.load()
#splitter = RecursiveCharacterTextSplitter(
    #chunk_size=1500,
    #chunk_overlap=150,
    #length_function=len,
    #is_separator_regex=False,
#)

#pages = splitter.split_documents(documents)
#print("Number of chunks = ", len(pages))
#print(pages[3].page_content)
import json
from langchain_text_splitters import RecursiveCharacterTextSplitter
import pandas as pd

json_file_path = inputdirectory / "test.json"

with open(json_file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

def extract_text_from_json(json_data):
    text_chunks = []
    
    for chapter in json_data.get("children", []):
        for article in chapter.get("articles", []):
            for paragraph in article.get("paragraphs", []):
                text_content = " ".join(
                    [t["text"].strip() for t in paragraph.get("text", []) if isinstance(t.get("text"), str)]
                )
                text_chunks.append(text_content)

    return text_chunks  

documents = extract_text_from_json(data)

if not documents:
    raise ValueError("No valid text found in the JSON file.")

# Define text splitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)


text_content = "\n\n".join(documents) 
pages = splitter.split_text(text_content)




## Create a dataframe of all the chunks

In [18]:
from langchain.schema import Document
from helpers.df_helpers import documents2Dataframe

documents = [Document(page_content=chunk, metadata={"source": "json"}) for chunk in pages]

df = documents2Dataframe(documents)

print(df.shape)
df.head()


(150, 3)


Unnamed: 0,text,source,chunk_id
0,Article 1\nSubject matter\n\n1. The purpose of...,json,542f562d7b814d409a25e8728c0abc58
1,Article 2\nScope\n\n1. This Regulation applies...,json,ac309310e1c747b3a58f998baf718665
2,3. This Regulation does not apply to areas out...,json,0507410e83cb4df198d695b439c8857c
3,5. This Regulation shall not affect the applic...,json,4ecd5a1b2eb541bf836eba7910a795d4
4,11. This Regulation does not preclude the Unio...,json,305f8c1952b34f9cad6217d3592a6640


## Extract Concepts

In [19]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [None]:
## To regenerate the graph with LLM, set this to True
regenerate = True

if regenerate:
    required_columns = ["chapter", "article", "paragraph"]
    for col in required_columns:
        if col not in df.columns:
            df[col] = "Unknown"  

    concepts_list = df2Graph(df, model='zephyr:latest')
    dfg1 = graph2Df(concepts_list)
    
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)

    dfg1.to_csv(outputdirectory / "graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory / "chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory / "graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4  

#print(dfg1.shape)
#dfg1.head()



🔹 Processing 150 structured sections in df2Graph...
Here's the output in JSON format based on the provided legal document context:

[
  {
    "node_1": "Article 1",
    "node_2": "Purpose of Regulation",
    "edge": "describes"
  },
  {
    "node_1": "Union",
    "node_2": "Harmonised Rules",
    "edge": "apply to"
  },
  {
    "node_1": "Article 1",

KeyboardInterrupt: 

## Calculating contextual proximity

In [None]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
6644,voluntary model terms for contracts between pr...,provider of high-risk ai system,"b1fe88c109f84dbea1e2213d0e2a42e3,b1fe88c109f84...",2,contextual proximity
6647,where the commission considers the authorisati...,"5. where, within 15 calendar days of receipt o...","ed3d08d74f514936887ff9b0edaa10ff,ed3d08d74f514...",3,contextual proximity
6650,where the commission considers the authorisati...,commission shall address its decision to the m...,"ed3d08d74f514936887ff9b0edaa10ff,ed3d08d74f514...",2,contextual proximity
6662,widespread infringement,harm,"dd83a85a970b49e3824afb78f46e5523,dd83a85a970b4...",2,contextual proximity
6665,work of a group referred to in paragraph 1,appropriate coordination and cooperation betwe...,"37e6e9d2512a402db5a5763996294241,37e6e9d2512a4...",5,contextual proximity


### Merge both the dataframes

In [None]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,(a component of a product or of a system),an ai system,"e2471fe4b8c34d4da50c94b6f7cf2560,e2471fe4b8c34...",contextual proximity,3
1,(a component of a product or of a system),an ai system or a general-purpose ai model,"e2471fe4b8c34d4da50c94b6f7cf2560,e2471fe4b8c34...",contextual proximity,2
2,(a),(b),"b2218fd6411f435eaf18bf3725ef3624,b2218fd6411f4...",contextual proximity,2
3,(a),(c),"b2218fd6411f435eaf18bf3725ef3624,b2218fd6411f4...",contextual proximity,2
4,(a),(d),"b2218fd6411f435eaf18bf3725ef3624,b2218fd6411f4...",contextual proximity,2
...,...,...,...,...,...
2912,where the commission considers the authorisati...,authorisation shall be withdrawn by the market...,ed3d08d74f514936887ff9b0edaa10ff,is withdrawn by,4
2913,where the commission considers the authorisati...,commission shall address its decision to the m...,"ed3d08d74f514936887ff9b0edaa10ff,ed3d08d74f514...",contextual proximity,2
2914,widespread infringement,harm,"dd83a85a970b49e3824afb78f46e5523,dd83a85a970b4...",contextual proximity,2
2915,widespread infringement,union law protecting the interest of individuals,dd83a85a970b49e3824afb78f46e5523,is contrary to,4


## Calculate the NetworkX Graph

In [None]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(612,)

In [None]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [None]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  31
[['(a component of a product or of a system)', '(a)', '(a) a strategy for regulatory compliance', '(b)', '(c)', '(d)', '(provider, product manufacturer, deployer, authorised representative, importer or distributor)', '5. where, within 15 calendar days of receipt of the notification referred to in paragraph 3, objections are raised by a member state against an authorisation issued by a market surveillance authority of another member state', 'acceptable residual risk', 'accessibility requirements', 'acting contrary to its obligations pursuant to this regulation', 'adverse legal effect on a person', 'affected persons', 'ai models released under free and open licence', 'ai office', 'ai office and national competent authorities', 'ai office or national competent authorities', 'ai regulatory sandbox', 'ai system', 'ai system for which the provider has concluded that it is not high-risk according to article 6(3)', 'ai system no longer meets requirements set out in 

### Create a dataframe for community colors

In [None]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,(a component of a product or of a system),#57addb,1
1,(a),#57addb,1
2,(a) a strategy for regulatory compliance,#57addb,1
3,(b),#57addb,1
4,(c),#57addb,1
...,...,...,...
607,national market surveillance authorities,#db57cf,29
608,obligation to put in place a quality managemen...,#db5782,30
609,providers that are financial institutions subj...,#db5782,30
610,participating provider,#5760db,31


### Add colors to the graph

In [None]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [None]:
from pyvis.network import Network

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)

./docs/index.html
