## Setup

In [1]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
data_dir = "cureus"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output1/{out_dir}")

## Load Documents

In [2]:
## Dir PDF Loader
# loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[3].page_content)


100%|██████████| 1/1 [00:01<00:00,  1.88s/it]

Number of chunks =  5
（二）排污单位依法终止的；

（三）排污许可证依法被撤销、吊销的；

（四）应当注销的其他情形。

第三十条  有下列情形之一的，可以依法撤销排污许可证，并在全国排污许可证管理信息平台上公告：

（一）超越法定职权审批排污许可证的；

（二）违反法定程序审批排污许可证的；

（三）审批部门工作人员滥用职权、玩忽职守审批排污许可证的；

（四）对不具备申请资格或者不符合法定条件的排污单位审批排污许可证的；

（五）依法可以撤销排污许可证的其他情形。

排污单位以欺骗、贿赂等不正当手段取得排污许可证的，应当依法予以撤销。

第三十一条  上级生态环境主管部门可以对具有审批权限的下级生态环境主管部门的排污许可证审批和执行情况进行监督检查和指导，发现属于《条例》第三十二条规定违法情形的，上级生态环境主管部门应当责令改正。

第三十二条  排污许可证发生遗失、损毁的，排污单位可以向审批部门申请补领。已经办理排污许可证电子证照的排污单位可以根据需要自行打印排污许可证。

第四章  排污管理

第三十三条  排污单位应当依照《条例》规定，严格落实环境保护主体责任，建立健全环境管理制度，按照排污许可证规定严格控制污染物排放。

排污登记单位应当依照国家生态环境保护法律法规规章等管理规定运行和维护污染防治设施，建设规范化排放口，落实排污主体责任，控制污染物排放。

第三十四条  排污单位应当按照排污许可证规定和有关标准规范，依法开展自行监测，保存原始监测记录。原始监测记录保存期限不得少于五年。

排污单位对自行监测数据的真实性、准确性负责，不得篡改、伪造。

第三十五条  实行排污许可重点管理的排污单位，应当依法安装、使用、维护污染物排放自动监测设备，并与生态环境主管部门的监控设备联网。

排污单位发现污染物排放自动监测设备传输数据异常的，应当及时报告生态环境主管部门，并进行检查、修复。

第三十六条  排污单位应当按照排污许可证规定的格式、内容和频次要求记录环境管理台账，主要包括以下内容：

（一）与污染物排放相关的主要生产设施运行情况；发生异常情况的，应当记录原因和采取的措施。

（二）污染防治设施运行情况及管理信息；发生异常情况的，应当记录原因和采取的措施。

（三）污染物实际排放浓度和排放量；发生超标排放情况的，应当记录超标原因和采




## Create a dataframe of all the chunks

In [3]:
from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(5, 3)


Unnamed: 0,text,source,chunk_id
0,排污许可管理办法\n\n（2024年4月1日生态环境部令第32号公布，自2024年7月1日起...,data_input\cureus\P020240408373065009150.txt,26a229efdbdc4edd981b7dfcb1f56fcb
1,第十二条 排污单位承诺执行更加严格的排放限值的，应当在排污许可证副本中记载。\n\n第十三...,data_input\cureus\P020240408373065009150.txt,897f1388a9714c26917587712d099192
2,第二十二条 对具备下列条件的排污单位，颁发排污许可证：\n\n（一）依法取得建设项目环境影...,data_input\cureus\P020240408373065009150.txt,b022491c4f69470294dafbdc770576bb
3,（二）排污单位依法终止的；\n\n（三）排污许可证依法被撤销、吊销的；\n\n（四）应当注销...,data_input\cureus\P020240408373065009150.txt,8fb7334fcd194391a1e869a1121a896e
4,（七）其他排污许可证规定的内容执行情况。\n\n建设项目竣工环境保护设施验收报告中污染源监测...,data_input\cureus\P020240408373065009150.txt,09a9d4bfc9684ba78f6aef52f9e2e241


## Extract Concepts

In [4]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [6]:
concepts_list = df2Graph(df, model='qwen:4b')

{
    "node_1": "A concept from extracted ontology",
    "node_2": "A related concept from extracted ontology",
    "edge": "relationship between the two concepts, node_1 and node_2 in one or two sentences"
}
]

ERROR ### Here is the buggy response:  {
    "node_1": "A concept from extracted ontology",
    "node_2": "A related concept from extracted ontology",
    "edge": "relationship between the two concepts, node_1 and node_2 in one or two sentences"
}
] 


```json
[
    {
        "node_1": "A concept from extracted ontology", 
        "node_2": "A related concept from extracted ontology", 
        "edge": "relationship between the two concepts, node_1 and node_2 in one or two sentences" 
    }
]
```

ERROR ### Here is the buggy response:  ```json
[
    {
        "node_1": "A concept from extracted ontology", 
        "node_2": "A related concept from extracted ontology", 
        "edge": "relationship between the two concepts, node_1 and node_2 in one or two sentences" 
    }
]
```

ValueError: need at least one array to concatenate

In [36]:
df['text'][5]

'第二十条  审批部门收到排污单位提交的申请材料后，依照《条例》第九条、第十条要求作出处理。\n\n审批部门可以组织技术机构对排污许可证申请材料进行技术评估，并承担相应费用。技术机构应当遵循科学、客观、公正的原则，提出技术评估意见，并对技术评估意见负责，不得向排污单位收取任何费用。\n\n技术机构开展技术评估应当遵守国家相关法律法规、标准规范，保守排污单位商业秘密。\n\n第二十一条  排污单位采用相应污染防治可行技术的，或者新建、改建、扩建建设项目排污单位采用环境影响报告书（表）批准文件要求的污染防治技术的，审批部门可以认为排污单位采用的污染防治设施或者措施能够达到许可排放浓度要求。\n\n不符合前款规定情形的，排污单位可以通过提供监测数据证明其采用的污染防治设施可以达到许可排放浓度要求。监测数据应当通过使用符合国家有关环境监测、计量认证规定和技术规范的监测设备取得；对于国内首次采用的污染防治技术，应当提供工程试验数据予以证明。\n\n第二十二条  对具备下列条件的排污单位，颁发排污许可证：\n\n（一）依法取得建设项目环境影响报告书（表）批准文件，或者已经办理环境影响登记表备案手续；'

In [None]:
## To regenerate the graph with LLM, set this to True
regenerate = True

if regenerate:
    concepts_list = df2Graph(df, model='qwen:4b')
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()

## Calculating contextual proximity

In [None]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

### Merge both the dataframes

In [None]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

## Calculate the NetworkX Graph

In [None]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

In [None]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [None]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

### Create a dataframe for community colors

In [None]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

### Add colors to the graph

In [None]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [None]:
from pyvis.network import Network

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=True,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory)