## Setup

In [1]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
data_dir = "cureus"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents

In [2]:
## Dir PDF Loader
# loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[3].page_content)


100%|██████████| 1/1 [00:15<00:00, 15.43s/it]

Number of chunks =  27
tion of random horizontal flip for the image while not ap-

plying any augmentation to the text (Wang et al. 2020; Chen

et al. 2022; Ding et al. 2021). In this study, we comprehen-sively evaluate various data augmentation strategies, thereby

deriving a powerful data augmentation strategy for TBPS.2) Loss function. Designing rational and practical loss

functions is critical to improving performance and has been

an increasingly active research direction in TBPS commu-

nity (Zhang and Lu 2018; Bai et al. 2023a). We take CLIP as

a hotbed and conduct a series of probing studies to analyze

the effectiveness of various loss functions in TBPS. Unlike

the loss functions in existing TBPS methods that are well-

designed mainly from exploring the TBPS task and belong

to the task-oriented loss functions, the loss functions probed

in this study are primarily inspired by VLP communities and

are pretty generic to various cross-modal tasks.These empirical studies abov




In [6]:
import importlib.util
import sys
import os

# 添加项目路径到sys.path
project_path = 'D:/知识图谱与认知智能/大作业/knowledge_graph-main/knowledge_graph-main'
if project_path not in sys.path:
    sys.path.append(project_path)

# 加载 df_helpers 模块
module_path = os.path.join(project_path, 'helpers', 'df_helpers.py')
spec = importlib.util.spec_from_file_location("helpers.df_helpers", module_path)
df_helpers = importlib.util.module_from_spec(spec)
spec.loader.exec_module(df_helpers)

# 使用模块中的函数
#documents2Dataframe = df_helpers.documents2Dataframe
print("Module imported successfully")


Module imported successfully


## Create a dataframe of all the chunks

In [7]:

from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(27, 3)


Unnamed: 0,text,source,chunk_id
0,AbstractText-based Person Search (TBPS) aims t...,data_input\cureus\3.txt,8cae603439ee4c8abc10bb26dcf67cd4
1,available at https://github.com/Flame-Chasers/...,data_input\cureus\3.txt,793e33ea22794c428acd1099139baada
2,for solving these tasks. Considering that the ...,data_input\cureus\3.txt,468073942e8a48ff95077313de58f404
3,tion of random horizontal flip for the image w...,data_input\cureus\3.txt,3584388e37e1420e8413c2d91b216d67
4,"augmentations, and loss functions into CLIP. T...",data_input\cureus\3.txt,34a7ec0f07fe4a5196655586cb15bea5


## Extract Concepts

In [8]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [9]:
## To regenerate the graph with LLM, set this to True
regenerate = True

if regenerate:
    concepts_list = df2Graph(df, model='gpt-4')
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"3_graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory/"3_chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()

2


  warn_deprecated(


(254, 5)


Unnamed: 0,node_1,node_2,edge,chunk_id,count
0,abstracttext-based person search (tbps),natural language descriptions,TBPS aims to retrieve the person images using ...,8cae603439ee4c8abc10bb26dcf67cd4,4
1,abstracttext-based person search (tbps),clip,facing the rise of research on the CLIP-based ...,8cae603439ee4c8abc10bb26dcf67cd4,4
2,clip,tbps,conduct a comprehensive empirical study of CLI...,8cae603439ee4c8abc10bb26dcf67cd4,4
3,clip,downstream tasks,performed over various cross-modal downstream ...,8cae603439ee4c8abc10bb26dcf67cd4,4
4,tbps,clip,make the first attempt to conduct a comprehens...,8cae603439ee4c8abc10bb26dcf67cd4,4


## Calculating contextual proximity

In [10]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
4103,wang,tbps-clip,"559b39db23c6432381c2a4c0d0166e0a,559b39db23c64...",3,contextual proximity
4105,wang,vit-b/16,"559b39db23c6432381c2a4c0d0166e0a,559b39db23c64...",2,contextual proximity
4106,wang,vit-b/32,"559b39db23c6432381c2a4c0d0166e0a,559b39db23c64...",2,contextual proximity
4117,word,sentence,"ce8d1721d4a4428ebecff43790d12d9a,ce8d1721d4a44...",3,contextual proximity
4120,word,trivialaugment,"ce8d1721d4a4428ebecff43790d12d9a,ce8d1721d4a44...",2,contextual proximity


### Merge both the dataframes

In [11]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,"(in, tn)",c-itc,"bca3cb26c7fc44a08194fd70b87a6a27,bca3cb26c7fc4...",contextual proximity,2
1,"(ip, tp)","(in, tn)",bca3cb26c7fc44a08194fd70b87a6a27,"The paired cross-modal data (Ip, Tp) and (In, ...",4
2,"(ip, tp)",c-itc,"bca3cb26c7fc44a08194fd70b87a6a27,bca3cb26c7fc4...",contextual proximity,2
3,ablation studies,comparisons,"eb7f8f3132364c8ab3ee040bef09dcf6,eb7f8f3132364...",contextual proximity,2
4,ablation studies,experimental analyses,"eb7f8f3132364c8ab3ee040bef09dcf6,eb7f8f3132364...",contextual proximity,3
...,...,...,...,...,...
1663,wang,vit-b/16,"559b39db23c6432381c2a4c0d0166e0a,559b39db23c64...",contextual proximity,2
1664,wang,vit-b/32,"559b39db23c6432381c2a4c0d0166e0a,559b39db23c64...",contextual proximity,2
1665,word,sentence,"ce8d1721d4a4428ebecff43790d12d9a,ce8d1721d4a44...",contextual proximity,3
1666,word,trivialaugment,"ce8d1721d4a4428ebecff43790d12d9a,ce8d1721d4a44...",contextual proximity,2


## Calculate the NetworkX Graph

In [12]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(217,)

In [13]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [14]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  5
[['(in, tn)', '(ip, tp)', 'abstracttext-based person search (tbps)', 'academia and industry', 'albef', 'bai et al.', 'baseline', 'bert', 'c-itc', 'cfine', 'clip', 'clip-adapter', 'clip-based tbps method', 'cmpm (zhang and lu 2018)', 'coop', 'cross-modal correspondence', 'cross-modal tasks', 'cross-modality regularization', 'cyclip (goel et al. 2022)', 'data augmentation', 'declip', 'dkl(pi,j∥ˆqi,j) + dkl(pj,i∥ˆqj,i)', 'dkl(p∥q)', 'downstream tasks', 'dropping operation', 'empirical study', 'empirical study on data augmentation and loss function', 'epochalbef', 'few-shot capabilities', 'few-shot clip variants', 'few-shot tbps', 'figure 3', 'filip', 'geometry of the resulting representation space', 'grant nsfc 62002252', 'icfg-pedes', 'image encoder', 'image-text contrastive loss (c-itc)', 'image-text retrieval', 'in-modality regularization', 'irra', 'ivt', 'jiang', 'jiang and ye 2023', 'lcc−it c', 'lci−it c', 'learned representations', 'lgur', 'lr−it c', 'lstm

### Create a dataframe for community colors

In [15]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,"(in, tn)",#5784db,1
1,"(ip, tp)",#5784db,1
2,abstracttext-based person search (tbps),#5784db,1
3,academia and industry,#5784db,1
4,albef,#5784db,1
...,...,...,...
212,vlp models,#57db94,3
213,clip+aug,#b9db57,4
214,r-itc,#b9db57,4
215,multiple losses,#db5f57,5


### Add colors to the graph

In [16]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [17]:
from pyvis.network import Network

graph_output_directory = "./docs/3_index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])
html_content = net.generate_html()

# Write the HTML content to a file with UTF-8 encoding
with open(graph_output_directory, 'w', encoding='utf-8') as f:
    f.write(html_content)
# net.show(graph_output_directory, notebook=False)

In [None]:
import pandas as pd

# 保存节点信息
nodes_data = []
for node in G.nodes(data=True):
    nodes_data.append({
        'node': node[0],
        'group': node[1].get('group'),
        'color': node[1].get('color'),
        'size': node[1].get('size')
    })

df_nodes = pd.DataFrame(nodes_data)
df_nodes.to_csv("nodes_data.csv", index=False)

# 保存边信息
edges_data = []
for edge in G.edges(data=True):
    edges_data.append({
        'node_1': edge[0],
        'node_2': edge[1],
        'weight': edge[2].get('weight')
    })

df_edges = pd.DataFrame(edges_data)
df_edges.to_csv("edges_data.csv", index=False)
communities_data = []
for group, community in enumerate(communities, 1):
    for node in community:
        communities_data.append({'node': node, 'group': group})

df_communities = pd.DataFrame(communities_data)
df_communities.to_csv("communities_data.csv", index=False)
