## Setup

In [1]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random
regenerate=True

## Input data directory
data_dir = "test"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents

In [6]:
## Dir PDF Loader
# loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
loader = PyPDFLoader("http://localhost:8501/_stcore/upload_file/c603c78c-60f7-4d9b-bb6c-80fec4f9974d/83948113-ac35-43fa-8963-05a4c347d6ac")
# loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages)


ValueError: Check the url of your file; returned status code 405

## Create a dataframe of all the chunks

In [4]:
from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(2, 3)


Unnamed: 0,text,source,chunk_id
0,ABSTRACT\n\nZero-day network interruption assa...,data_input\test\zero_day.txt,d2c2552381744b1ea4bb62edbaa5dad9
1,of mounting cyber attacks. With a large portio...,data_input\test\zero_day.txt,bdc68d203d3e4f73994b13d6d0221cd5


## Extract Concepts

In [5]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [8]:
## To regenerate the graph with LLM, set this to True
# regenerate = False

if regenerate:
    concepts_list = df2Graph(df, model='zephyr:latest')
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"graph.csv", sep="|", index=False)
    df.to_csv(outputdirectory/"chunks.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()

[
   {
       "node_1": "Zero-day network interruption attacks",
       "node_2": "Cyber attacks",
       "edge": "Cyber attacks, also known as zero-day attacks, comprise a regular online security threat that exploit the weaknesses of an organization's infrastructure. These types of attacks have been contributing significantly to data breaches resulting in losses of money, time, and resources."
   },
   {
       "node_1": "Zero-day network interruption attacks",
       "node_2": "Data leaks",
       "edge": "Zero-day attacks are a major contributor to data leaks that have led to losses of money, time, and resources."
   },
   {
       "node_1": "Zero-day network interruption attacks",
       "node_2": "Prevention",
       "edge": "Our solution to detect and prevent such attacks is a desktop application that monitors your network traffic in real time and looks for any anomalies or malicious activity that may be happening and works to minimize the damage caused by prevention."
   },
   {

Unnamed: 0,node_1,node_2,edge,chunk_id,count
0,zero-day network interruption attacks,cyber attacks,"Cyber attacks, also known as zero-day attacks,...",d2c2552381744b1ea4bb62edbaa5dad9,4
1,zero-day network interruption attacks,data leaks,Zero-day attacks are a major contributor to da...,d2c2552381744b1ea4bb62edbaa5dad9,4
2,zero-day network interruption attacks,prevention,Our solution to detect and prevent such attack...,d2c2552381744b1ea4bb62edbaa5dad9,4
3,zero-day network interruption attacks,machine learning,"Keywords: Zero day attacks, Attack detection, ...",d2c2552381744b1ea4bb62edbaa5dad9,4
4,protection,malicious connections,Our software utilizes a network flow collectio...,d2c2552381744b1ea4bb62edbaa5dad9,4


## Calculating contextual proximity

In [9]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()

Unnamed: 0,node_1,node_2,chunk_id,count,edge
107,zero-day network interruption attacks,digital infrastructure,"d2c2552381744b1ea4bb62edbaa5dad9,d2c2552381744...",4,contextual proximity
108,zero-day network interruption attacks,machine learning,"d2c2552381744b1ea4bb62edbaa5dad9,d2c2552381744...",4,contextual proximity
109,zero-day network interruption attacks,malicious connections,"d2c2552381744b1ea4bb62edbaa5dad9,d2c2552381744...",4,contextual proximity
110,zero-day network interruption attacks,prevention,"d2c2552381744b1ea4bb62edbaa5dad9,d2c2552381744...",4,contextual proximity
111,zero-day network interruption attacks,protection,"d2c2552381744b1ea4bb62edbaa5dad9,d2c2552381744...",4,contextual proximity


### Merge both the dataframes

In [10]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,complexity and scale of cyber attacks,zero-day attack,"bdc68d203d3e4f73994b13d6d0221cd5,bdc68d203d3e4...",contextual proximity,3
1,cyber attacks,data leaks,"d2c2552381744b1ea4bb62edbaa5dad9,d2c2552381744...",contextual proximity,2
2,cyber attacks,digital infrastructure,"d2c2552381744b1ea4bb62edbaa5dad9,d2c2552381744...",With a large portion of our public and private...,6
3,cyber attacks,machine learning,"d2c2552381744b1ea4bb62edbaa5dad9,d2c2552381744...",contextual proximity,2
4,cyber attacks,malicious connections,"d2c2552381744b1ea4bb62edbaa5dad9,d2c2552381744...",contextual proximity,2
5,cyber attacks,prevention,"d2c2552381744b1ea4bb62edbaa5dad9,d2c2552381744...",contextual proximity,2
6,cyber attacks,protection,"d2c2552381744b1ea4bb62edbaa5dad9,d2c2552381744...",contextual proximity,2
7,cyber attacks,vulnerabilities in hardware and software,bdc68d203d3e4f73994b13d6d0221cd5,protecting computer systems and networks from ...,4
8,cyber attacks,zero-day attack,"bdc68d203d3e4f73994b13d6d0221cd5,bdc68d203d3e4...",contextual proximity,3
9,cyber attacks,zero-day network interruption attacks,"d2c2552381744b1ea4bb62edbaa5dad9,d2c2552381744...",contextual proximity,8


## Calculate the NetworkX Graph

In [11]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(15,)

In [12]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [13]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  3
[['complexity and scale of cyber attacks', 'cyber security landscape', 'no defenses or safeguards', 'unknown vulnerabilities', 'vulnerabilities in hardware and software', 'zero-day attack'], ['cyber attacks', 'data leaks', 'digital infrastructure', 'machine learning', 'malicious connections', 'prevention', 'protection', 'zero-day network interruption attacks'], ['irreparable damage']]


### Create a dataframe for community colors

In [14]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,complexity and scale of cyber attacks,#5f57db,1
1,cyber security landscape,#5f57db,1
2,no defenses or safeguards,#5f57db,1
3,unknown vulnerabilities,#5f57db,1
4,vulnerabilities in hardware and software,#5f57db,1
5,zero-day attack,#5f57db,1
6,cyber attacks,#db5f57,2
7,data leaks,#db5f57,2
8,digital infrastructure,#db5f57,2
9,machine learning,#db5f57,2


### Add colors to the graph

In [15]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [16]:
from pyvis.network import Network

graph_output_directory = "./docs/index.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)

./docs/index.html
