## Setup

In [None]:
!pip install langchain

In [None]:
!pip install -U langchain-community

In [None]:
!pip install unstructured

In [None]:
!pip install libmagic

In [None]:
!pip install unstructured[pdf]

In [2]:
import pandas as pd
import numpy as np
import os
from langchain.document_loaders import PyPDFLoader, UnstructuredPDFLoader, PyPDFium2Loader
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path
import random

## Input data directory
# data_dir = "cureus"
data_dir = "hamas-economist"
inputdirectory = Path(f"./data_input/{data_dir}")
## This is where the output csv files will be written
out_dir = data_dir
outputdirectory = Path(f"./data_output/{out_dir}")

## Load Documents

In [5]:
## Dir PDF Loader
# loader = PyPDFDirectoryLoader(inputdirectory)
## File Loader
# loader = PyPDFLoader("./data/MedicalDocuments/orf-path_health-n1.pdf")
loader = DirectoryLoader(inputdirectory, show_progress=True)
documents = loader.load()

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)

pages = splitter.split_documents(documents)
print("Number of chunks = ", len(pages))
print(pages[5].page_content)


  0%|          | 0/1 [00:00<?, ?it/s]libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
100%|██████████| 1/1 [00:00<00:00, 54.35it/s]

Number of chunks =  6
So far Hamas seems financially bulletproof. Israel has managed to inflict little harm on either its income or its savings; Turkey’s banks have been unco-operative. America’s numerous sanctions are less effective if their targets can keep cash outside its banking system. And Hamas hides its companies well. “Every time you think you’ve got a big fish, it changes its name,” despairs one ex-Treasury official.

In fact, the risk is that Hamas’s finances will improve. As Israel steps up its attacks on Gaza, countries with pro-Palestinian populations may make life even easier for Hamas’s bankers. For months rumours have circulated that some civil servants in Mr Erdogan’s economic ministry are co-ordinating with Hamas’s finance office.

For Israel, the prospect of Hamas growing richer despite the war would be a bitter failure. With its wealth and financial roots intact, Hamas—or a similar organisation—might re-emerge and flourish anew from the destruction. While Gazans ha




## Create a dataframe of all the chunks

In [None]:
!pip install yachalk

In [6]:
from helpers.df_helpers import documents2Dataframe
df = documents2Dataframe(pages)
print(df.shape)
df.head()

(6, 3)


Unnamed: 0,text,source,chunk_id
0,Inside Hamas’s sprawling financial empire\n\nW...,data_input\hamas-economist\hamas-economist.txt,760165da660e4cb6919f696042512ac2
1,Hamas’s income pays for everything from school...,data_input\hamas-economist\hamas-economist.txt,22b259079a794da2961bfc6ba85aab7b
2,Dodging American sanctions requires some ingen...,data_input\hamas-economist\hamas-economist.txt,1f830bf583c64d58bd16743a06aaee08
3,Can the revenue streams still flowing to Hamas...,data_input\hamas-economist\hamas-economist.txt,2c5914d3324f414bbc913e65864a803b
4,"Meanwhile, Turkey’s banking system helps Hamas...",data_input\hamas-economist\hamas-economist.txt,c287c746668948718a578bab547e4c85


## Extract Concepts

In [7]:
## This function uses the helpers/prompt function to extract concepts from text
from helpers.df_helpers import df2Graph
from helpers.df_helpers import graph2Df

If regenerate is set to True then the dataframes are regenerated and Both the dataframes are written in the csv format so we dont have to calculate them again. 

        dfne = dataframe of edges

        df = dataframe of chunks


Else the dataframes are read from the output directory

In [8]:
## To regenerate the graph with LLM, set this to True
regenerate = True

if regenerate:
    concepts_list = df2Graph(df, model='gpt-4.1-mini')
    dfg1 = graph2Df(concepts_list)
    if not os.path.exists(outputdirectory):
        os.makedirs(outputdirectory)
    
    dfg1.to_csv(outputdirectory/"graph_IP.csv", sep="|", index=False)
    df.to_csv(outputdirectory/"chunks_IP.csv", sep="|", index=False)
else:
    dfg1 = pd.read_csv(outputdirectory/"graph_IP.csv", sep="|")

dfg1.replace("", np.nan, inplace=True)
dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
dfg1['count'] = 4 
## Increasing the weight of the relation to 4. 
## We will assign the weight of 1 when later the contextual proximity will be calculated.  
print(dfg1.shape)
dfg1.head()

(104, 5)


Unnamed: 0,node_1,node_2,edge,chunk_id,count
0,hamas,financial empire,Hamas has a sprawling financial empire that su...,760165da660e4cb6919f696042512ac2,4
1,israel,hamas's finances,Israel is powerless to dismantle Hamas's finan...,760165da660e4cb6919f696042512ac2,4
2,istanbul,bosporus,The Bosporus is viewed from Istanbul’s glitzie...,760165da660e4cb6919f696042512ac2,4
3,hamas's financiers,istanbul restaurants,Hamas’s financiers frequent some of Istanbul’s...,760165da660e4cb6919f696042512ac2,4
4,america,man,America has imposed sanctions on a man for fun...,760165da660e4cb6919f696042512ac2,4


## Calculating contextual proximity

In [None]:
def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
    ## Melt the dataframe into a list of nodes
    dfg_long = pd.melt(
        df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
    )
    dfg_long.drop(columns=["variable"], inplace=True)
    # Self join with chunk id as the key will create a link between terms occuring in the same text chunk.
    dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
    # drop self loops
    self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
    dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
    ## Group and count edges.
    dfg2 = (
        dfg2.groupby(["node_1", "node_2"])
        .agg({"chunk_id": [",".join, "count"]})
        .reset_index()
    )
    dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
    dfg2.replace("", np.nan, inplace=True)
    dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
    # Drop edges with 1 count
    dfg2 = dfg2[dfg2["count"] != 1]
    dfg2["edge"] = "contextual proximity"
    return dfg2


dfg2 = contextual_proximity(dfg1)
dfg2.tail()
dfg2.to_csv('context_prox_df_IP.csv', index=False)

In [10]:
import pandas as pd

top_10 = dfg2.sort_values(by="count", ascending=False).head(10)

print(top_10)

                       node_1                    node_2  \
1544                    hamas                    israel   
2410                   israel                     hamas   
1498                    hamas                 companies   
740                 companies                     hamas   
1573                    hamas                     redin   
3333                    redin                     hamas   
1173                    firms                     hamas   
1513                    hamas                     firms   
1529                    hamas  hamas's financial empire   
3773  turkey's banking system                     hamas   

                                               chunk_id  count  \
1544  760165da660e4cb6919f696042512ac2,760165da660e4...     31   
2410  760165da660e4cb6919f696042512ac2,760165da660e4...     31   
1498  1f830bf583c64d58bd16743a06aaee08,1f830bf583c64...     19   
740   1f830bf583c64d58bd16743a06aaee08,1f830bf583c64...     19   
1573  1f830bf583c64d

In [12]:
print(top_10[['node_1', 'node_2', 'count']])


                       node_1                    node_2  count
1544                    hamas                    israel     31
2410                   israel                     hamas     31
1498                    hamas                 companies     19
740                 companies                     hamas     19
1573                    hamas                     redin     14
3333                    redin                     hamas     14
1173                    firms                     hamas     14
1513                    hamas                     firms     14
1529                    hamas  hamas's financial empire     12
3773  turkey's banking system                     hamas     12


### Merge both the dataframes

In [13]:
dfg = pd.concat([dfg1, dfg2], axis=0)
dfg = (
    dfg.groupby(["node_1", "node_2"])
    .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
    .reset_index()
)
dfg

Unnamed: 0,node_1,node_2,chunk_id,edge,count
0,$100m a year,american officials,"22b259079a794da2961bfc6ba85aab7b,22b259079a794...",contextual proximity,2
1,$100m a year,goods,"22b259079a794da2961bfc6ba85aab7b,22b259079a794...",contextual proximity,2
2,$100m a year,hamas's income,"22b259079a794da2961bfc6ba85aab7b,22b259079a794...",contextual proximity,2
3,$100m a year,import taxes,"22b259079a794da2961bfc6ba85aab7b,22b259079a794...",contextual proximity,2
4,$100m a year,israel,"22b259079a794da2961bfc6ba85aab7b,22b259079a794...",contextual proximity,3
...,...,...,...,...,...
1369,zaher jabarin,recep tayyip erdogan,"2c5914d3324f414bbc913e65864a803b,2c5914d3324f4...",contextual proximity,6
1370,zaher jabarin,revenue streams,"2c5914d3324f414bbc913e65864a803b,2c5914d3324f4...",contextual proximity,4
1371,zaher jabarin,shelter,"2c5914d3324f414bbc913e65864a803b,2c5914d3324f4...",contextual proximity,2
1372,zaher jabarin,turkey,"2c5914d3324f414bbc913e65864a803b,2c5914d3324f4...",contextual proximity,2


## Calculate the NetworkX Graph

In [15]:
nodes = pd.concat([dfg['node_1'], dfg['node_2']], axis=0).unique()
nodes.shape

(137,)

In [16]:
import networkx as nx
G = nx.Graph()

## Add nodes to the graph
for node in nodes:
    G.add_node(
        str(node)
    )

## Add edges to the graph
for index, row in dfg.iterrows():
    G.add_edge(
        str(row["node_1"]),
        str(row["node_2"]),
        title=row["edge"],
        weight=row['count']/4
    )

### Calculate communities for coloring the nodes

In [17]:
communities_generator = nx.community.girvan_newman(G)
top_level_communities = next(communities_generator)
next_level_communities = next(communities_generator)
communities = sorted(map(sorted, next_level_communities))
print("Number of Communities = ", len(communities))
print(communities)

Number of Communities =  3
[['$100m a year', '$360m each year', '$750m per year', 'american officials', 'american sanctions', 'america’s sanctions', 'annual income', 'attacks on gaza', 'ayatollahs', 'basic necessities', 'biggest friendly government', 'board seats', 'border', 'bosporus', 'cooperation with israel', 'coordination with hamas finance office', 'effectiveness', 'egypt', 'erdogan’s economic ministry civil servants', 'financial empire', 'financially bulletproof', 'friendly governments', "funding for hamas's stockpile of arms and fuel", 'gaza', 'gazans', 'goods', 'goods brought into gaza', 'hamas financiers', 'hamas income', 'hamas money', 'hamas savings', "hamas's finances", "hamas's financial base", "hamas's financial empire", "hamas's financiers", "hamas's income", "hamas's physical force", 'import taxes', 'income stream', 'iran', 'israel', 'israel and allies', 'istanbul restaurants', 'larger income stream', 'man', 'missiles', 'money for hamas', 'money-launderers and mining c

### Create a dataframe for community colors

In [None]:
!pip install seaborn

In [18]:
import seaborn as sns
palette = "hls"

## Now add these colors to communities and make another dataframe
def colors2Community(communities) -> pd.DataFrame:
    ## Define a color palette
    p = sns.color_palette(palette, len(communities)).as_hex()
    random.shuffle(p)
    rows = []
    group = 0
    for community in communities:
        color = p.pop()
        group += 1
        for node in community:
            rows += [{"node": node, "color": color, "group": group}]
    df_colors = pd.DataFrame(rows)
    return df_colors


colors = colors2Community(communities)
colors

Unnamed: 0,node,color,group
0,$100m a year,#5f57db,1
1,$360m each year,#5f57db,1
2,$750m per year,#5f57db,1
3,american officials,#5f57db,1
4,american sanctions,#5f57db,1
...,...,...,...
132,turkish government,#57db5f,2
133,turkish officials,#57db5f,2
134,western regulators,#57db5f,2
135,zaher jabarin,#57db5f,2


### Add colors to the graph

In [19]:
for index, row in colors.iterrows():
    G.nodes[row['node']]['group'] = row['group']
    G.nodes[row['node']]['color'] = row['color']
    G.nodes[row['node']]['size'] = G.degree[row['node']]

In [None]:
!pip install pyvis

In [20]:
import os
os.makedirs('./docs', exist_ok=True)
from pyvis.network import Network

graph_output_directory = "./docs/index_IP.html"

net = Network(
    notebook=False,
    # bgcolor="#1a1a1a",
    cdn_resources="remote",
    height="900px",
    width="100%",
    select_menu=True,
    # font_color="#cccccc",
    filter_menu=False,
)

net.from_nx(G)
# net.repulsion(node_distance=150, spring_length=400)
net.force_atlas_2based(central_gravity=0.015, gravity=-31)
# net.barnes_hut(gravity=-18100, central_gravity=5.05, spring_length=380)
net.show_buttons(filter_=["physics"])

net.show(graph_output_directory, notebook=False)

./docs/index_IP.html


In [21]:
# Get node degrees as a dictionary: {node: degree}
node_degrees = dict(G.degree())

# Sort nodes by degree in descending order and get the top 10
top_10_nodes = sorted(node_degrees.items(), key=lambda x: x[1], reverse=True)[:10]

# Print the results
print("Top 10 nodes by size (degree):")
for node, degree in top_10_nodes:
    print(f"{node}: {degree}")

Top 10 nodes by size (degree):
hamas: 113
israel: 78
turkish government: 38
revenue streams: 36
istanbul: 35
doha: 32
hamas’s bankers: 32
recep tayyip erdogan: 32
zaher jabarin: 32
american officials: 31
