## NER FOR PDF FILES

In [33]:
import os
import spacy
import networkx as nx
import matplotlib.pyplot as plt
from collections import Counter
from pyvis.network import Network
import fitz  # PyMuPDF library
import pandas as pd

In [34]:
# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")

# Create a function to extract text from PDF files
def extract_text_from_pdf(pdf_path):
    with fitz.open(pdf_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()
    return text

In [35]:
# Dictionary to standardize entity names
ENTITY_NORMALIZATION = {
    "united states": "USA",
    "u.s.": "USA",
    "u.s": "USA",
    "the united states": "USA",
    "us": "USA",
    "america": "USA",

    "united kingdom": "UK",
    "u.k.": "UK",
    "u.k": "UK",
    "uk": "UK",
    "britain": "UK",
    "great britain": "UK",

    "russia": "Russian Federation",
    "russian federation": "Russian Federation",

    "south korea": "Korea",
    "republic of korea": "Korea",
    
    "china prc": "China",
    "people's republic of china": "China",
    
    "european union": "EU",
    
    "google inc.": "Google",
    "alphabet inc.": "Google",
    
    "meta platforms": "Meta",
    "facebook inc.": "Meta",
    
    "apple inc.": "Apple",
    
    "microsoft corporation": "Microsoft",
    "microsoft corp": "Microsoft",
    
    "amazon.com inc.": "Amazon",
    "amazon inc.": "Amazon",
    
    "twitter inc.": "Twitter",
    "x corp.": "Twitter",

    "united nations": "United Nations",
    "un": "United Nations",

    
}

In [36]:
def normalize_entity(entity, entity_type):
    """
    Normalize entities with improved handling of capitalization and variations
    
    Args:
        entity (str): Original entity text
        entity_type (str): SpaCy entity type
    
    Returns:
        str: Normalized entity name
    """
    # Convert to lowercase for matching
    lower_entity = entity.lower().strip()
    
    # First, check direct normalization
    if lower_entity in ENTITY_NORMALIZATION:
        return ENTITY_NORMALIZATION[lower_entity]
    
    # Handle partial matches (useful for variations)
    for key, value in ENTITY_NORMALIZATION.items():
        if key in lower_entity:
            return value
    
    # If no normalization found, return original with consistent capitalization
    if entity_type == "GPE":
        # Capitalize countries
        return ' '.join(word.capitalize() for word in entity.split())
    elif entity_type == "ORG":
        # Capitalize organizations
        return ' '.join(word.capitalize() for word in entity.split())
    
    # Default to original if no normalization applies
    return entity

In [43]:
# Create a function to process the text and extract entities
def process_text(text):
    doc = nlp(text)
    countries = []
    organizations = []

    for ent in doc.ents:
        if ent.label_ == "GPE":
            # countries.append(ent.text)
            normalized_country = normalize_entity(ent.text, "GPE")
            countries.append(normalized_country)

        elif ent.label_ == "ORG":
            # organizations.append(ent.text)
            normalized_org = normalize_entity(ent.text, "ORG")
            organizations.append(normalized_org)

    # Debug print
    print(f"Extracted Countries: {countries}")

    return {
        "Countries": list(set(countries)),
        "Organizations": list(set(organizations))
    }

# Function to filter top entities based on mentions
def filter_top_entities(entity_list, top_n=10):
    flat_list = [item for sublist in entity_list for item in safe_eval(sublist)]
    most_common = [entity for entity, count in Counter(flat_list).most_common(top_n)]
    return most_common

# Function to safely evaluate list-like data
def safe_eval(value):
    try:
        return eval(value) if isinstance(value, str) else []
    except:
        return []

In [44]:
# Process the PDF files and store the results
pdf_directory = "pdfs"  # Replace with your PDF directory path
pdf_data = []

for filename in os.listdir(pdf_directory):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(pdf_directory, filename)
        text = extract_text_from_pdf(pdf_path)
        entities = process_text(text)
        pdf_data.append(entities)


Extracted Countries: ['Kosovo', 'Kosovo', 'Kosovo', 'Kosovo', 'USA', 'Kosovo', 'Iraq', 'Kosovo']
Extracted Countries: ['St', 'Liberia', 'Liberia']
Extracted Countries: ['USA', 'Washington', 'D.c.', 'USA', 'Washington', 'USA', 'USA', 'France', 'Afghanistan', 'USA', 'Orcon', 'USA', 'USA', 'France', 'Afghanistan', 'Iran', 'Pakistan', 'Yemen', 'Somalia', 'Russian Federation', 'China', 'Turkey', 'USA', 'Washington', 'USA', 'Washington', 'USA', 'Washington', 'Washington', 'Washington', 'USA', 'France', 'Afghanistan', 'USA', 'USA', 'USA', 'USA', 'USA', 'Iran', 'Afghanistan', 'Pakistan', 'Yemen', 'Somalia', 'Russian Federation', 'China', 'Turkey']
Extracted Countries: ['St', 'Liberia', 'Iad1', 'Liberia', 'Iad1', 'Iad1', 'Iad1', 'Monrovia', 'Liberia']
Extracted Countries: ['St', 'New York', 'Nairobi', 'New York', 'St', 'Nairobi', 'Gigiri', 'Kenya', 'New York']
Extracted Countries: ['Kosovo', 'Kosovo', 'Kosovo']
Extracted Countries: ['Kosovo', 'Kosovo', 'Kosovo', 'Kosovo', 'Kosovo']
Extracted Co

Director exception handler, message is:
Director error: <class 'KeyboardInterrupt'>: 
Traceback (most recent call last):
    /opt/homebrew/Caskroom/miniconda/base/envs/myenv/lib/python3.12/site-packages/pymupdf/mupdf.py:59423:_print(): def _print( self, message):
    /opt/homebrew/Caskroom/miniconda/base/envs/myenv/lib/python3.12/site-packages/pymupdf/mupdf.py:46472:fz_open_document(): return _mupdf.fz_open_document(filename)
    /opt/homebrew/Caskroom/miniconda/base/envs/myenv/lib/python3.12/site-packages/pymupdf/__init__.py:3002:__init__(): doc = mupdf.fz_open_document(filename)
    /var/folders/fh/sgzv3sqd42s5hhrm7nvc_m280000gn/T/ipykernel_38790/3267630164.py:6:extract_text_from_pdf(): with fitz.open(pdf_path) as doc:
    /var/folders/fh/sgzv3sqd42s5hhrm7nvc_m280000gn/T/ipykernel_38790/2987261712.py:8:<module>(): text = extract_text_from_pdf(pdf_path)
    /opt/homebrew/Caskroom/miniconda/base/envs/myenv/lib/python3.12/site-packages/IPython/core/interactiveshell.py:3577:run_code(): e

Extracted Countries: ['Kosovo', 'Kosovo', 'Kosovo']
Extracted Countries: ['Kosovo', 'Kosovo', 'Beirut', 'Kosovo', 'Airport', 'Fraud', 'Fraud', 'Complicity', 'Complicity']
Extracted Countries: ['Kosovo', 'Kosovo', 'Kosovo', 'Kosovo', 'Kosovo', 'Kosovo', 'Kosovo', 'Kosovo', 'Kosovo', 'Italy', 'The United Kingdom', 'Airport Prishtina', 'Uk', 'London', 'Prishtina', 'Italy', 'Canada', 'UK', '06/2002', 'The United Kingdom', 'Uk', 'Prishtina', 'Kosovo']
Extracted Countries: ['Kosovo', 'Kosovo', 'Airport', 'Airport', 'Kosovo']


In [45]:
df = pd.DataFrame(pdf_data)
df.head(20)

Unnamed: 0,Countries,Organizations
0,"[Kosovo, Iraq, USA]","[Euro, Itf, The Border Boundary Police, Backgr..."
1,"[St, Liberia]","[Strictly, The Investigations Division Of The ..."
2,"[Turkey, Orcon, Russian Federation, Somalia, P...","[Sigint, Noforn, National Security Agency, Eur..."
3,"[Iad1, Monrovia, St, Liberia]","[The United Nations Mission, “fuel Management,..."
4,"[Nairobi, Kenya, Gigiri, St, New York]","[Strictly, The United Nations St/sgb/2003/7, R..."
5,[Kosovo],"[Itf, USA, Contract, Unmik Regulation, Investi..."
6,[Kosovo],"[Tender Documents, The Transport Sector Of The..."
7,"[St, Kosovo, Spuvesek]","[Unmik’s Finance Administrative, Section Vi “t..."
8,"[Poznan, USA]","[Nf, The Poznan Conference, Copenhagen, The Eu..."
9,"[Invoice, Kosovo, Vienna]","[Invoice, Itf, Airport Cargo Department, Value..."


In [40]:
df.tail(20)

Unnamed: 0,Countries,Organizations
24,"[Nairobi, Bratislava, Johannesburg, Fraud, Mon...","[Unhq, The Slovakian Police, The Finance Depar..."
25,"[Airport, Vienna, Iceland]","[Unmuk, Itf, Ycaa, Icaa, USA, Dpko, Schedule V..."
26,"[Tokyo, Japan, Washington, USA]","[Department Of Agriculture, Noforn, National S..."
27,"[Kosovo, Pristina]","[Airlines, The Transport Sector Of The Unmik D..."
28,"[Staff Union, Nairobi, St, Zambia, Olokodana, ...","[Jab, Office Of Legal Affairs, Un, Background ..."
29,"[Port Louis, Victoria, St, Iv, Seychelles, USA]","[Barclay Bank, The Investigations Division, Of..."
30,[],"[United Nations Environment Programme, The Pos..."
31,"[Nairobi, Kenya, Somalia, St, Unon, USA]","[Id/oios Investigation, Un, The United Nations..."
32,"[Pd, Thunderbird, New York]","[Dpko, Manchester Technologies, Sony, Un, Nec,..."
33,"[Kosovo, Pristina, Iceland, Italy]","[Dm 50,000, The Transport Sector Of The Unmik ..."


In [54]:
def filter_top_entities(entity_list, top_n=10):
    """Returns only the most frequently mentioned entities"""
    flat_list = [item for sublist in entity_list if isinstance(sublist, list) for item in sublist]  # Ensure flattening works

    print(f"Flat List of extracted entities: {flat_list}")  # Debugging

    most_common = Counter(flat_list).most_common(top_n)
    print(f"Most common entities: {most_common}")  # Debugging

    return [entity for entity, count in most_common]

In [53]:
print(f"Raw country data in df: {df['Countries'].tolist()}")


Raw country data in df: [['Kosovo', 'Iraq', 'USA'], ['St', 'Liberia'], ['Turkey', 'Orcon', 'Russian Federation', 'Somalia', 'Pakistan', 'France', 'Yemen', 'Afghanistan', 'Iran', 'China', 'Washington', 'D.c.', 'USA'], ['Iad1', 'Monrovia', 'St', 'Liberia'], ['Nairobi', 'Kenya', 'Gigiri', 'St', 'New York'], ['Kosovo'], ['Kosovo'], ['St', 'Kosovo', 'Spuvesek'], ['Poznan', 'USA'], ['Invoice', 'Kosovo', 'Vienna'], ['Kosovo', 'Vendor', 'Pristina'], ['Spain', 'Valentini', 'Rome', 'Italy'], ['Meti', 'United States', 'Japan', 'Washington', 'USA', 'Washington Dc'], ['Kosovo'], ['Kosovo', 'Vendor', 'Pristina'], ['Kosovo'], ['Kosovo', 'Pristina'], ['Respond', 'Japan', 'Tokyo', 'Cancun', 'Washington', 'USA'], ['Meti', 'India', 'Japan', 'Tokyo', 'China', 'USA'], ['Israel', 'Italy', 'Washington', 'Fvey', 'USA'], ['Kosovo'], ['India', 'Bali', 'South Africa', 'Japan', 'Tokyo', 'Germany', 'Indonesia', 'Korea', 'Mexico', 'Brazil', 'USA'], ['Meti', 'USA', 'Canada', 'Japan', 'UK', 'New Zealand'], ['Kosovo',

In [60]:
# Get the top 10 most frequent countries
top_countries = filter_top_entities(df["Countries"], top_n=10)
print(f"Top 10 most frequent countries: {top_countries}")  # Debugging

Flat List of extracted entities: ['Kosovo', 'Iraq', 'USA', 'St', 'Liberia', 'Turkey', 'Orcon', 'Russian Federation', 'Somalia', 'Pakistan', 'France', 'Yemen', 'Afghanistan', 'Iran', 'China', 'Washington', 'D.c.', 'USA', 'Iad1', 'Monrovia', 'St', 'Liberia', 'Nairobi', 'Kenya', 'Gigiri', 'St', 'New York', 'Kosovo', 'Kosovo', 'St', 'Kosovo', 'Spuvesek', 'Poznan', 'USA', 'Invoice', 'Kosovo', 'Vienna', 'Kosovo', 'Vendor', 'Pristina', 'Spain', 'Valentini', 'Rome', 'Italy', 'Meti', 'United States', 'Japan', 'Washington', 'USA', 'Washington Dc', 'Kosovo', 'Kosovo', 'Vendor', 'Pristina', 'Kosovo', 'Kosovo', 'Pristina', 'Respond', 'Japan', 'Tokyo', 'Cancun', 'Washington', 'USA', 'Meti', 'India', 'Japan', 'Tokyo', 'China', 'USA', 'Israel', 'Italy', 'Washington', 'Fvey', 'USA', 'Kosovo', 'India', 'Bali', 'South Africa', 'Japan', 'Tokyo', 'Germany', 'Indonesia', 'Korea', 'Mexico', 'Brazil', 'USA', 'Meti', 'USA', 'Canada', 'Japan', 'UK', 'New Zealand', 'Kosovo', 'London', 'Pristina', 'Vienna', 'Nair

# GRAPHS

In [66]:
import matplotlib.colors as mcolors  # For gradient color mapping

## COUNTRIES

In [79]:
# Create a new graph only for countries
G_countries = nx.Graph()

# Function to add only country-related entities to the graph
def add_country_entities_to_graph(top_entities, color):
    for index, row in df.iterrows():
        excerpt_id = f"{index+1}.pdf"
        # country_list = safe_eval(row["Countries"])
        country_list = row["Countries"]

        # Filter out less frequent countries
        country_list = [country for country in country_list if country in top_entities]
        print(country_list)

        if len(country_list) > 0:
            G_countries.add_node(excerpt_id, type="excerpt", color="white")

        for country in country_list:
            G_countries.add_node(country, type="country", color=color)
            G_countries.add_edge(excerpt_id, country, weight=2)

# Add only country relationships
add_country_entities_to_graph(top_countries, "purple")

# Remove weakly connected nodes (degree < 2)
filtered_nodes = [node for node, degree in dict(G_countries.degree()).items() if degree >= 2]
G_countries = G_countries.subgraph(filtered_nodes)

# Ensure all nodes have a color attribute
for node in G_countries.nodes():
    if "color" not in G_countries.nodes[node]:
        G_countries.nodes[node]["color"] = "gray"

# Get node degrees and find min/max connections
node_degrees = dict(G_countries.degree())
country_degrees = {node: degree for node, degree in node_degrees.items() if node in top_countries}

min_degree = min(country_degrees.values()) if country_degrees else 1
max_degree = max(country_degrees.values()) if country_degrees else 1

# Function to map degree to a gradient color (light green → dark blue)
def get_gradient_color(degree):
    normalized_value = (degree - min_degree) / (max_degree - min_degree + 0.0001)
    color_map = mcolors.LinearSegmentedColormap.from_list("custom", ["#21b9cb", "#a0fff4"])
    return mcolors.to_hex(color_map(normalized_value))

# Assign gradient colors to countries based on their connections
country_colors = {}
for country, degree in country_degrees.items():
    country_colors[country] = get_gradient_color(degree)
    G_countries.nodes[country]["color"] = country_colors[country]

['Kosovo', 'USA']
['St']
['Washington', 'USA']
['St']
['Nairobi', 'St']
['Kosovo']
['Kosovo']
['St', 'Kosovo']
['USA']
['Kosovo']
['Kosovo', 'Pristina']
['Italy']
['Japan', 'Washington', 'USA']
['Kosovo']
['Kosovo', 'Pristina']
['Kosovo']
['Kosovo', 'Pristina']
['Japan', 'Tokyo', 'Washington', 'USA']
['Japan', 'Tokyo', 'USA']
['Italy', 'Washington', 'USA']
['Kosovo']
['Japan', 'Tokyo', 'USA']
['USA', 'Japan']
['Kosovo', 'Pristina']
['Nairobi']
['Airport']
['Tokyo', 'Japan', 'Washington', 'USA']
['Kosovo', 'Pristina']
['Nairobi', 'St', 'USA']
['St', 'USA']
[]
['Nairobi', 'St', 'USA']
[]
['Kosovo', 'Pristina', 'Italy']
['Kosovo']
['Kosovo']
['Washington', 'USA']
['Kosovo', 'Pristina']
['Kosovo', 'Airport', 'Pristina']
['Kosovo', 'Pristina']
['Kosovo']
['Kosovo', 'Airport']
['Kosovo', 'Italy']
['Kosovo', 'Airport']


In [80]:
# Function to display the improved country-only graph in Pyvis
def display_country_pyvis(G, filename="countries_network.html"):
    net = Network(notebook=True, width="1200px", height="800px", bgcolor='#222222', font_color='white', cdn_resources="remote")

    def scale_size(degree):
        return max(10, min(degree * 3, 50))

    for node, data in G.nodes(data=True):
        degree = node_degrees.get(node, 1)

        # if "News" in node:
        #     color = "#87CEEB"
        # else:
        color = country_colors.get(node, "#7B68EE")

        net.add_node(node, size=scale_size(degree), color=color, label=node)

    for src, dst in G.edges():
        if src in country_colors and dst in country_colors:
            edge_color = min(country_colors[src], country_colors[dst])
        elif src in country_colors:
            edge_color = country_colors[src]
        elif dst in country_colors:
            edge_color = country_colors[dst]
        else:
            edge_color = "#CDB7F6"

        net.add_edge(src, dst, width=0.3, color=edge_color)

    net.barnes_hut(
        gravity=-500,
        central_gravity=0.5,
        spring_length=100,
        damping=0.9
    )

    net.show(filename)

In [81]:
# Generate the country-only Pyvis graph with gradient-colored edges
display_country_pyvis(G_countries, "pdf_countries_network.html")

print(f"Number of nodes in G_countries: {len(G_countries.nodes())}")
print(f"Number of edges in G_countries: {len(G_countries.edges())}")

pdf_countries_network.html
Number of nodes in G_countries: 37
Number of edges in G_countries: 66


## ORGANIZATION

In [82]:
from pyvis.network import Network
import networkx as nx
import pandas as pd
from collections import Counter
import matplotlib.colors as mcolors

# ✅ Extract the top 10 most frequent organizations
top_organizations = filter_top_entities(df["Organizations"], top_n=10)
print(f"Top 10 most frequent organizations: {top_organizations}")  # Debugging

# ✅ Create a new graph only for organizations
G_organizations = nx.Graph()

# Function to add only organization-related entities to the graph
def add_organization_entities_to_graph(top_entities, color):
    for index, row in df.iterrows():
        excerpt_id = f"{index+1}.pdf"
        # org_list = safe_eval(row["Organizations"])
        org_list = row["Organizations"]

        # Filter out less frequent entities
        org_list = [org for org in org_list if org in top_entities]

        if len(org_list) > 0:
            G_organizations.add_node(excerpt_id, type="excerpt", color="white")  # Excerpts as white

        for org in org_list:
            G_organizations.add_node(org, type="organization", color=color)  # Organizations with color
            G_organizations.add_edge(excerpt_id, org, weight=2)

# ✅ Add organizations to the graph
add_organization_entities_to_graph(top_organizations, "green")

# ✅ Remove weakly connected nodes (degree < 2)
filtered_nodes = [node for node, degree in dict(G_organizations.degree()).items() if degree >= 2]
G_organizations = G_organizations.subgraph(filtered_nodes)

# ✅ Ensure all nodes have a color attribute
for node in G_organizations.nodes():
    if "color" not in G_organizations.nodes[node]:  
        G_organizations.nodes[node]["color"] = "gray"

# ✅ Get node degrees and find min/max connections
node_degrees = dict(G_organizations.degree())
org_degrees = {node: degree for node, degree in node_degrees.items() if node in top_organizations}

min_degree = min(org_degrees.values()) if org_degrees else 1
max_degree = max(org_degrees.values()) if org_degrees else 1

# ✅ Function to map degree to a gradient color (light green → dark blue)
def get_gradient_color(degree):
    normalized_value = (degree - min_degree) / (max_degree - min_degree + 0.0001)  # Normalize between 0 and 1
    color_map = mcolors.LinearSegmentedColormap.from_list("custom", ["#21b9cb", "#a0fff4"])  # Green → Blue
    return mcolors.to_hex(color_map(normalized_value))

# ✅ Assign gradient colors to organizations based on their connections
org_colors = {}
for org, degree in org_degrees.items():
    org_colors[org] = get_gradient_color(degree)
    G_organizations.nodes[org]["color"] = org_colors[org]  # Apply color

# ✅ Function to display the improved organization-only graph in Pyvis
def display_organization_pyvis(G, filename="organizations_network.html"):
    net = Network(notebook=True, width="1200px", height="800px", bgcolor='#222222', font_color='white', cdn_resources="remote")

    # ✅ Scale node sizes logarithmically
    def scale_size(degree):
        return max(10, min(degree * 3, 50))  # Limits size to prevent giant nodes

    for node, data in G.nodes(data=True):
        degree = node_degrees.get(node, 1)  # Get degree, default to 1

        # ✅ Differentiate node colors
        # if ".pdf" in node:
        #     color = "#87CEEB"  # 🔹 Light blue for pdf
        # else:
        color = org_colors.get(node, "#7B68EE")  # Use the assigned gradient color

        net.add_node(node, size=scale_size(degree), color=color, label=node)

    # ✅ Set edge colors to match organization node colors
    for src, dst in G.edges():
        edge_color = org_colors.get(dst, "#CDB7F6") if dst in org_colors else "#CDB7F6"
        net.add_edge(src, dst, width=0.3, color=edge_color)

    # ✅ Use `barnes_hut()` for a balanced layout
    net.barnes_hut(
        gravity=-500,  # Keeps the layout compact
        central_gravity=0.5,  # Pulls highly connected nodes to the center
        spring_length=100,  # Defines the spacing between nodes
        damping=0.9  # Reduces excess movement
    )

    # Save and display
    net.show(filename)

# ✅ Generate the organization-only Pyvis graph with gradient-colored edges
display_organization_pyvis(G_organizations, "organizations_network.html")


Flat List of extracted entities: ['Euro', 'Itf', 'The Border Boundary Police', 'Background Information', 'Unmik Police Officer', 'Peap', 'Divisional', 'Investigation Task Force Allegation Of Possible Bribery', 'Unmik Police', 'Regional Crime Squad Pristina Region', 'United Nations', 'Airport Engineer', 'Unmik Pillar Iv', 'The Regional Crime Squad', 'Unmik', 'United Nations Interim Administration Mission', 'The Border Police Investigations Unit', 'The Investigation Task Force', 'Strictly', 'The Investigations Division Of The Office Of Internal Oversight Services', 'The United Nations Mission', 'U N I T', 'Sigint', 'Noforn', 'National Security Agency', 'European Aeronautic Defence', 'Eads', 'Nsa', 'Organization: National Security Agency', 'Space Corporation', 'USA', 'The United Nations Mission', '“fuel Management', 'Dpko', 'Id/oios Investigation', 'Siu', 'Security Council', 'Background Information', 'Iad', 'U N I T', 'United Nations', 'Internal Audit Division 1', 'The Organization', 'The