In [1]:
import google.generativeai as genai
import os
from dotenv import load_dotenv

load_dotenv()
model = genai.GenerativeModel(model_name='gemini-1.5-flash',
                                system_instruction="""
Conduct **Relationship Extraction** on the given input. 

### **Expected Output Format**
Respond **ONLY** with two Python lists, separated by `@`:
1. **Entities List**: List of tuples containing (entity_name, classification), where:
    - `entity_name`: Maximum of **three words**.
    - `classification`: Must be one of the following:
        - `PER` (Person)
        - `ORG` (Organisation)
        - `LOC` (Location)
        - `EVT` (Event)
    - **No duplicate entities allowed.**
    - **Every entity must have at least one relationship.**

2. **Relationships List**: List of tuples containing (entity_1, entity_2, relationship_label, relationship_classification), where:
    - `entity_1`, `entity_2`: Must be present in the **Entities List**.
    - `relationship_label`: Maximum of **three words**.
    - `relationship_classification`: Must be one of the following:
        - `PV` (Positive sentiment)
        - `NG` (Negative sentiment)
        - `NE` (Neutral sentiment)
    - **No duplicate relationships allowed.**
    - **Ensure that every identified relationship is valid and meaningful.**
    - **All relationships must have entities that exist in the Entities List.**

### **Processing Rules**
- **Strictly use the specified classifications**; do not identify anything outside them.
- **Ensure high accuracy** in entity recognition and classification.
- **Each entity must be related to at least one other entity**.
- **Ensure that every relationship label is concise, meaningful, and accurate**.
- **Avoid misclassifying generic nouns as entities unless they fit within the specified categories.**

### **Example Output Format**
[("John Doe", "PER"), ("Google", "ORG"), ("New York", "LOC"), ("Tech Summit", "EVT")] @ 
[("John Doe", "Google", "works at", "PV"), ("Tech Summit", "Google", "sponsored by", "PV")]
""")
genai.configure(api_key=os.getenv("GEMINI_KEY"))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def prompt(input):
    response = model.generate_content(input)
    return response.text

def str_to_ls(str):
    entities_str, relationships_str = str.split('@')
    entities = eval(entities_str.strip())
    relationships = eval(relationships_str.strip())
    return entities, relationships


In [3]:
import pandas as pd
df = pd.read_excel("Dataset/wikileaks_parsed.xlsx")
df

Unnamed: 0,PDF Path,Text
0,1.pdf,Pristina Airport – Possible administrative irr...
1,1.pdf,Investigative details\n\nIn his/her interviews...
2,10.pdf,"""An interoffice memorandum providing an “outst..."
3,10.pdf,"""Allegation 2 & 3:\n\n(Specifically, three of ..."
4,10.pdf,"""When asked about this in interview, the Divis..."
...,...,...
138,89.pdf,"""Description\n\nTop Secret US National Securit..."
139,9.pdf,"""INTRODUCTION\n\nThis case arises out of an au..."
140,9.pdf,"""BACKGROUND INFORMATION\n\nPristina Internatio..."
141,9.pdf,"""BACKGROUND INFORMATION\n\nPristina Internatio..."


In [4]:
df_grouped = df.groupby('PDF Path')['Text'].apply(' '.join).reset_index()
df_grouped

Unnamed: 0,PDF Path,Text
0,1.pdf,Pristina Airport – Possible administrative irr...
1,10.pdf,"""An interoffice memorandum providing an “outst..."
2,105.pdf,"""Description\n\nThis is a Secret US National S..."
3,106.pdf,"""Tokyo's Climate Change Officials to Continue ..."
4,107.pdf,"""Description\n\nThis is a secret US National S..."
5,108.pdf,"""Japanese Strive to Avoid Damage to U.S. Relat..."
6,11.pdf,"Until the end of June 2002, responsibility for..."
7,110.pdf,The EU and Japan were engaged as of early Dece...
8,111.pdf,A 22 October meeting attended by German Chance...
9,112.pdf,"Israel has reached out to Europe, including It..."


In [5]:
df_grouped['Text'] = df_grouped['Text'].apply(lambda x: [sentence for sentence in x.split('\n') if len(sentence) >= 10])
df_grouped['Text'] = df_grouped['Text'].apply(lambda x: ' '.join(x))
df_grouped['Text'][23]

'Alleged misappropriation of funds – Manager (Case No. 050/04) The Investigation Task Force (ITF) has been investigating an allegation regarding administrative irregularities regarding the insurance coverage for Pristina Airport by a London-based Company, who then purchased the cover from Insurance Company 1 between 2001 and 2003. Based on extensive documentation obtained from UNMIK Pillar II, the Kosovo Trust Agency (KTA) and Public Enterprise Airport Pristina (PEAP), the ITF determined that the responsible Manager did not undertake efforts in 2002 or 2003 to invite international tenders for the Airport’s insurance coverage. According to UNMIK regulations 1999/2 – Finance Administration Instruction on Public Procurement using Kosovo Consolidated Funds – and in view of the cost of the premiums of more than $125,000 USD, such a tender should have been undertaken. The Manager did not prepare an international tender for at least the period 2 October 2003 to 1 October 2004, instead, he con

In [6]:
import time
from tqdm import tqdm

graph_data = []

for index, row in tqdm(df_grouped.iterrows(), total=df_grouped.shape[0], desc="Processing rows"):
    text = row['Text']
    pdf_file = row['PDF Path']
    
    # Making the API call
    res = prompt(text)
    
    # Parsing the response
    entities, relationships = str_to_ls(res)
    
    # Appending results to list
    graph_data.append({'PDF':pdf_file,'Entities': entities, 'Relationships': relationships})
    time.sleep(5)  # Adjust delay as necessary

# Convert to DataFrame
graph_df = pd.DataFrame(graph_data)

Processing rows: 100%|██████████| 44/44 [05:51<00:00,  7.99s/it]


In [17]:
graph_df.to_csv('wiki_leaks_network.csv',index=False)

In [21]:
from pyvis.network import Network

# Define dynamic color scheme
node_colors = {
"PER": "#FFA500",  # Orange
"ORG": "#007BFF",  # Blue
"LOC": "#8E44AD",  # Purple
"EVT": "#1ABC9C",  # Teal
"MISC": "#BDC3C7"  # Grey
}

relationship_colors = {
"PV": "#27AE60",  # Green (Positive)
"NG": "#E74C3C",  # Red (Negative)
"NE": "#D3D3D3"   # Light Grey (Neutral)
}

for ind, row in graph_df.iterrows():
    pdf = row['PDF'][:-4]
    entities = row["Entities"]
    relationships = row["Relationships"]
    for ent1, ent2, rs, classification in relationships:
        if ent1 not in [entity for entity, _ in entities]:
            entities.append((ent1, "MISC"))
        if ent2 not in [entity for entity, _ in entities]:
            entities.append((ent2, "MISC"))
    
    # Create PyVis Network
    net = Network(width="100vh", height="100vh", notebook=True, directed=True, cdn_resources='remote', bgcolor='#222222', font_color='white')

    # Add nodes with color-coded styles
    for entity, classification in entities:
        color = node_colors.get(classification, "#BDC3C7") 
        net.add_node(entity, label=entity, size=30, borderWidth=4, borderWidthSelected=8,
                        color={"highlight": {"border": color}, "background": color, "border": color},
                        font={'size': 18, 'color': 'white'})

    # Add edges with relationship colors
    for source, target, rs, classification in relationships:
        net.add_edge(source, target, label=rs, width=5,
                    font={'size': 14, 'align': 'middle', 'color': 'white', "strokeColor": "rgba(0,0,0,0)", "vadjust": -10},
                    color=relationship_colors.get(classification, "#FFFFFF"))

    # Improve physics settings for better visualization
    net.set_options("""
    var options = {
    "physics": {
        "barnesHut": {
            "gravitationalConstant": -5050,
            "centralGravity": 0.75,
            "springLength": 180,
            "damping": 0.5,
            "avoidOverlap": 1
        }
    }
    }
    """)

    file_path = f"wikileaks_graph/graph_{pdf}.html"
    net.show(file_path)

    # Inject **DYNAMIC LEGEND** into HTML file
    legend_html = f"""
    <style>
        .legend {{
            position: absolute;
            top: 20px;
            right: 20px;
            background: white;
            padding: 10px;
            border-radius: 5px;
            font-size: 14px;
            font-family: Arial, sans-serif;
            z-index: 1000;
        }}
        .legend-item {{
            display: flex;
            align-items: center;
            margin-bottom: 5px;
        }}
        .legend-color {{
            width: 15px;
            height: 15px;
            margin-right: 5px;
            border-radius: 3px;
        }}
    </style>
    <div class="legend">
        <strong>Legend</strong>
        <div class="legend-item"><div class="legend-color" style="background:{node_colors["PER"]}"></div>Person (PER)</div>
        <div class="legend-item"><div class="legend-color" style="background:{node_colors["ORG"]}"></div>Organisation (ORG)</div>
        <div class="legend-item"><div class="legend-color" style="background:{node_colors["LOC"]}"></div>Location (LOC)</div>
        <div class="legend-item"><div class="legend-color" style="background:{node_colors["EVT"]}"></div>Event (EVT)</div>
        <div class="legend-item"><div class="legend-color" style="background:{node_colors["MISC"]}"></div>Miscellaneous (MISC)</div>
        <hr>
        <div class="legend-item"><div class="legend-color" style="background:{relationship_colors["PV"]}"></div>Positive (PV)</div>
        <div class="legend-item"><div class="legend-color" style="background:{relationship_colors["NG"]}"></div>Negative (NG)</div>
        <div class="legend-item"><div class="legend-color" style="background:{relationship_colors["NE"]}"></div>Neutral (NE)</div>
    </div>
    """

    # Append dynamic legend to HTML file
    with open(file_path, "a") as f:
        f.write(legend_html)

wikileaks_graph/graph_1.html
wikileaks_graph/graph_10.html
wikileaks_graph/graph_105.html
wikileaks_graph/graph_106.html
wikileaks_graph/graph_107.html
wikileaks_graph/graph_108.html
wikileaks_graph/graph_11.html
wikileaks_graph/graph_110.html
wikileaks_graph/graph_111.html
wikileaks_graph/graph_112.html
wikileaks_graph/graph_113.html
wikileaks_graph/graph_114.html
wikileaks_graph/graph_13.html
wikileaks_graph/graph_14.html
wikileaks_graph/graph_15.html
wikileaks_graph/graph_16.html
wikileaks_graph/graph_2.html
wikileaks_graph/graph_21.html
wikileaks_graph/graph_24.html
wikileaks_graph/graph_26.html
wikileaks_graph/graph_27.html
wikileaks_graph/graph_31.html
wikileaks_graph/graph_35.html
wikileaks_graph/graph_36.html
wikileaks_graph/graph_38.html
wikileaks_graph/graph_39.html
wikileaks_graph/graph_4.html
wikileaks_graph/graph_43.html
wikileaks_graph/graph_44.html
wikileaks_graph/graph_45.html
wikileaks_graph/graph_47.html
wikileaks_graph/graph_49.html
wikileaks_graph/graph_5.html
wikil