In [6]:
import google.generativeai as genai
import os
from dotenv import load_dotenv

load_dotenv()
model = genai.GenerativeModel(model_name='gemini-1.5-flash',
                                system_instruction="""
Conduct **Relationship Extraction** on the given input. 

### **Expected Output Format**
Respond **ONLY** with two Python lists, separated by `@`:
1. **Entities List**: List of tuples containing (entity_name, classification), where:
    - `entity_name`: Maximum of **three words**.
    - `classification`: Must be one of the following:
        - `PER` (Person)
        - `ORG` (Organisation)
        - `LOC` (Location)
        - `EVT` (Event)
    - **No duplicate entities allowed.**
    - **Every entity must have at least one relationship.**

2. **Relationships List**: List of tuples containing (entity_1, entity_2, relationship_label, relationship_classification), where:
    - `entity_1`, `entity_2`: Must be present in the **Entities List**.
    - `relationship_label`: Maximum of **three words**.
    - `relationship_classification`: Must be one of the following:
        - `PV` (Positive sentiment)
        - `NG` (Negative sentiment)
        - `NE` (Neutral sentiment)
    - **No duplicate relationships allowed.**
    - **Ensure that every identified relationship is valid and meaningful.**
    - **All relationships must have entities that exist in the Entities List.**

### **Processing Rules**
- **Strictly use the specified classifications**; do not identify anything outside them.
- **Ensure high accuracy** in entity recognition and classification.
- **Each entity must be related to at least one other entity**.
- **Ensure that every relationship label is concise, meaningful, and accurate**.
- **Avoid misclassifying generic nouns as entities unless they fit within the specified categories.**

### **Example Output Format**
[("John Doe", "PER"), ("Google", "ORG"), ("New York", "LOC"), ("Tech Summit", "EVT")] @ 
[("John Doe", "Google", "works at", "PV"), ("Tech Summit", "Google", "sponsored by", "PV")]
""")
genai.configure(api_key=os.getenv("GEMINI_KEY"))

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
def prompt(input):
    response = model.generate_content(input)
    return response.text

def str_to_ls(str):
    entities_str, relationships_str = str.split('@')
    entities = eval(entities_str.strip())
    relationships = eval(relationships_str.strip())
    return entities, relationships


In [8]:
import pandas as pd
df = pd.read_excel("../Dataset/wikileaks_parsed.xlsx")
df

Unnamed: 0,PDF Path,Text
0,1.pdf,Pristina Airport – Possible administrative irr...
1,1.pdf,Investigative details\n\nIn his/her interviews...
2,10.pdf,"""An interoffice memorandum providing an “outst..."
3,10.pdf,"""Allegation 2 & 3:\n\n(Specifically, three of ..."
4,10.pdf,"""When asked about this in interview, the Divis..."
...,...,...
138,89.pdf,"""Description\n\nTop Secret US National Securit..."
139,9.pdf,"""INTRODUCTION\n\nThis case arises out of an au..."
140,9.pdf,"""BACKGROUND INFORMATION\n\nPristina Internatio..."
141,9.pdf,"""BACKGROUND INFORMATION\n\nPristina Internatio..."


In [9]:
df_grouped = df.groupby('PDF Path')['Text'].apply(' '.join).reset_index()
df_grouped

Unnamed: 0,PDF Path,Text
0,1.pdf,Pristina Airport – Possible administrative irr...
1,10.pdf,"""An interoffice memorandum providing an “outst..."
2,105.pdf,"""Description\n\nThis is a Secret US National S..."
3,106.pdf,"""Tokyo's Climate Change Officials to Continue ..."
4,107.pdf,"""Description\n\nThis is a secret US National S..."
5,108.pdf,"""Japanese Strive to Avoid Damage to U.S. Relat..."
6,11.pdf,"Until the end of June 2002, responsibility for..."
7,110.pdf,The EU and Japan were engaged as of early Dece...
8,111.pdf,A 22 October meeting attended by German Chance...
9,112.pdf,"Israel has reached out to Europe, including It..."


In [10]:
df_grouped['Text'] = df_grouped['Text'].apply(lambda x: [sentence for sentence in x.split('\n') if len(sentence) >= 10])
df_grouped['Text'] = df_grouped['Text'].apply(lambda x: ' '.join(x))

In [11]:
import time
from tqdm import tqdm

graph_data = []

start_time = time.time()
call_count = 0

for index, row in tqdm(df_grouped.iterrows(), total=df_grouped.shape[0], desc="Processing rows"):
    text = row['Text']
    pdf_file = row['PDF Path']
    
    # Check if 1 minute has passed and reset the counter
    if time.time() - start_time >= 60:
        start_time = time.time()
        call_count = 0
    
    # Making the API call
    if call_count < 15:
        res = prompt(text)
        call_count += 1
    else:
        time.sleep(60 - (time.time() - start_time))
        start_time = time.time()
        call_count = 1
        res = prompt(text)
    
    # Parsing the response
    entities, relationships = str_to_ls(res)
    
    # Appending results to list
    graph_data.append({'PDF':pdf_file,'Entities': entities, 'Relationships': relationships})

# Convert to DataFrame
graph_df = pd.DataFrame(graph_data)

Processing rows: 100%|██████████| 44/44 [02:39<00:00,  3.62s/it]


In [12]:
graph_df.to_csv('wiki_leaks_network.csv',index=False)