In [36]:
import pandas as pd



# Load CSV
df = pd.read_csv("ntrs_complete_csv.csv")

df['Publication Date'] = pd.to_datetime(df['Publication Date'], format="%d/%m/%Y", errors='coerce')

# Filter for Subject = Aerodynamics and Year = 1999
filtered_df = df[
    (df['Subject Categories'] == 'Computer Systems') &
    (df['Publication Date'].dt.year == 1985)
].dropna(subset=['Title', 'Abstract'])


In [38]:
print(len(filtered_df))

53


In [44]:
import getpass
import google.generativeai as genai
import json

# Prompt for API key securely
api_key = getpass.getpass("Enter your Gemini API key: ")

# Configure Gemini with the entered key
genai.configure(api_key=api_key)

Enter your Gemini API key:  ········


In [52]:
model = genai.GenerativeModel("models/gemini-2.0-flash")

batch_size = 10
all_triples = []

for i in range(0, len(filtered_df), batch_size):
    batch = filtered_df.iloc[i:i+batch_size]

    prompt = """
You are a helpful AI that extracts knowledge graph triples from scientific study titles and abstracts.

For each study below, extract the most important knowledge graph triples in the format:
[{"subject": "", "predicate": "", "object": ""}, ...]

Return a JSON object where each key is the study title, and the value is the list of triples.

Studies:
"""

    for idx, row in batch.iterrows():
        prompt += f"""
{idx+1}. Title: {row['Title']}
Abstract: {row['Abstract']}
"""

    try:
        chat = model.start_chat()
        response = chat.send_message(prompt)
        text = response.text.strip()

        # Try parsing just the JSON part
        json_start = text.find("{")
        json_end = text.rfind("}") + 1
        parsed = json.loads(text[json_start:json_end])

        for title, triples in parsed.items():
            for t in triples:
                all_triples.append(t)

    except Exception as e:
        print(f"Error in batch {i}-{i+batch_size}: {e}")
        continue



# Save output
with open("triples_batched.json", "w", encoding="utf-8") as f:
    json.dump(all_triples, f, indent=2, ensure_ascii=False)

print(f"Extracted {len(all_triples)} triples from {len(filtered_df)} studies.")

✅ Extracted 319 triples from 53 studies.


Nodes: 484, Edges: 319


In [66]:
from pyvis.network import Network
import networkx as nx
import json

# Load triples
with open("triples_batched.json") as f:
    triples = json.load(f)

# Create graph
G = nx.MultiDiGraph()
for triple in triples:
    G.add_edge(triple["subject"], triple["object"], label=triple["predicate"])

# Render with PyVis
net = Network(height="750px", width="100%", notebook=True, directed=True)
net.from_nx(G)
net.show("knowledge_graph.html")

knowledge_graph.html


test_graph.html
