# Import Required Libraries
Import libraries for PDF parsing, NLP, knowledge graph construction, and visualization.

In [3]:
!uv pip install PyPDF2 spacy networkx matplotlib scipy

import PyPDF2
import spacy
import networkx as nx
import matplotlib.pyplot as plt
from collections import defaultdict
import scipy
from pyvis.network import Network
import tempfile
import os

[2mAudited [1m5 packages[0m [2min 34ms[0m[0m


In [4]:
# Download spaCy model only if not already installed
import spacy.util
model_name = 'en_core_web_sm'
if not spacy.util.is_package(model_name):
    import subprocess
    subprocess.run(['python', '-m', 'spacy', 'download', model_name], check=True)

# Parse PDF to Extract Text
Load a research paper PDF and extract its text content using PyPDF2.

In [5]:
# Extract text from PDF
pdf_path = 'pdfs/zhen1.pdf'  # Change to your PDF file path
text = ''
with open(pdf_path, 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    for page in reader.pages:
        text += page.extract_text() or ''

print(text[:1000])  # Preview the first 1000 characters

In Review, , pp. 1–8
doi:
Transfer learning improves performance in volumetric
electron microscopy organelle segmentation across
tissues
Ronald Xie,1,2,3,Ben Mulcahy,4Ali Darbandi,4Sagar Marwah,4Fez Ali,1Yuna Lee,1
Gunes Parlakgul,5Gokhan Hotamisligil,6,7Bo Wang,2,8,11,12,Sonya MacParland,8,9
Mei Zhen3,4and Gary D. Bader1,3,4,12,13
1The Donnelly Centre, University of Toronto, Toronto, Ontario, Canada,2Peter Munk Cardiac Centre and Joint Department of Medical
Imaging, University Health Network, Toronto, Canada,3Department of Molecular Genetics, University of Toronto, Toronto, Ontario,
Canada,4Lunenfeld-Tanenbaum Research Institute, Mount Sinai Hospital, Toronto, Ontario, Canada,5University of California, Berkeley,
Berkeley, CA, USA,6Sabri ¨Ulker Center of Metabolic Research and Department of Molecular Metabolism, Harvard T.H. Chan School of
Public Health, Boston, MA, USA,7Broad Institute of MIT and Harvard, Cambridge, MA, USA,8Department of Laboratory Medicine and
Pathobiology, Temerty 

# Extract Concepts from Text
Use spaCy to identify and extract key concepts (noun phrases and named entities) from the extracted text.

In [6]:
# Extract concepts using spaCy with improved filtering for high-level concepts
import string
from collections import Counter
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')
doc = nlp(text)

# Helper function to check if a phrase is high-level
def is_high_level(phrase):
    words = [w for w in phrase.split() if w.lower() not in STOP_WORDS and w not in string.punctuation]
    # At least 2 words, not all stopwords, not too short
    return len(words) >= 2 and len(phrase) > 6

# Extract noun phrases and named entities
candidates = []
for np in doc.noun_chunks:
    if is_high_level(np.text.strip()):
        candidates.append(np.text.strip())
for ent in doc.ents:
    if is_high_level(ent.text.strip()):
        candidates.append(ent.text.strip())

# Count frequency and keep only those that appear more than once
concept_counts = Counter(candidates)
concepts = set([c for c, count in concept_counts.items() if count > 1])

print(f"Extracted {len(concepts)} high-level concepts. Sample:", list(concepts)[:10])

Extracted 192 high-level concepts. Sample: ['University Health Network', 'Jody Clements', 'Klaus H Maier-Hein', 'Huxley K Hoffman', 'Jurgen AW Heymann', 'Rajeev Parvathala', 'Rand Init', 'mouse cortex', 'Toronto General\nResearch Institute', 'Peter Li']


# Build Knowledge Graph
Construct a knowledge graph where nodes are concepts and edges represent co-occurrence within the same sentence.

In [10]:
# Build knowledge graph from concept co-occurrence in sentences
graph = nx.Graph()
graph.add_nodes_from(concepts)

# Add edges based on co-occurrence in sentences
for sent in doc.sents:
    sent_concepts = set()
    for np in sent.noun_chunks:
        if np.text.strip() in concepts:
            sent_concepts.add(np.text.strip())
    for ent in sent.ents:
        if ent.text.strip() in concepts:
            sent_concepts.add(ent.text.strip())
    # Add edges between all pairs of concepts in the sentence
    for c1 in sent_concepts:
        for c2 in sent_concepts:
            if c1 != c2:
                graph.add_edge(c1, c2)

print(f"Graph has {graph.number_of_nodes()} nodes and {graph.number_of_edges()} edges.")

Graph has 192 nodes and 472 edges.


# Visualize Knowledge Graph
Visualize the constructed knowledge graph using NetworkX and Matplotlib (non-interactive visualization).

In [11]:
# Visualize the knowledge graph interactively with PyVis (save HTML only, improved readability)
from pyvis.network import Network

# Create a PyVis network with larger node and font sizes
net = Network(height='800px', width='100%', notebook=True, bgcolor='#ffffff', font_color='black')
net.barnes_hut()

# Add nodes and edges with larger size and font
for node in graph.nodes():
    net.add_node(node, label=node, size=30, font={"size": 28})
for source, target in graph.edges():
    net.add_edge(source, target)

# Set physics options for a tighter layout and better initial zoom
net.set_options('''
var options = {
  "nodes": {
    "font": {"size": 28},
    "size": 30
  },
  "edges": {
    "color": {"inherit": true},
    "smooth": false
  },
  "physics": {
    "barnesHut": {
      "gravitationalConstant": -20000,
      "centralGravity": 0.5,
      "springLength": 150,
      "springConstant": 0.04,
      "damping": 0.09,
      "avoidOverlap": 1
    },
    "minVelocity": 0.75
  }
}
''')

# Save to HTML file (you can open this file outside Jupyter)
output_path = 'knowledge_graph.html'
net.show(output_path)
print(f"Interactive knowledge graph saved to {output_path}")

knowledge_graph.html
Interactive knowledge graph saved to knowledge_graph.html
