# Import Required Libraries
Import libraries for PDF parsing, NLP, knowledge graph construction, and visualization.

In [54]:
!uv pip install PyPDF2 spacy networkx matplotlib scipy openai python-dotenv

import PyPDF2
import spacy
import networkx as nx
import matplotlib.pyplot as plt
from collections import defaultdict
import scipy
from pyvis.network import Network
import tempfile
import os
import openai
from dotenv import load_dotenv
import re

[2mAudited [1m7 packages[0m [2min 56ms[0m[0m


In [55]:
# Download spaCy model only if not already installed
import spacy.util
model_name = 'en_core_web_sm'
if not spacy.util.is_package(model_name):
    import subprocess
    subprocess.run(['python', '-m', 'spacy', 'download', model_name], check=True)

# Parse PDF to Extract Text
Load a research paper PDF and extract its text content using PyPDF2.

In [None]:
# Extract text from PDF
pdf_path = 'pdfs/mooc1.pdf'  # Change to your PDF file path
text = ''
with open(pdf_path, 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    for page in reader.pages:
        text += page.extract_text() or ''

print(text[:1000])  # Preview the first 1000 characters

ORIGINAL RESEARCH
published: 14 September 2021
doi: 10.3389/fpsyg.2021.751492
Frontiers in Psychology | www.frontiersin.org 1 September 2021 | Volume 12 | Article 751492Editedby:
XiyingLi,
ShaanxiNormalUniversity,China
Reviewedby:
YanDong,
BeijingNormalUniversity,China
ZhiqiangMa,
JiangnanUniversity,China
XiaofeiQi,
DurhamUniversity,UnitedKingdom
*Correspondence:
JunyiLi
junyili@sicnu.edu.cn
XuechenDing
dingxuechen_psy@163.com
Specialtysection:
Thisarticlewassubmittedto
EducationalPsychology,
asectionofthejournal
FrontiersinPsychology
Received: 01August2021
Accepted: 11August2021
Published: 14September2021
Citation:
WuC,LiJ,ZhangY,LanC,ZhouK,
WangY,LuLandDingX(2021)Can
MOOCInstructorBePortrayedby
SemanticFeatures?UsingDiscourse
andClusteringAnalysistoIdentify
Lecture-StyleofInstructorsin
MOOCs.Front.Psychol.12:751492.
doi:10.3389/fpsyg.2021.751492Can MOOC Instructor Be Portrayed
by Semantic Features? Using
Discourse and Clustering Analysis to
Identify Lecture-Style of Instructors
in MO

# Extract Concepts from Text
Use spaCy to identify and extract key concepts (noun phrases and named entities) from the extracted text.

In [57]:
# Extract concepts using spaCy with improved filtering for high-level concepts
import string
from collections import Counter
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')
doc = nlp(text)

# Exclude these entity types
EXCLUDE_ENT_TYPES = {"PERSON", "DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL"}

# Helper function to check if a phrase is a high-level concept
def is_high_level(phrase, ent_type=None):
    words = [w for w in phrase.split() if w.lower() not in STOP_WORDS and w not in string.punctuation]
    # At least 2 words, not all stopwords, not too short
    if len(words) < 2 or len(phrase) <= 6:
        return False
    if ent_type and ent_type in EXCLUDE_ENT_TYPES:
        return False
    return True

# Extract noun phrases and named entities (excluding irrelevant types)
candidates = []
for np in doc.noun_chunks:
    if is_high_level(np.text.strip()):
        candidates.append(np.text.strip())
for ent in doc.ents:
    if is_high_level(ent.text.strip(), ent.label_):
        candidates.append(ent.text.strip())

# Count frequency and keep only those that appear more than once
concept_counts = Counter(candidates)
concepts = set([c for c, count in concept_counts.items() if count > 1])

print(f"Extracted {len(concepts)} high-level concepts. Sample:", list(concepts)[:10])

Extracted 87 high-level concepts. Sample: ['deep learning models', 'C Shan Xu', 'Rand Init', 'Matija Marolt', 'transfer learning', 'Song Pang', 'an additional test', 'Toronto General\nResearch Institute', 'Automatic segmentation', 'Constantin Pape']


# Build Knowledge Graph
Construct a knowledge graph where nodes are concepts and edges represent co-occurrence within the same sentence.

In [58]:
import itertools
from collections import defaultdict

graph = nx.Graph()
graph.add_nodes_from(concepts)

edge_contexts = defaultdict(list)
for sent in doc.sents:
    sent_concepts = set()
    for np in sent.noun_chunks:
        if np.text.strip() in concepts:
            sent_concepts.add(np.text.strip())
    for ent in sent.ents:
        if ent.text.strip() in concepts:
            sent_concepts.add(ent.text.strip())
    # Add edges between all pairs of concepts in the sentence
    for c1, c2 in itertools.combinations(sent_concepts, 2):
        graph.add_edge(c1, c2)
        edge_contexts[(c1, c2)].append(sent.text.strip())
# Add context as edge attribute (join multiple sentences if needed)
for (c1, c2), contexts in edge_contexts.items():
    if graph.has_edge(c1, c2):
        graph[c1][c2]['context'] = '\n---\n'.join(contexts)

print(f"Graph has {graph.number_of_nodes()} nodes and {graph.number_of_edges()} edges.")

Graph has 87 nodes and 118 edges.


# Visualize Knowledge Graph
Visualize the constructed knowledge graph using NetworkX and Matplotlib (non-interactive visualization).

In [59]:
# Visualize the knowledge graph interactively with PyVis (save HTML only, improved readability)
from pyvis.network import Network

# Create a PyVis network with larger node and font sizes
net = Network(height='800px', width='100%', notebook=True, bgcolor='#ffffff', font_color='black')
net.barnes_hut()

# Add nodes and edges with larger size and font
for node in graph.nodes():
    net.add_node(node, label=node, size=30, font={"size": 28})
# Add edges with context as tooltip
for source, target, data in graph.edges(data=True):
    tooltip = data.get('context', '')
    net.add_edge(source, target, title=tooltip)

# Set physics options for a tighter layout and better initial zoom
net.set_options('''
var options = {
  "nodes": {
    "font": {"size": 28},
    "size": 30
  },
  "edges": {
    "color": {"inherit": true},
    "smooth": false
  },
  "physics": {
    "barnesHut": {
      "gravitationalConstant": -20000,
      "centralGravity": 0.5,
      "springLength": 150,
      "springConstant": 0.04,
      "damping": 0.09,
      "avoidOverlap": 1
    },
    "minVelocity": 0.75
  }
}
''')

# Save to HTML file (you can open this file outside Jupyter)
output_path = 'knowledge_graph.html'
net.show(output_path)
print(f"Interactive knowledge graph saved to {output_path}")

knowledge_graph.html
Interactive knowledge graph saved to knowledge_graph.html


# LLM-Based Concept and Relationship Extraction for Knowledge Graphs
This workflow uses the OpenAI API to extract high-level concepts and their relationships from research text, then builds and visualizes a labeled knowledge graph.

In [60]:
# Load OpenAI API key from .env
load_dotenv()
OPENAI_KEY = os.getenv("OPENAI_KEY")

# Use the new OpenAI API client (openai>=1.0.0)
client = openai.Client(api_key=OPENAI_KEY)

# Function to chunk text into manageable pieces
def chunk_text(text, max_tokens=1500):
    paragraphs = text.split('\n')
    chunks = []
    current = ''
    for para in paragraphs:
        if len(current) + len(para) < max_tokens:
            current += para + '\n'
        else:
            chunks.append(current)
            current = para + '\n'
    if current:
        chunks.append(current)
    return chunks

# Prompt for extracting concept triples
PROMPT = (
    "Extract a list of the most important technical concepts and their relationships from the following text. "
    "Return as a list of triples in the format (concept1, relationship, concept2). "
    "Focus on high-level, domain-relevant concepts and meaningful relationships.\n\nText:\n"
)

def extract_triples_from_text(text):
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "user", "content": PROMPT + text}],
        max_tokens=800,
        temperature=0.2
    )
    content = response.choices[0].message.content
    # Extract triples using regex
    triples = re.findall(r'\(([^,]+),\s*([^,]+),\s*([^\)]+)\)', content)
    return [tuple(map(str.strip, t)) for t in triples]

# Chunk the text and extract triples
chunks = chunk_text(text, max_tokens=1500)
all_triples = []
for chunk in chunks:
    triples = extract_triples_from_text(chunk)
    all_triples.extend(triples)

print(f"Extracted {len(all_triples)} concept-relationship triples. Sample:", all_triples[:5])

Extracted 283 concept-relationship triples. Sample: [("'Volumetric electron microscopy'", "'enables'", "'nanoscale resolution three-dimensional imaging'"), ("'nanoscale resolution three-dimensional imaging'", "'used for'", "'biological samples'"), ("'Identification and labeling'", "'required for'", "'image interpretation'"), ("'manual labeling'", "'is'", "'time-consuming'"), ("'deep learning segmentation algorithms'", "'automate'", "'labeling'")]


In [61]:
# Build a labeled knowledge graph from triples
G = nx.DiGraph()
for c1, rel, c2 in all_triples:
    G.add_node(c1)
    G.add_node(c2)
    G.add_edge(c1, c2, label=rel)
print(f"Graph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")

Graph has 435 nodes and 280 edges.


In [62]:
# Cluster nodes using Louvain community detection (on undirected version)
import community as community_louvain
partition = community_louvain.best_partition(G.to_undirected()) if G.number_of_edges() > 0 else {}
for node, cluster in partition.items():
    G.nodes[node]['cluster'] = cluster

AttributeError: module 'community' has no attribute 'best_partition'

In [None]:
# Visualize the labeled knowledge graph interactively with PyVis (edge labels, clusters)
from pyvis.network import Network
import random

net = Network(height='900px', width='100%', notebook=True, bgcolor='#ffffff', font_color='black', directed=True)
net.barnes_hut()

cluster_colors = {}
for node in G.nodes():
    cluster = G.nodes[node].get('cluster', 0)
    if cluster not in cluster_colors:
        cluster_colors[cluster] = f"#{random.randint(0, 0xFFFFFF):06x}"
    net.add_node(node, label=node, size=30, font={"size": 28}, color=cluster_colors[cluster])
for source, target, data in G.edges(data=True):
    label = data.get('label', '')
    net.add_edge(source, target, title=label, label=label)

net.set_options('''
var options = {
  "nodes": {
    "font": {"size": 28},
    "size": 30
  },
  "edges": {
    "color": {"inherit": true},
    "smooth": false,
    "arrows": {"to": {"enabled": true}}
  },
  "physics": {
    "barnesHut": {
      "gravitationalConstant": -20000,
      "centralGravity": 0.5,
      "springLength": 150,
      "springConstant": 0.04,
      "damping": 0.09,
      "avoidOverlap": 1
    },
    "minVelocity": 0.75
  }
}
''')

output_path = 'knowledge_graph.html'
net.show(output_path)
print(f"Interactive knowledge graph saved to {output_path}")