# Knowledge Graph RAG System

This notebook implements a Retrieval-Augmented Generation (RAG) system using a pre-built knowledge graph to answer questions about programming languages, frameworks, and technology concepts.

## Import Required Libraries



In [None]:
import json
import pickle
import re
from pathlib import Path
from typing import Dict, List

import networkx as nx
import openai
from dotenv import load_dotenv
import os

## Configure Environment Variables

In [None]:
load_dotenv(override=True)

# OpenAI/GitHub Models API
client = openai.OpenAI(
    base_url="https://models.github.ai/inference", 
    api_key=os.environ["GITHUB_TOKEN"]
)
MODEL_NAME = os.getenv("GITHUB_MODEL", "openai/gpt-4o")

print(f"✓ Model configured: {MODEL_NAME}")

## Load Serialized Knowledge Graph

Load the pre-built knowledge graph from disk for instant access. The graph contains entities and relationships about technology concepts.

In [None]:
# Load graph from pickle
graph_pickle_path = Path("data/knowledge_graph.pkl")

print("Loading serialized graph...")
with open(graph_pickle_path, 'rb') as f:
    graph = pickle.load(f)

# Extract entity dictionary
entities = graph.graph['entities']

print(f"✓ Graph loaded:")
print(f"  - Entities: {graph.number_of_nodes()}")
print(f"  - Relationships: {graph.number_of_edges()}")
print(f"  - Entity types: {set(data['type'] for _, data in graph.nodes(data=True))}")

## Define Search and Context Functions

Implement entity search with scoring based on name, type, and properties. Also define functions to retrieve entity context including relationships.

In [None]:
def search_entities(query: str, top_k: int = 5) -> List[Dict]:
    """Search entities by name or properties."""
    query_lower = query.lower()
    query_tokens = [token for token in re.split(r"\W+", query_lower) if token]
    
    if not query_tokens:
        return []
    
    matches = []
    for entity_id, entity in entities.items():
        score = 0
        
        # Name matching
        entity_name = entity['name'].lower()
        name_matches = sum(1 for token in query_tokens if token in entity_name)
        score += 10 * name_matches
        
        # Type matching
        entity_type = entity['type'].lower()
        type_matches = sum(1 for token in query_tokens if token in entity_type)
        score += 5 * type_matches
        
        # Property matching
        for key, value in entity.get('properties', {}).items():
            key_lower = str(key).lower()
            value_lower = str(value).lower()
            prop_matches = sum(
                1 for token in query_tokens 
                if token in key_lower or token in value_lower
            )
            score += 3 * prop_matches
        
        if score > 0:
            matches.append({'entity': entity, 'score': score})
    
    matches.sort(key=lambda x: x['score'], reverse=True)
    return [m['entity'] for m in matches[:top_k]]


def get_entity_context(entity_id: str) -> Dict:
    """Get entity and its immediate relationships."""
    if entity_id not in entities:
        return None
    
    context = {
        'entity': entities[entity_id],
        'relationships': []
    }
    
    # Outgoing relationships
    for target in graph.successors(entity_id):
        edge_data = graph.get_edge_data(entity_id, target)
        context['relationships'].append({
            'type': edge_data['rel_type'],
            'direction': 'outgoing',
            'target': entities[target],
            'properties': edge_data.get('properties', {})
        })
    
    # Incoming relationships
    for source in graph.predecessors(entity_id):
        edge_data = graph.get_edge_data(source, entity_id)
        context['relationships'].append({
            'type': edge_data['rel_type'],
            'direction': 'incoming',
            'source': entities[source],
            'properties': edge_data.get('properties', {})
        })
    
    return context


def format_context_for_llm(contexts: List[Dict]) -> str:
    """Format graph contexts for LLM consumption."""
    if not contexts:
        return "No relevant information found in the knowledge graph."
    
    formatted = "Knowledge Graph Information:\n\n"
    
    for i, ctx in enumerate(contexts, 1):
        entity = ctx['entity']
        formatted += f"Entity #{i}: {entity['name']} ({entity['type']})\n"
        
        # Properties
        if entity.get('properties'):
            formatted += "  Properties:\n"
            for key, value in entity['properties'].items():
                formatted += f"    - {key}: {value}\n"
        
        # Relationships
        if ctx['relationships']:
            formatted += "  Relationships:\n"
            for rel in ctx['relationships']:
                if rel['direction'] == 'outgoing':
                    formatted += f"    - {rel['type']} -> {rel['target']['name']} ({rel['target']['type']})\n"
                else:
                    formatted += f"    - {rel['type']} <- {rel['source']['name']} ({rel['source']['type']})\n"
                
                # Relationship properties
                if rel.get('properties'):
                    for key, value in rel['properties'].items():
                        formatted += f"      {key}: {value}\n"
        
        formatted += "\n"
    
    return formatted

print("✓ Search functions defined")

## Define System Messages for LLM

In [None]:
QUERY_REWRITE_SYSTEM_MESSAGE = """
You are a helpful assistant that rewrites user questions into keyword queries
for searching a knowledge graph about technology, programming languages, frameworks, and concepts.

Extract the key entities, concepts, or topics the user is asking about.
Focus on specific names of languages, frameworks, libraries, organizations, or technical concepts.

Respond with ONLY the keyword query (2-6 words).
"""

SYSTEM_MESSAGE = """
You are a helpful assistant that answers questions using a knowledge graph about
technology, programming languages, frameworks, libraries, and related concepts.

You must base your answers on the knowledge graph data provided in the context.
If the information is not in the knowledge graph, say so clearly.
Use the relationships and properties to provide comprehensive and accurate answers.
"""

## Question Answering Function

Define a function to process individual questions through the RAG pipeline: query rewriting, entity search, context retrieval, and answer generation.

In [None]:
def ask_question(question: str) -> str:
    """Ask a single question to the RAG system."""
    messages_local = [{"role": "system", "content": SYSTEM_MESSAGE}]
    
    # Rewrite query
    response = client.chat.completions.create(
        model=MODEL_NAME,
        temperature=0.05,
        messages=[
            {"role": "system", "content": QUERY_REWRITE_SYSTEM_MESSAGE},
            {"role": "user", "content": f"New user question: {question}"},
        ],
    )
    search_query = response.choices[0].message.content
    
    # Search and get context
    found_entities = search_entities(search_query, top_k=5)
    
    if found_entities:
        contexts = [get_entity_context(e['id']) for e in found_entities if get_entity_context(e['id'])]
        graph_context = format_context_for_llm(contexts)
    else:
        graph_context = "No relevant information found."
    
    print(f"\n[Search query]:\n{search_query}")
    print(f"\n[Context]:\n{graph_context}")

    # Generate answer
    messages_local.append({
        "role": "user",
        "content": f"{question}\n\nContext:\n{graph_context}"
    })
    
    response = client.chat.completions.create(
        model=MODEL_NAME,
        temperature=0.3,
        messages=messages_local
    )
    
    return response.choices[0].message.content

## Ejemplos de prompts



In [None]:
# Example usage: Query about Python's use cases
answer = ask_question("What is Python used for?")
print(f"\n[Answer]:\n{answer}")

In [None]:
# Example usage: Compare web frameworks
answer = ask_question("Tell me about GPT")
print(f"\n[Answer]:\n{answer}")

In [None]:
# Example usage: Query about relationships between technologies
answer = ask_question("What frameworks are built with JavaScript and what are they used for?")
print(f"\n[Answer]:\n{answer}")