In [1]:
!pip install -q streamlit pyngrok PyPDF2 pyyaml networkx pandas numpy

print("✅ Packages installed!")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m66.0 MB/s[0m eta [36m0:00:00[0m
[?25h✅ Packages installed!


In [3]:
from pyngrok import ngrok
import getpass

# Get your ngrok token from: https://dashboard.ngrok.com/get-started/your-authtoken
# It's free! Just sign up and copy your token
ngrok_token = getpass.getpass("Enter your ngrok auth token: ")
ngrok.set_auth_token(ngrok_token)

print("✅ ngrok configured!")

Enter your ngrok auth token: ··········
✅ ngrok configured!


In [4]:
import os

# Create app directory
APP_DIR = "streamlit_app"
os.makedirs(APP_DIR, exist_ok=True)

# Write the complete app.py file
app_code = '''import streamlit as st
import time
import json
from pathlib import Path

st.set_page_config(page_title="Graph-RAG Demo", layout="wide")
st.title("📚 Graph-RAG Question Answering System")
st.markdown("Upload a PDF or use existing data to answer questions with citations and evidence.")

def extract_text_from_pdf(pdf_file):
    """Extract text from uploaded PDF file"""
    try:
        import PyPDF2
        from io import BytesIO

        pdf_reader = PyPDF2.PdfReader(BytesIO(pdf_file.read()))
        documents = []

        for i, page in enumerate(pdf_reader.pages):
            text = page.extract_text()
            if text.strip():
                documents.append({
                    "id": f"page_{i+1}",
                    "text": text,
                    "page_num": i+1
                })

        return documents
    except Exception as e:
        st.error(f"Error reading PDF: {str(e)}")
        return []

def extract_entities_from_text(documents):
    """Simple entity extraction from text"""
    entities = []
    entity_types = {
        "dataset": ["dataset", "corpus", "benchmark"],
        "method": ["method", "algorithm", "model", "approach"],
        "metric": ["accuracy", "f1", "precision", "recall", "score"],
        "paper": ["paper", "study", "article", "work"]
    }

    for doc in documents:
        text_lower = doc["text"].lower()
        words = text_lower.split()

        for i, word in enumerate(words):
            for ent_type, keywords in entity_types.items():
                if word in keywords and i + 1 < len(words):
                    entity_name = " ".join(words[i:min(i+3, len(words))])
                    entities.append({
                        "name": entity_name.title(),
                        "type": ent_type,
                        "source": doc["id"]
                    })

    unique_entities = []
    seen = set()
    for ent in entities:
        if ent["name"] not in seen:
            seen.add(ent["name"])
            unique_entities.append(ent)

    return unique_entities[:50]

def extract_relations_from_documents(documents, entities):
    """Simple relation extraction based on co-occurrence"""
    relations = []
    entity_names = [e["name"].lower() for e in entities]

    for doc in documents:
        text_lower = doc["text"].lower()
        appearing_entities = [e for e in entity_names if e in text_lower]

        for i, ent1 in enumerate(appearing_entities):
            for ent2 in appearing_entities[i+1:]:
                relations.append({
                    "head": ent1.title(),
                    "relation": "co_occurs_with",
                    "tail": ent2.title(),
                    "source": doc["id"]
                })

    return relations[:100]

@st.cache_data
def process_pdf_data(_pdf_file):
    """Process uploaded PDF and extract structured data"""
    documents = extract_text_from_pdf(_pdf_file)
    if not documents:
        return None
    entities = extract_entities_from_text(documents)
    relations = extract_relations_from_documents(documents, entities)
    return {"documents": documents, "entities": entities, "relations": relations}

def simple_entity_extraction(query, entities):
    """Extract entities mentioned in the query"""
    found_entities = []
    query_lower = query.lower()
    for entity in entities:
        if entity["name"].lower() in query_lower:
            found_entities.append(entity["name"])

    if not found_entities:
        for entity in entities:
            entity_words = entity["name"].lower().split()
            query_words = query_lower.split()
            if any(word in query_words for word in entity_words):
                found_entities.append(entity["name"])

    return found_entities[:5]

def find_neighbors(entity_name, relations):
    """Find related entities in the knowledge graph"""
    neighbors = []
    entity_lower = entity_name.lower()

    for rel in relations:
        if rel["head"].lower() == entity_lower:
            neighbors.append((rel["head"], rel["relation"], rel["tail"]))
        elif rel["tail"].lower() == entity_lower:
            neighbors.append((rel["tail"], rel["relation"] + "_inv", rel["head"]))

    return neighbors[:10]

def retrieve_evidence(entities, documents, query, top_k=3):
    """Retrieve relevant document spans"""
    evidence = []
    query_words = set(query.lower().split())

    for doc in documents:
        doc_text_lower = doc["text"].lower()
        relevance_score = 0

        for entity in entities:
            if entity.lower() in doc_text_lower:
                relevance_score += 2

        doc_words = set(doc_text_lower.split())
        overlap = len(query_words & doc_words)
        relevance_score += overlap

        if relevance_score > 0:
            snippet = doc["text"][:500] + "..." if len(doc["text"]) > 500 else doc["text"]
            evidence.append({
                "doc_id": doc["id"],
                "text": snippet,
                "full_text": doc["text"],
                "relevance": relevance_score / (len(query_words) + len(entities) + 1),
                "page_num": doc.get("page_num", "N/A")
            })

    evidence.sort(key=lambda x: x["relevance"], reverse=True)
    return evidence[:top_k]

def graph_rag_query(query, data, top_k=3):
    """Main Graph-RAG function"""
    start_time = time.time()
    seed_entities = simple_entity_extraction(query, data["entities"])

    if not seed_entities:
        seed_entities = [data["entities"][0]["name"]] if data["entities"] else []

    all_neighbors = []
    expanded_entities = set(seed_entities)
    for entity in seed_entities:
        neighbors = find_neighbors(entity, data["relations"])
        all_neighbors.extend(neighbors)
        for _, _, tail in neighbors:
            expanded_entities.add(tail)

    evidence = retrieve_evidence(list(expanded_entities), data["documents"], query, top_k)

    if evidence:
        answer = f"Based on the documents, here's what I found about your query: "
        if seed_entities:
            answer += f"{seed_entities[0]} appears in {len(evidence)} document(s). "
        if all_neighbors:
            related = [tail for _, _, tail in all_neighbors[:3]]
            answer += f"Related concepts include: {', '.join(related)}."
        else:
            answer += "See the evidence below for more details."
    else:
        answer = "I couldn't find specific information about this query in the documents."

    latency = time.time() - start_time

    return {
        "answer": answer,
        "evidence": evidence,
        "seed_entities": seed_entities,
        "neighbors": all_neighbors,
        "trace": [
            ("seed_extraction", f"{len(seed_entities)} entities"),
            ("graph_expansion", f"{len(all_neighbors)} neighbors"),
            ("retrieval", f"{len(evidence)} documents"),
            ("synthesis", "completed")
        ],
        "latency": latency,
        "tokens_in": len(query.split()) * 1.3,
        "tokens_out": len(answer.split()) * 1.3
    }

if 'data' not in st.session_state:
    st.session_state.data = None

with st.sidebar:
    st.header("⚙️ Settings")
    st.subheader("📄 Data Source")
    data_source = st.radio("Choose data source:", ["Upload PDF", "Use Demo Data"])

    if data_source == "Upload PDF":
        uploaded_file = st.file_uploader("Upload your PDF file", type=['pdf'])
        if uploaded_file is not None:
            if st.button("Process PDF"):
                with st.spinner("Processing PDF..."):
                    st.session_state.data = process_pdf_data(uploaded_file)
                    if st.session_state.data:
                        st.success("✅ PDF processed successfully!")
                    else:
                        st.error("❌ Failed to process PDF")
    else:
        if st.session_state.data is None:
            demo_data = {
                "documents": [
                    {"id": "doc1", "text": "Method X was compared on Dataset D1 achieving an F1 score of 0.78.", "page_num": 1},
                    {"id": "doc2", "text": "Dataset D1 contains 10,000 samples for classification tasks.", "page_num": 2},
                    {"id": "doc3", "text": "The baseline method achieved 0.65 F1 on Dataset D1.", "page_num": 3},
                    {"id": "doc4", "text": "Paper P3 introduces Method X and applies it to Dataset D2 with accuracy of 0.82.", "page_num": 4}
                ],
                "entities": [
                    {"name": "Method X", "type": "method"},
                    {"name": "Dataset D1", "type": "dataset"},
                    {"name": "Dataset D2", "type": "dataset"},
                    {"name": "Paper P3", "type": "paper"}
                ],
                "relations": [
                    {"head": "Method X", "relation": "evaluated_on", "tail": "Dataset D1"},
                    {"head": "Method X", "relation": "evaluated_on", "tail": "Dataset D2"},
                    {"head": "Method X", "relation": "reported_in", "tail": "Paper P3"}
                ]
            }
            st.session_state.data = demo_data

    st.markdown("---")

    if st.session_state.data:
        st.subheader("Retrieval Parameters")
        top_k = st.slider("Top-K passages", min_value=1, max_value=10, value=3)
        st.subheader("Visualization")
        show_graph = st.checkbox("Show Knowledge Graph Neighbors", value=True)
        show_trace = st.checkbox("Show Reasoning Trace", value=True)
        show_stats = st.checkbox("Show Performance Stats", value=True)
        st.markdown("---")
        st.caption("📊 Dataset Info")
        st.caption(f"Documents: {len(st.session_state.data['documents'])}")
        st.caption(f"Entities: {len(st.session_state.data['entities'])}")
        st.caption(f"Relations: {len(st.session_state.data['relations'])}")

if st.session_state.data is None:
    st.info("👈 Please upload a PDF or select demo data from the sidebar to get started.")
else:
    st.subheader("🔍 Ask Your Question")
    query = st.text_area("Enter your question:", value="What methods were evaluated?", height=100)

    if st.button("🚀 Run Graph-RAG", type="primary"):
        if query.strip():
            with st.spinner("Processing query..."):
                result = graph_rag_query(query, st.session_state.data, top_k=top_k)

            st.markdown("---")
            st.subheader("💡 Answer")
            st.success(result["answer"])

            st.subheader("📄 Supporting Evidence")
            if result["evidence"]:
                for i, ev in enumerate(result["evidence"], 1):
                    with st.expander(f"Evidence {i}: {ev['doc_id']} (Page {ev['page_num']})", expanded=True):
                        st.markdown(f"**Text:** {ev['text']}")
                        st.caption(f"Relevance: {ev['relevance']:.2f}")
            else:
                st.info("No evidence found for this query.")

            if show_graph and result["neighbors"]:
                st.subheader("🕸️ Knowledge Graph Neighbors")
                for head, rel, tail in result["neighbors"][:10]:
                    st.markdown(f"- `{head}` **{rel}** → `{tail}`")

            if show_trace:
                st.subheader("🔄 Reasoning Trace")
                for step, info in result["trace"]:
                    st.markdown(f"- **{step}**: {info}")

            if show_stats:
                st.subheader("📊 Performance Statistics")
                col1, col2, col3 = st.columns(3)
                with col1:
                    st.metric("Latency", f"{result['latency']:.3f}s")
                with col2:
                    st.metric("Tokens In", f"{int(result['tokens_in'])}")
                with col3:
                    st.metric("Tokens Out", f"{int(result['tokens_out'])}")
        else:
            st.warning("Please enter a question first!")

st.markdown("---")
st.caption("Built with Streamlit • Graph-RAG Implementation Demo")
'''

# Write app.py
with open(os.path.join(APP_DIR, "app.py"), "w", encoding="utf-8") as f:
    f.write(app_code)

print(f"✅ App file created in {APP_DIR}/")
print(f"📁 File location: {os.path.abspath(APP_DIR)}/app.py")

✅ App file created in streamlit_app/
📁 File location: /content/streamlit_app/app.py


In [5]:
import subprocess
import threading

def run_streamlit():
    """Run streamlit in the background"""
    subprocess.run([
        "streamlit", "run",
        f"{APP_DIR}/app.py",
        "--server.port", "8501",
        "--server.headless", "true",
        "--server.enableXsrfProtection", "false",
        "--server.enableCORS", "false"
    ])

# Start streamlit in background thread
thread = threading.Thread(target=run_streamlit, daemon=True)
thread.start()

# Wait for streamlit to start
import time
time.sleep(5)

# Create public tunnel
public_url = ngrok.connect(8501, "http")
print("\n" + "="*60)
print("🚀 YOUR STREAMLIT APP IS RUNNING!")
print("="*60)
print(f"\n🌐 Public URL: {public_url}")
print(f"\n👆 Click the link above to access your app!")
print("\n" + "="*60)
print("\n⚠️  IMPORTANT: Keep this cell running!")
print("   If you stop it, the app will stop too.")
print("="*60)

# Keep the tunnel alive
try:
    import time
    while True:
        time.sleep(1)
except KeyboardInterrupt:
    print("\n\n🛑 Stopping app...")
    ngrok.disconnect(public_url)


🚀 YOUR STREAMLIT APP IS RUNNING!

🌐 Public URL: NgrokTunnel: "https://invertible-apsidally-pamella.ngrok-free.dev" -> "http://localhost:8501"

👆 Click the link above to access your app!


⚠️  IMPORTANT: Keep this cell running!
   If you stop it, the app will stop too.






🛑 Stopping app...


PyngrokNgrokURLError: ngrok client exception, URLError: [Errno 111] Connection refused

In [6]:
# Save environment to JSON file
import subprocess
import json
import sys

def save_environment_to_json(filename="env_rag_adv.json"):
    result = subprocess.run([sys.executable, '-m', 'pip', 'list', '--format=json'],
                          capture_output=True, text=True, check=True)
    packages = json.loads(result.stdout)

    env_info = {
        "python_version": sys.version,
        "platform": sys.platform,
        "packages": packages
    }

    with open(filename, 'w') as f:
        json.dump(env_info, f, indent=2)

    print(f"Environment saved to {filename}")

save_environment_to_json("env_rag_adv.json")

Environment saved to env_rag_adv.json
