In [1]:
import json
import os
from pathlib import Path

In [2]:
from config import Config, IS_CONFIG_VALID
from core.knowledge_graph import KnowledgeGraphBuilder
from core.rag_engine import RAGEngine
from ingestion.document_processor import DocumentProcessor
from ingestion.web_crawler import WebCrawler
from logger import logger

2025-10-23 21:47:58,773 - RAG_App - INFO - Loading configuration...
2025-10-23 21:47:58,775 - RAG_App - INFO - Configuration validated successfully.
2025-10-23 21:48:00,694 - faiss.loader - INFO - Loading faiss with AVX512 support.
2025-10-23 21:48:00,695 - faiss.loader - INFO - Could not load library with AVX512 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx512'")
2025-10-23 21:48:00,695 - faiss.loader - INFO - Loading faiss with AVX2 support.
2025-10-23 21:48:00,857 - faiss.loader - INFO - Successfully loaded faiss with AVX2 support.


  from .autonotebook import tqdm as notebook_tqdm


2025-10-23 21:48:34,132 - datasets - INFO - TensorFlow version 2.20.0 available.
2025-10-23 21:48:34,134 - datasets - INFO - JAX version 0.7.2 available.



In [3]:
def pjson(obj):
    """Pretty-print JSON to stdout."""
    print(json.dumps(obj, indent=2, ensure_ascii=False))

In [4]:
if not IS_CONFIG_VALID:
    logger.error("CRITICAL: .env file is not configured correctly. Please check it.")
    raise RuntimeError("Invalid configuration")

logger.info("Configuration is valid.")
print(f"LLM Provider : {Config.LLM_PROVIDER}")
print(f"LLM Model    : {Config.LLM_MODEL}")
print(f"Vector Store : {Config.VECTOR_STORE_TYPE}")

rag = RAGEngine()

2025-10-23 21:48:49,790 - RAG_App - INFO - Configuration is valid.
LLM Provider : featherless-ai
LLM Model    : inclusionAI/Ling-1T
Vector Store : chroma
2025-10-23 21:48:49,791 - RAG_App - INFO - Initializing RAGEngine...
2025-10-23 21:48:49,792 - RAG_App - INFO - Initializing ChromaVectorStore at ./chroma_db_store
2025-10-23 21:48:49,820 - chromadb.telemetry.product.posthog - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


2025-10-23 21:48:50.177 
  command:

    streamlit run C:\Users\Wizard\AppData\Roaming\Python\Python311\site-packages\ipykernel_launcher.py [ARGUMENTS]


2025-10-23 21:48:50,180 - RAG_App - INFO - Loading embedding model: sentence-transformers/all-MiniLM-L6-v2
2025-10-23 21:48:50,199 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device_name: cuda:0
2025-10-23 21:48:50,200 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2




2025-10-23 21:48:56,384 - RAG_App - INFO - ChromaVectorStore loaded on initialization.
2025-10-23 21:48:56,385 - RAG_App - INFO - Vector store loaded successfully.
2025-10-23 21:48:56,386 - RAG_App - INFO - RAGEngine initialized with provider featherless-ai and model inclusionAI/Ling-1T


In [5]:
logger.warning("Clearing vector store for a clean test…")
rag.clear_vector_store()
print("Vector store cleared.\n")

2025-10-23 21:49:10,030 - RAG_App - ERROR - Error resetting chroma database: Reset is disabled by config
2025-10-23 21:49:10,031 - RAG_App - INFO - Vector store cleared and re-initialized.
Vector store cleared.



In [7]:
mock_files = [
    {
        "name": "test_paris.txt",
        "type": "text/plain",
        "data": b"The capital of France is Paris. Paris is known for the Eiffel Tower, "
                b"the Louvre Museum, and its beautiful cafes. It is a major center for "
                b"art and culture.",
    },
    {
        "name": "test_berlin.txt",
        "type": "text/plain",
        "data": b"Berlin is the capital of Germany. It is famous for the Brandenburg Gate "
                b"and the remains of the Berlin Wall. It has a vibrant nightlife and tech scene.",
    },
]

processor = DocumentProcessor()
docs = processor.process_uploaded_files(mock_files)
rag.add_documents(docs)
print(f"DocumentProcessor created {len(docs)} chunks from {len(mock_files)} files.\n")

stats = rag.get_vector_store_stats()
print("--- Vector-store stats after document ingestion ---")
pjson(stats)

2025-10-23 21:50:10,324 - RAG_App - INFO - DocumentProcessor initialized.
2025-10-23 21:50:10,324 - RAG_App - INFO - Processing file: test_paris.txt
2025-10-23 21:50:10,325 - RAG_App - INFO - Successfully processed test_paris.txt, created 1 chunks.
2025-10-23 21:50:10,325 - RAG_App - INFO - Processing file: test_berlin.txt
2025-10-23 21:50:10,326 - RAG_App - INFO - Successfully processed test_berlin.txt, created 1 chunks.
2025-10-23 21:50:10,326 - RAG_App - INFO - Adding 2 documents to Chroma...


Batches: 100%|██████████| 1/1 [00:00<00:00,  1.81it/s]

2025-10-23 21:50:10,953 - RAG_App - INFO - Document addition to Chroma complete.
2025-10-23 21:50:10,954 - RAG_App - INFO - ChromaVectorStore is persistent. No explicit save needed.
DocumentProcessor created 2 chunks from 2 files.

--- Vector-store stats after document ingestion ---
{
  "total_documents": 6,
  "index_size": 6,
  "dimension": 384,
  "model": "sentence-transformers/all-MiniLM-L6-v2",
  "type": "Chroma"
}





In [10]:
crawler = WebCrawler()
seed_urls = ["https://en.wikipedia.org/wiki/Bread"]
crawled = crawler.crawl_root_urls(
    seed_urls,
    context="baking, history, flour",
    max_depth=1,
    max_pages_per_url=2,   # <-- added
)

print(f"\nWebCrawler found {len(crawled)} relevant pages.")
if crawled:
    rag.add_documents(crawled)
    stats = rag.get_vector_store_stats()
    print("--- Vector-store stats after web crawl ---")
    pjson(stats)

2025-10-23 21:51:39,949 - RAG_App - INFO - Starting crawl 1/1 for root URL: https://en.wikipedia.org/wiki/Bread
2025-10-23 21:51:39,950 - RAG_App - INFO - Crawling (Depth 0): https://en.wikipedia.org/wiki/Bread
2025-10-23 21:51:42,484 - RAG_App - INFO - Crawling (Depth 1): https://en.wikipedia.org/wiki/American_Jewish_cuisine
2025-10-23 21:51:43,246 - RAG_App - INFO - Crawl complete. Fetched 2 total pages.

WebCrawler found 2 relevant pages.
2025-10-23 21:51:43,246 - RAG_App - INFO - Adding 2 documents to Chroma...


Batches: 100%|██████████| 1/1 [00:00<00:00, 38.87it/s]

2025-10-23 21:51:43,288 - RAG_App - INFO - Document addition to Chroma complete.
2025-10-23 21:51:43,289 - RAG_App - INFO - ChromaVectorStore is persistent. No explicit save needed.
--- Vector-store stats after web crawl ---
{
  "total_documents": 8,
  "index_size": 8,
  "dimension": 384,
  "model": "sentence-transformers/all-MiniLM-L6-v2",
  "type": "Chroma"
}





In [11]:
query = "What is the capital of France?"
retrieved = rag.retrieve_relevant_documents(query, k=3)
print(f"\nRetrieval test for: '{query}'")
pjson(retrieved)

Batches: 100%|██████████| 1/1 [00:00<00:00, 23.40it/s]


Retrieval test for: 'What is the capital of France?'
[
  {
    "document": "The capital of France is Paris. Paris is known for the Eiffel Tower, the Louvre Museum, and its beautiful cafes. It is a major center for art and culture.",
    "metadata": {
      "filename": "test_paris.txt",
      "source": "upload",
      "chunk_id": 0,
      "file_type": "text/plain"
    },
    "score": 0.689079999923706
  },
  {
    "document": "The capital of France is Paris. Paris is known for the Eiffel Tower, the Louvre Museum, and its beautiful cafes. It is a major center for art and culture.",
    "metadata": {
      "source": "upload",
      "filename": "test_paris.txt",
      "chunk_id": 0,
      "file_type": "text/plain"
    },
    "score": 0.689079999923706
  },
  {
    "document": "Berlin is the capital of Germany. It is famous for the Brandenburg Gate and the remains of the Berlin Wall. It has a vibrant nightlife and tech scene.",
    "metadata": {
      "file_type": "text/plain",
      "sour




In [12]:
print("\n=== Generation tests ===")

q1 = "What is Paris known for?"
print(f"\nQ: {q1}")
pjson(rag.generate_response(q1))

q2 = "What is bread?"
print(f"\nQ: {q2}")
pjson(rag.generate_response(q2))


=== Generation tests ===

Q: What is Paris known for?


Batches: 100%|██████████| 1/1 [00:00<00:00, 109.60it/s]

2025-10-23 23:18:41,023 - RAG_App - INFO - Generating LLM chat completion for 1 messages...





2025-10-23 23:18:46,158 - RAG_App - INFO - LLM response received.
{
  "answer": "Paris is known for the Eiffel Tower, the Louvre Museum, and its beautiful cafes. It is also a major center for art and culture.",
  "sources": [
    {
      "filename": "test_paris.txt",
      "source": "upload",
      "chunk_id": 0,
      "file_type": "text/plain"
    },
    {
      "file_type": "text/plain",
      "source": "upload",
      "chunk_id": 0,
      "filename": "test_paris.txt"
    },
    {
      "chunk_id": 0,
      "filename": "test_berlin.txt",
      "source": "upload",
      "file_type": "text/plain"
    }
  ],
  "confidence": 0.4087715148925781,
  "context_used": "The capital of France is Paris. Paris is known for the Eiffel Tower, the Louvre Museum, and its beautiful cafes. It is a major center for art and culture.\n\n---\n\nThe capital of France is Paris. Paris is known for the Eiffel Tower, the Louvre Museum, and its beautiful cafes. It is a major center for art and culture.\n\n---\n\n

Batches: 100%|██████████| 1/1 [00:00<00:00, 99.88it/s]

2025-10-23 23:18:46,176 - RAG_App - INFO - Generating LLM chat completion for 1 messages...





2025-10-23 23:18:49,717 - RAG_App - INFO - LLM response received.
{
  "answer": "Bread is a baked food product made from water, flour, and often yeast. It has been an important part of many cultures' diets throughout history.",
  "sources": [
    {
      "source": "web_crawl",
      "context": "baking, history, flour",
      "url": "https://en.wikipedia.org/wiki/Bread",
      "description": "",
      "title": "Bread - Wikipedia"
    },
    {
      "source": "web_crawl",
      "title": "Bread - Wikipedia",
      "url": "https://en.wikipedia.org/wiki/Bread",
      "description": "",
      "context": "baking, history, flour"
    },
    {
      "title": "American Jewish cuisine - Wikipedia",
      "description": "",
      "source": "web_crawl",
      "context": "baking, history, flour",
      "url": "https://en.wikipedia.org/wiki/American_Jewish_cuisine"
    }
  ],
  "confidence": 0.39863656759262084,
  "context_used": "Bread - Wikipedia Jump to content From Wikipedia, the free encyclopedi

In [13]:
print("\n=== Multi-turn chat ===")
history = []

turns = [
    "What is the capital of Germany?",
    "How many people live there?",
    "What is the Eiffel Tower?",
]

for turn in turns:
    print(f"\nHuman   : {turn}")
    reply = rag.chat_mode(turn, history)
    print(f"Assistant: {reply['answer']}")
    history.append({"human": turn, "assistant": reply["answer"]})


=== Multi-turn chat ===

Human   : What is the capital of Germany?


Batches: 100%|██████████| 1/1 [00:00<00:00, 124.66it/s]

2025-10-23 23:18:57,453 - RAG_App - INFO - Generating LLM chat completion for 1 messages...





2025-10-23 23:18:59,427 - RAG_App - INFO - LLM response received.
Assistant: Berlin

Human   : How many people live there?


Batches: 100%|██████████| 1/1 [00:00<00:00, 124.72it/s]

2025-10-23 23:18:59,442 - RAG_App - INFO - Generating LLM chat completion for 3 messages...





2025-10-23 23:19:01,767 - RAG_App - INFO - LLM response received.
Assistant: I do not have that information in my documents.

Human   : What is the Eiffel Tower?


Batches: 100%|██████████| 1/1 [00:00<00:00, 111.74it/s]

2025-10-23 23:19:01,782 - RAG_App - INFO - Generating LLM chat completion for 5 messages...





2025-10-23 23:19:05,060 - RAG_App - INFO - LLM response received.
Assistant: The Eiffel Tower is a landmark in Paris, the capital of France, known for its iconic iron lattice structure.


In [14]:
print("\n=== Knowledge Graph ===")
kg_builder = KnowledgeGraphBuilder()

all_docs = rag.get_all_documents_for_kg()
print(f"Building KG from {len(all_docs)} chunks…")

kg_stats = kg_builder.extract_entities_and_relationships(all_docs)
print("\n--- KG stats ---")
pjson(kg_stats)

if kg_stats.get("graph_nodes", 0):
    fig = kg_builder.visualize_graph_plotly()
    # Save interactive plot instead of trying to display in terminal
    out_file = Path("knowledge_graph.html")
    fig.write_html(out_file)
    print(f"Interactive graph saved → {out_file.resolve()}")
else:
    print("No nodes found for KG visualisation.")


=== Knowledge Graph ===
2025-10-23 23:19:31,800 - RAG_App - INFO - Retrieving all documents for KG build...
Building KG from 8 chunks…
2025-10-23 23:19:31,812 - RAG_App - INFO - Starting KG extraction from 8 documents...
2025-10-23 23:19:31,813 - RAG_App - INFO - Processing document 0/8 for KG...
2025-10-23 23:19:33,218 - RAG_App - INFO - KG build complete. Nodes: 165, Edges: 1

--- KG stats ---
{
  "entities_count": 165,
  "relationships_count": 2,
  "graph_nodes": 165,
  "graph_edges": 1
}
Interactive graph saved → C:\Users\Wizard\Documents\RAG\knowledge_graph.html


In [1]:
print("\nTest complete. "
      "Delete 'chroma_db_store' and 'logs' folders if you want a fresh start next run.")


Test complete. Delete 'chroma_db_store' and 'logs' folders if you want a fresh start next run.


In [None]:

import os
from huggingface_hub import InferenceClient


api_key = os.getenv("HF_API_TOKEN")
if not api_key:
    raise RuntimeError("Export HF_TOKEN=<your-hugging-face-token> first")


s= "explain yourself"

client = InferenceClient(
    provider="featherless-ai",
    api_key=api_key,
)


completion = client.chat.completions.create(
    model="inclusionAI/Ling-1T",
    messages=[{"role": "user", "content": s}],
    max_tokens=250,
    temperature=0.7,
)


answer = completion.choices[0].message.content
print(answer)

  from .autonotebook import tqdm as notebook_tqdm


I am an AI language model developed by Ant Group, named Bailing. Ant Group is a world-leading open internet platform that, through technological innovation, assists partners in providing inclusive and convenient digital life and digital financial services to consumers and small and micro enterprises]]: enterprises. It continuously opens its products and technologies to help enterprises with digital upgrades and collaboration.

**Key points about me:**

1.  **Purpose:** My primary function is to assist you by providing information, answering questions, generating text, translating languages, summarizing content, and engaging in helpful, respectful conversation.
2.  **Knowledge Base:** I was trained on a massive dataset of text and code from the internet, books, articles, and other sources, up until my last update (currently around July 2024). This allows me to understand and generate human-like text across a wide range of topics and domains.
3.  **Capabilities:**
    *   **Answering Que