# Data Ingestion

In [1]:
import requests
from bs4 import BeautifulSoup
from langchain_community.document_loaders import DirectoryLoader, TextLoader, WebBaseLoader

In [2]:
from dotenv import load_dotenv
import os
load_dotenv()
USER_AGENT = os.getenv("USER_AGENT")

Rust Language Book

In [3]:
book_loader = DirectoryLoader(
    path="training_data/book/src",         
    glob="**/*.md",           
    loader_cls=TextLoader
)
rust_lang_book = sorted(book_loader.load(), key=lambda x: x.metadata["source"])

Rust By Example Book

In [4]:
rbe_loader = DirectoryLoader(
    path="training_data/rust-by-example/",
    glob="**/*",
    loader_cls=TextLoader
)
rbe_book = sorted(rbe_loader.load(), key=lambda x: x.metadata["source"])

Cosmos Sdk Docs

In [5]:
cosmossdk_loader = DirectoryLoader(
    path="training_data/cosmos-sdk/docs",
    glob="**/*.md",         
    loader_cls=TextLoader
)
cosmos_sdk_docs = sorted(cosmossdk_loader.load(), key=lambda x: x.metadata["source"])

Cosmwasm Docs

In [6]:
# Step 1: Get all doc links
base_url = "https://cosmwasm.cosmos.network"
docs_url = "https://cosmwasm.cosmos.network/core"

response = requests.get(docs_url, headers={"User-Agent": USER_AGENT})
soup = BeautifulSoup(response.text, "html.parser")

links = []
for a in soup.find_all("a", href=True):
    href = a["href"]
    if href.startswith("/core") and href != "/core":
        full_url = base_url + href
        if full_url not in links:
            links.append(full_url)

# Step 2: Load all pages
pages = []
for link in links:
    cosmwasm_loader = WebBaseLoader(link)
    pages.extend(cosmwasm_loader.load())

# Step 3: Sort by source for consistency
cosmwasmm_docs = sorted(pages, key=lambda x: x.metadata["source"])

cw-tpl Osmosis

In [7]:
osmosis_loader = DirectoryLoader(
    path="training_data/cw-tpl-osmosis",
    glob="**/*.[rsm][sdm]",
    loader_cls=TextLoader
)
cwtpl_osmosis = sorted(osmosis_loader.load(), key=lambda x: x.metadata["source"])

Splitting into Chunks

In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)

all_docs = []

for dataset in [rust_lang_book, rbe_book, cosmos_sdk_docs, cosmwasmm_docs, cwtpl_osmosis]:
    split_docs = splitter.split_documents(dataset)
    all_docs.extend(split_docs)

In [9]:
print(f"Total split docs: {len(all_docs)}") 

Total split docs: 6061


# Embedding, Indexing & Model Setup

In [10]:
# Load API KEY
openai_key = os.getenv("OPENAI_API_KEY")

In [11]:
# Create Embeddings
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(
    openai_api_key=openai_key,
    model="text-embedding-3-small"
)

In [12]:
# Store Embeddings in Vector Store
from langchain_chroma import Chroma

# 1. Initialize vectorstore
vector_store = Chroma( 
    persist_directory="chroma_db",
    embedding_function=embeddings
)

# 2. Add documents in safe batches
batch_size = 200 
for i in range(0, len(all_docs), batch_size):
    batch = all_docs[i:i+batch_size]
    vector_store.add_documents(batch)

In [13]:
# Load Vector Store
vectorstore = Chroma(
    persist_directory="chroma_db",
    embedding_function=embeddings
)

In [14]:
# Initialize LLM
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

# Prompt Engineering

In [15]:
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
# Query rewriter (for history-aware retrieval)

CONDENSE_QUESTION_SYSTEM = """\
You are a query rewriting assistant for a RAG chatbot about Rust, Cosmos SDK, CosmWasm, and Osmosis.
Rewrite the user's latest question into a **standalone** search query, incorporating relevant details from the chat history.
Do NOT answer; only output the rewritten query.
"""

condense_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", CONDENSE_QUESTION_SYSTEM),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}")
    ]
)

In [16]:
# Answering prompt (stuff docs chain)
ANSWER_SYSTEM = """\
You are a precise assistant that explains blockchain theory, Rust, Cosmos SDK, and CosmWasm concepts and code.

Rules:
- Use only the provided CONTEXT. If not found, say so.
- For CosmWasm or Rust code, explain step by step with small code excerpts in triple backticks.
- Cite sources (metadata like file path, URL, or source+page) as markdown footnotes.
- If helpful, suggest "Next steps".
"""

ANSWER_HUMAN = """\
Question: {input}

CONTEXT:
{context}

(You may also use chat history above.)
"""

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", ANSWER_SYSTEM),
        MessagesPlaceholder("chat_history"),
        ("human", ANSWER_HUMAN),
    ]
)

# Retrieval & Document QA

In [17]:
# Document QA chain (stuff docs) 
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
doc_chain = create_stuff_documents_chain(llm, qa_prompt)

In [18]:
# Retriever
retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={
        "k": 6,
        "fetch_k": 30,
    },
)

In [19]:
# History-aware retriever 
from langchain_classic.chains import create_history_aware_retriever
history_aware_retriever = create_history_aware_retriever(
    llm=llm,
    retriever=retriever,
    prompt=condense_prompt,
)

In [20]:
# Full RAG chain (retrieval + doc QA) 
from langchain_classic.chains import create_retrieval_chain
rag_chain = create_retrieval_chain(history_aware_retriever, doc_chain)

# Persistent Memory

In [21]:
from pathlib import Path
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import FileChatMessageHistory

# Persistent chat history 

Path("chat_histories").mkdir(parents=True, exist_ok=True)
session_id = "demo-session"
chat_history = FileChatMessageHistory(file_path=f"chat_histories/{session_id}.json")

rag_with_history = RunnableWithMessageHistory(
    rag_chain,
    lambda session_id: FileChatMessageHistory(
        file_path=f"chat_histories/{session_id}.json"
    ),
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

Example Usage

In [22]:
response1 = rag_with_history.invoke(
    {"input": "Explain CosmWasm instantiate vs execute"},
    config={"configurable": {"session_id": session_id}},
)
print(response1["answer"])

In CosmWasm, the concepts of **instantiate** and **execute** are fundamental to how smart contracts operate. Here's a breakdown of each:

### Instantiate

The **instantiate** function is used to create a new instance of a smart contract. This is where the contract is initialized with its state and any necessary parameters. During instantiation, you typically provide an initialization message that sets up the contract's initial state.

In the provided code excerpt, the `proper_instantiate` function demonstrates how to instantiate a contract:

```rust
fn proper_instantiate() -> (App, CwTemplateContract) {
    let mut app = mock_app();
    let cw_template_id = app.store_code(contract_template());

    let msg = InstantiateMsg { count: 1i32 };
    let cw_template_contract_addr = app
        .instantiate_contract(
            cw_template_id,
            Addr::unchecked(ADMIN),
            &msg,
            &[],
            "test",
            None,
        )
        .unwrap();

    let cw_t