In [None]:
# |default_exp local_rag_ds_r1

Please reference [this blog post](https://nbdev.fast.ai/blog/posts/2022-11-07-spaces) on how to use this notebook.

## Install dependencies

## Make an app with Gradio

In [1]:
# |export
import gradio as gr
from dotenv import load_dotenv
import os
import sys
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#| export
import ollama
import re
from concurrent.futures import ThreadPoolExecutor
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import OllamaEmbeddings
from chromadb.config import Settings
from chromadb import Client
# from langchain.vectorstores import Chroma
from langchain_chroma import Chroma
import getpass

In [3]:
# |export
load_dotenv()
os.environ['HTTP_PROXY'] = 'http://127.0.0.1:20171'
os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:20171'
os.environ['NO_PROXY'] = 'localhost, 127.0.0.1'

sys.getdefaultencoding()
print(os.environ.get('HTTP_PROXY'))
print(os.environ.get('HTTPS_PROXY'))


http://127.0.0.1:20171
http://127.0.0.1:20171


In [5]:
#| export
# file_path = "../res/employee_manual.pdf"
file_path = "../res/foundatiuons_of_llm_zhu.pdf"
loader = PyPDFLoader(file_path)
documents = loader.load()
print(len(documents))
# print(documents[0].page_content[0:1000])
print(documents[0].metadata)

231
{'producer': 'GPL Ghostscript 10.01.2', 'creator': 'LaTeX with hyperref', 'creationdate': '2025-01-16T20:13:48-05:00', 'moddate': '2025-01-16T20:13:48-05:00', 'title': '', 'subject': '', 'author': '', 'keywords': '', 'source': '../res/foundatiuons_of_llm_zhu.pdf', 'total_pages': 231, 'page': 0, 'page_label': '1'}


In [6]:
#| export
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

In [7]:
#| export
# Initialize Ollama embeddings using DeepSeek-R1, default to 7b
embedding_function = OllamaEmbeddings(model="deepseek-r1:14b")
# Parallelize embedding generation
def generate_embeddings(chunk):
    # return embedding_function.embed_query(chunk.page_content)
    return embedding_function.embed_documents([chunk])[0]
with ThreadPoolExecutor() as executor:
    embeddings = list(executor.map(generate_embeddings, chunks))


  embedding_function = OllamaEmbeddings(model="deepseek-r1:14b")


In [8]:
#| export
# Initialize Chroma client and create/reset the collection
client = Client(Settings())
# client.delete_collection(name="foundations_of_llm")
collection = client.create_collection(name="foundations_of_llm")
# Add documents to the Chroma
for idx, chunk in enumerate(chunks):
    collection.add(
        documents=[chunk.page_content],
        metadatas=[{'id': idx}],
        embeddings=[embeddings[idx]],
        ids=[str(idx)],
    )

In [9]:
#| export
# Initialize retriever using Ollama embeddings for queries
retriever = Chroma(collection_name="foundations_of_llm",client=client,embedding_function=embedding_function).as_retriever()

In [11]:
#| export
def retrieve_context(question):
    # Retrieve relevant documents
    results = retriever.invoke(question)
    # Combine the retrieved content
    context = '\n\n'.join([doc.page_content for doc in results])
    return context


In [12]:
#| export
# Query DeepSeek-R1 model for contextual answers
def query_deepseek(question, context):
    # Format the input prompt
    formatted_prompt = f"Question: {question}\n\nContext: {context}"
    # Query the model using Ollama
    # response = embedding_function.chat(
    #     model="deepseek-r1:7b",
    #     messages=[{'role': 'user', 'content': formatted_prompt}],
    # )
    response = ollama.chat(
        model="deepseek-r1:14b",
        messages=[{'role': 'user', 'content': formatted_prompt}]
    )
    # Clean and return the response
    response_content = response['message']['content']
    final_answer = re.sub(r'<think>.*?</think>', '', response_content, flags=re.DOTALL).strip()
    return final_answer

In [13]:
#| export
def ask_question(question):
    # Retrieve context and generate an answer using RAG
    context = retrieve_context(question)
    # Query DeepSeek-R1 model
    answer = query_deepseek(question, context)
    return answer
    # return "This is a test answer for the question: " + question
    # return "Context: " + context

In [14]:
ask_question("What is the main idea of the document?")

"### 1. Main Idea of the Document:  \nThe main idea of the document is about **efficient variants of BERT models** and methods to improve their performance while reducing computational costs. It discusses techniques like knowledge distillation, parameter-efficient fine-tuning (e.g., prefix tuning), and learning soft prompts for adapting large language models (LLMs) to various tasks.\n\n---\n\n### 2. Extracted Financial Figures:  \n- **Revenue**: $10 million  \n- **Profit Margin**: 15%  \n\n---\n\n### 3. Spam Classification:  \nThe email is classified as **spam** because it contains a common phishing/scheme pattern, such as announcing an unexpected prize (a gift card) and prompting the recipient to click on a link to claim it.\n\n---\n\n### 4. Technical Issue Solution:  \nTo address the issue of a slow computer that often freezes, follow these steps:\n\n1. **Close Unnecessary Background Processes**:  \n   - Restart your computer and avoid running too many applications at once. Use Task 

In [23]:

# Set up Gradio interface
interface = gr.Interface(
    fn=ask_question,
    inputs="text", #gr.Textbox(label="Ask a question about the document"),
    outputs="text", #gr.Textbox(label="Answer"),
    title="RAG with DeepSeek-R1: Foundations of LLM",
    description="RAG with DeepSeek-R1: Foundations of LLM",
)
interface.launch(server_name="0.0.0.0", server_port=7860, share=False)

* Running on local URL:  http://0.0.0.0:7860

To create a public link, set `share=True` in `launch()`.




In [24]:
interface.close()

Closing server running on port: 7860


In [None]:
#| export
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

results = rag_chain.invoke({"input": "How does the distilled model perform in the evaluation comparing to other language models?"})
# print(results['answer'])

{'input': 'How does the distilled model perform in the evaluation comparing to other language models?', 'context': [Document(id='af74dba9-f877-492a-b88a-0c5d89dea874', metadata={'author': '', 'creationdate': '2025-01-23T01:45:31+00:00', 'creator': 'LaTeX with hyperref', 'keywords': '', 'moddate': '2025-01-23T01:45:31+00:00', 'page': 13, 'page_label': '14', 'producer': 'pdfTeX-1.40.25', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'source': '../res/DeepSeek_R1.pdf', 'subject': '', 'title': '', 'total_pages': 22, 'trapped': '/False'}, page_content='R1-7B (i.e., DeepSeek-R1-Distill-Qwen-7B, abbreviated similarly below) to outperform non-\nreasoning models like GPT-4o-0513 across the board. DeepSeek-R1-14B surpasses QwQ-32B-\nPreview on all evaluation metrics, while DeepSeek-R1-32B and DeepSeek-R1-70B significantly\nexceed o1-mini on most benchmarks. These results demonstrate the strong potential of distilla-\ntion. Additional

In [None]:
print(results['answer'])

The distilled model, specifically DeepSeek-R1-Distill-Qwen-32B, significantly outperforms the benchmarks compared to other language models like QwQ-32B-Preview on reasoning-related benchmarks. It achieves higher pass rates on benchmarks such as AIME 2024 and MATH-500. Overall, distillation has proven to be a more effective and economical strategy compared to relying solely on large-scale reinforcement learning.


In [None]:
#| export
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
import validators

In [None]:
#| export
# Load and chunk contents of the blog
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(docs)

# Index chunks
_ = vectorstore.add_documents(documents=all_splits)

# Define prompt for question-answering
prompt = hub.pull("rlm/rag-prompt")

NameError: name 'WebBaseLoader' is not defined

In [None]:
#| export
# Define state for application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str


# Define application steps
def retrieve(state: State):
    retrieved_docs = vector_store.similarity_search(state["question"])
    return {"context": retrieved_docs}


def generate(state: State):
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])
    messages = prompt.invoke({"question": state["question"], "context": docs_content})
    response = llm.invoke(messages)
    return {"answer": response.content}


# Compile application and test
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [None]:
response = graph.invoke({"question": "What is Task Decomposition?"})
print(response["answer"])

Task decomposition is the process of breaking down a complex task into smaller, manageable steps or subgoals. This can be achieved through techniques like Chain of Thought (CoT), which encourages the model to think step-by-step, or by creating a Tree of Thoughts that explores multiple reasoning possibilities for each step. It can be initiated through simple prompts, specific instructions, or human inputs.


In [None]:
#| export
def answer(message, history, system_prompt, tokens):
    files = []
    file_names = []
    for msg in history:
        if msg["role"] == "user" and isinstance(msg["content"], tuple):
            files.append(msg["content"][0])
            file_names.append(msg["content"][0].split("/")[-1])
    for file in message["files"]:
        files.append(file)
        file_names.append(file.split("/")[-1])

    #if message["text"]:
    #    content = message["text"]
    #else:
    #    content = system_prompt
    # content = message
    # question = system_prompt
    # response = f"Content: {content}\nQuestion: {question}\n"
    # len = min(len(response),int(response_len))

    user_input = f"Question: {system_prompt}\n Website: {message['text']}\n File:\n{'\n'.join(file_names)}"

    if validators.url(message['text']):
        loader = WebBaseLoader(
            # web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
            web_paths=(message['text'],),
            bs_kwargs=dict(
                parse_only=bs4.SoupStrainer(
                    class_=("post-content", "post-title", "post-header")
                )
            ),
        )
        docs = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        all_splits = text_splitter.split_documents(docs)
        # Index chunks
        _ = vector_store.add_documents(documents=all_splits)

        # # # Compile application and test
        # graph_builder_i = StateGraph(State).add_sequence([retrieve, generate])
        # graph_builder_i.add_edge(START, "retrieve")
        # graph_i = graph_builder_i.compile()
        reply = graph.invoke({"question": system_prompt})
        response_i = reply["answer"]
    elif files:
        f = files[-1]
        f_name = file_names[-1]
        loader = PyPDFLoader(f)
        docs = loader.load()
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        splits = text_splitter.split_documents(docs)
        vectorstore = Chroma.from_documents(documents=splits,embedding=OpenAIEmbeddings())
        retriever = vectorstore.as_retriever()

        system_prompt = (
            "You are an assistant for question-answering tasks. "
            "Use the following pieces of retrieved context to answer "
            "the question. If you don't know the answer, say that you "
            "don't know. Use three sentences maximum and keep the "
            "answer concise."
            "\n\n"
            "{context}"
        )

        prompt = ChatPromptTemplate.from_messages(
            [
                ("system", system_prompt),
                ("human", "{input}"),
            ]
        )

        question_answer_chain = create_stuff_documents_chain(llm, prompt)
        rag_chain = create_retrieval_chain(retriever, question_answer_chain)

        results = rag_chain.invoke({"input": system_prompt})

        response_i = results["answer"]

    # response_i = user_input
    for i in range(min(len(response_i), int(tokens))):
        time.sleep(0.05)
        yield response_i[: i + 1]

In [None]:
# |export
demo = gr.ChatInterface(
    answer,
    type="messages",
    title="智能问答RAG",
    description="输入一个网址，查询或询问其中的内容。",
    textbox=gr.MultimodalTextbox(value="https://lilianweng.github.io/posts/2023-06-23-agent/",
                                 file_count="multiple",
                                 file_types=["image", ".pdf", ".txt"],
                                 sources=["upload", "microphone"]),
    additional_inputs=[
        gr.Textbox("What is Task Decomposition?", label="你的问题在此输入！"),
        gr.Slider(10,400,value=300,label="回答长度")
    ],
    multimodal=True,
)
demo.launch(share=False)

* Running on local URL:  http://127.0.0.1:7866

To create a public link, set `share=True` in `launch()`.




In [None]:
# this is only necessary in a notebook
demo.close()

NameError: name 'demo' is not defined

## Create a `requirements.txt` file

In [None]:
%%writefile ../requirements.txt
fastcore

Writing ../requirements.txt


## Convert this notebook into a Gradio app

In [None]:
# from nbdev.export import nb_export
# nb_export('01_gradio.ipynb', lib_path='.', name='gradio')

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()

<div>
<link rel="stylesheet" href="https://gradio.s3-us-west-2.amazonaws.com/2.6.5/static/bundle.css">
<div id="target"></div>
<script src="https://gradio.s3-us-west-2.amazonaws.com/2.6.5/static/bundle.js"></script>
<script>
launchGradioFromSpaces("abidlabs/question-answering", "#target")
</script>
</div>