In [1]:
import gradio as gr

import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams
from transformers import AutoTokenizer, AutoModel
import hashlib
import pymongo
from urllib.parse import urljoin, urlparse

Project struture:

    -gradio.ipynb(gradio app)

    -ETL.ipynb(Data extraction and saving)
    
    -Instructgen-openai.ipynb(use openai API to generate the dataset for finetuning)

    -finetune.ipynb(finetuning using unsloth, runs on colab with another env)

    -mongodb_data.json(raw data for mongo migration)

    -docker-compose.yml docker-compose-clearml.yml docker-compose-UI.yml

Github and huggingface links:

    github repo:
        
    hugging face:
        https://huggingface.co/1312354o/llama-tune(the model)

        https://huggingface.co/datasets/1312354o/llama-ros2(dataset for fine tuning generated by gpt4o-mini)

Other files:

    pyproject.toml
    
    record.mp4(the video is on google drive:https://drive.google.com/file/d/1zO800gpaXqSyWnMJkRD6FwWxSKfpEITt/view?usp=drive_link)


Start the docker:docker-compose -f docker-compose.yml -f docker-compose-clearml.yml -f docker-compose-UI.yml up -d

The running container includes rag_qdrant, ragmongo and ollama(where the model runs). Other container starts with clearml and async_delete are clearml infrastructures, pulled from clearml docker hub aka https://hub.docker.com/r/allegroai/clearml

In ollama docker execute: ollama pull hf.co/1312354o/llama-tune, this is where the model runs,  I use ollama to pull the model and run it. OpenwebUI is for debugging and not used.

![docker page](screenshot-docker.png)

![ollama](ollma.png)

In [11]:
##Before starting the gradio app
mongo_client = MongoClient("mongodb://localhost:27018/")
mongo_db = mongo_client["ros2_database"]
mongo_collection = mongo_db["ros2_documents"]

qdrant_client = QdrantClient(url="http://localhost:6333")
qdrant_collection_name = "ros2_vectors"

tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

In [13]:
from transformers import AutoTokenizer, AutoModel

def vectorize_text(text):
    """vectorize text"""
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    embeddings = model(**tokens).last_hidden_state.mean(dim=1).squeeze().tolist()
    return embeddings

In [31]:
api_url = "http://10.255.255.254:11434/api/generate" 
##api_url = "http://localhost:11434/api/chat"
def generate_response(question, context, api_url="http://10.255.255.254:11434/api/generate"):
    """
    generate response by contacting ollama api
    """
    # 准备请求数据
    prompt = f"Suppose you are an assistant,generate answer to the question and use the context as supplementary information：\ncontext：{context}\nquestion：{question}"
    payload = {
        "model": "hf.co/1312354o/llama-tune:latest",  # 
        "prompt": prompt,
        "stream":False
    }
    
    try:
        # 发送请求
        response = requests.post(api_url, json=payload,timeout=None)
        response.raise_for_status()  # 检查请求是否成功
        result = response.json()
        return result.get("response", "no response")
    except requests.exceptions.RequestException as e:
        return f"ollama fault：{e}"
    
def retrive_context(question):
    try:
        vectorized_question = vectorize_text(question)
        result = qdrant_client.search(collection_name=qdrant_collection_name,query_vector = vectorized_question,limit=1)
        resultdocument = mongo_collection.find_one({"_id": result[0].id.replace("-", "")})
        return resultdocument['content']
    except Exception:
        return "Context retrive error, please ignore this context when generating"




def rag_pipeline(question):
    context = retrive_context(question)
    response = generate_response(question, context)
    return  context, response

# Gradio 前端


preset_questions = [
    "Introduce ROS2",
    "How to install ROS2",
    "How to create a ROS2 workspace"
]

def populate_input(selected_option, manual_input):
    """
    the bar
    """
    if manual_input and manual_input !="":
        return manual_input
    return selected_option

# gradio blocks
with gr.Blocks() as demo:
    gr.Markdown("### RAG-based QA System with Dropdown and Manual Input")

    # dropdown
    dropdown = gr.Dropdown(choices=preset_questions, label="pre-defined questions", value=None)
    manual_input = gr.Textbox(label="manual input", placeholder="input")
    
    # submit
    submit_btn = gr.Button("submit")
    
    # final input
    final_question = gr.Textbox(label="question", interactive=False)
    
    # RAG pipeline output
    augment_output = gr.Textbox(label="Context")
    answer_output = gr.Textbox(label="Answer")

    # button
    submit_btn.click(
        populate_input,
        inputs=[dropdown, manual_input],  
        outputs=final_question           
    )

    # call RAG pipeline
    final_question.change(
        fn=rag_pipeline,
        inputs=final_question,
        outputs=[augment_output, answer_output]
    )

# launch
demo.launch()




* Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.




Conclusion && potential further improvements:

To be honest, my RAG system achieves little improvement compared to original llama agent. This is mainly because of data crawling pipeline requires more fine-grained data processing and data from wider source. The RAG system from the book aka LLM handbook aims at a simpler goal of answering questions while this RAG system aims at a goal of much more complexity of useful coding. And apparently we need more data and larger model to deal with this. 

If this system is to be improved in the future, the first and most important thing to do is to modify and extend the data crawling pipeline to get data from more sources. And we need a larger model of more parameters as the new baseline model. Given that a small model already used all of my local memory. A cloud deployment would be needed.

And this is the proof of concept RAG system.