In [None]:
# |default_exp rag

## Install dependencies

## Make an app with Gradio

In [None]:
from datetime import datetime

# |export
import ollama
import re
import gradio as gr
from concurrent.futures import ThreadPoolExecutor
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_ollama import OllamaEmbeddings
from chromadb.config import Settings
from chromadb import Client
import chromadb
from langchain.vectorstores import Chroma
from dotenv import load_dotenv
import os
import csv
from google import genai
from google.genai import types
from fastcore.net import urljson, HTTPError
from openai import api_key
from openai import OpenAI
import openai

In [None]:
# |export
import textwrap
import numpy as np
from datetime import datetime
import pandas as pd
from chromadb import Documents, EmbeddingFunction, Embeddings
from IPython.display import Markdown
import langdetect
# import chromadb.utils.embedding_functions as embedding_functions
# from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

In [None]:
load_dotenv()
# deepseek_key = os.getenv('DEEPSEEK_R1_bAPI_KEY')
gemini_key = os.getenv('GEMINI_API_KEY')

In [None]:
print(os.environ.get('HTTPS_PROXY'))

In [None]:
import requests
requests.get("https://google.com")

In [None]:
client = genai.Client(api_key=gemini_key,
                      http_options={'api_version': 'v1beta'})

In [None]:
all_models = client.models.list()
for m in all_models.page:
    if 'embedContent' in m.supported_actions:
        print(m.name)

In [None]:
#| export
class GeminiEmbeddingFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        model = "models/text-embedding-004"
        # model = "models/gemini-embedding-exp-03-07"
        # model = "models/text-embedding-001"
        # model = "text-multilingual-embedding-002"
        # title = "Siasun Employee Manual query"
        result = client.models.embed_content(model=model,
                                   contents=input,
                                   config=types.EmbedContentConfig(task_type='SEMANTIC_SIMILARITY'),
                                   # config=types.EmbedContentConfig(task_type='RETRIEVAL_DOCUMENT'),
                                   )
        return result.embeddings[0].values

In [None]:
'p'+str(1)

In [None]:
#| export
def create_chroma_db(documents, name, language='en'):
    chroma_client = chromadb.PersistentClient(path="../db")
    # chroma_client = chromadb.Client()
    if language == 'en':
        coll = chroma_client.get_or_create_collection(name=name,
                                                    embedding_function=GeminiEmbeddingFunction(),
                                                      metadata={
                                                          "description": name,
                                                          "created_by": "binjian",
                                                          "created": str(datetime.now())
                                                      })
    else: # use default
        coll = chroma_client.get_or_create_collection(name=name,
                                                      metadata={
                                                          "description": name,
                                                          "created_by": "binjian",
                                                          "created": str(datetime.now())
                                                      })
    # coll.add(
    #     documents=[d.page_content for d in documents],
    #     metadatas=[d.metadata for d in documents],
    #     ids=['p'+str(i+1) for i in range(len(documents))]
    # )
    # return coll
    for i,d  in enumerate(documents):
         try:
             coll.add(
                 documents=d.page_content,
                 metadatas=d.metadata,
                 ids=str(i+1)
             )
             print(f"Added document {i+1}")
         except Exception as e:
             print(f"{i+1},{e}")

    return coll

In [None]:
#| export
loader = PyMuPDFLoader("../res/DeepSeek_R1.pdf")
# loader = PyMuPDFLoader("../res/employee_manual.pdf")
documents = loader.load()
docs = [d.page_content for d in documents]

In [None]:
#| export
def select_embedding_model(input_text):
    try:
        language = langdetect.detect(input_text)
        print(language)
    except langdetect.LangDetectException:
        language = None
        print("Language detection failed. Please use default model!")
    return language

In [None]:
lang = select_embedding_model(docs[-1])

In [None]:
client

In [None]:
if lang == 'en':
    result = client.models.embed_content(model="models/gemini-embedding-exp-03-07",
        # model="text-embedding-004",
        contents=documents[0].page_content,
        config=types.EmbedContentConfig(task_type='SEMANTIC_SIMILARITY')
    )
else:
    result = client.models.embed_content(model="models/embedding-001",
        contents=documents[0].page_content,
        config=types.EmbedContentConfig(task_type='SEMANTIC_SIMILARITY')
    )
    print("select Multilingual")



In [None]:
result.embeddings[0].values

In [None]:
lang

In [None]:
#|export
# db = create_chroma_db(documents, "employee_manual", language=lang)
db = create_chroma_db(documents, "deepseek_r1")
# chroma_client = chromadb.PersistentClient(path="../db")
# db = chroma_client.get_or_create_collection('deepseek_r1')


In [None]:

chroma_client = chromadb.PersistentClient(path="../db")
# chroma_client.delete_collection(name='deepseek_r1')
chroma_client.list_collections()


In [None]:
# chroma_client = chromadb.Client()
recs = db.peek(5)
# df = pd.DataFrame(recs)

In [None]:
def get_relevant_passage(query, db):
    passage = db.query(query_texts=[query], n_results=1)['documents'][0][0]
    return passage

In [None]:
# Perform embedding search
passage = get_relevant_passage("How does the distilled models perform in evaluation comparing to other models?", db)
# passage = get_relevant_passage("休假规定", db)
Markdown(passage)

In [None]:
def make_prompt(query, relevant_passage):
    escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
    prompt = ("""You are a helpful and informative bot that answers questions using text from the reference passage included below. \
    Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. \
    However, you are talking to a non-technical audience, so be sure to break down complicated concepts and \
    strike a friendly and converstional tone. \
    If the passage is irrelevant to the answer, you may ignore it. \
    Please answer in Chinese.
    QUESTION: '{query}'
    PASSAGE: '{relevant_passage}'

        ANSWER:
    """).format(query=query, relevant_passage=escaped)
    return prompt

In [None]:
# query ="How does the distilled models perform in evaluation comparing to other models?"
query ="这里的蒸馏模型在评估中的表现和其他模型比较的结果如何?"
prompt = make_prompt(query, passage)
Markdown(prompt)

In [None]:
# client = genai.Client(api_key=gemini_key)
# response = client.models.generate_content(model="gemini-2.0-flash", contents=prompt)
response = client.models.generate_content(model="gemini-2.5-pro-exp-03-25", contents=prompt)
Markdown(response.text)

In [None]:
Markdown(response.text)

In [None]:
# Split the document into smaller chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

In [None]:
# embedding = client.embeddings.create(
#         model="deepseek/deepseek-r1:free",
#         input='Your text string goes here'
#     )
# embedding

In [None]:
# Parallelize embedding generation
def generate_embedding(chunk):
    # return ollama_embedding_function.embed_query(chunk.page_content)
    return google_ef.embed_query(chunk.page_content)

with ThreadPoolExecutor() as executor:
    embeddings = list(executor.map(generate_embedding, chunks))

In [None]:

completion = client.chat.completions.create(
  extra_headers={
    "HTTP-Referer": "binjian.github.io", # Optional. Site URL for rankings on openrouter.ai.
    "X-Title": "binjian's digital garden", # Optional. Site title for rankings on openrouter.ai.
  },
  extra_body={},
  model="deepseek/deepseek-r1:free",
  messages=[
    {
      "role": "system", "content": "You are a helpful assistant.",
      "role": "user", "content": "What's Anthropic's Model context protocol?"
    }
  ]
)
print(completion.choices[0].message.content)

In [None]:
google_ef = embedding_functions.GoogleGenerativeAiEmbeddingFunction(api_key=gemini_key)

In [None]:
def convert_qa_to_csv(input_file, output_file):
    """
    Convert a text file with Q/A format to a CSV file.

    Args:
        input_file: Path to the input text file
        output_file: Path to the output CSV file
    """
    # Read the content of the file
    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()

    # Split the content by 'Q' marker
    qa_blocks = content.split('Q\n')

    qa_blks = [block.strip() for block in qa_blocks][1:]
    # Remove empty blocks (like the first one if file starts with 'Q')
    # qa_blocks = [[line for line in block.split('\n') ] for block in qa_blks if block.strip()]
    # Remove empty blocks (like the first one if file starts with 'Q')
    # qa_blocks = [blk for block in qa_blocks if block.strip() for blk in block.strip()]

    # Process each Q&A block
    qa_pairs = []
    for block in qa_blks:
        # Split the block into lines
        lines = block.strip().split('\n')

        if lines:
            # First line is the question
            question = lines[0]
            # The rest are the answer
            answer = '\n'.join(lines[1:])

            # Add the pair to our list
            qa_pairs.append([question, answer])

    # Write to CSV
    with open(output_file, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        # Write header
        writer.writerow(['Question', 'Answer'])
        # Write Q&A pairs
        for pair in qa_pairs:
            writer.writerow(pair)
    print(f"Conversion complete. CSV file saved to {output_file}")

In [None]:

input_files = ["../res/qa_service.txt", "../res/qa_technology.txt"]
output_files = ["../res/qa_service.csv", "../res/qa_technology.csv"]
for in_f, ot_f in zip(input_files, output_files):
    convert_qa_to_csv(in_f, ot_f)
    

In [None]:
# |export
client = chromadb.PersistentClient(path="../vdb")
# collections = [client.get_or_create_collection(name="siasun_qa_service",embedding_function=deepseek_ef),
#                 client.get_or_create_collection(name="siasun_qa_technology",embedding_function=deepseek_ef)]
collections = [client.get_or_create_collection(name="siasun_qa_service", embedding_function=google_ef),
                client.get_or_create_collection(name="siasun_qa_technology", embedding_function=google_ef)]

In [None]:
i=2
f'q{i}'

In [None]:
# |export
for csv_file,collection in zip(output_files,collections):
    with open(csv_file, newline='') as f:
        reader = csv.reader(f)
        for i, row in enumerate(reader):
            collection.add(
                documents = row,
                metadatas = [{"source": "question"}, {"source": "answer"}],
                ids = [f"{collection.name}_q{i}", f"{collection.name}_a{i}"]
            )

In [None]:
#|export
queries=["你们的产品需要多久维护一次?","我怎么设置机器人的安全工作区域?"]
queries[0]

In [None]:
results = collections[0].query(
    query_texts=queries,
    n_results=4
)

In [None]:
results

In [None]:
# results['metadatas'][0] #[0]['source']
results['documents'][0][0]

In [None]:
colls = client.list_collections()
colls[0]

In [None]:
queries1 = ['你们的产品需要多久维护一次?','宏作业有什么用?']

In [None]:

results = collections[0].query(
    query_texts=queries1,
    n_results=4
)
results


In [None]:

# question = "你们的产品需要多久维护一次?"
# question = "你们在售前评估上，如何帮助到我们?"
answers = []
for collection in collections:
    results = collection.query(
        query_texts=queries,
        n_results=4
    )
    docs = []
    for i,metadata in enumerate(results['metadatas'][0]):
        if metadata['source'] == 'question':
            docs.append({'id': results['ids'][0][i],
                         'document': results['documents'][0][i],
                         'distance':results['distances'][0][i]})
    df = pd.DataFrame(docs)
    answers.append(df)
df_answers = pd.concat(answers, axis=0,ignore_index=True)
# df_answers = pd.stack(answers, axis=2)

In [None]:
df_answers.loc[df_answers['distance'].idxmin()]

In [None]:

id_q = df_answers.loc[df_answers['distance'].idxmin()]['id']
id_q

In [None]:

id_a_list = id_q.split('_')
id_a_list[-1] = id_a_list[-1].replace('q','a')
id_a_list

In [None]:
id_a = '_'.join(id_a_list)
id_a

In [None]:
coll_idx = 0 if id_a_list[-2] == 'service' else 1
coll_idx

In [None]:
answer = collections[coll_idx].get(id_a)
answer['documents']

In [None]:
best_answer = df_answers.loc[df_answers['distance'].idxmin()]

In [None]:
def qa(questions:list[str], collections:list[chromadb.Collection]=collections):
    matched_questions = []
    for collection in collections:
        results = collection.query(
            query_texts=questions,
            n_results=4
        )
        docs = []
        for i,metadata in enumerate(results['metadatas'][0]):
            if metadata['source'] == 'question':
                docs.append({'id': results['ids'][0][i],
                                'document': results['documents'][0][i], 
                                'distance':results['distances'][0][i]})
        df = pd.DataFrame(docs)
        matched_questions.append(df)

    df_matched_questions = pd.concat(matched_questions,axis=0,ignore_index=True)
    best_match_q_id = df_matched_questions.loc[df_matched_questions['distance'].idxmin()]['id']
    id_a_list = best_match_q_id.split('_')
    id_a_list[-1] = id_a_list[-1].replace('q','a')
    id_a = '_'.join(id_a_list)
    coll_idx = 0 if id_a_list[-2] == 'service' else 1
    best_answer = collections[coll_idx].get(id_a)['documents']
    res_text = best_answer[0]
    return res_text
    # question =
    # return answers

In [None]:

question = "你们的产品需要多久维护一次?"
res = qa(question)
res


In [None]:
print(res[0])

In [None]:
# |export
iface = gr.Interface(fn=qa, inputs=gr.Text(value="多久维护一次产品?"), outputs="text")
iface.launch(width=500,share=False)

In [None]:
# this is only necessary in a notebook
iface.close()

## Create a `requirements.txt` file

In [None]:
%%writefile ../requirements.txt
fastcore

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()

In [None]:
# |default_exp data_preprocessing

## Convert this notebook into a Gradio app

In [None]:
# from nbdev.export import nb_export
# nb_export('01_gradio.ipynb', lib_path='.', name='gradio')

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()