In [None]:
# |default_exp chroma_db

## Install dependencies

## Make an app with Gradio

In [2]:
# |export
import csv
import re
import chromadb
from chromadb import Settings
import pandas as pd
from dotenv import load_dotenv
import os

import gradio as gr
from fastcore.net import urljson, HTTPError

In [3]:
load_dotenv()
oai_key = os.getenv('GEMINI_API_KEY')

In [4]:
def convert_qa_to_csv(input_file, output_file):
    """
    Convert a text file with Q/A format to a CSV file.

    Args:
        input_file: Path to the input text file
        output_file: Path to the output CSV file
    """
    # Read the content of the file
    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()

    # Split the content by 'Q' marker
    qa_blocks = content.split('Q\n')

    qa_blks = [block.strip() for block in qa_blocks][1:]
    # Remove empty blocks (like the first one if file starts with 'Q')
    # qa_blocks = [[line for line in block.split('\n') ] for block in qa_blks if block.strip()]
    # Remove empty blocks (like the first one if file starts with 'Q')
    # qa_blocks = [blk for block in qa_blocks if block.strip() for blk in block.strip()]

    # Process each Q&A block
    qa_pairs = []
    for block in qa_blks:
        # Split the block into lines
        lines = block.strip().split('\n')

        if lines:
            # First line is the question
            question = lines[0]
            # The rest are the answer
            answer = '\n'.join(lines[1:])

            # Add the pair to our list
            qa_pairs.append([question, answer])

    # Write to CSV
    with open(output_file, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        # Write header
        writer.writerow(['Question', 'Answer'])
        # Write Q&A pairs
        for pair in qa_pairs:
            writer.writerow(pair)
    print(f"Conversion complete. CSV file saved to {output_file}")

In [5]:

input_files = ["../res/qa_service.txt", "../res/qa_technology.txt"]
output_files = ["../res/qa_service.csv", "../res/qa_technology.csv"]
for in_f, ot_f in zip(input_files, output_files):
    convert_qa_to_csv(in_f, ot_f)
    

Conversion complete. CSV file saved to ../res/qa_service.csv
Conversion complete. CSV file saved to ../res/qa_technology.csv


In [6]:
# |export
client = chromadb.PersistentClient(path="../db")
# collections = [client.create_collection(name="siasun_qa_service"),
#                 client.create_collection(name="siasun_qa_technology")]
collections = [client.get_or_create_collection(name="siasun_qa_service"),
                client.get_or_create_collection(name="siasun_qa_technology")]

In [7]:
i=2
f'q{i}'

'q2'

In [8]:
# |export

for csv_file,collection in zip(output_files,collections):
    with open(csv_file, newline='') as f:
        reader = csv.reader(f)
        for i, row in enumerate(reader):
            collection.add(
                documents = row,
                metadatas = [{"source": "question"}, {"source": "answer"}],
                ids = [f"{collection.name}_q{i}", f"{collection.name}_a{i}"]
            )

Add of existing embedding ID: q
Add of existing embedding ID: a
Add of existing embedding ID: q
Add of existing embedding ID: a
Add of existing embedding ID: q
Add of existing embedding ID: a
Add of existing embedding ID: q
Add of existing embedding ID: a
Add of existing embedding ID: q
Add of existing embedding ID: a
Add of existing embedding ID: q
Add of existing embedding ID: a
Add of existing embedding ID: q
Add of existing embedding ID: a
Add of existing embedding ID: q
Add of existing embedding ID: a
Add of existing embedding ID: q
Add of existing embedding ID: a
Add of existing embedding ID: q
Add of existing embedding ID: a
Add of existing embedding ID: q
Add of existing embedding ID: a
Add of existing embedding ID: q
Add of existing embedding ID: a
Add of existing embedding ID: q
Add of existing embedding ID: a
Add of existing embedding ID: q
Add of existing embedding ID: a
Add of existing embedding ID: q
Add of existing embedding ID: a
Add of existing embedding ID: q
Add of e

In [17]:
#|export
queries=["你们的产品需要多久维护一次?","我怎么设置机器人的安全工作区域?"]
queries[0]

'你们的产品需要多久维护一次?'

In [20]:
results = collections[0].query(
    query_texts=queries,
    n_results=4
)

In [19]:
results

{'ids': [['siasun_qa_service_q2',
   'siasun_qa_service_a9',
   'siasun_qa_service_q21',
   'siasun_qa_service_a10']],
 'embeddings': None,
 'documents': [['你们的产品，多久需要维护一次？维护保养内容有哪些？',
   '作业作为一个重要的单元，所以不能进行批量删除与添加，防止误操作造成损失。',
   '宏作业是干什么用的？',
   '新松有标准的视觉通讯协议，视觉厂家可以按照此协议进行开发适配。当前适配过的品牌有，沈阳自动化所，欧姆龙、海康、梅卡曼德、视比特、基恩士。']],
 'uris': None,
 'data': None,
 'metadatas': [[{'source': 'question'},
   {'source': 'answer'},
   {'source': 'question'},
   {'source': 'answer'}]],
 'distances': [[0.3583492934703827,
   0.802642822265625,
   0.8398691415786743,
   0.8906089067459106]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [21]:
# results['metadatas'][0] #[0]['source']
results['documents'][0][0]

'你们的产品，多久需要维护一次？维护保养内容有哪些？'

In [22]:
colls = client.list_collections()
colls[0]

'siasun_qa_technology'

In [71]:

# question = "你们的产品需要多久维护一次?"
question = "你们在售前评估上，如何帮助到我们?"
answers = []
for collection in collections:
    results = collection.query(
        query_texts=[question],
        n_results=4
    )
    docs = []
    for i,metadata in enumerate(results['metadatas'][0]):
        if metadata['source'] == 'question':
            docs.append({'id': results['ids'][0][i],
                         'document': results['documents'][0][i],
                         'distance':results['distances'][0][i]})
    df = pd.DataFrame(docs)
    answers.append(df)
df_answers = pd.concat(answers, axis=0,ignore_index=True)
# df_answers = pd.stack(answers, axis=2)

In [72]:
df_answers.loc[df_answers['distance'].idxmin()]

id          siasun_qa_technology_q10
document         你们在售前评估上，能提供什么样的帮助？
distance                    0.376306
Name: 2, dtype: object

In [75]:

id_q = df_answers.loc[df_answers['distance'].idxmin()]['id']
id_q

'siasun_qa_technology_q10'

In [83]:

id_a_list = id_q.split('_')
id_a_list[-1] = id_a_list[-1].replace('q','a')
id_a_list

['siasun', 'qa', 'technology', 'a10']

In [84]:
id_a = '_'.join(id_a_list)
id_a

'siasun_qa_technology_a10'

In [87]:
coll_idx = 0 if id_a_list[-2] == 'service' else 1
coll_idx

1

In [88]:
answer = collections[coll_idx].get(id_a)
answer['documents']

['部分情况可提供现场技术指导，提供成功应用案例经验支持，仿真模拟场景，评估负载等风险，提供机械、电气、软件接口对接。']

In [27]:
best_answer = df_answers.loc[df_answers['distance'].idxmin()]

In [89]:
def qa(question:str, collections:list[chromadb.Collection]=collections):
    matched_questions = []
    for collection in collections:
        results = collection.query(
            query_texts=[question],
            n_results=4
        )
        docs = []
        for i,metadata in enumerate(results['metadatas'][0]):
            if metadata['source'] == 'question':
                docs.append({'id': results['ids'][0][i],
                                'document': results['documents'][0][i], 
                                'distance':results['distances'][0][i]})
        df = pd.DataFrame(docs)
        matched_questions.append(df)

    df_matched_questions = pd.concat(matched_questions,axis=0,ignore_index=True)
    best_match_q_id = df_matched_questions.loc[df_matched_questions['distance'].idxmin()]['id']
    id_a_list = best_match_q_id.split('_')
    id_a_list[-1] = id_a_list[-1].replace('q','a')
    id_a = '_'.join(id_a_list)
    coll_idx = 0 if id_a_list[-2] == 'service' else 1
    best_answer = collections[coll_idx].get(id_a)['documents']
    return best_answer
    # question =
    # return answers

In [90]:

question = "你们的产品需要多久维护一次?"
qa(question)


['根据机器人的型号和实际使用情况，制定机器人的保养计划,一般分为日常、3 个月、6 个月、1 年期的维护保养。\n需要对机器人进行日常点检和定期维护保养，点检工作主要检查设备是否存在漏油、异响、异常震动、异常报警；定期维护保养主要对油脂、线束护套、风扇、电机接头等易损位置进行检查，并定期更换润滑油。具体检验项目及维护周期详见安装维护手册。']

In [None]:
# |export
iface = gr.Interface(fn=qa, inputs=gr.Text(value="多久维护一次产品?"), outputs="text")
iface.launch(width=500,share=True)

* Running on local URL:  http://127.0.0.1:7861


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


* Running on public URL: https://1cd211027624cee8c5.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [92]:
# this is only necessary in a notebook
iface.close()

Closing server running on port: 7861


## Create a `requirements.txt` file

In [None]:
%%writefile ../requirements.txt
fastcore

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()

In [1]:
# |default_exp data_preprocessing

## Convert this notebook into a Gradio app

In [9]:
# from nbdev.export import nb_export
# nb_export('01_gradio.ipynb', lib_path='.', name='gradio')

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()