In [None]:
# |default_exp chroma_db

## Install dependencies

## Make an app with Gradio

In [14]:
# |export
import csv
import re
import chromadb
from chromadb import Settings
import pandas as pd
from dotenv import load_dotenv
import os

import gradio as gr
from fastcore.net import urljson, HTTPError

In [2]:
load_dotenv()
oai_key = os.getenv('GEMINI_API_KEY')

In [3]:
def convert_qa_to_csv(input_file, output_file):
    """
    Convert a text file with Q/A format to a CSV file.

    Args:
        input_file: Path to the input text file
        output_file: Path to the output CSV file
    """
    # Read the content of the file
    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()

    # Split the content by 'Q' marker
    qa_blocks = content.split('Q\n')

    qa_blks = [block.strip() for block in qa_blocks][1:]
    # Remove empty blocks (like the first one if file starts with 'Q')
    # qa_blocks = [[line for line in block.split('\n') ] for block in qa_blks if block.strip()]
    # Remove empty blocks (like the first one if file starts with 'Q')
    # qa_blocks = [blk for block in qa_blocks if block.strip() for blk in block.strip()]

    # Process each Q&A block
    qa_pairs = []
    for block in qa_blks:
        # Split the block into lines
        lines = block.strip().split('\n')

        if lines:
            # First line is the question
            question = lines[0]
            # The rest are the answer
            answer = '\n'.join(lines[1:])

            # Add the pair to our list
            qa_pairs.append([question, answer])

    # Write to CSV
    with open(output_file, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        # Write header
        writer.writerow(['Question', 'Answer'])
        # Write Q&A pairs
        for pair in qa_pairs:
            writer.writerow(pair)
    print(f"Conversion complete. CSV file saved to {output_file}")

In [4]:

input_files = ["../res/qa_service.txt", "../res/qa_technology.txt"]
output_files = ["../res/qa_service.csv", "../res/qa_technology.csv"]
for in_f, ot_f in zip(input_files, output_files):
    convert_qa_to_csv(in_f, ot_f)
    

Conversion complete. CSV file saved to ../res/qa_service.csv
Conversion complete. CSV file saved to ../res/qa_technology.csv


In [8]:
# |export
client = chromadb.PersistentClient(path="../db")
# collections = [client.create_collection(name="siasun_qa_service"),
#                 client.create_collection(name="siasun_qa_technology")]
collections = [client.get_or_create_collection(name="siasun_qa_service"),
                client.get_or_create_collection(name="siasun_qa_technology")]

In [6]:
i=2
f'q{i}'

'q2'

In [None]:
# |export

for csv_file,collection in zip(output_files,collections):
    with open(csv_file, newline='') as f:
        reader = csv.reader(f)
        for i, row in enumerate(reader):
            collection.add(
                documents = row,
                metadatas = [{"source": "question"}, {"source": "answer"}],
                ids = [f"{collection.name}_q{i}", f"{collection.name}_a{i}"]
            )

In [17]:
#|export
results = collections[0].query(
    query_texts=["你们的产品需要多久维护一次?"],
    n_results=4
)

In [18]:
results

{'ids': [['siasun_qa_service_q2',
   'siasun_qa_service_a9',
   'siasun_qa_service_q21',
   'siasun_qa_service_a10']],
 'embeddings': None,
 'documents': [['你们的产品，多久需要维护一次？维护保养内容有哪些？',
   '作业作为一个重要的单元，所以不能进行批量删除与添加，防止误操作造成损失。',
   '宏作业是干什么用的？',
   '新松有标准的视觉通讯协议，视觉厂家可以按照此协议进行开发适配。当前适配过的品牌有，沈阳自动化所，欧姆龙、海康、梅卡曼德、视比特、基恩士。']],
 'uris': None,
 'data': None,
 'metadatas': [[{'source': 'question'},
   {'source': 'answer'},
   {'source': 'question'},
   {'source': 'answer'}]],
 'distances': [[0.35835238473584624,
   0.8026765812953612,
   0.8398664268040978,
   0.8905950951966901]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

In [13]:
# results['metadatas'][0] #[0]['source']
results['documents'][0][0]

'你们的产品，多久需要维护一次？维护保养内容有哪些？'

In [23]:
colls = client.list_collections()
colls[0]

'siasun_qa_service'

In [25]:

question = "你们的产品需要多久维护一次?"
answers = []
for collection in collections:
    results = collection.query(
        query_texts=[question],
        n_results=4
    )
    docs = []
    for i,metadata in enumerate(results['metadatas'][0]):
        if metadata['source'] == 'question':
            docs.append({'id': results['ids'][0][i],
                         'document': results['documents'][0][i],
                         'distance':results['distances'][0][i]})
    df = pd.DataFrame(docs)
    answers.append(df)
df_answers = pd.concat(answers, axis=)

In [None]:
def qa(question:str, collections:list[chromadb.Collection]):
    answers = []
    for collection in collections:
        results = collection.query(
            query_texts=[question],
            n_results=4
        )
        docs = []
        for i,metadata in enumerate(results['metadatas'][0]):
            if metadata['source'] == 'question':
                docs.append({'id': results['ids'][0][i],
                                'document': results['documents'][0][i], 
                                'distance':results['distances'][0][i]})

        answers.append(docs)
    # question =
    # return answers

In [None]:
# |export
iface = gr.Interface(fn=qa, inputs=gr.Text(value="多久维护一次产品?"), outputs="text")
iface.launch(width=500)

In [None]:
# this is only necessary in a notebook
iface.close()

## Create a `requirements.txt` file

In [None]:
%%writefile ../requirements.txt
fastcore

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()

In [1]:
# |default_exp data_preprocessing

## Convert this notebook into a Gradio app

In [9]:
# from nbdev.export import nb_export
# nb_export('01_gradio.ipynb', lib_path='.', name='gradio')

In [None]:
# | hide
import nbdev

nbdev.nbdev_export()