In [3]:
!pip install langchain_community gradio openai chromadb tiktoken

Collecting langchain_community
  Downloading langchain_community-0.2.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.6-py3-none-any.whl (28 kB)
Collecting langchain<0.3.0,>=0.2.0 (from langchain_community)
  Downloading langchain-0.2.0-py3-none-any.whl (973 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m973.7/973.7 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-core<0.3.0,>=0.2.0 (from langchain_community)
  Downloading langchain_core-0.2.1-py3-none-any.whl (308 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m308.5/308.5 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langsmith<0.2.0,>=0.1.0 (from langchain_community)
  Downloading langsmith-0.1.62-py3-none-any.whl (122 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━

In [7]:
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_text_splitters.base import Language
from langchain_community.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_community.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
import gradio as gr
import os
import warnings
import json

warnings.filterwarnings('ignore')

In [8]:
class ChromaEmbedding:
    def __init__(self, directory, embedding):
        """
        :param directory: 벡터 데이터베이스 폴더
        :param embedding: 임베딩을 수행할 모델
        """
        self.directory = directory
        self.chromaDb = Chroma(persist_directory=self.directory, embedding_function=embedding)

    def addJSONL(self, jsonl_file):
        """
        지정된 JSONL 파일을 로드하여 임베딩을 수행한다.
        :param jsonl_file: JSONL 파일의 경로
        :return:
        """
        with open(jsonl_file, 'r', encoding='utf-8') as file:
            lines = file.readlines()

        # JSONL 파일을 로드하여 임베딩을 수행한다.
        documents = []
        for line in lines:
            data = json.loads(line)
            question = data.get("question", "")
            answer = data.get("answer", "")
            content = f"Question: {question}\nAnswer: {answer}"
            documents.append(content)

        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=900,
            chunk_overlap=0,
            length_function=len,
        )

        # 문자열을 지정된 크기의 청크로 분할한다.
        docs = text_splitter.create_documents(documents)

        # 분할된 청크를 임베딩 데이터베이스에 저장한다.
        self.chromaDb.add_documents(docs)
        self.chromaDb.persist()

In [18]:
# 벡터 데이터베이스 폴더
INDEX = "/content/drive/MyDrive/AI-modeling/law_RAG app/app_QA_index"

def buildIndex():
    """
    OpenAI 임베딩 모델을 이용하여 MARKDOWN 파일을 임베딩한다.
    :return:
    """
    chroma = ChromaEmbedding(INDEX, OpenAIEmbeddings())
    chroma.addJSONL("/content/drive/MyDrive/AI-modeling/law_RAG app/data/law_qa_sample.jsonl")
    print("임베딩 완료!")

In [26]:
def runApplication():
    """
    RAG 어플리케이션을 실행한다.
    :return:
    """
    chroma = ChromaEmbedding(INDEX, OpenAIEmbeddings())
    retriever = chroma.chromaDb.as_retriever(
        search_type="similarity",
        search_kwargs={
            'k': 2,  # 리턴 문서 수
        }
    )

    # QA
    qa_interface = RetrievalQA.from_chain_type(llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
                                               chain_type="stuff",
                                               retriever=retriever,
                                               chain_type_kwargs={
                                                   "verbose": True,
                                               },
                                               return_source_documents=True)

    # 웹 화면을 구성한다.
    with gr.Blocks() as rag_tester:
        gr.HTML("<h2>법률 사례기반 챗봇</h2>")
        with gr.Row():
            with gr.Column(scale=1):
                query = gr.Textbox(label="질문", value="법률 관련 질문을 입력하세요.", lines=3)
                with gr.Row():
                    clear = gr.Button("Clear")
                    submit = gr.Button("Submit", variant="primary")
            with gr.Column(scale=1):
                with gr.Tab("답변"):
                    result = gr.Textbox(label="", lines=6)
                with gr.Tab("검색 문서"):
                    mkdown = gr.Markdown()

        def submitHandler(input_text):
            qa_result = qa_interface(input_text)

            result = qa_result["result"]
            md_text = ""
            for ix, doc in enumerate(qa_result['source_documents']):
                md_text += f"## 검색문서 {ix+1}\n```\n{doc.page_content}\n```\n\n"

            return result, md_text

        def clearHandler():
            return "", "", ""

        submit.click(submitHandler, inputs=[query], outputs=[result, mkdown])
        clear.click(clearHandler, outputs=[query, result, mkdown])

    rag_tester.launch()

if __name__ == '__main__':
    # 임베딩 폴더가 없는 경우 임베딩을 수행한다.
    # if not os.path.isdir(INDEX):
        buildIndex()
        runApplication()
    # else:
    #     runApplication()

임베딩 완료!
Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://e4c63cdb22598f2652.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)
