In [1]:
import torch

torch.cuda.is_available()

True

In [2]:
!pip install accelerate
!pip install -i https://pypi.org/simple/ bitsandbytes
!pip install transformers[torch] -U

!pip install datasets
!pip install langchain
!pip install langchain_community
!pip install PyMuPDF
!pip install sentence-transformers
!pip install faiss-cpu

Looking in indexes: https://pypi.org/simple/


In [3]:
!pip install peft



In [4]:
!pip install pickle

[31mERROR: Could not find a version that satisfies the requirement pickle (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for pickle[0m[31m
[0m

In [5]:
import os
import unicodedata

import torch
import pandas as pd
from tqdm import tqdm
import fitz  # PyMuPDF

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
    BitsAndBytesConfig,
    Gemma2ForCausalLM
)
from accelerate import Accelerator

# Langchain 관련
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

from peft import PeftModel
import faiss
import pickle

In [6]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
torch.cuda.empty_cache()


In [7]:
!nvidia-smi

Wed Oct 30 21:12:33 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX 6000...  Off  | 00000000:17:00.0 Off |                  Off |
| 30%   37C    P8    26W / 300W |  12922MiB / 49140MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [8]:
def process_pdf(file_path, chunk_size=512, chunk_overlap=32):
    """PDF 텍스트 추출 후 chunk 단위로 나누기"""
    # PDF 파일 열기
    doc = fitz.open(file_path)
    text = ''
    # 모든 페이지의 텍스트 추출
    for page in doc:
        breakpoint()
        text += page.get_text()
    # 텍스트를 chunk로 분할
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunk_temp = splitter.split_text(text)
    # Document 객체 리스트 생성
    chunks = [Document(page_content=t) for t in chunk_temp]
    return chunks


def create_vector_db(chunks, model_path="intfloat/multilingual-e5-base"):
    """
    Faiss DB: 
    Embedding 작업(인코딩을 통해 vector들을 하나의 vector space에 투영)이 끝난 뒤,
    vector space 내의 벡터끼리 유사도 기반 검색 서비스를 더 빠르게 지원
    벡터의 유사도 측정, 클러스터링에 효율적인 라이브러리

    전통적인 코사인 유사도 < Faiss DB(gpu 지원)
    """
    
    """FAISS DB 생성"""
    # 임베딩 모델 설정
    model_kwargs = {'device': 'cuda'}
    encode_kwargs = {'normalize_embeddings': True}
    embeddings = HuggingFaceEmbeddings(
        model_name=model_path,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
    # FAISS DB 생성 및 반환
    db = FAISS.from_documents(chunks, embedding=embeddings)
    return db




def normalize_path(path):
    """경로 유니코드 정규화"""
    return unicodedata.normalize('NFC', path)


def process_pdfs_from_dataframe(base_directory):
    """딕셔너리에 pdf명을 키로해서 DB, retriever 저장"""
    pdf_databases = {}
    pdf_files = [file for file in os.listdir(base_directory) if file.endswith('.pdf')]

    
    for path in tqdm(pdf_files, desc="Processing PDFs"):
        # 경로 정규화 및 절대 경로 생성
        normalized_path = normalize_path(path)
        full_path = os.path.normpath(os.path.join(base_directory, normalized_path.lstrip('./'))) if not os.path.isabs(normalized_path) else normalized_path

        
        pdf_title = os.path.splitext(os.path.basename(full_path))[0]
        print(f"Processing {pdf_title}...")
        
        # PDF 처리 및 벡터 DB 생성
        chunks = process_pdf(full_path)
        
        # 빈 chunks 처리
        if not chunks:
            print(f"Skipping {pdf_title} due to empty content.")
            continue
        
        db = create_vector_db(chunks)
        
        # Retriever 생성
        retriever = db.as_retriever(search_type="mmr", 
                                    search_kwargs={'k': 3, 'fetch_k': 8})
        

        faiss_directory = "./faiss/" + base_directory.split('/')[-1] + "/"
        faiss.write_index(db.index, faiss_directory + pdf_title + "_faiss_db.index")

        # 필요한 매핑 객체를 추출하여 저장
        with open(faiss_directory + pdf_title + "_index_to_docstore_id.pkl", "wb") as f:
            pickle.dump(db.index_to_docstore_id, f)

        with open(faiss_directory + pdf_title + "_docstore.pkl", "wb") as f:
            pickle.dump(db.docstore, f)
        
        # 결과 저장
        pdf_databases[pdf_title] = {
                'db': db,
                'retriever': retriever
        }
    return pdf_databases



In [9]:
fish_data_directory = './data/fish' # Your Base Directory
# df = pd.read_csv(data_directory + '/test.csv')
fish_pdf_databases = process_pdfs_from_dataframe(fish_data_directory)

Processing PDFs:   0%|          | 0/11 [00:00<?, ?it/s]

Processing 숭어...


  embeddings = HuggingFaceEmbeddings(
Processing PDFs:   9%|▉         | 1/11 [00:06<01:01,  6.17s/it]

Processing 돔류...


Processing PDFs:  18%|█▊        | 2/11 [00:10<00:47,  5.33s/it]

Processing 조피볼락...


Processing PDFs:  27%|██▋       | 3/11 [00:16<00:44,  5.53s/it]

Processing 비단잉어...


Processing PDFs:  36%|███▋      | 4/11 [00:20<00:33,  4.75s/it]

Processing 무지개송어...


Processing PDFs:  45%|████▌     | 5/11 [00:23<00:26,  4.34s/it]

Processing 향어...


Processing PDFs:  55%|█████▍    | 6/11 [00:28<00:22,  4.57s/it]

Processing 넙치...


Processing PDFs:  64%|██████▎   | 7/11 [00:33<00:18,  4.67s/it]

Processing 황복...


Processing PDFs:  73%|███████▎  | 8/11 [00:38<00:13,  4.64s/it]

Processing 메기...


Processing PDFs:  82%|████████▏ | 9/11 [00:42<00:09,  4.61s/it]

Processing 강도다리...


Processing PDFs:  91%|█████████ | 10/11 [00:46<00:04,  4.45s/it]

Processing 뱀장어...


Processing PDFs: 100%|██████████| 11/11 [00:52<00:00,  4.80s/it]


In [10]:
shellfish_data_directory = './data/shellfish' # Your Base Directory
# df = pd.read_csv(data_directory + '/test.csv')
shellfish_pdf_databases = process_pdfs_from_dataframe(shellfish_data_directory)
shellfish_pdf_databases

Processing PDFs:   0%|          | 0/3 [00:00<?, ?it/s]

Processing 가리비...


Processing PDFs:  33%|███▎      | 1/3 [00:04<00:09,  4.98s/it]

Processing 전복...


Processing PDFs:  67%|██████▋   | 2/3 [00:08<00:04,  4.28s/it]

Processing 참굴...


Processing PDFs: 100%|██████████| 3/3 [00:13<00:00,  4.33s/it]


{'가리비': {'db': <langchain_community.vectorstores.faiss.FAISS at 0x7f4b3e9d9100>,
  'retriever': VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7f4b3e9d9100>, search_type='mmr', search_kwargs={'k': 3, 'fetch_k': 8})},
 '전복': {'db': <langchain_community.vectorstores.faiss.FAISS at 0x7f4b3e699880>,
  'retriever': VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7f4b3e699880>, search_type='mmr', search_kwargs={'k': 3, 'fetch_k': 8})},
 '참굴': {'db': <langchain_community.vectorstores.faiss.FAISS at 0x7f4add122070>,
  'retriever': VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7f4add122070>, search_type='mmr', search_kwargs={'k': 3, 'fetch_k': 8})}}

In [11]:
seaweed_data_directory = './data/seaweed' # Your Base Directory
# df = pd.read_csv(data_directory + '/test.csv')
seaweed_pdf_databases = process_pdfs_from_dataframe(seaweed_data_directory)
seaweed_pdf_databases

Processing PDFs:   0%|          | 0/6 [00:00<?, ?it/s]

Processing 미역...


Processing PDFs:  17%|█▋        | 1/6 [00:04<00:21,  4.36s/it]

Processing 김...


Processing PDFs:  33%|███▎      | 2/6 [00:10<00:21,  5.28s/it]

Processing 곰피...


Processing PDFs:  50%|█████     | 3/6 [00:13<00:13,  4.52s/it]

Processing 모자반...


Processing PDFs:  67%|██████▋   | 4/6 [00:19<00:09,  4.83s/it]

Processing 청각...


Processing PDFs:  83%|████████▎ | 5/6 [00:22<00:04,  4.25s/it]

Processing 넓미역...


Processing PDFs: 100%|██████████| 6/6 [00:25<00:00,  4.32s/it]


{'미역': {'db': <langchain_community.vectorstores.faiss.FAISS at 0x7f4b3e85e100>,
  'retriever': VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7f4b3e85e100>, search_type='mmr', search_kwargs={'k': 3, 'fetch_k': 8})},
 '김': {'db': <langchain_community.vectorstores.faiss.FAISS at 0x7f4c6d6530a0>,
  'retriever': VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7f4c6d6530a0>, search_type='mmr', search_kwargs={'k': 3, 'fetch_k': 8})},
 '곰피': {'db': <langchain_community.vectorstores.faiss.FAISS at 0x7f4adcc0d130>,
  'retriever': VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7f4adcc0d130>, search_type='mmr', search_kwargs={'k': 3, 'fetch_k': 8})},
 '모자반': {'db': <langchain_community.vectorstores.faiss.FAISS at 0x7f4b3e5b8070>,
  'retriever': Ve

In [12]:
etc_data_directory = './data/etc' # Your Base Directory
# df = pd.read_csv(data_directory + '/test.csv')
etc_pdf_databases = process_pdfs_from_dataframe(etc_data_directory)
etc_pdf_databases

Processing PDFs:   0%|          | 0/4 [00:00<?, ?it/s]

Processing 해삼...


Processing PDFs:  25%|██▌       | 1/4 [00:03<00:09,  3.29s/it]

Processing 큰징거미새우...


Processing PDFs:  50%|█████     | 2/4 [00:06<00:06,  3.32s/it]

Processing 멍게...


Processing PDFs:  75%|███████▌  | 3/4 [00:10<00:03,  3.56s/it]

Processing 흰다리새우...


Processing PDFs: 100%|██████████| 4/4 [00:15<00:00,  3.94s/it]


{'해삼': {'db': <langchain_community.vectorstores.faiss.FAISS at 0x7f4b3e3f21f0>,
  'retriever': VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7f4b3e3f21f0>, search_type='mmr', search_kwargs={'k': 3, 'fetch_k': 8})},
 '큰징거미새우': {'db': <langchain_community.vectorstores.faiss.FAISS at 0x7f4b3e36f0d0>,
  'retriever': VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7f4b3e36f0d0>, search_type='mmr', search_kwargs={'k': 3, 'fetch_k': 8})},
 '멍게': {'db': <langchain_community.vectorstores.faiss.FAISS at 0x7f4adcc080d0>,
  'retriever': VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7f4adcc080d0>, search_type='mmr', search_kwargs={'k': 3, 'fetch_k': 8})},
 '흰다리새우': {'db': <langchain_community.vectorstores.faiss.FAISS at 0x7f4adc091130>,
  'retriev

In [13]:
fish_pdf_databases

{'숭어': {'db': <langchain_community.vectorstores.faiss.FAISS at 0x7f4b4dff4610>,
  'retriever': VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7f4b4dff4610>, search_type='mmr', search_kwargs={'k': 3, 'fetch_k': 8})},
 '돔류': {'db': <langchain_community.vectorstores.faiss.FAISS at 0x7f4b4dff4fd0>,
  'retriever': VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7f4b4dff4fd0>, search_type='mmr', search_kwargs={'k': 3, 'fetch_k': 8})},
 '조피볼락': {'db': <langchain_community.vectorstores.faiss.FAISS at 0x7f4b4838a160>,
  'retriever': VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7f4b4838a160>, search_type='mmr', search_kwargs={'k': 3, 'fetch_k': 8})},
 '비단잉어': {'db': <langchain_community.vectorstores.faiss.FAISS at 0x7f4b1a309160>,
  'retriever'

In [12]:
def setup_llm_pipeline():
    # 4비트 양자화 설정
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    # 모델 ID 
    model_id = "rtzr/ko-gemma-2-9b-it"

    # 토크나이저 로드 및 설정
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.use_default_system_prompt = False
    

    # 모델 로드 및 양자화 설정 적용
    model = Gemma2ForCausalLM.from_pretrained(
        model_id,
        #quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True )

#     model = PeftModel.from_pretrained(model, "./persona/checkpoint-200",is_trainable=True)

    # HuggingFacePipeline 객체 생성
    text_generation_pipeline = pipeline(
        model=model,
        tokenizer=tokenizer,
        task="text-generation",
        #temperature=0.2,
        return_full_text=False,
        max_new_tokens=450,
    )

    hf = HuggingFacePipeline(pipeline=text_generation_pipeline)

    return hf

In [13]:
# LLM 파이프라인
llm = setup_llm_pipeline()

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.
  hf = HuggingFacePipeline(pipeline=text_generation_pipeline)


In [15]:
def normalize_string(s):
    """유니코드 정규화"""
    return unicodedata.normalize('NFC', s)

def format_docs(docs):
    """검색된 문서들을 하나의 문자열로 포맷팅"""
    context = ""
    for doc in docs:
        context += doc.page_content
        context += '\n'
    return context

# 테스트용으로 첫 번째 질문만 선택
# row = df.iloc[0]

# 소스 문자열 정규화
source = normalize_string('숭어')
question = '숭어의 서식지는 어떻게 돼?'

# 정규화된 키로 데이터베이스 검색
normalized_keys = {normalize_string(k): v for k, v in fish_pdf_databases.items()}
retriever = normalized_keys[source]['retriever']

# RAG 체인 구성
template = """
다음 정보를 바탕으로 질문에 답하세요:
{context}

질문: {question}

주어진 질문에만 답변하세요. 문장으로 답변해주세요. 답변할 때 질문의 주어를 써주세요.
답변:
"""
prompt = PromptTemplate.from_template(template)

# RAG 체인 정의
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# 답변 추론
print(f"Question: {question}")
full_response = rag_chain.invoke(question)

print(f"Answer: {full_response}\n")

# 결과 저장
# results = [{
#     "Source": row['Source'],
#     "Source_path": row['Source_path'],
#     "Question": question,
#     "Answer": full_response
# }]


Question: 숭어의 서식지는 어떻게 돼?
Answer: 숭어는 적도 주변의 열대 해역을 포함한 북위 42°부터 남위 42°까지 광범위한 위도대에서 서식 가능한 광온성 어류이자 바다에서 담수까지 살 수 있는 광염성 어류이다. 






In [22]:
from langchain.docstore import InMemoryDocstore

# 임베딩 설정
model_path = "intfloat/multilingual-e5-base"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True}
embeddings = HuggingFaceEmbeddings(
    model_name=model_path,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

# 빈 docstore와 index_to_docstore_id 생성
docstore = InMemoryDocstore({})
index_to_docstore_id = {}


index = faiss.read_index( "./faiss/fish/숭어_faiss_db.index")
db = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id
)

In [24]:
db.as_retriever(search_type="mmr", search_kwargs={'k': 3, 'fetch_k': 8})

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7fcce3939f40>, search_type='mmr', search_kwargs={'k': 3, 'fetch_k': 8})

In [None]:
def normalize_string(s):
    """유니코드 정규화"""
    return unicodedata.normalize('NFC', s)

def format_docs(docs):
    """검색된 문서들을 하나의 문자열로 포맷팅"""
    context = ""
    for doc in docs:
        context += doc.page_content
        context += '\n'
    return context

# 결과를 저장할 리스트 초기화
results = []

# DataFrame의 각 행에 대해 처리
for _, row in tqdm(df.iterrows(), total=len(df), desc="Answering Questions"):
    # 소스 문자열 정규화
    source = normalize_string(row['Source'])
    question = row['Question']

    # 정규화된 키로 데이터베이스 검색
    normalized_keys = {normalize_string(k): v for k, v in pdf_databases.items()}
    retriever = normalized_keys[source]['retriever']

    # RAG 체인 구성
    template = """
    다음 정보를 바탕으로 질문에 답하세요:
    {context}

    질문: {question}
    
    주어진 질문에만 답변하세요. 문장으로 답변해주세요. 답변할 때 질문의 주어를 써주세요.
    답변:
    """
    prompt = PromptTemplate.from_template(template)

    # RAG 체인 정의
    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )

    # 답변 추론
    print(f"Question: {question}")
    full_response = rag_chain.invoke(question)

    print(f"Answer: {full_response}\n")

    # 결과 저장
    results.append({
        "Source": row['Source'],
        "Source_path": row['Source_path'],
        "Question": question,
        "Answer": full_response
    })