In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
import os

print(f"[API KEY]\n{os.environ['OPENAI_API_KEY']}")
print(os.environ["LANGCHAIN_TRACING_V2"])

[API KEY]
sk-proj-2ALiWBzcJl4s9ri6EUJ6T3BlbkFJxbUDIanlzbIf6JMPE7o2
true


In [3]:
import torch
import os
import uuid

from langchain.schema import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts.chat import PromptTemplate, ChatPromptTemplate

import numpy as np

from langchain.chains import LLMChain
from langchain.llms import HuggingFacePipeline
import transformers

from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from transformers import pipeline, AutoModel, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

from langchain_community.chat_models import ChatOllama
from langchain_core.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_core.callbacks.manager import CallbackManager

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# 단계 1 : 데이터셋 로드

import json
# JSON 파일에서 데이터를 불러와 Document 객체로 복원
with open("/media/choi/HDD1/mmaction2/data/Korea_construction_standard/LHCS_qna_id.json", "r", encoding="utf-8") as f:
    documents_data = json.load(f)

documents_512 = [
    Document(page_content=doc["page_content"], metadata=doc["metadata"]) for doc in documents_data
]

print(f"문서의 페이지수 : {len(documents_512)}")

문서의 페이지수 : 1255


In [5]:
# 단계 2 : 임베딩 모델 로드
from langchain_google_vertexai import VertexAIEmbeddings
# from FlagEmbedding import FlagModel
from transformers import AutoModel


EM_klue_nli = HuggingFaceEmbeddings(model_name = '/home/choi/Git/ConSRoBERTa/output/2024_2/klue_nli_top_2e_55.24/kor_multi_klue-2024-09-19_15-26-29e1e2')
EM_open_3 = OpenAIEmbeddings(model = 'text-embedding-3-large') # 최신 GPT4 유로 모델
EM_klue_vanilla = HuggingFaceEmbeddings(model_name = 'klue/roberta-base')
EM_MNRL_MRL = HuggingFaceEmbeddings(model_name='/home/choi/Git/RAG_con_doc/langchain/FT_model/MRL_MNRL_NLI-2024-10-17_14-16-30/checkpoint/checkpoint-322_best')
bge_m3 = HuggingFaceEmbeddings(model_name = 'BAAI/bge-m3')
e5_large = HuggingFaceEmbeddings(model_name = 'intfloat/multilingual-e5-large-instruct')



embedding_models = [
    EM_open_3,
    bge_m3,
    e5_large
]

  EM_klue_nli = HuggingFaceEmbeddings(model_name = '/home/choi/Git/ConSRoBERTa/output/2024_2/klue_nli_top_2e_55.24/kor_multi_klue-2024-09-19_15-26-29e1e2')
No sentence-transformers model found with name klue/roberta-base. Creating a new one with mean pooling.
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# top 5검색 결과와 답변 생성까지 저장
llm = ChatOpenAI(model_name='gpt-4o', temperature=0.0)

answer_prompt = ChatPromptTemplate.from_template(
    """
- <Question> 이후에 오는 질문은 <Text> 이후에 오는 문서와 관련된 질문이야.
- <Text> 이후의 문서를 참고해서 <Question> 이후에 오는 질문에 대한 답변을 생성해줘.
- 정확하고 간결하게 답변해줘
- 답변은 한글로 출력해줘


<Question> : {question}
<Text>: {input}
----
<Answer>:
    """
)

class AnswerParser(StrOutputParser):
    def parse(self, response):
        # "Question:" 이후의 텍스트를 추출
        if "Answer>:" in response:
            return response.split("Answer>:")[1].strip()
        return response.strip()
    
A_chain = answer_prompt | llm | AnswerParser()


In [7]:
# 검색 테스트 함수 정의
def Accuracy_test(documents, embed_model):
    vectorstore = FAISS.from_documents(documents=documents, embedding=embed_model)
    print("FAISS 인덱스 생성 완료")

    retriever = vectorstore.as_retriever()

    correct = 0
    total = len(documents_512)

    for doc in documents_512:
        question = doc.metadata['question']
        doc_id = doc.metadata['id']
        
        # FAISS를 사용해 질문과 가장 유사한 문서 검색
        results = retriever.invoke(question)
        retrieved_doc_id = results[0].metadata['id']  # 검색된 문서
        
        # 질문에 해당하는 답변과 검색된 문서 비교
        if doc_id == retrieved_doc_id:
            correct += 1
            
    # 정확도 계산
    accuracy = (correct / total) * 100
    print(f"검색 정확도: {accuracy}%")

In [8]:
# NDCG와 MRR 점수 비교하고 결과 top5 랑 답변 결과까지 저장하기
import json
from sklearn.metrics import ndcg_score

# FAISS로 검색 후 상위 5개의 검색 결과를 저장하고, NDCG@5, MRR@5를 계산하는 함수
def evaluate_and_save_results(embedding_model, documents, output_file, A_chain):
    
    vectorstore = FAISS.from_documents(documents=documents, embedding=embedding_model)
    
    results = []
    ndcg_scores = []
    mrr_scores = []

    for doc in documents:
        question = doc.metadata['question']
        correct_id = doc.metadata['id']
        correct_answer = doc.metadata['answer']

        # FAISS로 상위 5개 검색
        retrieved_results = vectorstore.similarity_search(query=question,k=5)

        # 검색된 문서들의 ID를 저장
        retrieved_ids = [result.metadata['id'] for result in retrieved_results]

        # 상위 1개의 검색 결과로 답변 생성
        top_document = retrieved_results[0].page_content
        generated_answer = A_chain.invoke({
            'question': question,
            'input': top_document
        })

        # 정답 ID를 NDCG와 MRR 계산용으로 변환
        y_true = [1 if retrieved_id == correct_id else 0 for retrieved_id in retrieved_ids]
        y_score = [5, 4, 3, 2, 1]  # 순위에 따른 가중치

        # NDCG@5 계산
        ndcg = ndcg_score([y_true], [y_score], k=5)
        ndcg_scores.append(ndcg)

        # MRR@5 계산
        try:
            rank = retrieved_ids.index(correct_id) + 1
            mrr = 1 / rank
        except ValueError:
            mrr = 0  # 정답이 top 5 안에 없으면 MRR은 0
        mrr_scores.append(mrr)

        # 결과를 저장할 데이터 구성
        results.append({
            'question': question,
            'correct_id': correct_id,
            'retrieved_ids': retrieved_ids,
            'correct_answer': correct_answer,
            'generated_answer': generated_answer,  # 쉼표 누락 수정
            'ndcg@5': ndcg,
            'mrr@5': mrr
        })

    # 결과를 JSON 파일로 저장
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=4)

    # NDCG@5, MRR@5 평균 출력
    print(f"Average NDCG@5: {sum(ndcg_scores) / len(ndcg_scores)}")
    print(f"Average MRR@5: {sum(mrr_scores) / len(mrr_scores)}")

In [12]:
# 간단한 테스트
from langchain.embeddings import SentenceTransformerEmbeddings
from sentence_transformers import SentenceTransformer
# 임베딩 모델 로드 (파인튜닝된 모델)

# Accuracy_test(documents_512, EM_klue_nli)
# Accuracy_test(documents_512, EM_open)
# Accuracy_test(documents_512, EM_klue_vanilla)
# output_path = '/home/choi/Git/RAG_con_doc/langchain/MNRL_MRL.json'
# evaluate_and_save_results(embed_model, documents_512_sample, output_path, A_chain)
embedding_models2 = [
    ('EM_klue_nli', EM_klue_nli),
    # ('EM_MNRL_MRL', EM_MNRL_MRL),
    # ('EM_open', EM_open),
    # ('EM_klue_vanilla', EM_klue_vanilla),
    # ('EM_open_3', EM_open_3),
    # ('bge_m3', bge_m3),
    # ('e5_large', e5_large),
    
]

for model_name, embed_model in embedding_models2:
    output_path = f'/home/choi/Git/RAG_con_doc/langchain/{model_name}_results.json'
    print(f"Evaluating model: {model_name}")
    evaluate_and_save_results(embed_model, documents_512, output_path, A_chain)

Evaluating model: EM_klue_nli
Average NDCG@5: 0.6903726108736176
Average MRR@5: 0.6620717131474109


In [15]:
# 이 아래는 간단한 테스트를 위한 코드이므로 삭제해도 괜찮음
def calculate_accuracy(file_path):
    # JSON 파일 불러오기
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    # Initialize counters
    correct_count = 0
    total_count = len(data)

    # Iterate through each entry and check if the correct_id matches the first retrieved_id
    for entry in data:
        if entry["correct_id"] == entry["retrieved_ids"][0]:
            correct_count += 1

    # Calculate accuracy
    accuracy = correct_count / total_count * 100
    print(f"Accuracy: {accuracy:.2f}%")
    
def calculate_accuracy_and_average_scores(file_path):
    # JSON 파일 불러오기
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    # Initialize counters
    correct_count = 0
    total_count = len(data)
    ndcg_sum = 0
    mrr_sum = 0

    # Iterate through each entry to calculate correct count, ndcg sum, and mrr sum
    for entry in data:
        # Check if correct_id matches the first retrieved_id for accuracy calculation
        if entry["correct_id"] == entry["retrieved_ids"][0]:
            correct_count += 1

        # Sum up ndcg@5 and mrr@5 scores
        ndcg_sum += entry.get("ndcg@5", 0)
        mrr_sum += entry.get("mrr@5", 0)

    # Calculate accuracy, ndcg@5 average, and mrr@5 average
    accuracy = (correct_count / total_count) * 100
    avg_ndcg = ndcg_sum / total_count
    avg_mrr = mrr_sum / total_count

    print(f"Accuracy: {accuracy:.2f}%")
    print(f"Average NDCG@5: {avg_ndcg:.4f}")
    print(f"Average MRR@5: {avg_mrr:.4f}")


In [16]:

for model_name, embed_model in embedding_models2:
    file_path = f'/home/choi/Git/RAG_con_doc/langchain/{model_name}_results.json'
    print(f"Evaluating model: {model_name}")
    calculate_accuracy_and_average_scores(file_path)

Evaluating model: EM_klue_nli
Accuracy: 58.65%
Average NDCG@5: 0.6904
Average MRR@5: 0.6621


In [17]:
calculate_accuracy_and_average_scores('/home/choi/Git/RAG_con_doc/langchain/EM_MNRL_MRL_results.json')

Accuracy: 59.60%
Average NDCG@5: 0.7454
Average MRR@5: 0.7047
