In [None]:
import pandas as pd
from faiss_module import load_and_vectorize,load_chunks_make_docdb
from model import setup_llm_pipeline
from fewshot_module import fewshot_ex
from save_module import save
from seed_module import seed_everything
from utils_module import make_dict, extract_answer, format_docs
from run import run
seed_everything(52)
from sklearn.model_selection import KFold

In [None]:
k_folds = 4
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
train_dict = make_dict('train.csv')
# train에도 RAG를 쓸 때 사용
train_db = load_chunks_make_docdb('./train_source', './train_faiss_db')
train_retriever = train_db.as_retriever(search_kwargs={'k': 1})

test_retriver = train_db.as_retriever(search_kwargs={'k': 3})

fewshot_db = load_and_vectorize('train.csv', './fewshot_faiss_db')
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
llm = setup_llm_pipeline(model_id)
for fold, (train_index, val_index) in enumerate(kf.split(train_dict)):
    print(f"\nFold {fold + 1}/{k_folds}")
    train_set = [train_dict[i] for i in train_index]
    val_set = [train_dict[i] for i in val_index]
    run(train_retriever,
        test_retriver,
        fewshot_db, 
        val_set ,
        llm)

        # K-fold 교차 검증을 위한 데이터 분할
        kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
        
        fold_results = []
        
        for fold, (train_index, val_index) in enumerate(kf.split(train_dict)):
            print(f"\nFold {fold + 1}/{k_folds}")
            
            # 현재 fold에 대한 학습 및 검증 데이터 설정
            train_set = [train_dict[i] for i in train_index]
            val_set = [train_dict[i] for i in val_index]
            
            # 현재 fold에 대한 데이터베이스 설정
            train_db = load_chunks_make_docdb('./train_source', './train_faiss_db')
            train_retriever = train_db.as_retriever(search_kwargs={'k': 1})
            
            # 검증 데이터베이스 설정
            val_db = load_chunks_make_docdb('./val_source', './val_faiss_db')
            val_retriever = val_db.as_retriever(search_kwargs={'k': 5})
            
            fewshot_db = load_and_vectorize('train.csv', './fewshot_faiss_db')
            llm = setup_llm_pipeline(model_id)
            
            fold_results.append([])
            
            for item in tqdm(val_set):
                fewshot_str = fewshot_ex(fewshot_db, item, train_retriever=train_retriever, fewshot_num=7)
                
                full_template = """system
Today Date: 8 Aug 2024
1,000,000 원= 100 만원
10 백만원 = 10,000,000 원
100 백만원 = 100,000,000 원
You are the financial expert who helps me with my financial information Q&As.
You earn 10 points when you answer me and follow the rules and lose 7 points when you don't.

Here are some rules you should follow.
- Please use contexts to answer the question.
- Please your answers should be concise.
- Please answers must be written in Korean.
- Please answer the question in 1-3 sentences.

Please answer like the example below.
""" + f"{fewshot_str}" + """system
Now Do it for me.
Given the following contexts about Question:
{context}user
{input}
assistant\n\n
"""
                prompt = PromptTemplate.from_template(full_template)
                qa_chain = (
                {
                    "context": val_retriever | format_docs,
                    "input": RunnablePassthrough(),
                }
                | prompt
                | llm
                | StrOutputParser()
                )
                
                answer = qa_chain.invoke(item['Question'])
                answer = extract_answer(answer)
                result = {
                    "Question": item['Question'],
                    "Answer": answer,
                    "Source": item['Source']
                }
                fold_results[-1].append(result)
                print(f"Question: {item['Question']}\nAnswer: {result['Answer']}")
            
            # 현재 fold에 대한 결과 저장
            fold_save_path = f"fold_{fold + 1}_results.json"
            save(fold_results[-1], path=fold_save_path)
        
        # 모든 fold의 결과를 집계하고 메트릭 계산
        all_results = [result for fold_result in fold_results for result in fold_result]
        
        # 예시로 정확도 계산, 필요에 따라 조정
        # y_true, y_pred를 정의하여 정확도 계산
        # accuracy = accuracy_score(y_true, y_pred)  # 예시
        
        # 집계된 결과 저장
        aggregate_save_path = "aggregate_results.json"
        save(all_results, path=aggregate_save_path)
        
        print("K-fold 교차 검증이 완료되었습니다.")
    else:
        # 검증을 수행하지 않을 경우
        print("검증을 수행하지 않습니다.")
        # 기본적인 run 함수를 호출하여 전체 데이터에 대해 실행
        run(model_id=model_id)