In [1]:
# Model
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
from langchain_huggingface import HuggingFacePipeline
import torch
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import PromptTemplate
# Vector stores
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.retrievers import BM25Retriever, KNNRetriever
from langchain.retrievers import EnsembleRetriever
# etc
import os
import pandas as pd
from tqdm import tqdm
import unicodedata

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

def get_embedding():
    embeddings = HuggingFaceEmbeddings(
        model_name='intfloat/multilingual-e5-small',
        model_kwargs={'device': 'cuda'},
        encode_kwargs={'normalize_embeddings': True})
    return embeddings

def normalize_string(s):
    return unicodedata.normalize('NFC', s)

def format_docs(docs):
    """검색된 문서들을 하나의 문자열로 포맷팅"""
    context = ""
    for i, doc in enumerate(docs):
        #context += f"Document {i+1}\n"
        context += doc.page_content
        context += '\n\n'
    return context

In [None]:

def fewshot_db(df):
    df = df.drop('SAMPLE_ID', axis=1)
    df = df.drop('Source_path', axis=1)
    df = df.to_dict(orient='records')
    print("Loaded Fewshot Set:", df[:1])
    to_vectorize = ["\n\n".join(normalize_string(value) for value in example.values()) for example in df]
    
    faiss = FAISS.from_texts(to_vectorize, embedding=get_embedding())
    bm = BM25Retriever.from_texts(to_vectorize)
    knn = KNNRetriever.from_texts(to_vectorize, embeddings=get_embedding())
    return faiss, bm, knn
    
def make_db(df):
    # Create a new FAISS database
    # pdf reader
    documents = []
    pdf_files = df['Source_path'].unique()
    for pdf_file in pdf_files:
        pdf_loader = PyPDFLoader(pdf_file)
        pdf_documents = pdf_loader.load()
        for pdf_document in pdf_documents:
            pdf_document.page_content = pdf_document.page_content.replace("\x07","")
        documents.extend(pdf_documents)
    # 정규화
    # for doc in documents:
    #     doc.page_content = normalize_string(doc.page_content)
    chunk_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=60)
    chunks = chunk_splitter.split_documents(documents)
    print(f"Total number of chunks: {len(chunks)}")
    # FAISS DB 만들기
    faiss = FAISS.from_documents(chunks, embedding=get_embedding())
    bm =  BM25Retriever.from_documents(chunks)
    knn = KNNRetriever.from_documents(chunks, embeddings=get_embedding())
    return faiss, bm, knn


In [None]:

train_faiss_db, train_bm_retrievier, knn_retriever = make_db(train_df) 
test_faiss_db, test_bm_retrievier, test_knn_retriever = make_db(test_df)


Total number of chunks: 2419


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Total number of chunks: 261


In [None]:

fewshot_faiss_db, fewshot_bm_retrievier, fewshot_knn_retriever = fewshot_db(train_df)


Loaded Fewshot Set: [{'Source': '1-1 2024 주요 재정통계 1권', 'Question': '2024년 중앙정부 재정체계는 어떻게 구성되어 있나요?', 'Answer': '2024년 중앙정부 재정체계는 예산(일반·특별회계)과 기금으로 구분되며, 2024년 기준으로 일반회계 1개, 특별회계 21개, 기금 68개로 구성되어 있습니다.'}]


In [None]:
train_k = 1
train_bm_retrievier.k = train_k
knn_retriever.k = train_k
faiss_retriever = train_faiss_db.as_retriever(search_kwargs={'k':train_k} )
train_ensemble_retriever = EnsembleRetriever(
    retrievers=[train_bm_retrievier,knn_retriever, faiss_retriever], weights=[0.25,0.25, 0.5]
)

test_k = 3
test_bm_retrievier.k = test_k
test_knn_retriever.k = test_k
test_faiss_retriever = test_faiss_db.as_retriever(search_kwargs={'k':test_k} )
test_ensemble_retriever = EnsembleRetriever(
    retrievers=[test_bm_retrievier,test_knn_retriever, test_faiss_retriever], weights=[0.25,0.25, 0.5]
)

fewshot_k = 3
fewshot_bm_retrievier.k = fewshot_k
fewshot_knn_retriever.k = fewshot_k
fewshot_faiss_retriever = fewshot_faiss_db.as_retriever(search_kwargs={'k':fewshot_k} )
fewshot_ensemble_retriever = EnsembleRetriever(
    retrievers=[fewshot_bm_retrievier,fewshot_knn_retriever, fewshot_faiss_retriever], weights=[0.25,0.25, 0.5]
)


In [None]:
def setup_llm_pipeline(model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"):
    # 토크나이저 로드 및 설정
        # 양자화 설정 적용
    bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_use_double_quant=True, 
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=torch.bfloat16
    )
    model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config,low_cpu_mem_usage=True)
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    text_generation_pipeline = pipeline(
        model=model,
        tokenizer=tokenizer,
        task="text-generation",
        temperature=0.4,
        do_sample=True,
        top_p = 0.6,
        repetition_penalty=1.1,
        return_full_text=False,
        max_new_tokens=512,
        eos_token_id = terminators,
        pad_token_id = tokenizer.eos_token_id
    )

    llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

    return llm
llm = setup_llm_pipeline()

Loading checkpoint shards: 100%|██████████| 4/4 [00:22<00:00,  5.66s/it]


In [None]:
def extract_answer(response):
    # AI: 로 시작하는 줄을 찾아 그 이후의 텍스트만 추출
    lines = response.split('\n')
    for line in lines:
        line = line.replace('**', '')
        if line.startswith('Answer:'):
            return line.replace('Answer:', '').strip()
        if line.startswith('assistant:'):
            return line.replace('assistant:', '').strip()
    return response.strip()  # AI: 를 찾지 못한 경우 전체 응답을 정리해서 반환

def fewshot_ex(fewshot_retriever, train_retriever, query):
    fewshot_results = fewshot_retriever.invoke(query) #Document(page_content='중소벤처기업부_창업사업화지원\n\n창업사업화지원의 사업목적은 무엇인가?\n\n창업사업화지원의 사업목적은 창업기업의 성장단계별, 초격차 분야별, 글로벌화 지원체계를 구축‧운영하여 혁신 기술창업을 활성화하고 창업기업 성장 및 생존율 제고하는 것이다.')
    fewshot_str = ""
    for result in fewshot_results:
        result = result.page_content.split('\n\n')
        buff_str = "<|start_header_id|>user<|end_header_id|>\n"
        question = result[1]
        buff_str += f"Question\n{question}\n\n"
        if train_retriever is not None:
            buff_str += f"Context\n"
            docs = train_retriever.invoke(question)
            
            print(len(docs),end='|')
            
            buff_str += format_docs(docs)
            buff_str += "<eot_id>"
        buff_str += f"<|start_header_id|>assistant<|end_header_id>\n{result[2]}<|eot_id|>"
        fewshot_str += buff_str
    return fewshot_str

def run (train,test,fewshot,dataset,llm,verbose=False):
    results = []
    for i, row in tqdm(dataset.iterrows()):
        full_template = "<|begin_of_text|>"
        full_template += """<|start_header_id|>system<|end_header_id|>
You are the financial expert who helps me with my financial information Q&As.
You earn 10 points when you answer me and follow the rules and lose 7 points when you don't.
Here are some rules you should follow.
- Please use contexts to answer the question.
- Please your answers should be concise.
- Please answers must be written in Korean.
- Please answer the question in 1-3 sentences.

- Use the three examples below to learn how to follow the rules and reference information in context.<|eot_id|>
"""
        question = row['Question']
        fewshot_str = fewshot_ex(fewshot, train, question)
        full_template += fewshot_str
        full_template += "\n\n"
        contexts = test.invoke(question)
        contexts = format_docs(contexts)
        full_template += """<|start_header_id|>user<|end_header_id|>\nQuestion\n{input}\n\n"""
        full_template += f"""Context\n{contexts}<|eot_id|>"""
        full_template += """<|start_header_id|>assistant<|end_header_id>\n"""
        
        prompt = PromptTemplate.from_template(full_template)
        qa_chain = (
        {
            "input": RunnablePassthrough(),
        }
        | prompt
        | llm
        | StrOutputParser()
        )
        if verbose:
            print("Question: ", question)
        answer = qa_chain.invoke(input=question)
        answer = extract_answer(answer)
        results.append({
            "Question": question,
            "Answer": answer,
            "Source": row['Source']
        })
        if verbose:
            print("Answer: ", results[-1]['Answer'])
    return results

In [None]:
from collections import Counter
def calculate_f1_score(true_sentence, predicted_sentence, sum_mode=True):

    #공백 제거
    true_sentence = ''.join(true_sentence.split())
    predicted_sentence = ''.join(predicted_sentence.split())
    
    true_counter = Counter(true_sentence)
    predicted_counter = Counter(predicted_sentence)

    #문자가 등장한 개수도 고려
    if sum_mode:
        true_positive = sum((true_counter & predicted_counter).values())
        predicted_positive = sum(predicted_counter.values())
        actual_positive = sum(true_counter.values())

    #문자 자체가 있는 것에 focus를 맞춤
    else:
        true_positive = len((true_counter & predicted_counter).values())
        predicted_positive = len(predicted_counter.values())
        actual_positive = len(true_counter.values())

    #f1 score 계산
    precision = true_positive / predicted_positive if predicted_positive > 0 else 0
    recall = true_positive / actual_positive if actual_positive > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1_score

def calculate_average_f1_score(true_sentences, predicted_sentences):
    
    total_precision = 0
    total_recall = 0
    total_f1_score = 0
    
    for true_sentence, predicted_sentence in zip(true_sentences, predicted_sentences):
        precision, recall, f1_score = calculate_f1_score(true_sentence, predicted_sentence)
        total_precision += precision
        total_recall += recall
        total_f1_score += f1_score
    
    avg_precision = total_precision / len(true_sentences)
    avg_recall = total_recall / len(true_sentences)
    avg_f1_score = total_f1_score / len(true_sentences)
    
    return {
        'average_precision': avg_precision,
        'average_recall': avg_recall,
        'average_f1_score': avg_f1_score
    }

In [None]:
from sklearn.model_selection import KFold
k_folds = 4
fold_results = []
kf = KFold(n_splits=k_folds, shuffle=True, random_state=52)
for fold, (train_index, val_index) in enumerate(kf.split(train_df)):
    fold_result = []
    train_set = train_df.iloc[train_index]
    val_set = train_df.iloc[val_index]
    
    
    fewshot_faiss_db, fewshot_bm_retrievier, fewshot_knn_retriever = fewshot_db(train_set)

    fewshot_k = 2
    fewshot_bm_retrievier.k = fewshot_k
    fewshot_knn_retriever.k = fewshot_k
    fewshot_faiss_retriever = fewshot_faiss_db.as_retriever(search_kwargs={'k':fewshot_k} )
    fewshot_ensemble_retriever = EnsembleRetriever(
        retrievers=[fewshot_bm_retrievier,fewshot_knn_retriever, fewshot_faiss_retriever], weights=[0.25,0.25, 0.5]
    )
    pred = run(train_ensemble_retriever, train_ensemble_retriever, fewshot_ensemble_retriever, val_set, llm, verbose=False)
    result = pd.DataFrame()
    result['pred'] = [result['Answer'] for result in pred]
    val_set.index = range(len(val_set))
    result['gt'] = val_set['Answer']
        
    result = calculate_average_f1_score(result['gt'], result['pred'])
    print(result)
    fold_results.append(result)
    break

Loaded Fewshot Set: [{'Source': '1-1 2024 주요 재정통계 1권', 'Question': '2024년 중앙정부 재정체계는 어떻게 구성되어 있나요?', 'Answer': '2024년 중앙정부 재정체계는 예산(일반·특별회계)과 기금으로 구분되며, 2024년 기준으로 일반회계 1개, 특별회계 21개, 기금 68개로 구성되어 있습니다.'}]


0it [00:00, ?it/s]

1|1|2|

1it [00:24, 24.94s/it]

2|2|2|2|

2it [01:07, 35.51s/it]

2|1|2|1|

3it [01:46, 37.09s/it]

1|2|2|2|

4it [02:30, 39.70s/it]

2|2|2|

5it [03:05, 38.06s/it]

2|2|2|2|

6it [03:39, 36.68s/it]

1|2|2|2|

7it [04:24, 39.46s/it]

2|2|2|1|

8it [05:05, 39.84s/it]

2|2|1|1|

9it [05:45, 40.01s/it]

2|2|2|

10it [06:34, 42.62s/it]

2|1|2|

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


2|

11it [07:17, 42.78s/it]

2|1|2|

12it [07:57, 42.04s/it]

2|2|2|2|

13it [08:41, 42.41s/it]

2|2|1|1|

14it [09:12, 39.23s/it]

2|2|2|2|

15it [09:56, 40.57s/it]

2|2|2|2|

In [None]:
from save_module import save


fewshot_faiss_db, fewshot_bm_retrievier, fewshot_knn_retriever = fewshot_db(train_set)
fewshot_k = 2
fewshot_bm_retrievier.k = fewshot_k
fewshot_knn_retriever.k = fewshot_k
fewshot_faiss_retriever = fewshot_faiss_db.as_retriever(search_kwargs={'k':fewshot_k} )
fewshot_ensemble_retriever = EnsembleRetriever(
        retrievers=[fewshot_bm_retrievier,fewshot_knn_retriever, fewshot_faiss_retriever], weights=[0.25,0.25, 0.5]
    )

results = run(train_ensemble_retriever, test_ensemble_retriever, fewshot_ensemble_retriever, test_df, llm, verbose=True)
save(results)

0it [00:00, ?it/s]

Question
2022년 혁신창업사업화자금(융자)의 예산은 얼마인가요?

Question:  2022년 혁신창업사업화자금(융자)의 예산은 얼마인가요?


1it [01:25, 85.76s/it]

Answer:  2022년 혁신창업사업화자금(융자의) 예산은 2,300백만원입니다.
Question
중소벤처기업부의 혁신창업사업화자금(융자) 사업목적은 무엇인가요?

Question:  중소벤처기업부의 혁신창업사업화자금(융자) 사업목적은 무엇인가요?


2it [02:50, 85.16s/it]

Answer:  중소벤처기업부의 혁신창업사업화자금(융자)의 사업목적은 기술력과 사업성이 우수하고 미래 성장 가능성이 높은 중소벤처기업의 창업을 활성화하고 고용 창출을 도모하는 것입니다.
Question
중소벤처기업부의 혁신창업사업화자금(융자) 사업근거는 어떤 법률에 근거하고 있나요?

Question:  중소벤처기업부의 혁신창업사업화자금(융자) 사업근거는 어떤 법률에 근거하고 있나요?


3it [04:09, 82.21s/it]

Answer:  중소기업진흥에 관한 법률 제66조, 제67조, 제74조
Question
2010년에 신규 지원된 혁신창업사업화자금은 무엇인가요?

Question:  2010년에 신규 지원된 혁신창업사업화자금은 무엇인가요?


4it [05:45, 87.58s/it]

Answer:  2010년에 신규 지원된 혁신창업사업화자금은 재창업자금(실패 경영인에 대한 재기지원)을 신규 지원했습니다.
Question
혁신창업사업화자금 중 2020년에 신규 지원된 자금은 무엇인가요?

Question:  혁신창업사업화자금 중 2020년에 신규 지원된 자금은 무엇인가요?


5it [07:07, 85.80s/it]

Answer:  2020년 신규 지원된 혁신창업사업화자금은 2,300,000백만원입니다.
Question
재창업자금이 재도약지원자금으로 이관된 연도는 언제인가요?

Question:  재창업자금이 재도약지원자금으로 이관된 연도는 언제인가요?


: 