In [None]:
!pip3 install -q -U torch==2.3.1
!pip3 install -q -U transformers==4.44.0
!pip3 install -q -U datasets==2.18.0
!pip3 install -q -U bitsandbytes==0.42.0
!pip3 install -q -U peft==0.9.0
!pip3 install -q -U accelerate==0.33.0

!pip install langchain
!pip install langchain_community
!pip install langchain-teddynote
!pip install sentence-transformers
!pip install faiss-gpu
!pip install pymupdf
!pip install pymupdf4llm
!pip install pdf2docx
!pip install pdfplumber

!pip install FlagEmbedding

In [None]:
import torch
from datasets import Dataset, load_dataset, DatasetDict
from transformers import (AutoTokenizer,
                          AutoModelForCausalLM,
                          BitsAndBytesConfig,
                          pipeline,
                          set_seed,
)
import transformers
from peft import LoraConfig, PeftModel
from accelerate.utils import is_mlu_available

import json
import random
import os
import unicodedata
import pandas as pd
import numpy as np
from tqdm import tqdm
from accelerate import Accelerator
from FlagEmbedding import FlagReranker
import re
import pymupdf
import pymupdf4llm
import pdfplumber
from tabulate import tabulate
from collections import Counter

# Langchain 관련
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter, MarkdownTextSplitter, MarkdownHeaderTextSplitter
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from langchain.schema import Document
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain_teddynote.retrievers import KiwiBM25Retriever
from langchain.retrievers import EnsembleRetriever, MultiQueryRetriever
from langchain.document_loaders import PDFPlumberLoader, PyMuPDFLoader, PyPDFLoader, UnstructuredPDFLoader

In [None]:
import huggingface_hub
huggingface_hub.login("hf_")
LLAMA_CLOUD_API_KEY= "llx-"

In [None]:
model = "sh2orc/Llama-3.1-Korean-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer = tokenizer,
    torch_dtype=torch.float16,
    device_map="auto"
)

In [None]:
def extract_table_summarys(load_path):
  """테이블 추출 후 llm을 이용하여 자연어로 정리"""
  with pdfplumber.open(load_path) as pdf:
      texts = ""
      for page in pdf.pages:
          table = page.extract_table()
          print(page)
          if table==None:
            continue
          # RAG 체인 구성
          template = """다음 리스트들은 2차원 표 형태의 데이터입니다. 테이블 형태의 정보를 자연어로 표현해주세요. 누락되는 정보가 없도록 답변하세요.

          {}"""
          content = template.format(str(table))
          messages = [{"role": "user", "content": f"{content}"}]
          prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
          terminators = [
                    pipeline.tokenizer.eos_token_id,
                    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
                ]

          # 답변 추론
          print(f"Question: {content}")
          #print(messages)
          #full_response = llm.invoke(prompt)
          outputs = pipeline(prompt,
                            max_new_tokens=500,
                            do_sample=True,
                            temperature=0.2,
                            top_k=3,
                            top_p=0.95,
                            eos_token_id = terminators,
                            )
          summary = outputs[0]["generated_text"][len(prompt):]
          print(summary)
          texts+=summary#여기까지가 파일로드

      splitter = RecursiveCharacterTextSplitter(
          chunk_size=512,
          chunk_overlap=32
      )
      chunks = splitter.split_text(texts)
      return chunks


In [None]:
def refine_text(text):
  """추출된 텍스트 정보 깔끔하게 정리하기"""
  llama_docs = re.sub(r"-{5,}","\n", text)
  #llama_docs = re.sub(r'\n', r' ',llama_docs)
  llama_docs = re.sub(r"[-*\x07\u20E7\u20E9\u0003\u00B7]", "", llama_docs)
  llama_docs = re.sub(r'[\u2027\u25CB\u3139]', '', llama_docs)
  llama_docs = re.sub(r"\u318d", "", llama_docs)
  llama_docs = re.sub(r"\u25A1", "", llama_docs)
  llama_docs = re.sub(r"\u007C{2,}", "", llama_docs)
  llama_docs = re.sub(r"\u25FE", " ", llama_docs)
  llama_docs = re.sub(r"\u3147", " ", llama_docs)
  llama_docs = re.sub(r"[\u3010]", "[", llama_docs)
  llama_docs = re.sub(r"[\u3011]", "]", llama_docs)
  llama_docs = re.sub(r"\u2018", "'", llama_docs)
  llama_docs = re.sub(r"\u2019", "'", llama_docs)
  llama_docs = re.sub(r"\t", "\n", llama_docs)
  llama_docs = re.sub(r" {3,}", "\n", llama_docs)
  llama_docs = re.sub(r"\n +", "\n", llama_docs)
  llama_docs = re.sub(r" \n ", "\n", llama_docs)
  llama_docs = re.sub(r"\n\n", "\n", llama_docs)
  llama_docs = re.sub(r'\n(.{1})\n', r'\1',llama_docs)
  return llama_docs

In [None]:
def extract_tables_from_pdf(pdf_path):
  """테이블 추출하기"""
    tables = []

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            # Extract tables from the page
            page_tables = page.extract_tables()

            for table in page_tables:
                formatted_table = format_table(table)
                tables.append({
                    'page': page_num + 1,
                    'table': formatted_table
                })

    return tables

def format_table(table):
    """테이블 포매팅하기"""
    formatted = []
    for row in table:
        formatted_row = []
        for cell in row:
            if isinstance(cell, list):
                # Nested table case
                formatted_row.append(format_table(cell, refine_text))
            else:
                cell = refine_text(str(cell))
                formatted_row.append(cell)
        formatted.append(formatted_row)
    return formatted

In [None]:
def remove_tables(load_path, save_path):
  """테이블 pdf에서 지우기"""
    doc = pymupdf.open(load_path)
    for page in doc:
        for tab in page.find_tables():
            # process the content of table 'tab'
            page.add_redact_annot(tab.bbox)  # wrap table in a redaction annotation
            #page.apply_redactions(0, 2, 0)  # erase all table text
    doc.save(save_path)
    # do text searches and text extractions here

In [None]:
def get_text_refined(load_path):
  """Document 객체에서 텍스트 추출하기"""
  md_docs = pymupdf4llm.to_markdown(load_path)
  text_refined_temp = refine_text(md_docs)

  return text_refined_temp

In [None]:
def get_preprocessed_text(load_path, save_path):
  table_summarys = extract_table_summarys(load_path)
  remove_tables(load_path, save_path)
  text_only = get_text_refined(save_path)
  return text_only, table_summarys

In [None]:
def process_pdf(file_path, chunk_size=512, chunk_overlap=32):
    """PDF 텍스트 추출 후 chunk 단위로 나누기"""
    text_preprocessed , table_summarized = get_preprocessed_text(file_path, "check.pdf")

    # 텍스트를 chunk로 분할

    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ]

    md_header_splitter = MarkdownHeaderTextSplitter(headers_to_split_on = headers_to_split_on, strip_headers = False)
    md_chunks = md_header_splitter.split_text(text_preprocessed)

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = splitter.split_documents(md_chunks)
    #테이블 요약 이어붙이기
    for summary in table_summarized:
        chunks.append(Document(page_content =summary))
    for chunk in chunks:
      print(chunk.page_content)
      print("*"*150)
    return chunks


def create_vector_db(chunks, embeddings):
    """FAISS DB 생성"""

    # FAISS DB 생성 및 반환
    db = FAISS.from_documents(chunks, embedding=embeddings)
    return db

def normalize_path(path):
    """경로 유니코드 정규화"""
    return unicodedata.normalize('NFC', path)


def process_pdfs_from_dataframe(df, base_directory, model_path="intfloat/multilingual-e5-large"):
    """딕셔너리에 pdf명을 키로해서 DB, retriever 저장"""
    pdf_databases = {}
    unique_paths = df['Source_path'].unique()

    # 임베딩 모델 설정
    model_kwargs = {'device': 'cpu'} # 주의
    encode_kwargs = {'normalize_embeddings': True}
    embeddings = HuggingFaceEmbeddings(
            model_name=model_path,
            model_kwargs=model_kwargs,
            encode_kwargs=encode_kwargs
    )
    for path in tqdm(unique_paths, desc="Processing PDFs"):
        # 경로 정규화 및 절대 경로 생성
        normalized_path = normalize_path(path)
        full_path = os.path.normpath(os.path.join(base_directory, normalized_path.lstrip('./'))) if not os.path.isabs(normalized_path) else normalized_path

        pdf_title = os.path.splitext(os.path.basename(full_path))[0]
        print(f"Processing {pdf_title}...")

        # PDF 처리 및 벡터 DB 생성
        chunks = process_pdf(full_path)
        db = create_vector_db(chunks, embeddings)

        #기본 리트리버 생성
        base_retriever = db.as_retriever()

        #sparse리트리버 생성
        kiwi_bm25_retriever = KiwiBM25Retriever.from_documents(chunks)

        #앙상블 리트리버 생성
        retriever = EnsembleRetriever(
            retrievers = [kiwi_bm25_retriever, base_retriever],
            weights = [0.5, 0.5],
            search_type = "mmr"
        )

        # 결과 저장
        pdf_databases[pdf_title] = {
                'db': db,
                'retriever': retriever
        }
    return pdf_databases


In [None]:
base_directory = './data' # Your Base Directory
df = pd.read_csv('./test.csv')
pdf_databases = process_pdfs_from_dataframe(df, base_directory)

In [None]:
# langchain 을 이용한 추론 

In [None]:
#GPU 메모리 정리
torch.cuda.empty_cache()

In [None]:
def find_top_n_indices(numbers, n):
    # 리스트의 인덱스와 값을 함께 저장
    indexed_numbers = enumerate(numbers)

    # 값을 기준으로 내림차순 정렬
    sorted_numbers = sorted(indexed_numbers, key=lambda x: x[1], reverse=True)

    # 상위 n개의 인덱스만 추출
    top_n_indices = [index for index, value in sorted_numbers[:n]]

    return top_n_indices

def normalize_string(s):
    """유니코드 정규화"""
    return unicodedata.normalize('NFC', s)

flagreranker = FlagReranker('BAAI/bge-reranker-v2-m3', use_fp16=True)

def format_docs(question, docs):
    """검색된 문서들을 하나의 문자열로 포맷팅"""
    context = ""

    reranker = flagreranker
    scores = []
    n = 5
    if n > len(docs):
      n = len(docs)

    for i in range(len(docs)):
      score = reranker.compute_score([question, str(docs[i])], normalize=True)
      scores.append(score)
      indeces = find_top_n_indices(scores, n)

    for index in indeces:
        context += docs[index].page_content
        context += '\n'

    return context



# 결과를 저장할 리스트 초기화
results = []

# DataFrame의 각 행에 대해 처리
for _, row in tqdm(df.iterrows(), total=len(df), desc="Answering Questions"):
    # 소스 문자열 정규화
    source = normalize_string(row['Source'])
    question = row['Question']

    # 정규화된 키로 데이터베이스 검색
    normalized_keys = {normalize_string(k): v for k, v in pdf_databases.items()}
    retriever = normalized_keys[source]['retriever']

    # RAG 체인 구성
    template = """정보의 기준 연도에 유의하여 답변하세요.
숫자 단위에 유의하여 답변하세요.
정확한 정보에 기반하여 답변하세요.
다음 정보를 바탕으로 질문에 답하세요:
{}

### 질문:
{}

질문의 핵심만 파악하여 간결하게 1-2문장으로 답변하고, 불필요한 설명은 피하며 요구된 정보만 제공하세요.

### 답변:
"""



    #Reranker 사용하려는데 Chain에 어떻게 적용하는지 몰라서 체인 과정을 직접 구현했습니다.
    docs = retriever.invoke(question)
    context = format_docs(question, docs)
    content = template.format(context, question)
    messages = [{"role": "user", "content": f"{content}"}]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    terminators = [
              pipeline.tokenizer.eos_token_id,
              pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
          ]

    # 답변 추론
    print(f"Question: {question}")
    #print(messages)
    #full_response = llm.invoke(prompt)
    outputs = pipeline(prompt,
                       max_new_tokens=400,
                       do_sample=True,
                       temperature=0.01,
                       top_k=4,
                       top_p=0.95,
                       eos_token_id = terminators,
                       )
    full_response = outputs[0]["generated_text"][len(prompt):]
    print(f"Answer: {full_response}\n")

    # 결과 저장
    results.append({
        "Source": row['Source'],
        "Source_path": row['Source_path'],
        "Question": question,
        "Answer": full_response
    })
    del full_response

In [None]:
# 제출용 샘플 파일 로드
submit_df = pd.read_csv("./sample_submission.csv")

# 생성된 답변을 제출 DataFrame에 추가
submit_df['Answer'] = [item['Answer'] for item in results]
submit_df['Answer'] = submit_df['Answer'].fillna("데이콘")     # 모델에서 빈 값 (NaN) 생성 시 채점에 오류가 날 수 있음 [ 주의 ]

# 결과를 CSV 파일로 저장
submit_df.to_csv("/content/drive/MyDrive/Dacon/Dacon_Financial_NLP/llama3.1-ko5.csv", encoding='UTF-8-sig', index=False)

In [None]:
#메모리 정리
torch.cuda.empty_cache()
import gc
gc.collect()