In [1]:
import pdfplumber, re, redis, numpy as np
from sentence_transformers import SentenceTransformer
from redis.commands.search.field import TextField, VectorField
from redis.commands.search.index_definition import IndexDefinition, IndexType

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ------------------------------------------------------------
# 1️⃣ Redis 연결 + 모델 로드
# ------------------------------------------------------------
r = redis.Redis(host="localhost", port=6379, decode_responses=False)
model = SentenceTransformer("jhgan/ko-sroberta-multitask")

RAG_INDEX = "qa2_index"  # 새 인덱스명
PDF_PATH = "/Users/yoodongseok/Desktop/rag_project/data/2024 관세행정 민원상담 사례집.pdf"

In [3]:
# ------------------------------------------------------------
# 2️⃣ 임베딩 함수
# ------------------------------------------------------------
def embed(text: str):
    emb = model.encode(text, normalize_embeddings=False)  # ✅ COSINE 호환
    return np.array(emb, dtype=np.float32).tobytes()

In [4]:
# ------------------------------------------------------------
# 3️⃣ 새 인덱스 초기화
# ------------------------------------------------------------
def init_rag_index():
    try:
        r.ft(RAG_INDEX).dropindex(delete_documents=True)
        print(f"🗑️ 기존 {RAG_INDEX} 삭제 완료")
    except Exception:
        pass

    dim = len(model.encode("차원 확인", normalize_embeddings=False))
    r.ft(RAG_INDEX).create_index(
        fields=[
            VectorField("embedding", "FLAT", {
                "TYPE": "FLOAT32",
                "DIM": dim,
                "DISTANCE_METRIC": "COSINE"
            }),
            TextField("question"),
            TextField("answer")
        ],
        definition=IndexDefinition(prefix=["doc2:"], index_type=IndexType.HASH)
    )
    print(f"✅ {RAG_INDEX} 인덱스 생성 완료")

In [5]:
# ------------------------------------------------------------
# 4️⃣ PDF Q–A 추출 (본문 + 표 포함)
# ------------------------------------------------------------
def extract_qa_pairs(pdf_path):
    qa_pairs = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_idx, page in enumerate(pdf.pages):
            text = page.extract_text() or ""
            lines = text.split("\n") if text else []

            # 표 추출
            tables = page.extract_tables()
            table_texts = []
            for table in tables:
                rows = [" | ".join([cell if cell else "" for cell in row]) for row in table]
                table_texts.append("\n".join(rows))
            table_text_block = "\n\n[표 데이터]\n" + "\n\n".join(table_texts) if table_texts else ""

            merged_text = text + table_text_block

            current_q, current_a = None, []
            for line in merged_text.split("\n"):
                line = line.strip()
                if re.match(r"^(\?|관세법|.*\?)", line):
                    if current_q and current_a:
                        qa_pairs.append({
                            "question": current_q,
                            "answer": "\n".join(current_a).strip()
                        })
                    current_q = line
                    current_a = []
                elif current_q:
                    current_a.append(line)

            if current_q and current_a:
                qa_pairs.append({
                    "question": current_q,
                    "answer": "\n".join(current_a).strip()
                })
    return qa_pairs


In [6]:
# ------------------------------------------------------------
# 5️⃣ Redis에 저장 (doc2: prefix)
# ------------------------------------------------------------
def save_to_rag_index(pdf_path):
    qa_list = extract_qa_pairs(pdf_path)
    print(f"📘 PDF에서 {len(qa_list)}개의 QA 추출 완료")

    for i, qa in enumerate(qa_list):
        emb = embed(qa["question"])
        r.hset(f"doc2:{i}", mapping={
            "embedding": emb,
            "question": qa["question"],
            "answer": qa["answer"]
        })

    print(f"💾 Redis에 {len(qa_list)}개의 QA 저장 완료 ({RAG_INDEX})")

In [7]:
# ------------------------------------------------------------
# 6️⃣ 실행
# ------------------------------------------------------------
if __name__ == "__main__":
    init_rag_index()
    save_to_rag_index(PDF_PATH)

✅ qa2_index 인덱스 생성 완료
📘 PDF에서 1000개의 QA 추출 완료
💾 Redis에 1000개의 QA 저장 완료 (qa2_index)
