In [11]:
import os, json, pandas as pd
from pathlib import Path
from typing import List, Dict, Any

from llama_index.core import VectorStoreIndex, Document, Settings
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.embeddings import BaseEmbedding

import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer
from pydantic import PrivateAttr
from IPython.display import display
from tqdm.notebook import tqdm

# 로컬 Hugging Face 임베딩 클래스 정의 (BaseEmbedding 상속 + PrivateAttr 사용)
class LocalHFEmbedding(BaseEmbedding):
    _model: SentenceTransformer = PrivateAttr()

    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2", **kwargs):
        super().__init__(**kwargs)
        self._model = SentenceTransformer(model_name)

    # 동기 메서드
    def _get_query_embedding(self, query: str):
        return self._model.encode(query, convert_to_numpy=True).tolist()

    def _get_text_embedding(self, text: str):
        return self._model.encode(text, convert_to_numpy=True).tolist()

    def _get_text_embeddings(self, texts: list[str]):
        return [self._get_text_embedding(t) for t in texts]

    # 비동기 메서드
    async def _aget_query_embedding(self, query: str):
        return self._get_query_embedding(query)

    async def _aget_text_embedding(self, text: str):
        return self._get_text_embedding(text)

    async def _aget_text_embeddings(self, texts: list[str]):
        return self._get_text_embeddings(texts)

# 전역 설정 (로컬 임베딩 사용)
Settings.embed_model = LocalHFEmbedding()
splitter = SentenceSplitter(chunk_size=1200, chunk_overlap=200)

# 데이터 경로
DATA_DIR = Path("./data")
DATA_DIR.mkdir(exist_ok=True)

JSONL_PATH = DATA_DIR / "JaGovFaqs-22k.jsonl"
PDF_PATH   = Path("~/Desktop/00zentai.pdf").expanduser()  # PDF 저장 경로 확인 필요

print("JSONL exists?:", JSONL_PATH.exists(), "| PDF exists?:", PDF_PATH.exists())

# JSONL 불러오기
def load_jsonl_as_documents(jsonl_path: Path):
    docs = []
    if jsonl_path.exists():
        with open(jsonl_path, "r", encoding="utf-8") as f:
            for line in f:
                obj = json.loads(line)
                q = obj.get("question") or obj.get("Q") or ""
                a = obj.get("answer")   or obj.get("A") or obj.get("answer_text") or ""
                text = f"Q: {q}\nA: {a}"
                docs.append(Document(text=text))
    return docs

jsonl_docs = load_jsonl_as_documents(JSONL_PATH)

# PDF 불러오기 (PyMuPDF, 진행률 표시)
pdf_docs = []
if PDF_PATH.exists():
    doc = fitz.open(PDF_PATH)
    for i, page in enumerate(tqdm(doc, desc="Loading PDF")):
        text = page.get_text("text")
        if text.strip():
            for node in splitter.get_nodes_from_documents([Document(text=text)]):
                pdf_docs.append(Document(
                    text=node.get_content(),
                    metadata={"source": f"soumu_whitepaper_r06_page_{i+1}"}
                ))
    print("Total pages:", len(doc), "| Extracted chunks:", len(pdf_docs))
else:
    print("PDF file not found:", PDF_PATH)

print("Loaded docs -> JSONL:", len(jsonl_docs), "| PDF:", len(pdf_docs))

# 인덱스 생성
all_docs = jsonl_docs + pdf_docs
if not all_docs:
    raise RuntimeError("No documents to index. Please check data files.")

index = VectorStoreIndex.from_documents(all_docs)
retriever = VectorIndexRetriever(index=index, similarity_top_k=3)

# 질의 함수 (retrieval 결과 반환)
def ask(question: str) -> Dict[str, Any]:
    nodes = retriever.retrieve(question)
    return {
        "question": question,
        "retrieved": [n.node.get_content()[:200] for n in nodes]  # 앞 200자만 미리보기
    }

# 과제용 질문 5개 실행
questions = [
    "Summarize the overview of the My Number system.",
    "What are the key points of the Digital Garden City Nation concept in the 2024 White Paper?",
    "Briefly explain the security measures for the My Number card.",
    "If there are KPIs or progress related to local government DX, please cite them.",
    "Summarize the status of the spread of online application procedures based on the White Paper.",
]

results = [ask(q) for q in questions]

# 결과 저장 + 출력
json_path = DATA_DIR / "io_examples_for_pr.json"
csv_path  = DATA_DIR / "io_examples_for_pr.csv"

with open(json_path, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

pd.DataFrame(results).to_csv(csv_path, index=False, encoding="utf-8-sig")

print("Done! Results saved.")
print("JSON path:", json_path)
print("CSV path:", csv_path)

df = pd.DataFrame(results)
display(df)

JSONL exists?: True | PDF exists?: True


Loading PDF:   0%|          | 0/320 [00:00<?, ?it/s]

Total pages: 320 | Extracted chunks: 550
Loaded docs -> JSONL: 22794 | PDF: 550
Done! Results saved.
JSON path: data/io_examples_for_pr.json
CSV path: data/io_examples_for_pr.csv


Unnamed: 0,question,retrieved
0,Summarize the overview of the My Number system.,"[804\n15,109\n15,252\n15,273\n15,759\n16,123\n..."
1,What are the key points of the Digital Garden ...,[また、5GをはじめとするICTインフラ整備支援策と5G利活用促進\n策を一体的かつ効果的に...
2,Briefly explain the security measures for the ...,[soumu.go.jp/main_sosiki/kenkyu/cybersecurity_...
3,If there are KPIs or progress related to local...,[年度情報通信メディアの利用時間と情報行動に関する調査」\n総務省「令和5 年度 テレワーク...
4,Summarize the status of the spread of online a...,"[＊17 European Commission, “The Digital Service..."
