In [1]:
# API 키를 환경변수로 관리하기 위한 설정 파일
from dotenv import load_dotenv
import os

from typing_extensions import override

# API 키 정보 로드
load_dotenv(override=True)
print("🔍 PROJECT:", os.getenv("GOOGLE_CLOUD_PROJECT"))
print("🔍 CREDENTIAL FILE:", os.getenv("GOOGLE_APPLICATION_CREDENTIALS"))
# ✅ 추가: vertexai 초기화 (프로젝트 강제 적용)

🔍 PROJECT: knu-ema
🔍 CREDENTIAL FILE: C:/Users/SAMSUNG/AppData/Roaming/gcloud/knu-ema-af3cd6fa4532.json


In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader

# 1. PDF 파일 로드
loader = PyPDFLoader("James Stewart - Calculus, Early Transcendentals, International Metric Edition-CENGAGE Learning (2016).pdf")

# 2. 텍스트 분할기 설정
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150
)

# 3. 문서 로드 및 분할
docs = loader.load()
split_docs = text_splitter.split_documents(docs)


In [12]:
# 4. 배치 임베딩용 래퍼 클래스 정의
import time
from typing import List, Sequence
from langchain_google_genai import GoogleGenerativeAIEmbeddings

class BatchedEmbeddings:
    def __init__(self, base: GoogleGenerativeAIEmbeddings, batch_size: int = 16):
        self.base = base
        self.batch_size = batch_size

    def embed_documents(self, texts: Sequence[str]) -> List[List[float]]:
        all_embeddings: List[List[float]] = []
        for i in range(0, len(texts), self.batch_size):
            batch = texts[i : i + self.batch_size]
            embs = self.base.embed_documents(batch)
            all_embeddings.extend(embs)
            if i + self.batch_size < len(texts):
                print("sleep ", i)
                time.sleep(32)
                
        return all_embeddings

    def embed_query(self, text: str) -> List[float]:
        # Retriever 쿼리용에도 배치가 필요 없으니 바로 위임
        return self.base.embed_query(text)

# 5. Batched Embeddings 인스턴스 생성
base_embeddings = GoogleGenerativeAIEmbeddings(
    model="models/gemini-embedding-exp-03-07"
)
embeddings = BatchedEmbeddings(
    base=base_embeddings,
    batch_size=16  # 필요에 따라 조절하세요
)

# 6. FAISS VectorStore 생성 (내부에서 batch 단위로 임베딩 처리)
vectorstore = FAISS.from_documents(split_docs, embeddings)

# 7. Retriever 생성 및 확인
calculus_retriever = vectorstore.as_retriever()
print(f"총 청크 수: {len(split_docs)}")
print(f"첫 청크 미리보기:\n{split_docs[0].page_content[:300]}")

sleep  0
sleep  16
sleep  32
sleep  48
sleep  64
sleep  80
sleep  96
sleep  112
sleep  128
sleep  144
sleep  160
sleep  176
sleep  192
sleep  208
sleep  224
sleep  240
sleep  256
sleep  272
sleep  288
sleep  304
sleep  320
sleep  336
sleep  352
sleep  368
sleep  384
sleep  400
sleep  416
sleep  432
sleep  448
sleep  464
sleep  480
sleep  496
sleep  512
sleep  528
sleep  544
sleep  560
sleep  576
sleep  592
sleep  608
sleep  624
sleep  640
sleep  656
sleep  672
sleep  688
sleep  704
sleep  720
sleep  736
sleep  752
sleep  768
sleep  784
sleep  800
sleep  816
sleep  832
sleep  848
sleep  864
sleep  880
sleep  896
sleep  912
sleep  928
sleep  944
sleep  960
sleep  976
sleep  992
sleep  1008
sleep  1024
sleep  1040
sleep  1056
sleep  1072
sleep  1088
sleep  1104
sleep  1120
sleep  1136
sleep  1152
sleep  1168
sleep  1184
sleep  1200
sleep  1216
sleep  1232
sleep  1248
sleep  1264
sleep  1280
sleep  1296
sleep  1312
sleep  1328
sleep  1344
sleep  1360
sleep  1376
sleep  1392
sleep  1408
sle

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


총 청크 수: 5028
첫 청크 미리보기:
calculus
Early Transc EndEnTals
Eigh Th EdiTion
mETric v Ersion
Jam Es sTE war T
McMaster University  
and  
University of toronto
Australia • Brazil • Mexico • Singapore • United Kingdom • United States
Copyright 2016 Cengage Learning. All Rights Reserved. May not be copied, scanned, or duplicated,


In [14]:
vectorstore.save_local("vectorstore")