In [15]:
# API KEY를 환경변수로 관리하기 위한 설정 파일
from dotenv import load_dotenv
from langchain_upstage import UpstageDocumentParseLoader
from langchain_teddynote import logging
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import CharacterTextSplitter
from langchain_upstage import UpstageEmbeddings
import numpy as np
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate
import faiss
from langchain.vectorstores.faiss import FAISS
from langchain.schema import Document
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [6]:

# API KEY 정보로드
load_dotenv()

# LangSmith 추적을 설정합니다. https://smith.langchain.com


# 프로젝트 이름을 입력합니다.
logging.langsmith("CH07-DocumentLoader")


LangSmith 추적을 시작합니다.
[프로젝트명]
CH07-DocumentLoader


In [7]:

# 파일 경로
file_path = ".\data\중소벤처기업 지원사업 2권_유관기관_250114.pdf"

layzer = UpstageDocumentParseLoader(
            file_path,
            split='page',
            output_format='text',
           )

docs = layzer.load()

for doc in docs[:3]:
    print(doc)
    

page_content='발 간 등 록 번 호
 11-1421000-100007-10 Ⅱ 유관기관 편' metadata={'page': 1, 'coordinates': [[{'x': 0.115, 'y': 0.0743}, {'x': 0.3158, 'y': 0.0743}, {'x': 0.3158, 'y': 0.1139}, {'x': 0.115, 'y': 0.1139}], [{'x': 0.3778, 'y': 0.5478}, {'x': 0.594, 'y': 0.5478}, {'x': 0.594, 'y': 0.572}, {'x': 0.3778, 'y': 0.572}]]}
page_content='제1편❘유관기관(금융기관) 제1부❘중소벤처기업진흥공단 1 ∙ 정책자금 지원시 공통적으로 적용되는 사항 ·················10
1-1① ∙ 혁신창업사업화자금(융자) ······································20
1-1② ∙ 신성장기반자금(융자) ·············································25
1-1③ ∙ 재도약지원자금 ······················································30
1-1④ ∙ 긴급경영안정자금(융자) ··········································34
1-1⑤ ∙ 신시장진출지원자금(융자) ······································39
1-1⑥ ∙ 밸류체인안정화자금(융자) ·····································42
1-1⑦ ∙ 정책자금 이차보전 지원 ········································45
1-2 ∙ 수출바우처 ································································48
1-3 ∙ 글로벌비즈니스센터(GBC) ·······································

In [None]:
doc.page_content

'1-15 ∙ 글로벌창업사관학교 ·················································99\n1-16 ∙ 스타트업 AI 기술인력 양성 ··································103\n1-17 ∙ 해외인력 취업매칭 지원 ·······································107\n1-18 ∙ 레저장비산업개발지원 ···········································110\n1-19 ∙ 중소기업 탄소중립 설비투자 지원 ·······················113\n1-20 ∙ 중소기업 CBAM대응 인프라구축 ························116\n1-21 ∙ 중소기업 기후공시, 공급망 실사대응 기반구축 ···120\n1-22 ∙ 탄소중립 사업화 지원 ···········································123\n1-23 ∙ 중소기업 혁신바우처 사업 ····································126\n1-24 ∙ 구조혁신지원사업 ··················································130\n1-25 ∙ 선제적 자율구조개선 프로그램 ·····························133\n1-26 ∙ 성실경영평가 제도 ················································137\n1-27 ∙ 진로제시 컨설팅 ····················································140\n1-28 ∙ 회생 컨설팅 ···························································142\n1-29 ∙ 내일채움공제 ·························································145\n1-30 ∙ 중소기업 재직자 우대 저축공제 ············

##RecursiveCharacterTextSpliter

In [9]:
# text_splitter = RecursiveCharacterTextSplitter(
#     # 청크 크기를 매우 작게 설정합니다. 예시를 위한 설정입니다.
#     chunk_size=1000,
#     # 청크 간의 중복되는 문자 수를 설정합니다.
#     chunk_overlap=100,
#     # 문자열 길이를 계산하는 함수를 지정합니다.
#     length_function=len,
#     # 구분자로 정규식을 사용할지 여부를 설정합니다.
#     is_separator_regex=False,
# )


# 문자 기반으로 텍스트를 분할하는 CharacterTextSplitter를 생성합니다. 청크 크기는 300이고 청크 간 중복은 없습니다.
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)

In [10]:
# # text_splitter를 사용하여 file 텍스트를 문서로 분할합니다.
# texts = text_splitter.create_documents([doc.page_content for doc in docs])
# print(texts[0])  # 분할된 문서의 첫 번째 문서를 출력합니다.
# print("===" * 20)
# print(texts[1000])  # 분할된 문서의 두 번째 문서를 출력합니다.

# 로드된 문서를 분할합니다.
split_docs = text_splitter.split_documents(docs)

In [11]:
split_docs

# Upstage 임베딩을 생성합니다. 문서용 모델을 사용합니다.
doc_embedder = UpstageEmbeddings(model="solar-embedding-1-large-passage")

# 분할된 텍스트와 임베딩을 사용하여 FAISS 벡터 데이터베이스를 생성합니다.
db = FAISS.from_documents(split_docs, doc_embedder)


In [12]:
# 쿼리용 Upstage 임베딩을 생성합니다. 쿼리용 모델을 사용합니다.
query_embedder = UpstageEmbeddings(model="solar-embedding-1-large-query")

# 쿼리 문장을 벡터로 변환합니다.
query_vector = query_embedder.embed_query("청년창업에 제일 유리한 제도는 무엇인가요?")

# 벡터 유사도 검색을 수행하여 가장 유사한 2개의 문서를 반환합니다.
db.similarity_search_by_vector(query_vector, k=2)

[Document(id='80efbf7c-f653-41a4-9e95-ac10f4e58bf3', metadata={'page': 112, 'coordinates': [[{'x': 0.1811, 'y': 0.0974}, {'x': 0.6409, 'y': 0.0974}, {'x': 0.6409, 'y': 0.1186}, {'x': 0.1811, 'y': 0.1186}], [{'x': 0.1815, 'y': 0.1681}, {'x': 0.4156, 'y': 0.1681}, {'x': 0.4156, 'y': 0.196}, {'x': 0.1815, 'y': 0.196}], [{'x': 0.1889, 'y': 0.2997}, {'x': 0.2873, 'y': 0.2997}, {'x': 0.2873, 'y': 0.3212}, {'x': 0.1889, 'y': 0.3212}], [{'x': 0.1721, 'y': 0.3426}, {'x': 0.8269, 'y': 0.3426}, {'x': 0.8269, 'y': 0.3892}, {'x': 0.1721, 'y': 0.3892}], [{'x': 0.1738, 'y': 0.4264}, {'x': 0.4443, 'y': 0.4264}, {'x': 0.4443, 'y': 0.448}, {'x': 0.1738, 'y': 0.448}], [{'x': 0.1759, 'y': 0.4602}, {'x': 0.822, 'y': 0.4602}, {'x': 0.822, 'y': 0.5578}, {'x': 0.1759, 'y': 0.5578}], [{'x': 0.1868, 'y': 0.6035}, {'x': 0.2887, 'y': 0.6035}, {'x': 0.2887, 'y': 0.6259}, {'x': 0.1868, 'y': 0.6259}], [{'x': 0.1712, 'y': 0.6469}, {'x': 0.828, 'y': 0.6469}, {'x': 0.828, 'y': 0.7535}, {'x': 0.1712, 'y': 0.7535}], [{'x

In [13]:
retriever = db.as_retriever()

prompt = PromptTemplate.from_template(
    """You are propessional finance export. Use the following pieces of retrieved context to answer the questions.
    If you don't know the answer, just say that you don't know.
    Answer in Korean.
    
    #Context:
    {context}
    
    #Question:
    {question}
    
    #Answer:
    """
)

In [16]:
# 언어모델 생성

llm = ChatOpenAI(model_name='gpt-4o', temperature=0)

# chain 형성

chain = ({"context" : retriever, "question": RunnablePassthrough()}
         | prompt
         | llm
         | StrOutputParser()
)

In [17]:
question = "광물 수입을 원하는 기업에게 가장 유리한 제도와 수행기관를 설명하고 우대지원을 요약해줘?"
response = chain.invoke(question)
print(response)

광물 수입을 원하는 기업에게 가장 유리한 제도는 "수입보험"입니다. 이 제도는 장기적으로 안정적인 확보가 필요하거나 적기에 수입이 필요한 원유, 원목, 광물, LNG 등 주요 자원과 시설재 및 첨단제품 등의 수입거래를 지원합니다. 

수행기관은 한국무역보험공사이며, 수입자용으로는 국내 기업이 선급금 지급 조건 수입거래에서 비상위험 또는 신용위험으로 인해 선급금을 회수할 수 없게 되는 경우에 발생하는 손실을 보상합니다. 금융기관용/글로벌공급망으로는 금융기관이 수입에 필요한 자금을 수입 기업에 대출(지급보증)한 후에 대출금을 회수할 수 없게 되는 경우에 발생하는 손실을 보상합니다.

우대지원으로는 주요 자원(철, 동, 아연, 석탄, 원유) 및 시설재 등이 포함되며, 품목 해당 여부는 한국무역보험공사 홈페이지를 통해 확인할 수 있습니다.


## HuggingFace

In [22]:
import os
import numpy as np
import pandas as pd
from datasets import Dataset
from huggingface_hub import create_repo

# 1) 텍스트 리스트 추출
texts = [doc.page_content for doc in split_docs]

# 2) 임베딩 계산
embeddings_list = doc_embedder.embed_documents(texts)


# 2) DataFrame으로 정리
import pandas as pd
data = {
    "id":        [f"doc_{i}" for i in range(len(texts))],
    "text":      texts,
    "embedding": embeddings_list,
    # metadata 필드 예시: page number, source 등
    **{k: [doc.metadata.get(k) for doc in split_docs] for k in split_docs[0].metadata}
}
df = pd.DataFrame(data)

# 3) Hugging Face Dataset으로 변환
ds = Dataset.from_pandas(df)


In [None]:
from huggingface_hub import login

login(token="")  # 메모리상에 토큰 등록

# 4) Hub에 업로드
repo_id = "yong05/your-embeddings-repo"
create_repo(repo_id, private=True, exist_ok=True)
ds.push_to_hub(repo_id, commit_message="upload embeddings")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to C:\Users\user\.cache\huggingface\token
Login successful


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/yong05/your-embeddings-repo/commit/104637300b07ccdffd44957eb7c11ab16a92798c', commit_message='upload embeddings', commit_description='', oid='104637300b07ccdffd44957eb7c11ab16a92798c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/yong05/your-embeddings-repo', endpoint='https://huggingface.co', repo_type='dataset', repo_id='yong05/your-embeddings-repo'), pr_revision=None, pr_num=None)

In [21]:
texts

['발 간 등 록 번 호\n 11-1421000-100007-10 Ⅱ 유관기관 편',
 '제1편❘유관기관(금융기관) 제1부❘중소벤처기업진흥공단 1 ∙ 정책자금 지원시 공통적으로 적용되는 사항 ·················10\n1-1① ∙ 혁신창업사업화자금(융자) ······································20\n1-1② ∙ 신성장기반자금(융자) ·············································25\n1-1③ ∙ 재도약지원자금 ······················································30\n1-1④ ∙ 긴급경영안정자금(융자) ··········································34\n1-1⑤ ∙ 신시장진출지원자금(융자) ······································39\n1-1⑥ ∙ 밸류체인안정화자금(융자) ·····································42\n1-1⑦ ∙ 정책자금 이차보전 지원 ········································45\n1-2 ∙ 수출바우처 ································································48\n1-3 ∙ 글로벌비즈니스센터(GBC) ·········································55\n1-4 ∙ K-스타트업센터(KSC) ···············································59\n1-5 ∙ 전자상거래수출 시장진출 ··········································63\n1-6 ∙ 온라인수출플랫폼 ·······················································66\n1-7 ∙ 기업인력애로센터 활용 취업지원 ······························69\n