# Embedding API

## 1) Data Preprocessing

In [6]:
from langchain_community import embeddings
from langchain_ollama import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts  import ChatPromptTemplate

In [7]:
model_local = ChatOllama(model='phi4')

In [8]:
import json
from pprint import pprint

In [4]:
file1 = "./data/20250201/parsed_peoples_20250201.json"

with open(file1, "r", encoding="utf-8") as file:
    data = json.load(file)


In [5]:
documents = []
for party in data:
    for member in data[party]:
        page_content = {
            "소속": party,
            "이름": member["이름"],
            "직책": member["직책"],
            "슬로건": member["슬로건"]
        }
        metadata = member["sns"]
        metadata["이미지"] = member["이미지"]
        
        newDocument = {
            "page_content": json.dumps(page_content, ensure_ascii=False),
            "metadata": metadata
        }
        documents.append(newDocument)
    

In [6]:
len(documents)

263

## 2) Data Insert

In [10]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [11]:
import time
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore

index_name = "test2"
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=1024,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

In [12]:
index = pc.Index(index_name)
vector_store = PineconeVectorStore(index=index, embedding=embeddings.OllamaEmbeddings(model="bge-m3"))

  vector_store = PineconeVectorStore(index=index, embedding=embeddings.OllamaEmbeddings(model="bge-m3"))


In [10]:
from langchain_core.documents import Document

vectorDocuments = []
for document in documents:
    vectorDocument = Document(page_content=document["page_content"], metadata=document["metadata"])
    vectorDocuments.append(vectorDocument)

In [11]:
ids = [str(i) for i in range(1, len(vectorDocuments)+1)]
vector_store.add_documents(documents=vectorDocuments, ids=ids)

['1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '50',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '59',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '68',
 '69',
 '70',
 '71',
 '72',
 '73',
 '74',
 '75',
 '76',
 '77',
 '78',
 '79',
 '80',
 '81',
 '82',
 '83',
 '84',
 '85',
 '86',
 '87',
 '88',
 '89',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '97',
 '98',
 '99',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '118',
 '119',
 '120',
 '121',
 '122',
 '123',
 '124',
 '125',
 '126',
 '127',
 '128',
 '129',
 '130',
 '131',
 '132',
 '133',
 '134',
 '135',
 '136',
 '137',
 '138',
 '13

## 3) Chat Completion Test

In [13]:
results = vector_store.similarity_search(query="이재성",k=3)
print(len(results))
for doc in results:
    pprint(f"* {doc.page_content} [{doc.metadata}]")


3
('* {"소속": "시도당", "이름": "이재성", "직책": "부산시당위원장", "슬로건": ""} [{\'twitter\': '
 "'https://x.com/saha_jaesung', '네이버블로그': "
 "'https://blog.naver.com/leejaesungtv', '유튜브': "
 "'https://www.youtube.com/channel/UC_Cbr6xHddChyPdx812y67Q', '이미지': "
 "'https://theminjoo.kr/people/connect/people/01876148/profile.jpg', '인스타그램': "
 "'https://www.instagram.com/e_sport_jaesung'}]")
('* {"소속": "중앙당", "이름": "이재명", "직책": "당대표", "슬로건": "이기는 민주당! 이재명은 합니다!"} '
 "[{'국회': 'https://www.assembly.go.kr/members/21st/LEEJAEMYUNG', '네이버블로그': "
 "'https://blog.naver.com/jaemyunglee', '유튜브': "
 "'https://www.youtube.com/watch?v=ZfEeptqZN1M', '이미지': "
 "'https://theminjoo.kr/people/connect/people/185/profile.jpg', '인스타그램': "
 "'https://www.instagram.com/2_jaemyung/'}]")
('* {"소속": "국회의원", "이름": "이재관", "직책": "국회의원(충남, 천안시을)", "슬로건": ""} '
 "[{'네이버블로그': 'https://blog.naver.com/panmotvictory0601', '유튜브': "
 "'https://www.youtube.com/channel/UCGjrFUi6yOEftCoUmsr_rNQ', '이미지': "
 "'https://theminjoo.kr/people/connect/p

In [17]:
retriever = vector_store.as_retriever()

after_rag_template = """Refer to the content below and respond in Korean. Ensure that your response sounds natural, as if you are directly providing the answer yourself.:
{context}
Question: {question}
"""
after_rag_prompt = ChatPromptTemplate.from_template(after_rag_template)
after_rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | after_rag_prompt
    | model_local
    | StrOutputParser()
)

In [19]:
print(after_rag_chain.invoke("이재관 유튜브?"))


저의 정보에 따르면, 이재관 님의 유튜브 채널은 'https://www.youtube.com/channel/UCGjrFUi6yOEftCoUmsr_rNQ'입니다. 더 많은 내용을 알고 싶으시다면 방문해 보세요!
