In [34]:
import os
import glob
from tqdm import tqdm
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from langchain_openai import ChatOpenAI


load_dotenv()

chat = ChatOpenAI(
    openai_api_key=os.getenv("OPENAI_API_KEY"),
    model='gpt-3.5-turbo')


# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.getenv("PINECONE_API_KEY")

# configure client
pc = Pinecone(api_key=api_key)
spec = ServerlessSpec(cloud="aws", region="us-east-1")

In [5]:
import time

index_name = 'the-democraticparty'
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of ada 002
        metric='dotproduct',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()


{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [6]:
from langchain.embeddings.openai import OpenAIEmbeddings

embed_model = OpenAIEmbeddings(model="text-embedding-ada-002")

  embed_model = OpenAIEmbeddings(model="text-embedding-ada-002")


In [19]:
import pandas as pd

file_path = "./data/infos.xlsx"

data = pd.read_excel(file_path)


In [20]:
data

Unnamed: 0,source,title,text
0,https://theminjoo.kr/main/sub/introduce/team.php,더불어 민주당 중앙당 사람들,"당대표: 이재명, 원내대표: 박찬대, 최고의원: 김민석"


In [25]:
import datetime
import pytz

In [27]:
from tqdm.auto import tqdm  # for progress bar

batch_size = 100

for i in tqdm(range(0, len(data), batch_size)):
    i_end = min(len(data), i+batch_size)
    # get batch of data
    batch = data.iloc[i:i_end]
    # generate unique ids for each chunk
    ids = kst_now_str = datetime.datetime.now(pytz.timezone("Asia/Seoul")).strftime("%Y%m%d%H%M%S")
    # get text to embed
    texts = [x['text'] for _, x in batch.iterrows()]
    # embed text
    embeds = embed_model.embed_documents(texts)
    # get metadata to store in Pinecone
    metadata = [
        {'text': x['text'],
         'source': x['source'],
         'title': x['title']} for i, x in batch.iterrows()
    ]
    # add to Pinecone
    index.upsert(vectors=zip(ids, embeds, metadata))

100%|██████████| 1/1 [00:02<00:00,  2.06s/it]


In [13]:
from langchain.vectorstores import Pinecone

text_field = "text"  # the metadata field that contains our text

# initialize the vector store object
vectorstore = Pinecone(
    index, embed_model.embed_query, text_field
)

  vectorstore = Pinecone(


In [45]:
query = "더불어민주당 중앙당 최고의원은?"

vectorstore.similarity_search(query, k=3)

[Document(metadata={'source': 'https://theminjoo.kr/main/sub/introduce/team.php', 'title': '더불어 민주당 중앙당 사람들'}, page_content='당대표: 이재명, 원내대표: 박찬대, 최고의원: 김민석')]

In [46]:
def augment_prompt(query: str):
    # get top 3 results from knowledge base
    results = vectorstore.similarity_search(query, k=3)
    # get the text from the results
    source_knowledge = "\n".join([x.page_content for x in results])
    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query in korean.

    Contexts:
    {source_knowledge}

    Query: {query}"""
    return augmented_prompt

In [47]:
from langchain.schema import SystemMessage, HumanMessage, AIMessage

# create a new user prompt
messages = []

prompt = HumanMessage(
    content=augment_prompt(query)
)
# add to messages
messages.append(prompt)

res = chat(messages)

print(res.content)

김민석입니다.
