In [None]:
from langchain_core.documents import Document
import os
from dotenv import load_dotenv
from gen_ai_hub.proxy.langchain.openai import ChatOpenAI
from gen_ai_hub.proxy.langchain.openai import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores.hanavector import HanaDB
from hdbcli import dbapi
# import os
import configparser
import pandas as pd


In [None]:

# For connecting to SAP AI Core:
os.environ["AICORE_CLIENT_ID"] = "client_id"
os.environ["AICORE_CLIENT_SECRET"] = "client_secret"
os.environ["AICORE_AUTH_URL"] = "auth_url"
os.environ["AICORE_BASE_URL"] = "auth_base_url"
os.environ["AICORE_RESOURCE_GROUP"]  = "dev"



print(os.getenv("AICORE_BASE_URL"))
config = configparser.ConfigParser()
connection = dbapi.connect(
    address= 'hand_db_url'
    port= '443',
    user='hand_db_user', #config.get('hana', 'user'),
    password='hand_db_password'
    autocommit=True,
    sslValidateCertificate=False
)

EMBEDDING_DEPLOYMENT_ID = 'embedding_model_instance_in_sap_ai_core_id'
LLM_DEPLOYMENT_ID = 'llm_model_instance_in_sap_ai_core_id'

# Define which model to use
chat_llm = ChatOpenAI(deployment_id=LLM_DEPLOYMENT_ID)

embeddings = OpenAIEmbeddings(deployment_id=EMBEDDING_DEPLOYMENT_ID)
db = HanaDB(
    embedding=embeddings, connection=connection, table_name="VECTOR_TABLE_NAME_IN_HANA_DB"
)

In [None]:
df_pd = pd.read_excel("PRE_PROCESSED_DATA.xlsx")
df_pd.head(5)

In [21]:
def convert_to_textloader(df, text_column, metadata_column, chunk_size=500, chunk_overlap=100):
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    documents = []

    for idx, row in df.iterrows():
        text = row[text_column]
        metadata = row[metadata_column]
        
        # Split the text into chunks
        chunks = text_splitter.split_text(text)
        
        # Create Document objects with metadata and chunks
        for chunk in chunks:
            document = Document(page_content=chunk, metadata={"metadata":metadata})
            documents.append(document)
    
    return documents

In [None]:
documents = convert_to_textloader(df_pd, text_column='Details', metadata_column='Metadata')

In [None]:
# Delete already existing documents from the table
db.delete(filter={})


[]

In [None]:
# Save the embeddings to the vector DB table
db.add_documents(documents)