# AI Cyoda configurations Q&A with RAG Langchain

This is a playground for experimenting with mappings generation

Install requirements

In [None]:
pip install -r ../requirements.txt

### Load environment variables

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
WORK_DIR = os.environ["WORK_DIR"]
API_KEY = os.environ["CYODA_API_KEY"]
API_SECRET = os.environ["CYODA_API_SECRET"]
API_URL = os.environ["CYODA_API_URL"] + "/api"
GRPC_ADDRESS = os.environ["GRPC_ADDRESS"]
WORK_DIR = os.environ["WORK_DIR"]
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
WORK_DIR = os.environ["WORK_DIR"]
TOKEN = ""

In [None]:
import logging

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [None]:
%%script echo skipping
##for google colab (optional)
# This cell is optional and can be skipped
from google.colab import userdata
API_KEY = userdata.get('OPENAI_API_KEY')
WORK_DIR = userdata.get('WORK_DIR')

### Handle unsupported version of sqlite3 (optional)

In [None]:
pip install pysqlite3-binary

In [None]:
import sys

__import__("pysqlite3")
sys.modules["sqlite3"] = sys.modules["pysqlite3"]

### Initialize ChatOpenAI

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import GitLoader, DirectoryLoader, TextLoader
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.schema import HumanMessage

In [None]:
llm = ChatOpenAI(
    temperature=0.45,
    max_tokens=4000,
    model="gpt-4o-mini",
    openai_api_key=OPENAI_API_KEY,
)

### Load instructions and entities from the official cyoda repository

In [None]:
%%script echo skipping
loader = GitLoader(
    repo_path=WORK_DIR,
    branch="develop",
    file_filter=lambda file_path: file_path.startswith(f"{WORK_DIR}/data/rag/v1/connections/templates"),
)
docs = loader.load()
print(f"Number of documents loaded: {len(docs)}")

In [None]:
loader = DirectoryLoader(f"{WORK_DIR}/data/rag/v1/mappings", loader_cls=TextLoader)
docs = loader.load()
print(f"Number of documents loaded: {len(docs)}")

### Split documents and create vectorstore

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

In [None]:
count = vectorstore._collection.count()
print(count)

In [None]:
res = vectorstore.similarity_search("Get some document")
print(res)

### Define prompts for contextualizing question and answering question

In [None]:
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

In [None]:
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

### Answer question

In [None]:
qa_system_prompt = """You are a mapping generation code assistant assistant. \
You are an expert in Javascript Nashorn and understand how it is different from Java and javascript.
You will be asked to generate Nashorn javascript code to map input to entity. \
First, analyse the input and the entity and fill in Mapping Questionnaire.
Then do your best to do code assistance for mapping the input to the entity.   
{context}"""

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)

### Create retrieval chain

In [None]:
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

### Initialize chat history and relevant functions

In [None]:
chat_history = {}

In [None]:
# Function to add a message to the chat history
def add_to_chat_history(id, question, message):
    if id in chat_history:
        chat_history[id].extend([HumanMessage(content=question), message])
    else:
        chat_history[id] = [HumanMessage(content=question), message]

In [None]:
# Function to clear chat history
def clear_chat_history(id):
    if id in chat_history:
        del chat_history[id]

In [None]:
def ask_question(id, question):
    ai_msg = rag_chain.invoke(
        {"input": question, "chat_history": chat_history.get(id, [])}
    )
    add_to_chat_history(id, question, ai_msg["answer"])
    return ai_msg["answer"]

### Start a chat session

In [None]:
import uuid

# Generate a unique ID for the chat session
id = uuid.uuid1()

In [None]:
# clear chat history if necessary
clear_chat_history(id)

In [None]:
def parse_json(result):
    if result.startswith("```"):
        return "\n".join(result.split("\n")[1:-1])
    if not result.startswith("{"):
        start_index = result.find("```json")
        if start_index != -1:
            start_index += len("```json\n")
            end_index = result.find("```", start_index)
            return result[start_index:end_index].strip()
    return result

In [None]:
def generate_uuid() -> uuid:
    return uuid.uuid1()

generate_uuid()

In [None]:
import json
import jsonschema
from jsonschema import validate

# connections_file_path = f'{WORK_DIR}/data/v1/connections/connection_json_schema.json'
# endpoints_file_path = f'{WORK_DIR}/data/v1/connections/endpoint_json_schema.json'


def validate_result(parsed_result: str, file_path: str) -> bool:
    with open(file_path, "r") as schema_file:
        schema = json.load(schema_file)

    json_data = json.loads(parsed_result)

    try:
        validate(instance=json_data, schema=schema)
        print("JSON is valid.")
        return True
    except jsonschema.exceptions.ValidationError as err:
        print("JSON is invalid:", err.message)
    return False

In [None]:
def get_input(file_path):
    data = ""
    try:
        with open(file_path, "r") as file:
            data = file.read()
    except Exception as e:
        logger.error(f"Failed to read JSON file: {e}")
        return
    return data


data = get_input(f"{WORK_DIR}/data/test-inputs/v1/mappings/tender_level_0.json")
print(data)

In [None]:
input = get_input(f"{WORK_DIR}/data/test-inputs/v1/mappings/tender_level_2.json")
entity_name = "Tender Entity f2c3867f-6ddc-4a48-a47a-03ea7ac6b306"
question = f"Get {entity_name} entity json schema from the context. If you don't have it - return that you do not have data for {entity_name} entity and stop execution. Fill in Mappings Questionnaire json based on the input: {input}. Return the resulting Questionnaire json."
questionnaire_result = ask_question(id, question)
print(questionnaire_result)
try:
    parsed_questionnaire_result = parse_json(questionnaire_result)
    print(parsed_questionnaire_result)
    parsed_questionnaire_result_json = json.loads(parsed_questionnaire_result)
except Exception as e:
    print("error")
    print(parsed_questionnaire_result)
    raise e

In [None]:
question = f"Write code to map {entity_name} to the provided input. Base your answer on the available list_of_input_to_entity_properties"
questionnaire_result = ask_question(id, question)
print(questionnaire_result)

In [None]:
question = f"Write code to filter notices by noticeTitle. Exclude all notices that start with P. Return only this piece of code"
questionnaire_result = ask_question(id, question)
print(questionnaire_result)

In [None]:
def generate_paths(data, current_path=""):
    paths = []

    if isinstance(data, dict):
        for key, value in data.items():
            new_path = f"{current_path}/{key}" if current_path else key
            if isinstance(value, (dict, list)):
                paths.extend(generate_paths(value, new_path))
            else:
                paths.append(new_path)
    elif isinstance(data, list):
        for i in range(len(data)):
            new_path = f"{current_path}/*"
            if isinstance(data[i], (dict, list)):
                paths.extend(generate_paths(data[i], new_path))
            else:
                paths.append(new_path)

    return paths

In [None]:
print(type(input))
input_json = json.loads(input)
print(type(input_json))
paths = generate_paths(input_json)
print(paths)

In [None]:
script = {"script": {"body": str(parsed_questionnaire_result), "inputSrcPaths": str(paths)}}

In [None]:
print(script)

In [None]:
print(chat_history)

In [None]:
# clear chat history if necessary
clear_chat_history(id)