In [6]:
import os
import requests
from datetime import datetime
from langchain.document_loaders import DirectoryLoader, TextLoader, WebBaseLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from dotenv import load_dotenv, find_dotenv 
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

from download_cards import download_model_cards
from utils import *
from huggingface_hub import login

from retriever import Retriever
from generator import Generator

from opik import configure 
from opik.integrations.langchain import OpikTracer 

_ = load_dotenv(find_dotenv())
login(token=os.getenv("HUGGINGFACE_API_KEY"))

In [7]:
opik_tracer = OpikTracer(project_name="ai_engineers_agents_project") 

def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [
                f"Document {i + 1}:\n\n{d.page_content}\nMetadata: {d.metadata}"
                for i, d in enumerate(docs)
            ]
        )
    )

# Indexing 

In [15]:
repo_url_evidently = "https://github.com/evidentlyai/evidently"
repo_name_evidently = repo_url_evidently.rstrip('/').split('/')[-1]
extract_dir_evidently = f"./{repo_name_evidently}"

repo_url_dh = "https://github.com/scc-digitalhub/digitalhub-tutorials/"
# https://github.com/scc-digitalhub/digitalhub-tutorials/tree/main
repo_name_dh= repo_url_dh.rstrip('/').split('/')[-1]
extract_dir_dh = f"./{repo_name_dh}"

download_repo = True
if download_repo:
    #download_and_extract_repo(repo_url_evidently, extract_dir_evidently)
    download_and_extract_repo(repo_url_dh, extract_dir_dh)

#py_files = get_py_files(extract_dir_evidently)

In [26]:
# Step 1.1: load documents
def split_into_chunks(repo_folder):
    #loader = DirectoryLoader('model_cards/', glob="**/*.md", loader_cls=TextLoader)
    loader = DirectoryLoader(f"{repo_folder}/", glob="**/*.py", loader_cls=TextLoader)
    documents = loader.load()
    # Step 1.2: split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_documents(documents)
    return chunks

def split_into_chunks_documentation(web_url):
    loader = WebBaseLoader(web_url)
    documents = loader.load()
    # Step 1.2: split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_documents(documents)
    return chunks

#chunks_evidently = split_into_chunks(extract_dir_evidently)
chunks_dh = split_into_chunks(extract_dir_dh)

In [27]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)
# Step 1.3: encode chunks into vectors and store in a vector database
#vectordb = FAISS.load_local(
#    "../vectorstore_evidently.db", embeddings, allow_dangerous_deserialization=True
#)
#vectordb = FAISS.load_local(
#    "../vectorstore_dh.db", embeddings, allow_dangerous_deserialization=True
#)
#vectordb_evidently = FAISS.from_documents(chunks_evidently, embeddings)
#vectordb_evidently.save_local("vectorstore_evidently.db")

vectordb_dh = FAISS.from_documents(chunks_dh, embeddings)
vectordb_dh.save_local("vectorstore_dh.db")

# Merge vector stores
#vectordb_evidently.merge_from(vectordb_dh)


INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2


# Query

In [43]:
#query = "Please generate a new python script that detects data drift for tabular data using your context?"
query = "Please generate two functions that transform the following AI operation function into the proper syntax within the AI governance platform using your context. \
The first function serves as the handler and the second one as the deployer of the handler"

# Retrieval

In [44]:
# Step 2: Retrieval: retrieve the Top k chunks most relevant to the question based on semantic similarity.
retriever = vectordb_dh.as_retriever()
docs = retriever.invoke(query)
pretty_print_docs(docs)

Document 1:

f"--logging_steps={logging_steps}",
        f"--learning_rate={learning_rate}",
        f"--warmup_steps={warmup_steps}",
        f"--eval_strategy=steps",
        f"--eval_steps={eval_steps}",
        f"--save_strategy=steps",
        f"--save_steps={save_steps}",
        f"--save_total_limit=1",
        f"--generation_max_length={max_sequence_length}",
        f"--preprocessing_num_workers=16",
        f"--max_duration_in_seconds=30",
        f"--text_column_name=sentence",
Metadata: {'source': 'digitalhub-tutorials/digitalhub-tutorials-main/s8-whisper-fine-tuning/src/fine_tuning_seq2seq.py'}
----------------------------------------------------------------------------------------------------
Document 2:

def pipeline():
    with Workflow(entrypoint="dag", arguments=Parameter(name="employees")) as w:
        with DAG(name="dag"):
            A = step(
                template={
                    "action": "transform",
                    "inputs": {"employees": "{{workf

## Reranker

In [45]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain_community.document_compressors import FlashrankRerank

compressor = FlashrankRerank()
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)
compressed_docs = compression_retriever.invoke(query)
pretty_print_docs(compressed_docs)


Document 1:

def pipeline():
    with Workflow(entrypoint="dag", arguments=Parameter(name="employees")) as w:
        with DAG(name="dag"):
            A = step(
                template={
                    "action": "transform",
                    "inputs": {"employees": "{{workflow.parameters.employees}}"},
                    "outputs": {"output_table": "department-50"},
                },
                function="transform-employees",
            )
    return w
Metadata: {'id': 1, 'relevance_score': np.float32(0.00085208216), 'source': 'digitalhub-tutorials/digitalhub-tutorials-main/s2-dbt/src/pipeline.py'}
----------------------------------------------------------------------------------------------------
Document 2:

from digitalhub_runtime_hera.dsl import step
from hera.workflows import Steps, Workflow


def pipeline():
    with Workflow(entrypoint="dag") as w:
        with Steps(name="dag"):
            A = step(
                template={"action": "job"},
                f

# Generation

In [46]:
generator = Generator(model="gpt-4o")
template = generator.format_prompt_codegen(
    system_prompt_path="prompts/deployer_prompts/system_prompt_codegen.txt", 
    user_prompt_path="prompts/deployer_prompts/user_prompt_codegen.txt")
template

'You are a helpful assistant that generates Python code based on the provided context.\nConsider the following definitions:\n    1) The AI governance platform serves as the environment that converts a Python script into an executable function, integrating it into a larger system that orchestrates the tools and processes required to run, manage, and deploy the various components of an AI product.\n    2) AI operation is a phase whithin the AI product lifecycle — covering activities related to the data preparation, model training or model serving and monitoring.\n    3) The Deployer function converts the AI operation function into the syntax required by the AI governance platform.\nIMPORTANT: If you don\'t know the answer, just say that you don\'t know. \n\n \n Question: {question} \n\ndef bias_mitigation_pre_reweighing(data: Data, config: Configuration) -> Data:\n    """Reweighing is a pre-processing bias mitigation technique that amends the dataset to achieve statistical parity. \n    

In [47]:
prompt = ChatPromptTemplate.from_template(template)
print(prompt)

input_variables=['context', 'question'] input_types={} partial_variables={} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='You are a helpful assistant that generates Python code based on the provided context.\nConsider the following definitions:\n    1) The AI governance platform serves as the environment that converts a Python script into an executable function, integrating it into a larger system that orchestrates the tools and processes required to run, manage, and deploy the various components of an AI product.\n    2) AI operation is a phase whithin the AI product lifecycle — covering activities related to the data preparation, model training or model serving and monitoring.\n    3) The Deployer function converts the AI operation function into the syntax required by the AI governance platform.\nIMPORTANT: If you don\'t know the answer, just say that you don\'t know. \n\n \n Questio

In [48]:
# Step 3: Generation: input the original question and the retrieved chunks together into LLM to generate the final answer.
llm = ChatOpenAI(model_name="gpt-4o", temperature=0.5, callbacks=[opik_tracer])

rag_chain = (
    {"context": compression_retriever,  "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser() 
)

result = rag_chain.invoke(query, callbacks=[opik_tracer])
current_time = datetime.now()
with open(f"results/generated_{current_time}.py", "w") as file:
    file.write(result)

INFO:httpx:HTTP Request: POST http://localhost:5173/api/v1/private/traces/batch "HTTP/1.1 204 No Content"
INFO:httpx:HTTP Request: POST http://localhost:5173/api/v1/private/spans/batch "HTTP/1.1 204 No Content"
INFO:openai._base_client:Retrying request to /chat/completions in 0.440328 seconds
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


INFO:httpx:HTTP Request: POST http://localhost:5173/api/v1/private/traces/batch "HTTP/1.1 204 No Content"
INFO:httpx:HTTP Request: POST http://localhost:5173/api/v1/private/spans/batch "HTTP/1.1 204 No Content"


# Evaluation