In [1]:
import os
import requests
from datetime import datetime
from langchain.document_loaders import DirectoryLoader, TextLoader, WebBaseLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from dotenv import load_dotenv, find_dotenv 
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

from download_cards import download_model_cards
from utils import *
from huggingface_hub import login

from retriever import Retriever
from generator import Generator

_ = load_dotenv(find_dotenv())
login(token=os.getenv("HUGGINGFACE_API_KEY"))

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [
                f"Document {i + 1}:\n\n{d.page_content}\nMetadata: {d.metadata}"
                for i, d in enumerate(docs)
            ]
        )
    )

# Indexing 

In [3]:
repo_url_evidently = "https://github.com/evidentlyai/evidently"
repo_name_evidently = repo_url_evidently.rstrip('/').split('/')[-1]
extract_dir_evidently = f"./{repo_name_evidently}"

repo_url_dh = "https://scc-digitalhub.github.io/docs/"
# https://github.com/scc-digitalhub/digitalhub-tutorials/tree/main

download_repo = True
if download_repo:
    download_and_extract_repo(repo_url_evidently, extract_dir_evidently)

#py_files = get_py_files(extract_dir_evidently)

In [4]:
# Step 1.1: load documents
def split_into_chunks(repo_folder):
    #loader = DirectoryLoader('model_cards/', glob="**/*.md", loader_cls=TextLoader)
    loader = DirectoryLoader(f"{repo_folder}/", glob="**/*.py", loader_cls=TextLoader)
    documents = loader.load()
    # Step 1.2: split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_documents(documents)
    return chunks

def split_into_chunks_documentation(web_url):
    loader = WebBaseLoader(web_url)
    documents = loader.load()
    # Step 1.2: split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_documents(documents)
    return chunks

chunks_evidently = split_into_chunks(extract_dir_evidently)
chunks_dh = split_into_chunks_documentation(repo_url_dh)

In [5]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)
# Step 1.3: encode chunks into vectors and store in a vector database
vectordb_evidently = FAISS.load_local(
    "vectorstore_evidently.db", embeddings, allow_dangerous_deserialization=True
)
#vectordb_evidently = FAISS.from_documents(chunks_evidently, embeddings)
#vectordb_evidently.save_local("vectorstore_evidently.db")

#vectordb_dh = FAISS.from_documents(chunks_dh, embeddings)
#vectordb_dh.save_local("vectorstore_dh.db")

# Merge vector stores
#vectordb_evidently.merge_from(vectordb_dh)


# Query

In [6]:
query = "Please generate a new python script that detects data drift for tabular data using your context?"

# Retrieval

In [7]:
# Step 2: Retrieval: retrieve the Top k chunks most relevant to the question based on semantic similarity.
retriever = vectordb_evidently.as_retriever()
docs = retriever.invoke(query)
pretty_print_docs(docs)

Document 1:

"""Methods and types for data drift calculations."""

from dataclasses import dataclass
from typing import Dict
from typing import List
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union

import numpy as np
import pandas as pd
Metadata: {'source': 'evidently/evidently-main/src/evidently/legacy/calculations/data_drift.py'}
----------------------------------------------------------------------------------------------------
Document 2:

features: Dict[str, ColumnDriftParameter]

    @classmethod
    def from_data_drift_table(cls, table: DataDriftTableResults, condition: TestValueCondition):
        return ColumnsDriftParameters(
            features={
                feature: ColumnDriftParameter.from_metric(data) for feature, data in table.drift_by_columns.items()
            },
            condition=condition,
        )
Metadata: {'source': 'evidently/evidently-main/src/evidently/legacy/tests/data_drift_tests.py'}
-----

## Reranker

In [8]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain_community.document_compressors import FlashrankRerank

compressor = FlashrankRerank()
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)
compressed_docs = compression_retriever.invoke(query)
pretty_print_docs(compressed_docs)


INFO:flashrank.Ranker:Downloading ms-marco-MultiBERT-L-12...
ms-marco-MultiBERT-L-12.zip: 100%|██████████| 98.7M/98.7M [00:15<00:00, 6.71MiB/s]


Document 1:

features: Dict[str, ColumnDriftParameter]

    @classmethod
    def from_data_drift_table(cls, table: DataDriftTableResults, condition: TestValueCondition):
        return ColumnsDriftParameters(
            features={
                feature: ColumnDriftParameter.from_metric(data) for feature, data in table.drift_by_columns.items()
            },
            condition=condition,
        )
Metadata: {'id': 1, 'relevance_score': np.float32(0.9961698), 'source': 'evidently/evidently-main/src/evidently/legacy/tests/data_drift_tests.py'}
----------------------------------------------------------------------------------------------------
Document 2:

"""Methods and types for data drift calculations."""

from dataclasses import dataclass
from typing import Dict
from typing import List
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union

import numpy as np
import pandas as pd
Metadata: {'id': 0, 'relevance_score': np.float32(0

# Generation

In [9]:
generator = Generator(model="gpt-4o")
template = generator.format_prompt_codegen(
    system_prompt_path="prompts/system_prompt_codegen.txt", 
    user_prompt_path="prompts/user_prompt_codegen.txt")
template

"You are a helpful assistant that generates Python code based on the provided context.\nConsider the following definitions:\n    1) Data validation is an operation in the AI system lifecycle that validates the quality of training data to identify bias issues that could impact model performance across groups of sensitive features.\n    2) Data preprocessing is an operation in the AI system lifecycle that applies data cleaning procedure, data augmentation, type conversion. This operation can analyze data distribution, evaluate bias or discrimination issues on data and transform data in order to mitigate potential risks deriving from low data quality.\n    3) Fairness means ensuring equity in the decision-making process of a machine learning algorithm across individuals and groups. Group fairness split a population into groups defined by protected attributes (e.g. gender, race) and seeks for some measure to be as equal as possible across groups. Some fairness metrics for measuring group f

In [10]:
prompt = ChatPromptTemplate.from_template(template)
print(prompt)

input_variables=['context', 'question'] input_types={} partial_variables={} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are a helpful assistant that generates Python code based on the provided context.\nConsider the following definitions:\n    1) Data validation is an operation in the AI system lifecycle that validates the quality of training data to identify bias issues that could impact model performance across groups of sensitive features.\n    2) Data preprocessing is an operation in the AI system lifecycle that applies data cleaning procedure, data augmentation, type conversion. This operation can analyze data distribution, evaluate bias or discrimination issues on data and transform data in order to mitigate potential risks deriving from low data quality.\n    3) Fairness means ensuring equity in the decision-making process of a machine learning algorithm across individuals

In [11]:
# Step 3: Generation: input the original question and the retrieved chunks together into LLM to generate the final answer.
llm = ChatOpenAI(model_name="gpt-4o", temperature=0.5)

rag_chain = (
    {"context": compression_retriever,  "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser() 
)

result = rag_chain.invoke(query)
current_time = datetime.now()
with open(f"results/generated_{current_time}.py", "w") as file:
    file.write(result)

  llm = ChatOpenAI(model_name="gpt-4o", temperature=0.5)
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


# RAG Evaluation

In [None]:
# Import necessary libraries
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# Instantiate the models
generator_llm = ChatOpenAI(model="gpt-4o-mini")
critic_llm = ChatOpenAI(model="gpt-4o-mini")
embeddings = OpenAIEmbeddings()

# Create the TestsetGenerator
generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# Call the generator
testset = generator.generate_with_langchain_docs(
data_transformed, 
test_size=20, 
distributions={ 
simple: 0.5, 
reasoning: 0.25, 
multi_context: 0.25}
)

## LLM-as-a-judge