In [1]:
import os

from langchain_community.document_loaders import DirectoryLoader
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings

from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.testset import TestsetGenerator

from ai_assistant.utils.global_utils import load_conf, load_env

In [2]:
load_env()
project_root = os.environ.get("ROOT")
cfg = load_conf(project_root)

In [16]:
azure_configs = {
    "base_url": os.environ.get("AZURE_OPENAI_ENDPOINT"),
    "model_deployment": cfg["azure"]["llm"]["model"],
    "model_name": "gpt-4o",
    "openai_api_version": cfg["azure"]["llm"]["version"],
    "embedding_deployment": cfg["azure"]["embedding"]["model"],
    "embedding_name": cfg["azure"]["embedding"]["model"],
}

In [17]:
generator_llm = LangchainLLMWrapper(AzureChatOpenAI(
    openai_api_version=azure_configs["openai_api_version"],
    azure_endpoint=azure_configs["base_url"],
    azure_deployment=azure_configs["model_deployment"],
    model=azure_configs["model_name"],
    validate_base_url=False,
))

# init the embeddings for answer_relevancy, answer_correctness and answer_similarity
generator_embeddings = LangchainEmbeddingsWrapper(AzureOpenAIEmbeddings(
    openai_api_version=azure_configs["openai_api_version"],
    azure_endpoint=azure_configs["base_url"],
    azure_deployment=azure_configs["embedding_deployment"],
    model=azure_configs["embedding_name"],
))

In [12]:
path = "../downloads/Sample_Docs_Markdown/"
loader = DirectoryLoader(path, glob="**/*.md")
docs = loader.load()

In [18]:
generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(docs, testset_size=10)

Applying HeadlinesExtractor:   0%|          | 0/5 [00:00<?, ?it/s]

Applying HeadlineSplitter:   0%|          | 0/10 [00:00<?, ?it/s]

unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node


Applying SummaryExtractor:   0%|          | 0/7 [00:00<?, ?it/s]

Property 'summary' already exists in node '395843'. Skipping!
Property 'summary' already exists in node '9189ff'. Skipping!


Applying CustomNodeFilter:   0%|          | 0/9 [00:00<?, ?it/s]

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/25 [00:00<?, ?it/s]

Property 'summary_embedding' already exists in node '9189ff'. Skipping!
Property 'summary_embedding' already exists in node '395843'. Skipping!


Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/12 [00:00<?, ?it/s]

In [20]:
dataset.to_pandas().to_excel("../downloads/Generated_GTs/md_datatset.xlsx")