In [1]:
import os
import asyncio
from openai import AsyncOpenAI
from ragas.llms import llm_factory
from ragas.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import DirectoryLoader
from ragas.testset import TestsetGenerator


In [None]:
import logging
from ragas.testset.transforms.extractors.llm_based import HeadlinesExtractor, SummaryExtractor, KeyphrasesExtractor
from ragas.testset.transforms.splitters import HeadlineSplitter

logging.getLogger("ragas.prompt").setLevel(logging.DEBUG)
logging.getLogger("ragas.llms").setLevel(logging.DEBUG)

logging.basicConfig(level=logging.INFO)
async def generate_testset():
    # Ollama's OpenAI-compatible endpoint 
    async_client = AsyncOpenAI(
        base_url="http://localhost:11434/v1",
        api_key="ollama"  # dummy api key
    )

    generator_llm = llm_factory("qwen2.5:3b", client=async_client, provider="openai")

    generator_embeddings = HuggingFaceEmbeddings(model="BAAI/bge-small-en-v1.5")

    loader = DirectoryLoader("../../Documents/Ragas", glob="**/*.pdf")    
    documents = loader.load()
    
    for doc in documents:
        doc.metadata['filename'] = doc.metadata.get('source', 'unknown')

    generator = TestsetGenerator(
        llm=generator_llm,
        embedding_model=generator_embeddings
    )
    my_splitter = HeadlineSplitter(min_tokens=100, max_tokens=500)
    my_headlines = HeadlinesExtractor(llm=generator_llm, max_token_limit=300)
    my_summary = SummaryExtractor(llm=generator_llm, max_token_limit=300)
    my_keys = KeyphrasesExtractor(llm=generator_llm, max_token_limit=2000)

    testset = generator.generate_with_langchain_docs(
        documents=documents,
        testset_size=8,
        transforms=[my_splitter,my_headlines, my_summary, my_keys] # Include all three
    )

    df = testset.to_pandas()
    print(df.head())
    df.to_csv("testset_qwen.csv", index=False)

await generate_testset()


INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5




Applying HeadlineSplitter:   0%|          | 0/1 [00:00<?, ?it/s]

ERROR:ragas.async_utils:Task failed with ValueError: 'headlines' property not found in this node


ValueError: 'headlines' property not found in this node