In [2]:
import os
import dotenv
os.environ["AUTO_DOWNLOAD_NLTK"] = "false" 
os.environ["TIKTOKEN_CACHE_DIR"] = "/tmp/tiktoken_cache/"

dotenv.load_dotenv()

True

In [3]:
from langchain_community.document_loaders import DirectoryLoader

path = "test_docs/"
loader = DirectoryLoader(path, glob="**/*.md")
docs = loader.load()

  from .autonotebook import tqdm as notebook_tqdm
  return torch._C._cuda_getDeviceCount() > 0


In [4]:
from openai import OpenAI
from ragas.llms import llm_factory
from ragas.embeddings import OpenAIEmbeddings

client = OpenAI(
    api_key=os.getenv('OPENAI_API_KEY'),  # реальный ключ не нужен
    base_url=os.getenv('OPENAI_API_BASE'),
)
generator_llm = llm_factory("qwen235-thinking", provider="openai", client=client)
generator_embeddings = OpenAIEmbeddings(
    client=client,
    model="hosted_vllm/BAAI/bge-m3",  
)

In [5]:
from ragas.testset.graph import Node, NodeType
from ragas.testset.graph import KnowledgeGraph

kg = KnowledgeGraph()


for doc in docs:
    kg.nodes.append(
        Node(
            type=NodeType.DOCUMENT,
            properties={"page_content": doc.page_content, "document_metadata": doc.metadata}
        )
    )

In [7]:
from ragas.testset.transforms import default_transforms, apply_transforms

transformer_llm = generator_llm
embedding_model = generator_embeddings

trans = default_transforms(documents=docs, llm=transformer_llm, embedding_model=embedding_model)
apply_transforms(kg, trans)

Applying HeadlinesExtractor:   0%|          | 0/3 [00:00<?, ?it/s]

Applying HeadlinesExtractor: 100%|██████████| 3/3 [00:09<00:00,  3.12s/it]
Applying HeadlineSplitter: 100%|██████████| 3/3 [00:00<00:00, 111.92it/s]
Applying SummaryExtractor: 100%|██████████| 3/3 [00:22<00:00,  7.57s/it]
Applying CustomNodeFilter: 100%|██████████| 23/23 [00:53<00:00,  2.31s/it]
Applying EmbeddingExtractor: 100%|██████████| 3/3 [00:01<00:00,  2.27it/s]
Applying ThemesExtractor: 100%|██████████| 18/18 [01:02<00:00,  3.47s/it]
Applying NERExtractor: 100%|██████████| 18/18 [00:59<00:00,  3.28s/it]
Applying CosineSimilarityBuilder: 100%|██████████| 1/1 [00:00<00:00, 1250.54it/s]
Applying OverlapScoreBuilder: 100%|██████████| 1/1 [00:00<00:00, 198.46it/s]


In [8]:
kg

KnowledgeGraph(nodes: 21, relationships: 75)

In [9]:
kg.save("knowledge_graph.json")
loaded_kg = KnowledgeGraph.load("knowledge_graph.json")
loaded_kg

KnowledgeGraph(nodes: 21, relationships: 75)

In [10]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_llm, embedding_model=embedding_model, knowledge_graph=loaded_kg)

In [29]:
from ragas.testset.synthesizers import (
    SingleHopSpecificQuerySynthesizer,
    MultiHopAbstractQuerySynthesizer,
    MultiHopSpecificQuerySynthesizer,
)

def make_dist(generator_llm, llm_context: str | None = None):
    single = SingleHopSpecificQuerySynthesizer(llm=generator_llm, llm_context=llm_context)
    mh_abs = MultiHopAbstractQuerySynthesizer(llm=generator_llm, llm_context=llm_context)
    mh_spec = MultiHopSpecificQuerySynthesizer(llm=generator_llm, llm_context=llm_context)

    print(single.get_prompts().keys())

    instruction = """Generate ONLY Yes/No queries. Answer must be strictly faithful to context..."""
    p = single.get_prompts()["query_answer_generation_prompt"]
    p.instruction = instruction
    single.set_prompts(query_answer_generation_prompt=p)

    return [
        (single, 0.6),
        (mh_abs, 0.2),
        (mh_spec, 0.2),
    ]





In [1]:
MultiHopAbstractQuerySynthesizer(llm=generator_llm)

NameError: name 'MultiHopAbstractQuerySynthesizer' is not defined

In [30]:
SingleHopSpecificQuerySynthesizer

ragas.testset.synthesizers.single_hop.specific.SingleHopSpecificQuerySynthesizer

In [24]:
query_distribution = make_dist(generator_llm)

testset = generator.generate_with_langchain_docs(
    docs,
    testset_size=100,
    query_distribution=query_distribution,
)

dict_keys(['query_answer_generation_prompt', 'themes_personas_matching_prompt'])


Applying HeadlinesExtractor:   0%|          | 0/3 [00:00<?, ?it/s]

Applying HeadlinesExtractor: 100%|██████████| 3/3 [00:09<00:00,  3.21s/it]
Applying HeadlineSplitter: 100%|██████████| 3/3 [00:00<00:00, 109.75it/s]
Applying SummaryExtractor: 100%|██████████| 3/3 [00:21<00:00,  7.12s/it]
Applying CustomNodeFilter: 100%|██████████| 23/23 [00:52<00:00,  2.30s/it]
  property_name, property_value = await self.extract(node)
Applying EmbeddingExtractor: 100%|██████████| 3/3 [00:01<00:00,  2.51it/s]
Applying ThemesExtractor: 100%|██████████| 17/17 [00:58<00:00,  3.46s/it]
Applying NERExtractor: 100%|██████████| 17/17 [00:57<00:00,  3.36s/it]
Applying CosineSimilarityBuilder: 100%|██████████| 1/1 [00:00<00:00, 1152.91it/s]
Applying OverlapScoreBuilder: 100%|██████████| 1/1 [00:00<00:00, 195.54it/s]
Generating Scenarios: 100%|██████████| 3/3 [02:01<00:00, 40.35s/it] 
Generating Samples: 100%|██████████| 100/100 [06:19<00:00,  3.80s/it]  


In [27]:
testset.to_pandas()

Unnamed: 0,user_input,reference_contexts,reference,persona_name,query_style,query_length,synthesizer_name
0,is oleg zaikin involved in the research on dia...,[Enumerating the Transversals for Diagonal Lat...,Yes,Dr. Elena Vasiliev,POOR_GRAMMAR,SHORT,single_hop_specific_query_synthesizer
1,Is Stepan Kochemazov a researcher involved in ...,[Enumerating the Transversals for Diagonal Lat...,Yes,Dr. Elena Vasiliev,PERFECT_GRAMMAR,SHORT,single_hop_specific_query_synthesizer
2,Is Stepan Kochemazov a researcher involved in ...,[Enumerating the Transversals for Diagonal Lat...,Yes,Dr. Elena Vasiliev,PERFECT_GRAMMAR,LONG,single_hop_specific_query_synthesizer
3,Is Oleg Zaikin involved in the enumeration of ...,[Enumerating the Transversals for Diagonal Lat...,Yes,Dr. Elena Vasiliev,MISSPELLED,MEDIUM,single_hop_specific_query_synthesizer
4,Is N the order of a Latin square in the given ...,[2 Preliminaries A Latin square (LS) of order ...,Yes,Dr. Elena Vasiliev,PERFECT_GRAMMAR,MEDIUM,single_hop_specific_query_synthesizer
...,...,...,...,...,...,...,...
95,how did volunter computing help in the enumear...,"[<1-hop>\n\n5 Related Work Papers [BR75, MR95,...",In the volunteer computing project Gerasim@hom...,,,,multi_hop_specific_query_synthesizer
96,how many transversals in a latin square?,[<1-hop>\n\n3 Algorithm for Enumerating Transv...,The number of transversals in a Latin square v...,,,,multi_hop_specific_query_synthesizer
97,how mant diagonal latn squres isotopy clsses f...,[<1-hop>\n\nEnumeration of Isotopy Classes of ...,The isotopy classes of diagonal Latin squares ...,,,,multi_hop_specific_query_synthesizer
98,What is the significance of the discovery of t...,[<1-hop>\n\ncalculation process on a client co...,The first pairs of orthogonal diagonal Latin s...,,,,multi_hop_specific_query_synthesizer


In [26]:
testset = generator.generate(testset_size=10, query_distribution=query_distribution)
testset.to_pandas()


Generating Scenarios:   0%|          | 0/3 [00:14<?, ?it/s]


KeyboardInterrupt: 