In [31]:
def load_documents(url: str):
    import pickle
    with open("../app/data/documents/" + str(url) + ".pkl", "rb") as f:
            return pickle.load(f)

documents = load_documents("27")

In [19]:
%pip install ipywidgets widgetsnbextension pandas-profiling llama-index-llms-ollama rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.13.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (12 kB)
Downloading rapidfuzz-3.13.0-cp310-cp310-macosx_11_0_arm64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.13.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [32]:
from llama_index.llms.ollama import Ollama
from ragas.llms.base import LlamaIndexLLMWrapper

llm_ollama = Ollama(model="deepseek-r1:32b", request_timeout=120.0)

llm = LlamaIndexLLMWrapper(llm_ollama)

In [33]:
from ragas.embeddings.base import LangchainEmbeddingsWrapper
from langchain_ollama import OllamaEmbeddings

embedding_model = LangchainEmbeddingsWrapper(OllamaEmbeddings(
    model="nomic-embed-text:latest",

))

In [34]:
from ragas.testset.transforms import Transforms, apply_transforms, default_transforms

transforms = default_transforms(
                documents=list(documents),
                llm=llm,
                embedding_model=embedding_model
            )

In [35]:
from ragas.testset.graph import KnowledgeGraph, Node, NodeType

nodes = []
for doc in documents:
    node = Node(
        type=NodeType.DOCUMENT,
        properties={
            "page_content": doc.page_content,
            "document_metadata": doc.metadata,
        },
    )
    nodes.append(node)

kg = KnowledgeGraph(nodes=nodes)

In [36]:
from ragas.testset.transforms import apply_transforms

apply_transforms(kg, transforms)

Applying SummaryExtractor:  17%|█▋        | 20/116 [18:51<1:22:09, 51.35s/it]unable to apply transformation: 
Applying SummaryExtractor:  18%|█▊        | 21/116 [21:13<2:04:31, 78.65s/it]unable to apply transformation: 
Applying SummaryExtractor:  19%|█▉        | 22/116 [21:36<1:37:10, 62.02s/it]unable to apply transformation: 
Applying SummaryExtractor:  20%|█▉        | 23/116 [21:49<1:13:12, 47.24s/it]unable to apply transformation: 
Applying SummaryExtractor:  21%|██        | 24/116 [22:02<56:48, 37.04s/it]  unable to apply transformation: 
Applying SummaryExtractor:  22%|██▏       | 25/116 [22:06<41:11, 27.16s/it]unable to apply transformation: 
Applying SummaryExtractor:  22%|██▏       | 26/116 [22:17<33:17, 22.20s/it]unable to apply transformation: 
Applying SummaryExtractor:  23%|██▎       | 27/116 [22:29<28:29, 19.21s/it]unable to apply transformation: 
Applying SummaryExtractor:  24%|██▍       | 28/116 [22:52<29:38, 20.21s/it]unable to apply transformation: 
Applying SummaryEx

In [37]:
kg.save("kg-27-32b.json")

In [38]:
from ragas.testset.persona import generate_personas_from_kg

personas = generate_personas_from_kg(kg=kg, llm=llm, num_personas=5)

Generating personas: 100%|██████████| 5/5 [01:51<00:00, 22.25s/it]


In [39]:
personas

[Persona(name='Data Protection Officer', role_description="Responsible for informing visitors about how their personal data is collected and processed on the school's website or services, ensuring technical and organizational measures are in place to protect user data."),
 Persona(name='Web Analytics Manager', role_description='Monitors and analyzes website traffic to improve user experience while ensuring compliance with privacy regulations.'),
 Persona(name='Data Privacy Officer', role_description='Ensures compliance with GDPR regulations regarding cookies and manages user consent for data collection.'),
 Persona(name='Data Protection Officer', role_description='Ensures compliance with GDPR regulations for data processing activities.'),
 Persona(name='Data Compliance Officer', role_description='Oversees compliance with contractual obligations for data sharing, ensuring proper monitoring of third-party transfers.')]

In [40]:
personas[0].model_dump_json()

persona_json = []
for persona in personas:
    persona_json.append(persona.model_dump_json())

persona_json

['{"name":"Data Protection Officer","role_description":"Responsible for informing visitors about how their personal data is collected and processed on the school\'s website or services, ensuring technical and organizational measures are in place to protect user data."}',
 '{"name":"Web Analytics Manager","role_description":"Monitors and analyzes website traffic to improve user experience while ensuring compliance with privacy regulations."}',
 '{"name":"Data Privacy Officer","role_description":"Ensures compliance with GDPR regulations regarding cookies and manages user consent for data collection."}',
 '{"name":"Data Protection Officer","role_description":"Ensures compliance with GDPR regulations for data processing activities."}',
 '{"name":"Data Compliance Officer","role_description":"Oversees compliance with contractual obligations for data sharing, ensuring proper monitoring of third-party transfers."}']

In [41]:
import json

with open("personas-32b.json", "w") as f:
    json.dump(persona_json, f)
