In [1]:
from transformers import pipeline
generator = pipeline(model='PORTULAN/gervasio-8b-portuguese-ptpt-decoder')
generator("A comida portuguesa é", max_new_tokens=10)

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:  37%|###7      | 2.98G/7.98G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:  40%|####      | 3.32G/8.30G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:  30%|##9       | 2.09G/7.00G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Device set to use cuda:0


[{'generated_text': 'A comida portuguesa é uma das mais ricas e variadas do mundo'}]

In [2]:
generator("A comida portuguesa é")

[{'generated_text': 'A comida portuguesa é uma das mais ricas e variadas do mundo, com uma rica herança gastronómica que reflete a influência de diferentes culturas. A culinária portuguesa é conhecida por seus pratos típicos, como o bacalhau à brás, o frango grelhado e o arroz de tamboril, que são apreciados em todo o mundo.\nA comida portuguesa é uma combinação de sabores, texturas e temperos, com uma ênfase na utilização de ingredientes frescos e locais. Os pratos portugueses são frequentemente servidos em conjunto, com uma variedade de acompanhamentos, como saladas, legumes e panes.\nA culinária portuguesa é também conhecida por suas sobremesas, como o pastel de nata, que é uma especialidade portuguesa feita com ovos, açúcar e leite. Outra sobremesa popular é o arroz doce, que é uma mistura de arroz, açúcar e frutas.\nAlém disso, a culinária portuguesa é influenciada pela sua história'}]

In [None]:
from minirag.utils import calculate_similarity_score  # legacy helper (returns indices) – not used now
from nltk.metrics import edit_distance
from rouge import Rouge
from sentence_transformers import SentenceTransformer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
import numpy as np

# (Optionally create and reuse these globally to avoid re-loading each call)
_BERT_MODEL = None
_ROUGE = None
_SMOOTH = SmoothingFunction().method1

def calculate_best_similarity(sentences: list[str], target: str, method="levenshtein", n=1):
    """
    Returns the highest similarity score (float) between any sentence in `sentences` and `target`.
    Methods: jaccard | levenshtein | rouge | bert | overlap | bleu
    For rouge, n=1 or 2 selects rouge-1 or rouge-2 F.
    """
    if not sentences:
        return 0.0
    tgt_tokens = target.lower().split()
    scores = []

    if method == "jaccard":
        tgt_set = set(tgt_tokens)
        for s in sentences:
            s_tokens = set(s.lower().split())
            inter = set(s_tokens).intersection(set(tgt_set))
            union = set(s_tokens).union(set(tgt_set))
            scores.append(len(inter) / len(union) if union else 0.0)

    elif method == "levenshtein":
        tgt_len = max(len(tgt_tokens), 1)
        for s in sentences:
            dist = edit_distance(tgt_tokens, s.lower().split())
            norm = max(tgt_len, len(s.split()))
            scores.append(1 - dist / norm if norm else 0.0)

    elif method == "rouge":
        global _ROUGE
        if _ROUGE is None:
            _ROUGE = Rouge()
        key = f"rouge-{n}"
        for s in sentences:
            r = _ROUGE.get_scores(s, target)
            scores.append(r[0].get(key, {}).get("f", 0.0))

    elif method == "bert":
        global _BERT_MODEL
        if _BERT_MODEL is None:
            _BERT_MODEL = SentenceTransformer("all-MiniLM-L6-v2")
        embeddings = _BERT_MODEL.encode(sentences + [target])
        tgt_vec = embeddings[-1]
        tgt_norm = np.linalg.norm(tgt_vec)
        for i in range(len(sentences)):
            v = embeddings[i]
            denom = (np.linalg.norm(v) * tgt_norm)
            scores.append(float(np.dot(v, tgt_vec) / denom) if denom else 0.0)

    elif method == "overlap":
        tgt_set = set(tgt_tokens)
        for s in sentences:
            s_set = set(s.lower().split())
            inter = s_set & tgt_set
            denom = min(len(s_set), len(tgt_set))
            scores.append(len(inter) / denom if denom else 0.0)

    elif method == "bleu":
        tgt_bleu = word_tokenize(target.lower())
        for s in sentences:
            s_bleu = word_tokenize(s.lower())
            scores.append(sentence_bleu([tgt_bleu], s_bleu, smoothing_function=_SMOOTH))
    else:
        raise ValueError("Unsupported method.")

    return max(scores) if scores else 0.0

answer = calculate_best_similarity(["quick fox"], "the quick brown fox", method="jaccard")

print(answer)

0.5


In [27]:
import os, random, string, time
from csv import DictWriter
from datetime import datetime

def rand_text(n=8):
    return ''.join(random.choice(string.ascii_lowercase) for _ in range(n))

def make_random_row(i):
    return {
        "question": f"Q{i} {rand_text(5)}?",
        "gold": f"Gold answer {rand_text(6)}",
        "answer": f"Model answer {rand_text(7)}",
        "latency_s": round(random.uniform(0.01, 0.5), 4),
        "exact": random.choice([0,1]),
        "substring": random.choice([0,1]),
        "token_recall": round(random.random(), 3),
        "jaccard": round(random.random(), 3),
        "levenshtein": round(random.random(), 3),
        "rouge1_f": round(random.random(), 3),
        "rouge2_f": round(random.random(), 3),
        "overlap": round(random.random(), 3),
        "bleu": round(random.random(), 3),
        "bert_cos": round(random.uniform(0.3, 0.95), 3),
    }

# create N random rows
N = 5
rows = [make_random_row(i+1) for i in range(N)]
OUTPUT_CSV_PATH = r"C:\Users\Francisco Azeredo\OneDrive\Documents\tecnico\5 ano\tese\Código\MiniRAG\notebooks"  # set to None to skip saving
mode = "test"
os.makedirs(OUTPUT_CSV_PATH, exist_ok=True)
OUTPUT_CSV = os.path.join(OUTPUT_CSV_PATH, f"results_{mode}.csv")
# Optional CSV
if OUTPUT_CSV and rows:
    write_header = not os.path.exists(OUTPUT_CSV)
    with open(OUTPUT_CSV, 'x', encoding='utf-8', newline='') as f:
        writer = DictWriter(f, fieldnames=list(rows[0].keys()))
        if write_header: writer.writeheader()
        writer.writerows(rows)
    print(f"Saved results to {OUTPUT_CSV}")

Saved results to C:\Users\Francisco Azeredo\OneDrive\Documents\tecnico\5 ano\tese\Código\MiniRAG\notebooks\results_test.csv


In [None]:
from minirag.llm.openai import openai_complete
import os
import dotenv

dotenv.load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise RuntimeError("OPENAI_API_KEY not set in environment. Set it before running this cell.")

# Correct usage: pass api_key directly (don't wrap inside a kwargs dict)
# Choose a currently supported model name. (gpt-4o-mini shown; adjust if needed.)
response = await openai_complete(
    "Hello, world!",
    model_name="gpt-3.5-turbo",
    api_key=api_key,
)
print(response)

Hello! How can I assist you today?


: 

In [4]:
# Cell 0: RAG Initialization (Run First)
# -------------------------------------
# Loads embedding model, builds embedding_func, and instantiates a MiniRAG object.
# Does NOT ingest documents. Use the next cell to index.

import os, torch, sys
import minirag
from transformers import AutoTokenizer, AutoModel
from minirag.llm.hf import hf_embed
from minirag.utils import EmbeddingFunc
from minirag.llm import ollama
from minirag.llm.openai import openai_complete
from minirag import MiniRAG
from tqdm.auto import tqdm
import dotenv

dotenv.load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise RuntimeError("OPENAI_API_KEY not set in environment. Set it before running this cell.")

sys.path.append(r'c:\Users\Francisco Azeredo\OneDrive\Documents\tecnico\5 ano\tese\Código\Chatbot\lightrag')

# Core configuration (shared by later cells)
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
WORKING_DIR = r"C:\\Users\\Francisco Azeredo\\OneDrive\\Documents\\tecnico\\5 ano\\tese\\Código\\MiniRAG\\notebooks\\storage"
LLM_MODEL_NAME = "qwen2m:latest"  # set to None if no local Ollama model
LOG_LEVEL = "INFO"

os.makedirs(WORKING_DIR, exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Init device:", device)

print("Loading embedding tokenizer/model...")
_tokenizer = AutoTokenizer.from_pretrained(EMBEDDING_MODEL)
_embed_model = AutoModel.from_pretrained(EMBEDDING_MODEL).to(device)
_embed_model.eval()

async def _embed_batch(texts: list[str]):
    return await hf_embed(texts, tokenizer=_tokenizer, embed_model=_embed_model)

async def _embed_dispatch(input_text):
    if isinstance(input_text, str):
        return (await _embed_batch([input_text]))[0]


        
    if isinstance(input_text, (list, tuple)) and all(isinstance(t, str) for t in input_text):
        return await _embed_batch(list(input_text))
    raise TypeError(f"Unsupported input type for embedding_func: {type(input_text)}")

_embedding_func = EmbeddingFunc(
    embedding_dim=_embed_model.config.hidden_size,
    max_token_size=_tokenizer.model_max_length,
    func = lambda texts: hf_embed(texts, tokenizer=_tokenizer, embed_model=_embed_model),
)
rag = minirag.MiniRAG(
    working_dir=WORKING_DIR,
    llm_model_func=ollama.ollama_model_complete if LLM_MODEL_NAME else None,
    llm_model_name=LLM_MODEL_NAME,
    embedding_func=_embedding_func,
    log_level=LOG_LEVEL,
    suppress_httpx_logging=True
)
print("RAG initialized.")

Init device: cuda
Loading embedding tokenizer/model...


INFO:minirag:Logger initialized for working directory: C:\\Users\\Francisco Azeredo\\OneDrive\\Documents\\tecnico\\5 ano\\tese\\Código\\MiniRAG\\notebooks\\storage
INFO:minirag:Load KV json_doc_status_storage with 0 data
INFO:minirag:Load KV llm_response_cache with 0 data
INFO:minirag:Load KV full_docs with 28 data
INFO:minirag:Load KV text_chunks with 34 data
INFO:minirag:Load KV json_doc_status_storage with 0 data
INFO:minirag:Load KV llm_response_cache with 0 data
INFO:minirag:Load KV full_docs with 28 data
INFO:minirag:Load KV text_chunks with 34 data
INFO:minirag:Loaded graph from C:\\Users\\Francisco Azeredo\\OneDrive\\Documents\\tecnico\\5 ano\\tese\\Código\\MiniRAG\\notebooks\\storage\graph_chunk_entity_relation.graphml with 600 nodes, 718 edges
INFO:nano-vectordb:Load (591, 384) data
INFO:nano-vectordb:Init {'embedding_dim': 384, 'metric': 'cosine', 'storage_file': 'C:\\\\Users\\\\Francisco Azeredo\\\\OneDrive\\\\Documents\\\\tecnico\\\\5 ano\\\\tese\\\\Código\\\\MiniRAG\\\\no

RAG initialized.


In [None]:
from minirag import MiniRAG
from PyPDF2 import PdfReader
import os
from io import BytesIO

# NOTE:
# The previous version treated the *string path* as if it were an async file object (await file.read()).
# A str has no .read(), hence AttributeError. We don't need async here; just open the path.

def extract_pdf_text(file):
    """Return full extracted text from a PDF.

    Accepts either:
      - a path string / PathLike pointing to a PDF file
      - a file-like object with .read() returning bytes or str
    """
    # Case 1: path provided
    if isinstance(file, (str, os.PathLike)):
        with open(file, 'rb') as f:
            reader = PdfReader(f)
            return "\n".join((page.extract_text() or "") for page in reader.pages)

    # Case 2: file-like object
    if hasattr(file, 'read'):
        data = file.read()
        if isinstance(data, str):  # if someone passed text, re-encode
            data = data.encode('utf-8')
        reader = PdfReader(BytesIO(data))
        return "\n".join((page.extract_text() or "") for page in reader.pages)

    raise TypeError(f"Unsupported file parameter type: {type(file)}. Provide a path or file-like object.")


async def index_pdf(file, rag: MiniRAG):
    text = extract_pdf_text(file)
    rag.insert(text)

file = r"C:\\Users\\Francisco Azeredo\\OneDrive\\Documents\\tecnico\\5 ano\\tese\\Código\\MiniRAG\\notebooks\\O Acesso a documentos administrativos.pdf"

# Run indexing (sync now; no await needed)
await index_pdf(file, rag)
print("PDF indexed into RAG.")

RuntimeError: This event loop is already running

: 