# QA comparativo (DOCX + PDF) — v2 (Relatórios claros)

Mesma lógica da v2 (top_k=5, contexto=1600, pesos 0.6/0.4), com **CSVs e relatório mais legíveis**.

## 1) Instalação

In [1]:

!pip -q install transformers>=4.41.0 sentence-transformers>=2.7.0 torch python-docx pypdf pdfplumber pandas numpy scikit-learn tqdm unidecode nltk rapidfuzz
import nltk
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True)


## 2) Imports e diretórios

In [2]:

import os, re, time, json, math, io, random
import numpy as np
import pandas as pd
from typing import List, Dict

from docx import Document
from pypdf import PdfReader
import pdfplumber

from transformers import pipeline, AutoTokenizer
import torch

from sentence_transformers import SentenceTransformer
from rapidfuzz.fuzz import token_set_ratio
from unidecode import unidecode

BASE_DIR = '/content' if os.path.exists('/content') else '/mnt/data'
DATASETS_DIR = os.path.join(BASE_DIR, 'datasets')
OUTPUTS_DIR  = os.path.join(BASE_DIR, 'outputs')
for d in [DATASETS_DIR, OUTPUTS_DIR]:
    os.makedirs(d, exist_ok=True)

print('BASE_DIR:', BASE_DIR)
print('DATASETS_DIR:', DATASETS_DIR)
print('OUTPUTS_DIR:', OUTPUTS_DIR)

def set_seeds(seed=42):
    random.seed(seed)
    np.random.seed(seed)
set_seeds()


BASE_DIR: /content
DATASETS_DIR: /content/datasets
OUTPUTS_DIR: /content/outputs


## 3) Upload dos arquivos (se necessário)

In [3]:

try:
    from google.colab import files
    IN_COLAB = True
except:
    IN_COLAB = False

needed = {
    'docx': os.path.join(DATASETS_DIR, 'DICIONARIO_DE_DADOS.docx'),
    'pdf':  os.path.join(DATASETS_DIR, 'doencas-respiratorias.pdf')
}
for k,p in needed.items():
    print(f'{k}:', 'OK' if os.path.exists(p) else 'FALTA')

if IN_COLAB:
    to_upload = [k for k,p in needed.items() if not os.path.exists(p)]
    if to_upload:
        print('\nEnvie os arquivos requeridos (DOCX e PDF)...')
        up = files.upload()
        import shutil
        for name in up.keys():
            src = name
            dst = os.path.join(DATASETS_DIR, os.path.basename(name))
            if src != dst:
                shutil.move(src, dst)
        print('Upload concluído.')
        for k,p in needed.items():
            print(f'{k}:', 'OK' if os.path.exists(p) else 'FALTA')
else:
    print("Execução local detectada. Coloque os arquivos em datasets/.")


docx: FALTA
pdf: OK

Envie os arquivos requeridos (DOCX e PDF)...


Saving DICIONARIO_DE_DADOS.docx to DICIONARIO_DE_DADOS.docx
Upload concluído.
docx: OK
pdf: OK


## 4) Leitura e pré-processamento

In [4]:

def clean_text(s: str) -> str:
    if not s:
        return ""
    s = re.sub(r'-\n', '', s)
    s = re.sub(r'(?<=[a-z])\n(?=[a-z])', '', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def load_docx(path: str) -> str:
    doc = Document(path)
    paras = []
    for p in doc.paragraphs:
        t = (p.text or '').strip()
        if t:
            paras.append(t)
    for table in doc.tables:
        for row in table.rows:
            cells = [c.text.strip() for c in row.cells]
            if any(cells):
                paras.append(' | '.join(cells))
    return '\n'.join(paras)

def load_pdf(path: str) -> str:
    text_pages = []
    try:
        with pdfplumber.open(path) as pdf:
            for page in pdf.pages:
                t = page.extract_text() or ''
                t = t.strip()
                if t:
                    text_pages.append(t)
        if text_pages:
            return '\n'.join(text_pages)
    except Exception:
        pass
    try:
        reader = PdfReader(path)
        for page in reader.pages:
            t = page.extract_text() or ''
            t = t.strip()
            if t:
                text_pages.append(t)
    except Exception:
        pass
    return '\n'.join(text_pages)

DOCX_PATH = os.path.join(DATASETS_DIR, 'DICIONARIO_DE_DADOS.docx')
PDF_PATH  = os.path.join(DATASETS_DIR, 'doencas-respiratorias.pdf')
docx_text = clean_text(load_docx(DOCX_PATH)) if os.path.exists(DOCX_PATH) else ''
pdf_text  = clean_text(load_pdf(PDF_PATH)) if os.path.exists(PDF_PATH) else ''

print('DOCX chars:', len(docx_text))
print('PDF  chars:', len(pdf_text))


DOCX chars: 205617
PDF  chars: 311006


## 5) Chunking

In [5]:

from transformers import AutoTokenizer

def chunk_text(text: str, max_tokens: int = 210, overlap_tokens: int = 35, model_name: str = 'distilbert-base-uncased') -> List[str]:
    if not text:
        return []
    tok = AutoTokenizer.from_pretrained(model_name)
    words = text.split()
    chunks, start = [], 0
    while start < len(words):
        piece_count, end = 0, start
        while end < len(words):
            piece_count += len(tok.tokenize(words[end]))
            if piece_count > max_tokens:
                break
            end += 1
        chunk = " ".join(words[start:end]).strip()
        if chunk:
            chunks.append(chunk)
        if end >= len(words):
            break
        back = end - start
        overlap = min(overlap_tokens, back // 2 if back > 0 else 0)
        start = max(0, end - overlap)
        if start == 0 and chunks:
            start = end
    return chunks

tok_ref = 'distilbert-base-uncased'
chunk_docx = chunk_text(docx_text, max_tokens=210, overlap_tokens=35, model_name=tok_ref) if docx_text else []
chunk_pdf  = chunk_text(pdf_text,  max_tokens=210, overlap_tokens=35, model_name=tok_ref) if pdf_text else []
print(f'Chunks DOCX: {len(chunk_docx)} | Chunks PDF: {len(chunk_pdf)}')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Chunks DOCX: 589 | Chunks PDF: 924


## 6) Embeddings e retrieval

In [6]:

from sentence_transformers import SentenceTransformer

def normalize_for_match(s: str) -> str:
    if not s:
        return ""
    s = s.lower()
    s = unidecode(s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def get_embedding_model(name: str = 'sentence-transformers/multi-qa-MiniLM-L6-cos-v1'):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    return SentenceTransformer(name, device=device)

def build_embeddings(chunks: List[str], model: SentenceTransformer, batch_size: int = 32) -> np.ndarray:
    if not chunks:
        return np.zeros((0, model.get_sentence_embedding_dimension()))
    return model.encode(chunks, batch_size=batch_size, convert_to_numpy=True, show_progress_bar=False)

def encode_texts(texts: List[str], model: SentenceTransformer) -> np.ndarray:
    if not texts:
        return np.zeros((0, 1))
    return model.encode(texts, convert_to_numpy=True, show_progress_bar=False)

def retrieve(question: str, emb_matrix: np.ndarray, chunks: List[str], emb_model, top_k: int = 5) -> List[Dict]:
    if emb_matrix is None or len(emb_matrix) == 0 or not chunks:
        return []
    q_emb = encode_texts([question], emb_model)[0]
    sims = emb_matrix @ q_emb / (np.linalg.norm(emb_matrix, axis=1) * (np.linalg.norm(q_emb) + 1e-9) + 1e-9)
    idxs = np.argsort(-sims)[:top_k]
    return [{"index": int(i), "similarity": float(sims[i]), "text": chunks[i]} for i in idxs]

emb_model = get_embedding_model('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')
emb_docx = build_embeddings(chunk_docx, emb_model)
emb_pdf  = build_embeddings(chunk_pdf,  emb_model)
print('Embeddings docx:', emb_docx.shape, '| pdf:', emb_pdf.shape)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embeddings docx: (589, 384) | pdf: (924, 384)


## 7) Modelos HF (3)

In [7]:

def _device():
    return 0 if torch.cuda.is_available() else -1

models = {}
errs = {}
def add_model(key, task, model_name, tok_name=None):
    try:
        pipe = pipeline(task, model=model_name, tokenizer=tok_name or model_name, device=_device())
        models[key] = pipe
        print(f'[OK] {key}: {model_name}')
    except Exception as e:
        models[key] = {'error': str(e)}
        errs[key] = str(e)
        print(f'[ERRO] {key}:', e)

add_model('pierreguillou', 'question-answering', 'pierreguillou/bert-base-cased-squad-v1.1-portuguese')
add_model('mrm8488',      'question-answering', 'mrm8488/bert-base-portuguese-cased-finetuned-squad-v1-pt')
add_model('timpal0l',     'question-answering', 'timpal0l/mdeberta-v3-base-squad2')
print('Modelos:', list(models.keys()), '| Erros:', errs if errs else 'nenhum')


config.json:   0%|          | 0.00/862 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/494 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


[OK] pierreguillou: pierreguillou/bert-base-cased-squad-v1.1-portuguese


config.json:   0%|          | 0.00/691 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at mrm8488/bert-base-portuguese-cased-finetuned-squad-v1-pt were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


[OK] mrm8488: mrm8488/bert-base-portuguese-cased-finetuned-squad-v1-pt


config.json:   0%|          | 0.00/879 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

Device set to use cpu


[OK] timpal0l: timpal0l/mdeberta-v3-base-squad2
Modelos: ['pierreguillou', 'mrm8488', 'timpal0l'] | Erros: nenhum


## 8) Perguntas e conceitos

In [8]:

QUESTIONS = {
    'Q1': {'question': 'Qual o nome do campo que armazena o nome do profissional na tabela LFCES018?', 'document': 'docx', 'concepts_key': 'Q1'},
    'Q2': {'question': 'Qual tabela local equivale à tabela RL_ESTAB_EQP_UNID_APOIO no banco de produção Federal?', 'document': 'docx', 'concepts_key': 'Q2'},
    'Q3': {'question': 'Tomando como base a tabela RL_ESTAB_SAMU, qual o campo equivalente à CO_DESATIVACAO no banco local?', 'document': 'docx', 'concepts_key': 'Q3'},
    'Q4': {'question': 'Quais os procedimentos para o tratamento de crise de asma na sala de emergência?', 'document': 'pdf', 'concepts_key': 'Q4'},
    'Q5': {'question': 'Quais são os benefícios notáveis após a cessação do Tabagismo?', 'document': 'pdf', 'concepts_key': 'Q5'},
    'Q6': {'question': 'Quais são os fatores de risco da ASMA?', 'document': 'pdf', 'concepts_key': 'Q6'}
}

CONCEPTS = {
  "Q1": ["nome_prof", "no_profissional"],
  "Q2": ["lfces095"],
  "Q3": ["cod_desativ"],
  "Q4": ["oxigênio", "broncodilatador", "beta2", "ipratropio", "corticoide", "sulfato magnésio", "monitoramento", "reavaliação", "crise moderada", "crise grave", "internação", "alta"],
  "Q5": ["função pulmonar", "sintomas respiratórios", "redução risco câncer", "dpoc", "doenças cardiovasculares", "infecções respiratórias"],
  "Q6": ["genéticos", "obesidade", "sexo masculino", "poeira", "baratas", "vírus sincicial", "rinovírus"]
}


## 9) Métricas e pesos (0.6/0.4)

In [9]:

def key_concepts_recall(answer: str, concepts: List[str], thresh: float = 0.66) -> float:
    if not answer or not concepts:
        return 0.0
    ans = normalize_for_match(answer)
    hits = sum(1 for c in concepts if (token_set_ratio(ans, normalize_for_match(c)) / 100.0) >= thresh)
    return hits / max(1, len(concepts))

def encode_texts(texts: List[str], model: SentenceTransformer) -> np.ndarray:
    if not texts:
        return np.zeros((0, 1))
    return model.encode(texts, convert_to_numpy=True, show_progress_bar=False)

def calc_similarity_embedding(answer: str, reference_text: str, emb_model) -> float:
    if not answer or not reference_text:
        return 0.0
    vecs = encode_texts([answer, reference_text], emb_model)
    a, b = vecs[0], vecs[1]
    num = float(np.dot(a, b))
    den = float((np.linalg.norm(a) * np.linalg.norm(b)) + 1e-9)
    return num / den

WEIGHTS = {
    'embedding_answer_vs_ref': 0.6,
    'key_concepts_recall': 0.4
}


## 10) Execução e coleta de métricas

In [10]:

def run_all(questions_dict: Dict[str, Dict], concepts: Dict[str, List[str]], models: Dict[str, any],
            emb_model, chunk_map: Dict[str, List[str]], emb_map: Dict[str, np.ndarray],
            top_k: int = 5, context_char_limit: int = 1600) -> pd.DataFrame:
    rows, row_id = [], 0
    for qid, meta in questions_dict.items():
        q = meta['question']
        doc_key = meta['document']
        cpts = concepts.get(meta['concepts_key'], [])
        chunks = chunk_map.get(doc_key, [])
        embs   = emb_map.get(doc_key, np.zeros((0,1)))
        t0 = time.time()
        # retrieval
        def retrieve(question: str, emb_matrix: np.ndarray, chunks: List[str], emb_model, top_k: int = top_k) -> List[Dict]:
            if emb_matrix is None or len(emb_matrix) == 0 or not chunks:
                return []
            q_emb = encode_texts([question], emb_model)[0]
            sims = emb_matrix @ q_emb / (np.linalg.norm(emb_matrix, axis=1) * (np.linalg.norm(q_emb) + 1e-9) + 1e-9)
            idxs = np.argsort(-sims)[:top_k]
            return [{"index": int(i), "similarity": float(sims[i]), "text": chunks[i]} for i in idxs]

        retrieved = retrieve(q, embs, chunks, emb_model, top_k=top_k)
        t_retr = time.time() - t0
        if not retrieved:
            continue
        # contexto maior
        ctx_parts, tot = [], 0
        for r in retrieved:
            t = r['text']
            if tot + len(t) > context_char_limit:
                break
            ctx_parts.append(t)
            tot += len(t)
        context = "\n".join(ctx_parts)
        top_sim = retrieved[0]['similarity']
        for mname, qa in models.items():
            try:
                t1 = time.time()
                out = qa(question=q, context=context)
                tqa = time.time() - t1
                ans = out.get('answer', '')
                # MAX semântico no top-k + evidência
                if ans:
                    sims = [calc_similarity_embedding(ans, r['text'], emb_model) for r in retrieved]
                    emb_sim = max(sims) if sims else 0.0
                    best_idx = int(np.argmax(sims)) if sims else 0
                    evidence = retrieved[best_idx]['text']
                else:
                    emb_sim, evidence = 0.0, ''
                # cobertura de conceitos
                hits = 0
                for c in cpts:
                    score = token_set_ratio(normalize_for_match(ans), normalize_for_match(c)) / 100.0
                    if score >= 0.66:
                        hits += 1
                k_recall = (hits / max(1, len(cpts))) if cpts else 0.0
                final_score = WEIGHTS['embedding_answer_vs_ref'] * emb_sim + WEIGHTS['key_concepts_recall'] * k_recall
                rows.append({
                    'document': doc_key, 'question_id': qid, 'question': q, 'model': mname,
                    'answer': ans, 'evidence_ref': evidence[:350],
                    'embedding_answer_vs_ref': emb_sim, 'key_concepts_recall': k_recall,
                    'coverage_count': hits, 'coverage_total': len(cpts),
                    'final_composite_score': final_score,
                    'time_seconds': t_retr + tqa,
                    'answer_len': len(ans), 'context_len': len(context)
                })
            except Exception as e:
                rows.append({
                    'document': doc_key, 'question_id': qid, 'question': q, 'model': mname,
                    'answer': '', 'evidence_ref': '', 'embedding_answer_vs_ref': 0.0,
                    'key_concepts_recall': 0.0, 'coverage_count': 0, 'coverage_total': len(cpts),
                    'final_composite_score': 0.0, 'time_seconds': t_retr, 'answer_len': 0, 'context_len': len(context),
                })
    return pd.DataFrame(rows)

# build maps and run
from sentence_transformers import SentenceTransformer
emb_base = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1', device=('cuda' if torch.cuda.is_available() else 'cpu'))
emb_map = {'docx': emb_base.encode(chunk_docx, convert_to_numpy=True) if chunk_docx else np.zeros((0,1)),
           'pdf': emb_base.encode(chunk_pdf, convert_to_numpy=True) if chunk_pdf else np.zeros((0,1))}
chunk_map = {'docx': chunk_docx, 'pdf': chunk_pdf}

df = run_all(QUESTIONS, CONCEPTS, models, emb_base, chunk_map, emb_map, top_k=5, context_char_limit=1600)
print('Linhas geradas:', len(df))
df.head(10)


Linhas geradas: 18


Unnamed: 0,document,question_id,question,model,answer,evidence_ref,embedding_answer_vs_ref,key_concepts_recall,coverage_count,coverage_total,final_composite_score,time_seconds,answer_len,context_len
0,docx,Q1,Qual o nome do campo que armazena o nome do pr...,pierreguillou,CEPS,Código do Módulo Assistencial (Conforme o plan...,0.181689,0.0,0,2,0.109013,2.910311,4,1172
1,docx,Q1,Qual o nome do campo que armazena o nome do pr...,mrm8488,VARCHAR(6) | | CO_CBO,NOME DO CAMPO | TIPO | FOREIGN KEY | PRIMARY K...,0.459394,0.0,0,2,0.275636,2.314133,21,1172
2,docx,Q1,Qual o nome do campo que armazena o nome do pr...,timpal0l,TABELA DE PROFISSIONAIS DA COMISSÃO,PROFISSIONAL NO SITE DO CNES) TABELA DE HORÁRI...,0.722194,0.0,0,2,0.433316,3.455239,36,1172
3,docx,Q2,Qual tabela local equivale à tabela RL_ESTAB_E...,pierreguillou,LFCES005,OBRIGATÓRIAS TABELA DE ATIVIDADES SECUNDÁRIAS ...,0.263306,1.0,1,1,0.557983,3.758609,8,1249
4,docx,Q2,Qual tabela local equivale à tabela RL_ESTAB_E...,mrm8488,TABELA DE TIPO DE INSTALAÇÃO FÍSICA PARA ASSIS...,OBRIGATÓRIAS TABELA DE ATIVIDADES SECUNDÁRIAS ...,0.662706,0.0,0,1,0.397624,1.450118,52,1249
5,docx,Q2,Qual tabela local equivale à tabela RL_ESTAB_E...,timpal0l,(BANCO LOCAL),OBRIGATÓRIAS TABELA DE ATIVIDADES SECUNDÁRIAS ...,0.5172,0.0,0,1,0.31032,3.135135,14,1249
6,docx,Q3,"Tomando como base a tabela RL_ESTAB_SAMU, qual...",pierreguillou,CONTA CORRENTE DO ESTAB,DO ESTABELECIMENTO TABELA DE CNES VÁLIDOS TABE...,0.459132,0.0,0,1,0.275479,3.198143,23,1266
7,docx,Q3,"Tomando como base a tabela RL_ESTAB_SAMU, qual...",mrm8488,ESF,DO ESTABELECIMENTO TABELA DE CNES VÁLIDOS TABE...,0.18371,0.0,0,1,0.110226,1.161774,3,1266
8,docx,Q3,"Tomando como base a tabela RL_ESTAB_SAMU, qual...",timpal0l,TABELA DE MOTIVO DE DESATIVAÇÃO,DE TIPO LOGRADOURO TABELA DE RAÇA TABELA DE CB...,0.751663,0.0,0,1,0.450998,3.526808,32,1266
9,pdf,Q4,Quais os procedimentos para o tratamento de cr...,pierreguillou,utilizando-se a menor dose que possa controlar...,devem serrealizadas em todos os casos de asma....,0.496128,0.0,0,12,0.297677,0.938082,58,1236


## 11) Relatórios claros (CSVs e Markdown)

In [12]:

def to_pct(x, nd=1):
    try:
        return f"{float(x)*100:.{nd}f}%"
    except:
        return ""

# Arredondar e produzir versões pretty
for c in ["embedding_answer_vs_ref","key_concepts_recall","final_composite_score"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce").round(4)

df["coverage_pct"] = (df["coverage_count"] / df["coverage_total"].replace(0, np.nan)).fillna(0.0)

# Detalhado
detalhado = df[["document","question_id","question","model","answer","evidence_ref",
                "embedding_answer_vs_ref","key_concepts_recall","coverage_count","coverage_total","coverage_pct",
                "final_composite_score","time_seconds","answer_len","context_len"]].copy()

detalhado_pretty = detalhado.copy()
for c in ["embedding_answer_vs_ref","key_concepts_recall","coverage_pct","final_composite_score"]:
    detalhado_pretty[c+"_pct"] = detalhado_pretty[c].apply(lambda v: to_pct(v, 1))

# Resumo comparativo
group_cols = ['document','question_id','model']
metrics = ['embedding_answer_vs_ref','key_concepts_recall','final_composite_score']
resumo = df.groupby(group_cols, dropna=False)[metrics].mean().reset_index()
resumo["rank_within_q"] = resumo.groupby("question_id")["final_composite_score"].rank(ascending=False, method="dense")

resumo_pretty = resumo.copy()
for c in ["embedding_answer_vs_ref","key_concepts_recall","final_composite_score"]:
    resumo_pretty[c+"_pct"] = resumo_pretty[c].apply(lambda v: to_pct(v, 1))

pivot = resumo.pivot_table(index="question_id", columns="model", values="final_composite_score", aggfunc="mean").reset_index()
winners = resumo.sort_values(["question_id","final_composite_score"], ascending=[True,False]).groupby("question_id").head(1)
winners = winners[["question_id","document","model","final_composite_score"]].rename(columns={"final_composite_score":"best_score"})
winners["best_score_pct"] = winners["best_score"].apply(lambda v: to_pct(v, 1))

# Salvar
det_num = os.path.join(OUTPUTS_DIR, "detalhado_resultados_v2.csv")
det_pre = os.path.join(OUTPUTS_DIR, "detalhado_resultados_v2_pretty.csv")
res_num = os.path.join(OUTPUTS_DIR, "resumo_comparativo_v2.csv")
res_pre = os.path.join(OUTPUTS_DIR, "resumo_comparativo_v2_pretty.csv")
piv_out = os.path.join(OUTPUTS_DIR, "pivot_por_pergunta_v2.csv")
win_out = os.path.join(OUTPUTS_DIR, "winners_v2.csv")
md_path = os.path.join(OUTPUTS_DIR, "relatorio_comparativo_v2.md")

detalhado.to_csv(det_num, index=False)
detalhado_pretty.to_csv(det_pre, index=False)
resumo.to_csv(res_num, index=False)
resumo_pretty.to_csv(res_pre, index=False)
pivot.to_csv(piv_out, index=False)
winners.to_csv(win_out, index=False)

with open(md_path, "w", encoding="utf-8") as f:
    f.write("# Relatório comparativo (v2)\n\n")
    f.write("**Pesos:** 0.6 `embedding_answer_vs_ref` + 0.4 `key_concepts_recall`.\n\n")
    f.write("**Interpretação:** 100% = alinhamento perfeito (semântica + cobertura de conceitos-chave).\n\n")
    f.write("## Melhores por pergunta\n\n")
    f.write(winners.to_markdown(index=False))
    f.write("\n\n## Resumo comparativo (média por pergunta/modelo)\n\n")
    f.write(resumo_pretty.sort_values(['question_id','rank_within_q']).to_markdown(index=False))
    f.write("\n\n## Observações\n")
    f.write("- `evidence_ref`: melhor trecho de evidência (top-k) usado para comparar a resposta.\n")
    f.write("- `coverage_pct`: quantos conceitos-chave foram cobertos na resposta.\n")
    f.write("- `final_composite_score`: combinação 0.6*semântica + 0.4*recall dos conceitos.\n")


print("Arquivos salvos em /content/outputs ou /mnt/data/outputs:")
for p in [det_num, det_pre, res_num, res_pre, piv_out, win_out, md_path]:
    print(" -", p)


Arquivos salvos em /content/outputs ou /mnt/data/outputs:
 - /content/outputs/detalhado_resultados_v2.csv
 - /content/outputs/detalhado_resultados_v2_pretty.csv
 - /content/outputs/resumo_comparativo_v2.csv
 - /content/outputs/resumo_comparativo_v2_pretty.csv
 - /content/outputs/pivot_por_pergunta_v2.csv
 - /content/outputs/winners_v2.csv
 - /content/outputs/relatorio_comparativo_v2.md
