In [33]:
from sentence_transformers.cross_encoder import CrossEncoder, CrossEncoderTrainer, losses
from datasets import Dataset

model = CrossEncoder("models/MiniLM-ms-marco-L2-v2")
train_dataset = Dataset.from_dict({
    "query": ["What are pandas?", "What is the capital of France?"],
    "docs": [
        ["Pandas are a kind of bear.", "Pandas are kind of like fish."],
        ["The capital of France is Paris.", "Paris is the capital of France.", "Paris is quite large."],
    ],
    "labels": [[1, 0], [1, 1, 0]],
})
loss = losses.RankNetLoss(model)

trainer = CrossEncoderTrainer(
    model=model,
    train_dataset=train_dataset,
    loss=loss,
)
trainer.train()




Step,Training Loss


TrainOutput(global_step=3, training_loss=0.6599229176839193, metrics={'train_runtime': 1.2103, 'train_samples_per_second': 4.957, 'train_steps_per_second': 2.479, 'total_flos': 0.0, 'train_loss': 0.6599229176839193, 'epoch': 3.0})

In [34]:
model.save("models/ce-minilm-l2-ranknet-listwise")


In [37]:
from sentence_transformers.cross_encoder import CrossEncoder

# Dados de exemplo (mesmo formato do seu treino/listwise)
query = "What is the capital of France?"
docs = [
    "Paris is the capital of France.",
    "Marseille is a city in France.",
    "Berlin is the capital of Germany."
]

pairs = [(query, doc) for doc in docs]

# Carregue o modelo pré-treinado
model_pretrained = CrossEncoder("models/MiniLM-ms-marco-L2-v2")
scores_pretrained = model_pretrained.predict(pairs)

# Carregue o modelo fine-tunado
model_finetuned = CrossEncoder("models/ce-minilm-l2-ranknet-listwise")
scores_finetuned = model_finetuned.predict(pairs)

# Exiba os resultados lado a lado
print("Documento".ljust(40), "Score (pretrained)", "Score (finetuned)")
for doc, score_pre, score_fine in zip(docs, scores_pretrained, scores_finetuned):
    print(doc.ljust(40), f"{score_pre:.4f}".ljust(17), f"{score_fine:.4f}")


Documento                                Score (pretrained) Score (finetuned)
Paris is the capital of France.          8.2788            8.3409
Marseille is a city in France.           -3.5645           -3.8661
Berlin is the capital of Germany.        -2.9833           -2.9342


In [40]:
from sentence_transformers.cross_encoder import CrossEncoder

# 1) Defina suas queries e docs de teste
test_data = {
    "What are pandas?": [
        "Pandas are a kind of bear.",
        "Pandas are kind of like fish."
    ],
    "What is the capital of France?": [
        "Paris is the capital of France.",
        "Marseille is a city in France.",
        "Berlin is the capital of Germany."
    ]
}

# 2) Prepare os pares (query, doc)
pairs_list = []
for query, docs in test_data.items():
    for doc in docs:
        pairs_list.append((query, doc))

# 3) Carregue modelo base e modelo fine-tunado
model_pre = CrossEncoder("models/MiniLM-ms-marco-L2-v2", device="cpu")
model_fine = CrossEncoder("models/ce-minilm-l2-ranknet-listwise", device="cpu")

# 4) Gere scores
scores_pre = model_pre.predict(pairs_list)
scores_fine = model_fine.predict(pairs_list)

# 5) Exiba resultados agrupados por query
idx = 0
for query, docs in test_data.items():
    print(f"\nQuery: {query}")
    print("-" * (6 + len(query)))
    print(f"{'Document'.ljust(50)}  Pretrained    Finetuned")
    for doc in docs:
        print(f"{doc.ljust(50)}  {scores_pre[idx]:.4f}        {scores_fine[idx]:.4f}")
        idx += 1



Query: What are pandas?
----------------------
Document                                            Pretrained    Finetuned
Pandas are a kind of bear.                          8.1072        8.1414
Pandas are kind of like fish.                       7.6291        6.9827

Query: What is the capital of France?
------------------------------------
Document                                            Pretrained    Finetuned
Paris is the capital of France.                     8.2788        8.3409
Marseille is a city in France.                      -3.5645        -3.8661
Berlin is the capital of Germany.                   -2.9833        -2.9342
