In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["OMP_NUM_THREADS"] = "1"

In [None]:
import pandas as pd
import json
import os
import faiss
import logging
import numpy as np 
import torch
import gc
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Callable, Tuple

from datasets import Dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from ragas import evaluate
from ragas.metrics import context_precision, context_recall
from langchain_ollama import ChatOllama, OllamaEmbeddings
from ragas.run_config import RunConfig

# Logger configuration
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%H:%M:%S', force=True)
logger = logging.getLogger("RAGEval_20_20")

os.environ["TOKENIZERS_PARALLELISM"] = "false"

@dataclass
class EvalConfig:
    # Chemins des donn√©es
    path_ir_dataset: str = "bercy_test_10.jsonl"
    path_ragas_dataset: str = "bercy_golden_dataset.jsonl"
    output_dir: str = "./results_final_20_20"
    
    # Juge LLM (Ollama)
    llm_judge: str = "llama3"
    embedding_judge: str = "nomic-embed-text"
    ollama_url: str = "http://localhost:11434"
    
    # Param√®tres d'Ex√©cution
    batch_size: int = 1      
    top_k: int = 5
    
    # Instruction par d√©faut (R√©f√©rence centrale)
    default_instruction: str = "Retrieve the definition of an administrative acronym or term."

config = EvalConfig()
os.makedirs(config.output_dir, exist_ok=True)
print("Configuration charg√©e.")

‚úÖ Cellule 1 : Configuration charg√©e.


  from ragas.metrics import context_precision, context_recall
  from ragas.metrics import context_precision, context_recall


In [None]:
class RAGEvaluator:
    def __init__(self, config: EvalConfig):
        self.cfg = config
        self.judge_llm = ChatOllama(model=self.cfg.llm_judge, temperature=0, base_url=self.cfg.ollama_url)
        self.eval_embeddings = OllamaEmbeddings(model=self.cfg.embedding_judge, base_url=self.cfg.ollama_url)
        self.ragas_run_config = RunConfig(max_workers=1, timeout=360)
        self.ragas_data = self._load_ragas_data()
        
    def _load_ragas_data(self):
        logger.info(f"Chargement des donn√©es Ragas...")
        try:
            df = pd.read_json(self.cfg.path_ragas_dataset, lines=True)
            # D√©duplication du corpus
            corpus = list(set([c[0] if isinstance(c, list) else c for c in df['ground_truth']]))
            return {"df": df, "corpus": corpus}
        except Exception as e:
            logger.critical(f"Erreur lecture dataset Ragas: {e}")
            raise e

    def sanity_check(self, model_path: str, local_only: bool) -> bool:
        logger.info("Sanity Check...")
        try:
            # Test Mod√®le
            model = SentenceTransformer(model_path, device="cpu", local_files_only=local_only)
            model.encode(["test"], convert_to_numpy=True)
            # Test Ollama
            ds = Dataset.from_dict({'question': ["t"], 'ground_truth': ["t"], 'contexts': [["t"]]})
            evaluate(ds, metrics=[context_precision], llm=self.judge_llm, embeddings=self.eval_embeddings, raise_exceptions=True)
            logger.info("Syst√®me Sain.")
            return True
        except Exception as e:
            logger.critical(f" √âCHEC SANITY CHECK : {e}")
            return False

    def get_formatting_functions(self, model_alias: str, instruction: Optional[str] = None) -> Tuple[Callable, Callable, bool]:
        alias = model_alias.lower()
        
        if instruction or "config" in alias or "ft" in alias or "instruct" in alias:
            task_desc = instruction if instruction else self.cfg.default_instruction
            prompt = f"Instruct: {task_desc}\nQuery: "
            logger.info(f"Mode INSTRUCT appliqu√©.\n  Prompt: '{prompt.strip()}...'")
            return (lambda q: f"{prompt}{q}"), (lambda d: d), True
        
        elif "e5" in alias and "base" in alias:
            logger.info("Mode E5 STANDARD (query:/passage:) appliqu√©.")
            return (lambda q: f"query: {q}"), (lambda d: f"passage: {d}"), True
        
        else:
            logger.warning("Mode BRUT (Pas de pr√©fixe).")
            return (lambda q: q), (lambda d: d), False

    def evaluate_model(self, model_config: Dict):
        
        path = model_config["path"]
        alias = model_config["alias"]
        local_only = model_config["local_only"]
        custom_instruct = model_config.get("instruction", None)
        logger.info(f"\n{'='*60}\n √âVALUATION : {alias}\n{'='*60}")
        
        if not self.sanity_check(path, local_only):
            return {"Mod√®le": alias, "Error": "Sanity Check Failed"}

        #  Configuration du formatage
        prefix_q, prefix_d, do_norm = self.get_formatting_functions(alias, custom_instruct)

        try:
            model = SentenceTransformer(path, device="cpu", local_files_only=local_only)
        except Exception as e:
            logger.error(f"Erreur chargement mod√®le local: {e}")
            logger.info("Tentative de chargement sans restriction locale...")
            try:
                model = SentenceTransformer(path, device="cpu")
            except Exception as e2:
                return {"Mod√®le": alias, "Error": f"Load Error: {e2}"}

        metrics = {"Mod√®le": alias}

        # Information Retrieval (MRR/NDCG)
        logger.info("Phase IR (Retrieval metrics)...")
        try:
            # Chargement et pr√©paration dataset 
            with open(self.cfg.path_ir_dataset, 'r', encoding='utf-8') as f:
                ir_data = [json.loads(line) for line in f]
            
            queries, corpus, rel_docs = {}, {}, {}
            for idx, row in enumerate(ir_data):
                qid, docid = str(idx), f"doc_{idx}"
                queries[qid] = prefix_q(row['anchor'])
                corpus[docid] = prefix_d(row['positive'])
                rel_docs[qid] = {docid}
            
            ir_eval = InformationRetrievalEvaluator(
                queries, corpus, rel_docs, name=alias[:10], 
                show_progress_bar=False, mrr_at_k=[10], ndcg_at_k=[10]
            )
            ir_res = ir_eval(model)
            
            # R√©cup√©ration dynamique des cl√©s
            metrics['MRR@10'] = next((v for k, v in ir_res.items() if 'mrr@10' in k.lower()), 0)
            metrics['NDCG@10'] = next((v for k, v in ir_res.items() if 'ndcg@10' in k.lower()), 0)
            logger.info(f"IR Score: MRR@10 = {metrics['MRR@10']:.4f}")

        except Exception as e:
            logger.error(f" Erreur Phase IR: {e}")
            metrics['MRR@10'] = 0.0

        # RAGAS (Precision/Recall) 
        logger.info(" Phase RAGAS (LLM Judge)...")
        
        # Encodage Corpus
        logger.info("Indexation du corpus...")
        corpus_txt = [prefix_d(d) for d in self.ragas_data['corpus']]
        corpus_emb = model.encode(
            corpus_txt, normalize_embeddings=do_norm, 
            batch_size=8, show_progress_bar=True, convert_to_numpy=True
        )
        index = faiss.IndexFlatIP(corpus_emb.shape[1])
        index.add(corpus_emb)
        
        # Batch processing
        questions = self.ragas_data['df']['question'].tolist()
        ground_truths = self.ragas_data['df']['ground_truth'].tolist()
        ragas_batches = []
        
        for i in range(0, len(questions), self.cfg.batch_size):
            gc.collect() 
            
            batch_q = questions[i : i + self.cfg.batch_size]
            batch_gt = ground_truths[i : i + self.cfg.batch_size]
            
            try:
                # Retrieval
                q_fmt = [prefix_q(q) for q in batch_q]
                q_emb = model.encode(q_fmt, normalize_embeddings=do_norm, convert_to_numpy=True)
                _, indices = index.search(q_emb, self.cfg.top_k)
                batch_ctx = [[self.ragas_data['corpus'][idx] for idx in row] for row in indices]
                
                # Eval
                ds = Dataset.from_dict({'question': batch_q, 'ground_truth': batch_gt, 'contexts': batch_ctx})
                res = evaluate(ds, metrics=[context_precision, context_recall], llm=self.judge_llm, embeddings=self.eval_embeddings, raise_exceptions=False)
                ragas_batches.append(res.to_pandas())
                
                if (i // self.cfg.batch_size) % 5 == 0:
                    logger.info(f"Progress: {i}/{len(questions)} queries")
                    
            except Exception as e:
                logger.error(f"Erreur Batch {i}: {e}")
                continue

        if ragas_batches:
            full_df = pd.concat(ragas_batches)
            metrics['Context Precision'] = full_df['context_precision'].mean()
            metrics['Context Recall'] = full_df['context_recall'].mean()
        else:
            metrics['Context Precision'] = 0.0
            metrics['Context Recall'] = 0.0
            
        logger.info(f"R√âSULTATS : {metrics}")
        
        # Nettoyage
        del model, index, corpus_emb
        gc.collect()
        if torch.backends.mps.is_available():
            torch.mps.empty_cache()
            
        return metrics

# Instanciation globale
evaluator = RAGEvaluator(config)
print("Moteur d'√©valuation charg√© (Correction NameError appliqu√©e).")

02:13:18 - INFO - üìÇ Chargement des donn√©es Ragas...


‚úÖ Cellule 2 : Moteur d'√©valuation charg√© (Correction NameError appliqu√©e).


In [None]:


INSTRUCTION_BERCY = "Retrieve the definition of an administrative acronym or term."

models_campaign = [
    
    # 2. Config 1
    {
        "alias": "E5-FT-Config1",
        "path": "./e5_large_finetuned_config1_merged",
        "local_only": True,
        "instruction": INSTRUCTION_BERCY
    },
    # 3. Config 2
    {
        "alias": "E5-FT-Config2",
        "path": "./e5_large_finetuned_config2_merged_v2",
        "local_only": True,
        "instruction": INSTRUCTION_BERCY
    },
    # 4. Config 3
    {
        "alias": "E5-FT-Config3",
        "path": "./e5_large_finetuned_config3_merged",
        "local_only": True,
        "instruction": INSTRUCTION_BERCY
    },
    # 5. Config 4
    {
        "alias": "E5-FT-Config4",
        "path": "./e5_large_finetuned_config4_merged_v2",
        "local_only": True,
        "instruction": INSTRUCTION_BERCY
    },
e5_large_finetuned_config4_merged_v2
    # 1. Mod√®le de Base (R√©f√©rence)
    {
        "alias": "E5-Base-Instruct",
        "path": "intfloat/multilingual-e5-large-instruct",
        "local_only": False,
        "instruction": None 
    }
]
print(f"Campagne configur√©e avec {len(models_campaign)} mod√®les.")

‚úÖ Cellule 3 : Campagne configur√©e avec 5 mod√®les.


In [None]:

all_results = []

print(f"Lancement sur {len(models_campaign)} mod√®les...\n")

for model_cfg in models_campaign:
    if model_cfg["local_only"] and not os.path.exists(model_cfg["path"]):
        print(f"Dossier introuvable pour {model_cfg['alias']} ({model_cfg['path']}). On passe.")
        continue
    res = evaluator.evaluate_model(model_cfg)
    all_results.append(res)
    pd.DataFrame(all_results).to_csv("resultats_progressifs_20_20.csv", index=False)

print("\n Calculs termin√©s.")

12:52:19 - INFO - 
üöÄ √âVALUATION : E5-FT-Config1
12:52:19 - INFO - ü©∫ Sanity Check...
12:52:19 - INFO - Load pretrained SentenceTransformer: ./e5_large_finetuned_config1_merged


üöÄ Lancement de la campagne sur 5 mod√®les...



The tokenizer you are loading from './e5_large_finetuned_config1_merged' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:01<00:00,  1.20s/it]
Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]12:52:24 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:04<00:00,  4.56s/it]
12:52:26 - INFO - ‚úÖ Syst√®me Sain.
12:52:26 - INFO -    ‚öôÔ∏è  Mode INSTRUCT appliqu√©.
       üëâ Prompt: 'Instruct: Retrieve the definition of an administrative acronym or term.
Query:...'
12:52:26 - INFO - Load pretrained SentenceTransformer: ./e5_large_finetuned_config1_merged
The tokenizer you are loading from './e5_large_finetuned_config1_merged

KeyboardInterrupt: 

In [None]:
#RAPPORT FINAL 

if all_results:
    df_final = pd.DataFrame(all_results)
    
    # R√©organisation des colonnes pour la lisibilit√©
    desired_order = ['Mod√®le', 'MRR@10', 'NDCG@10', 'Context Precision', 'Context Recall']
    cols = [c for c in desired_order if c in df_final.columns]
   
    cols += [c for c in df_final.columns if c not in desired_order]
    
    print("\nCLASSEMENT FINAL")
    display(df_final[cols])
    df_final.to_csv("resultats_finaux_grand_chelem.csv", index=False)
    print("Sauvegard√© dans 'resultats_finaux_grand_chelem.csv'")
else:
    print("Aucun r√©sultat n'a √©t√© g√©n√©r√©.")


üèÜ CLASSEMENT FINAL üèÜ


Unnamed: 0,Mod√®le,MRR@10,NDCG@10,Context Precision,Context Recall
0,E5-FT-Config1,0.138189,0.169041,0.625874,0.903098
1,E5-FT-Config2,0.509758,0.564977,0.957778,0.987037
2,E5-FT-Config3,0.522828,0.579678,0.958599,0.980835
3,E5-Base-Instruct,0.513838,0.572278,0.955579,0.971663


Sauvegard√© dans 'resultats_finaux_grand_chelem.csv'


In [None]:


missing_model = {
    "alias": "E5-FT-Config4-v2",
    "path": "./e5_large_finetuned_config4_merged_v2", 
    "local_only": True,
    "instruction": INSTRUCTION_BERCY
}

print(f"\nLancement du rattrapage pour : {missing_model['alias']}")

if os.path.exists(missing_model["path"]):
    res_missing = evaluator.evaluate_model(missing_model)
    all_results.append(res_missing)
    pd.DataFrame(all_results).to_csv("resultats_rattrapage.csv", index=False)
    print(f" {missing_model['alias']} ajout√© avec succ√®s !")
else:
    print(f" ERREUR : Le dossier '{missing_model['path']}' est introuvable.")
    print(" V√©rifie le nom exact du dossier dans la barre de fichiers √† gauche.")

# 3. Affichage du Tableau Complet Mis √† Jour
print("\n CLASSEMENT FINAL (MIS √Ä JOUR)")
if all_results:
    df_final = pd.DataFrame(all_results)
    
    # Mise en forme propre
    cols = ['Mod√®le', 'MRR@10', 'NDCG@10', 'Context Precision', 'Context Recall']
    cols = [c for c in cols if c in df_final.columns] 
    
    display(df_final[cols])
else:
    print("Aucun r√©sultat en m√©moire.")

12:53:11 - INFO - 
üöÄ √âVALUATION : E5-FT-Config4-v2
12:53:11 - INFO - ü©∫ Sanity Check...
12:53:11 - INFO - Load pretrained SentenceTransformer: ./e5_large_finetuned_config4_merged_v2



üöÄ Lancement du rattrapage pour : E5-FT-Config4-v2


The tokenizer you are loading from './e5_large_finetuned_config4_merged_v2' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:01<00:00,  1.23s/it]
Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]12:53:13 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:01<00:00,  1.83s/it]
12:53:16 - INFO - ‚úÖ Syst√®me Sain.
12:53:16 - INFO -    ‚öôÔ∏è  Mode INSTRUCT appliqu√©.
       üëâ Prompt: 'Instruct: Retrieve the definition of an administrative acronym or term.
Query:...'
12:53:16 - INFO - Load pretrained SentenceTransformer: ./e5_large_finetuned_config4_merged_v2
The tokenizer you are loading from './e5_large_finetuned_config4_

‚úÖ E5-FT-Config4-v2 ajout√© avec succ√®s !

üèÜ CLASSEMENT FINAL (MIS √Ä JOUR) üèÜ


Unnamed: 0,Mod√®le,MRR@10,NDCG@10,Context Precision,Context Recall
0,E5-FT-Config4-v2,0.516592,0.575295,0.95787,0.987892
