In [9]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["OMP_NUM_THREADS"] = "1"

In [10]:
import pandas as pd
import json
import os
import faiss
import logging
import numpy as np 
import torch
import gc
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Callable, Tuple

from datasets import Dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from ragas import evaluate
from ragas.metrics import context_precision, context_recall
from langchain_ollama import ChatOllama, OllamaEmbeddings
from ragas.run_config import RunConfig

# CONFIGURATION

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%H:%M:%S', force=True)
logger = logging.getLogger("RAGEval_20_20")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

@dataclass
class EvalConfig:
    # Chemins des donn√©es
    path_ir_dataset: str = "bercy_test_10.jsonl"
    path_ragas_dataset: str = "bercy_golden_dataset.jsonl"
    output_dir: str = "./results_final_20_20"
    
    # Juge LLM (Ollama)
    llm_judge: str = "llama3"
    embedding_judge: str = "nomic-embed-text"
    ollama_url: str = "http://localhost:11434"
    
    # Param√®tres d'Ex√©cution
    batch_size: int = 1      
    top_k: int = 5
    
    default_instruction: str = "Retrieve the definition of an administrative acronym or term."

config = EvalConfig()
os.makedirs(config.output_dir, exist_ok=True)
print("Cellule 1 : Configuration charg√©e.")

Cellule 1 : Configuration charg√©e.


  from ragas.metrics import context_precision, context_recall
  from ragas.metrics import context_precision, context_recall


### **Retrieval & RAGAS**

In [11]:
class RAGEvaluator:
    def __init__(self, config: EvalConfig):
        self.cfg = config
        self.judge_llm = ChatOllama(model=self.cfg.llm_judge, temperature=0, base_url=self.cfg.ollama_url)
        self.eval_embeddings = OllamaEmbeddings(model=self.cfg.embedding_judge, base_url=self.cfg.ollama_url)
        self.ragas_run_config = RunConfig(max_workers=1, timeout=360)
        self.ragas_data = self._load_ragas_data()
        
    def _load_ragas_data(self):
        """Charge et pr√©pare les donn√©es Ragas une seule fois."""
        logger.info(f"Chargement des donn√©es Ragas...")
        try:
            df = pd.read_json(self.cfg.path_ragas_dataset, lines=True)
            # D√©duplication intelligente du corpus
            corpus = list(set([c[0] if isinstance(c, list) else c for c in df['ground_truth']]))
            return {"df": df, "corpus": corpus}
        except Exception as e:
            logger.critical(f"Erreur lecture dataset Ragas: {e}")
            raise e

    def sanity_check(self, model_path: str, local_only: bool) -> bool:
        """V√©rifie la sant√© du mod√®le et du serveur Ollama avant de commencer."""
        logger.info("ü©∫ Sanity Check...")
        try:
            # Test Mod√®le
            model = SentenceTransformer(model_path, device="cpu", local_files_only=local_only)
            model.encode(["test"], convert_to_numpy=True)
            # Test Ollama
            ds = Dataset.from_dict({'question': ["t"], 'ground_truth': ["t"], 'contexts': [["t"]]})
            evaluate(ds, metrics=[context_precision], llm=self.judge_llm, embeddings=self.eval_embeddings, raise_exceptions=True)
            logger.info("Syst√®me Sain.")
            return True
        except Exception as e:
            logger.critical(f"√âCHEC SANITY CHECK : {e}")
            return False

    def get_formatting_functions(self, model_alias: str, instruction: Optional[str] = None) -> Tuple[Callable, Callable, bool]:
        """
        Factory qui retourne les fonctions de formatage (Prefixes) adapt√©es au mod√®le.
        """
        alias = model_alias.lower()

        # CAS SOLON
        # Solon attend "query : " (avec espaces) pour les requ√™tes, et rien pour les passages.
        if "solon" in alias:
            logger.info("Mode SOLON (query : ) appliqu√©.")
            return (lambda q: f"query : {q}"), (lambda d: d), True

        logger.warning("Mode BRUT (Pas de pr√©fixe).")
        return (lambda q: q), (lambda d: d), False

    def evaluate_model(self, model_config: Dict):
        """
        Ex√©cute l'√©valuation compl√®te pour une configuration de mod√®le donn√©e.
        """
        # Extraction des variables depuis le dictionnaire
        path = model_config["path"]
        alias = model_config["alias"]
        local_only = model_config["local_only"]
        custom_instruct = model_config.get("instruction", None)

        logger.info(f"\n{'='*60}\n√âVALUATION : {alias}\n{'='*60}")
        
        if not self.sanity_check(path, local_only):
            return {"Mod√®le": alias, "Error": "Sanity Check Failed"}

        # 1. Configuration du formatage
        prefix_q, prefix_d, do_norm = self.get_formatting_functions(alias, custom_instruct)

        try:
            model = SentenceTransformer(path, device="cpu", local_files_only=local_only)
        except Exception as e:
            logger.error(f"Erreur chargement mod√®le local: {e}")
            logger.info("Tentative de chargement sans restriction locale...")
            try:
                # Tentative de secours si le path local √©choue
                model = SentenceTransformer(path, device="cpu")
            except Exception as e2:
                return {"Mod√®le": alias, "Error": f"Load Error: {e2}"}

        metrics = {"Mod√®le": alias}

        # PHASE 1 : Information Retrieval (MRR/NDCG)
        logger.info("Phase IR (Retrieval metrics)...")
        try:
            # Chargement et pr√©paration dataset IR √† la vol√©e
            with open(self.cfg.path_ir_dataset, 'r', encoding='utf-8') as f:
                ir_data = [json.loads(line) for line in f]
            
            queries, corpus, rel_docs = {}, {}, {}
            for idx, row in enumerate(ir_data):
                qid, docid = str(idx), f"doc_{idx}"
                queries[qid] = prefix_q(row['anchor'])
                corpus[docid] = prefix_d(row['positive'])
                rel_docs[qid] = {docid}
            
            ir_eval = InformationRetrievalEvaluator(
                queries, corpus, rel_docs, name=alias[:10], 
                show_progress_bar=False, mrr_at_k=[10], ndcg_at_k=[10]
            )
            ir_res = ir_eval(model)
            
            # R√©cup√©ration dynamique des cl√©s
            metrics['MRR@10'] = next((v for k, v in ir_res.items() if 'mrr@10' in k.lower()), 0)
            metrics['NDCG@10'] = next((v for k, v in ir_res.items() if 'ndcg@10' in k.lower()), 0)
            logger.info(f"IR Score: MRR@10 = {metrics['MRR@10']:.4f}")

        except Exception as e:
            logger.error(f"Erreur Phase IR: {e}")
            metrics['MRR@10'] = 0.0

        # PHASE 2 : RAGAS (Precision/Recall)
        logger.info("Phase RAGAS (LLM Judge)...")
        
        # Encodage Corpus
        logger.info("Indexation du corpus...")
        corpus_txt = [prefix_d(d) for d in self.ragas_data['corpus']]
        corpus_emb = model.encode(
            corpus_txt, normalize_embeddings=do_norm, 
            batch_size=8, show_progress_bar=True, convert_to_numpy=True
        )
        index = faiss.IndexFlatIP(corpus_emb.shape[1])
        index.add(corpus_emb)
        
        # Batch processing
        questions = self.ragas_data['df']['question'].tolist()
        ground_truths = self.ragas_data['df']['ground_truth'].tolist()
        ragas_batches = []
        
        for i in range(0, len(questions), self.cfg.batch_size):
            gc.collect()
            
            batch_q = questions[i : i + self.cfg.batch_size]
            batch_gt = ground_truths[i : i + self.cfg.batch_size]
            
            try:
                # Retrieval
                q_fmt = [prefix_q(q) for q in batch_q]
                q_emb = model.encode(q_fmt, normalize_embeddings=do_norm, convert_to_numpy=True)
                _, indices = index.search(q_emb, self.cfg.top_k)
                batch_ctx = [[self.ragas_data['corpus'][idx] for idx in row] for row in indices]
                
                # Eval
                ds = Dataset.from_dict({'question': batch_q, 'ground_truth': batch_gt, 'contexts': batch_ctx})
                res = evaluate(ds, metrics=[context_precision, context_recall], llm=self.judge_llm, embeddings=self.eval_embeddings, raise_exceptions=False)
                ragas_batches.append(res.to_pandas())
                
                if (i // self.cfg.batch_size) % 5 == 0:
                    logger.info(f"Progress: {i}/{len(questions)} queries")
                    
            except Exception as e:
                logger.error(f"Erreur Batch {i}: {e}")
                continue

        if ragas_batches:
            full_df = pd.concat(ragas_batches)
            metrics['Context Precision'] = full_df['context_precision'].mean()
            metrics['Context Recall'] = full_df['context_recall'].mean()
        else:
            metrics['Context Precision'] = 0.0
            metrics['Context Recall'] = 0.0
            
        logger.info(f"R√âSULTATS : {metrics}")
        
        # Nettoyage
        del model, index, corpus_emb
        gc.collect()
        if torch.backends.mps.is_available():
            torch.mps.empty_cache()
            
        return metrics

evaluator = RAGEvaluator(config)
print("Cellule 2 : Moteur d'√©valuation charg√© (Correction NameError appliqu√©e).")

19:03:52 - INFO - Chargement des donn√©es Ragas...


Cellule 2 : Moteur d'√©valuation charg√© (Correction NameError appliqu√©e).


### **Liste des mod√®les Solon √† tester (Baseline vs fine-tuner)**

In [None]:
INSTRUCTION_BERCY = "Retrieve the definition of an administrative acronym or term."

models_campaign = [

    # SOLON BASELINE

    {
        "alias": "Solon-Base-Large",
        "path": "OrdalieTech/SOLON-embeddings-large-0.1",
        "local_only": False,
        "instruction": None
    },

    # SOLON FT

    {
        "alias": "Solon-FT-Config1",
        "path": "./final_models/solon_large_finetuned_config1_merged",
        "local_only": True,
        "instruction": None
    },
    {
        "alias": "Solon-FT-Config1-v2",
        "path": "./final_models/solon_large_finetuned_config1_merged_v2",
        "local_only": True,
        "instruction": None
    },
    {
        "alias": "Solon-FT-Config2",
        "path": "./final_models/solon_large_finetuned_config2_merged",
        "local_only": True,
        "instruction": None
    },
    {
        "alias": "Solon-FT-Config2-v2",
        "path": "./final_models/solon_large_finetuned_config2_merged_v2",
        "local_only": True,
        "instruction": None
    },
    {
        "alias": "Solon-FT-Config3",
        "path": "./final_models/solon_large_finetuned_config3_merged",
        "local_only": True,
        "instruction": None
    },
    {
        "alias": "Solon-FT-Config3-v2",
        "path": "./final_models/solon_large_finetuned_config3_merged_v2",
        "local_only": True,
        "instruction": None
    },
    {
        "alias": "Solon-FT-Config4",
        "path": "./final_models/solon_large_finetuned_config4_merged",
        "local_only": True,
        "instruction": None
    },
    {
        "alias": "Solon-FT-Config4-v2",
        "path": "./final_models/solon_large_finetuned_config4_merged_v2",
        "local_only": True,
        "instruction": None
    },
]

print(f"Cellule 3 : Campagne configur√©e avec {len(models_campaign)} mod√®les.")

SyntaxError: invalid syntax. Perhaps you forgot a comma? (4133324039.py, line 7)

### **Ex√©cution des mod√®les**

In [14]:
all_results = []

print(f"Lancement de la campagne sur {len(models_campaign)} mod√®les...\n")

for model_cfg in models_campaign:
    # V√©rification si le dossier existe
    if model_cfg["local_only"] and not os.path.exists(model_cfg["path"]):
        print(f"Dossier introuvable pour {model_cfg['alias']} ({model_cfg['path']}). On passe.")
        continue
        
    # Lancement de l'√©valuation
    res = evaluator.evaluate_model(model_cfg)
    all_results.append(res)
    
    # Sauvegarde interm√©diaire (S√©curit√© anti-crash)
    pd.DataFrame(all_results).to_csv("resultats_progressifs_20_20.csv", index=False)

print("\nCellule 4 : Calculs termin√©s.")

19:05:21 - INFO - 
√âVALUATION : Solon-FT-Config4
19:05:21 - INFO - ü©∫ Sanity Check...
19:05:21 - INFO - Load pretrained SentenceTransformer: ./final_models/solon_large_finetuned_config4_merged


Lancement de la campagne sur 2 mod√®les...



Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 391/391 [00:00<00:00, 781.45it/s, Materializing param=pooler.dense.weight]                               
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:03<00:00,  3.08s/it]
Evaluating:   0%|          | 0/1 [00:00<?, ?it/s]19:05:28 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
Evaluating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:02<00:00,  2.68s/it]
19:05:31 - INFO - Syst√®me Sain.
19:05:31 - INFO - Mode SOLON (query : ) appliqu√©.
19:05:31 - INFO - Load pretrained SentenceTransformer: ./final_models/solon_large_finetuned_config4_merged
Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 391/391 [00:00<00:00, 724.54it/s, Materializing param=pooler.dense.weight]                               
19:05:34 - INFO - Phase IR (Retrieval metrics)...
19:05:34 - INFO - Information Retrieval Evaluation of the model on the Solon-FT-C dataset:
19:09:28 - INFO - Queries: 468
19:09:28 - INFO - Corpus: 46

KeyboardInterrupt: 

### **R√©cup√©ration d'un fichier csv pour les r√©sultats**

In [None]:
if all_results:
    df_final = pd.DataFrame(all_results)
    
    desired_order = ['Mod√®le', 'MRR@10', 'NDCG@10', 'Context Precision', 'Context Recall']
    cols = [c for c in desired_order if c in df_final.columns]
    cols += [c for c in df_final.columns if c not in desired_order]
    
    print("\nCLASSEMENT FINAL")
    display(df_final[cols])
    
    # Sauvegarde finale
    df_final.to_csv("resultats_finaux.csv", index=False)
    print("Sauvegard√© dans 'resultats_finaux.csv'")
else:
    print("Aucun r√©sultat n'a √©t√© g√©n√©r√©.")