In [1]:
import sys
import os

parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

import config

In [72]:
import random
import json
from pathlib import Path
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
import pandas as pd
from tqdm import tqdm
from openai import OpenAI
import re, math, unicodedata

In [112]:
client = OpenAI(api_key=config.Config.OPENAI_API_KEY)
qdrant_client = QdrantClient(url='localhost')
embedding_model = SentenceTransformer("BAAI/bge-m3")
collection_name = 'ragscout_players'
path = Path("../../data/player_queries.json")
random.seed(10)

sample_size = 100

GAIN_EXACT_PLAYER = 4
GAIN_PROFIL_EQUAL = 3
GAIN_PROFIL_STRONG_OVERLAP = 2
GAIN_PROFIL_WEAK_OVERLAP = 1

In [123]:
def sample_players_from_qdrant(qdrant_client, collection_name, n=sample_size):
    players, all_points = [], []
    next_page = None
    while True:
        points, next_page = qdrant_client.scroll(
            collection_name=collection_name,
            with_payload=True, with_vectors=False,
            limit=256, offset=next_page
        )
        all_points.extend(points)
        if not next_page:
            break
        
    random.shuffle(all_points)
    seen = set()
    for p in all_points:
        payload = p.payload or {}
        name = payload.get("player")
        if name and payload.get("summary") and not name in seen:
            seen.add(name)
            players.append(payload)
            if len(players) >= n:
                break

    return players

players_sample = sample_players_from_qdrant(qdrant_client, collection_name, n=sample_size)
len(players_sample), players_sample[0].get("player"), players_sample[0].get("summary")[:120]

(100,
 'Nadiem Amiri',
 'Le joueur évolue principalement au poste de milieu de terrain, avec un sous-rôle qui pourrait être défini comme milieu c')

In [None]:
def extract_profil_type(summary: str) -> str | None:
    if not summary:
        return None
    
    m = re.search(r"Profil-type\s*:\s*(.+)", summary, flags=re.IGNORECASE)
    return m.group(1).strip() if m else None

def strip_accents(s: str) -> str:
    return "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn")

def normalize_text(s: str) -> str:
    s = strip_accents(s.lower())
    s = s.replace(' et ', ' ').replace(' de ', ' ')
    s = re.sub(r"[^\w\s\-+]", " ", s)  # garde mots/espaces/tirets/+
    s = re.sub(r"\s+", " ", s).strip()
    return s

CANON_MAP = {}

def canonical_tokens(s: str) -> set:
    s = normalize_text(s)
    s = s.replace("duels aeriens", "aerien").replace("jeu entre les lignes", "entre-lignes")
    toks = set(s.split())
    return toks #set(CANON_MAP.get(t, t) for t in toks)

def soft_label(ref_profil: str | None, cand_profil: str | None) -> int:
    """
    2 = égal (après normalisation), 1 = au moins 1 token canonique partagé, 0 sinon
    """
    if not ref_profil or not cand_profil:
        return 0
    if normalize_text(ref_profil) == normalize_text(cand_profil):
        return GAIN_PROFIL_EQUAL
    if len(canonical_tokens(ref_profil) & canonical_tokens(cand_profil)) >= 3:
        return GAIN_PROFIL_STRONG_OVERLAP
    return GAIN_PROFIL_WEAK_OVERLAP if len(canonical_tokens(ref_profil) & canonical_tokens(cand_profil)) >= 2 else 0

def ndcg_at_k(gains: list[int], k: int) -> float:
    gains = gains[:k]
    dcg = sum(g / math.log2(i+2) for i, g in enumerate(gains))
    ideal = sorted(gains, reverse=True)
    idcg = sum(g / math.log2(i+2) for i, g in enumerate(ideal))
    return (dcg / idcg) if idcg > 0 else 0.0


print(f"""
    Player: {players_sample[0].get('player')}
    Profil : {extract_profil_type(players_sample[0].get('summary'))}
""")

print(f"""
    Player: {players_sample[0].get('player')}
    Profil : {normalize_text(extract_profil_type(players_sample[0].get('summary')))}
""")

print(f"""
    Player: {players_sample[0].get('player')}
    Profil : {canonical_tokens(extract_profil_type(players_sample[0].get('summary')))}
""")

player_1_index = 0
player_2_index = 3
profil_player_1 = extract_profil_type(players_sample[player_1_index].get('summary'))
profil_player_2 = extract_profil_type(players_sample[player_2_index].get('summary'))
token_player_1 = canonical_tokens(extract_profil_type(players_sample[player_1_index].get('summary')))
token_player_2 = canonical_tokens(extract_profil_type(players_sample[player_2_index].get('summary')))
print(f"""
    Players: {players_sample[player_1_index].get('player')} - {players_sample[player_2_index].get('player')}
    Tokens: {token_player_1} - {token_player_2}
    Score : {soft_label(profil_player_1, profil_player_2)}
""")


    Player: Nadiem Amiri
    Profil : Milieu central polyvalent et créatif.


    Player: Nadiem Amiri
    Profil : milieu central polyvalent creatif


    Player: Nadiem Amiri
    Profil : {'milieu', 'central', 'polyvalent', 'creatif'}


    Players: Nadiem Amiri - Danny da Costa
    Tokens: {'milieu', 'central', 'polyvalent', 'creatif'} - {'defenseur', 'implique', 'central', 'solide'}
    Score : 0



In [None]:
EVAL_PROMPT_TEMPLATE = """
Tu es recruteur dans un club professionnel.

Tu viens de lire un rapport de scouting décrivant le style de jeu, les qualités et les axes d’amélioration d’un joueur.

Formule une requête courte, naturelle et spécifique que taperait un recruteur pour retrouver ce profil :
- une seule phrase fluide
- poste (ou sous-rôle) + 2 ou 3 caractéristiques différenciantes
- termes précis : “pressing intense”, “relance propre”, “jeu entre les lignes”, “présence aérienne”, “percussion”, etc.
- ne sois pas vague (“bon techniquement” est à éviter)
- pas de nom, club, nationalité ou ligue

Résumé :
\"\"\"{summary}\"\"\"

Donne uniquement la requête, sans guillemets, sans commentaire.
"""

def generate_query_from_summary(summary: str, model: str):
    prompt = EVAL_PROMPT_TEMPLATE.format(summary=summary)
    resp = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "Tu es un assistant concis et précis."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.3,
        max_tokens=60
    )
    return resp.choices[0].message.content.strip()


try:
    with path.open("r", encoding="utf-8") as f:
        data = json.load(f)
        eval_queries = data if isinstance(data, list) else []
except FileNotFoundError:
    print("Le fichier n'existe pas")
    eval_queries = []
except json.JSONDecodeError:
    print('Le fichier est vide ou JSON corrompu ')
    eval_queries = []
    
for item in players_sample:
    connu = any(d.get("expected_player") == item.get('player') for d in eval_queries)
    if connu:
        continue
    
    q = generate_query_from_summary(item["summary"], model=config.Config.OPENAI_MODEL)
    eval_queries.append({
        "query": q,
        "expected_player": item["player"]
    })
    

print("Exemples de requêtes générées :")
for e in eval_queries[:3]:
    print("•", e["query"], "→ attendu :", e["expected_player"])
    
with open("../../data/player_queries.json", "w", encoding="utf-8") as f:
    json.dump(eval_queries, f, ensure_ascii=False, indent=2)


Exemples de requêtes générées :
• défenseur central solide avec pressing intense, relance propre et bonne présence aérienne → attendu : William Saliba
• milieu de terrain central avec pressing intense, relance propre et présence aérienne → attendu : Nicolás González
• milieu central polyvalent avec pressing intense, relance propre et capacité à jouer entre les lignes → attendu : Bruno Guimarães


In [None]:
def build_profil_type() -> list[dict]:
    generated_list = {}
    next_page = None
    while True:
        points, next_page = qdrant_client.scroll(
            collection_name=collection_name,
            with_payload=True, with_vectors=False,
            limit=256, offset=next_page
        )
        for p in points:
            pl = p.payload or {}
            name = pl.get("player")
            if not name:
                continue
            profil = pl.get("profil_type") or extract_profil_type(pl.get("summary", ""))
            if profil:
                generated_list[name] = normalize_text(profil) 
        if not next_page:
            break
    return generated_list

profile_player = build_profil_type(players_sample)
profile_player

{'Ben White': 'defenseur central solide implique',
 'Bukayo Saka': 'createur offensif polyvalent',
 'David Raya': 'gardien but relanceur solide',
 'Declan Rice': 'milieu creatif polyvalent',
 'Ethan Nwaneri': 'attaquant creatif polyvalent',
 'Gabriel Magalhães': 'defenseur central engage precis',
 'Gabriel Martinelli': 'attaquant mobile creatif',
 'Jakub Kiwior': 'defenseur central solide precis',
 'Jurriën Timber': 'defenseur lateral offensif solide',
 'Kai Havertz': 'attaquant polyvalent creatif',
 'Leandro Trossard': 'attaquant creatif polyvalent',
 'Martin Ødegaard': 'milieu creatif polyvalent',
 'Mikel Merino': 'milieu offensif polyvalent creatif',
 'Myles Lewis-Skelly': 'defenseur central solide fiable',
 'Riccardo Calafiori': 'defenseur lateral offensif solide',
 'Thomas Partey': 'milieu terrain polyvalent engage',
 'William Saliba': 'defenseur central solide fiable',
 'Amadou Onana': 'milieu central recuperateur dynamique',
 'Boubacar Kamara': 'milieu terrain recuperateur distr

In [155]:
def eval_ndcg_profil(eval_queries, k, gain_exact=GAIN_EXACT_PLAYER) -> dict:
    scores, details = [], []
    for sample in eval_queries:
        query = sample['query']
        expected = sample['expected_player']
        
        expected_profile = profile_player.get(expected, None)
        
        if expected_profile == None: 
            print(expected)
            continue
        
        query_vector = embedding_model.encode(query).tolist()
        results = qdrant_client.query_points(collection_name=collection_name, query=query_vector, limit=k)
        
        gains, cand_list = [], []
        for point in results.points: 
            payload = point.payload or {}
            cand_name = payload.get('player')
            cand_profile = normalize_text(extract_profil_type(payload.get('summary')))
            
            cand_list.append({'player':cand_name, 'profil_type': cand_profile})
            
            if cand_name == expected:
                gains.append(gain_exact)
            else:
                if expected_profile:
                    gains.append(soft_label(expected_profile, cand_profile))
                else:
                    gains.append(soft_label(query, cand_profile))
            
        ndcg = ndcg_at_k(gains, k)
        scores.append(ndcg)
        details.append({
            "query": query,
            "expected_player": sample["expected_player"],
            "ref_profil": expected_profile,
            "candidates": cand_list,
            "gains": gains,
            f"nDCG@{k}": round(ndcg, 3)
        })
    
    return {f"nDCG@{k}": round(sum(scores)/len(scores), 3), "details": details}

k = 3
name = f"nDCG@{k}"       
report = eval_ndcg_profil(eval_queries, k)
print(f"{name} = {report[name]}")


  return forward_call(*args, **kwargs)


nDCG@3 = 0.93


In [142]:
from collections import Counter
all_gains = []
for d in report["details"]:
    all_gains.extend(d["gains"])
print(Counter(all_gains))

Counter({2: 150, 3: 91, 1: 49, 0: 26, 4: 5})


In [158]:
## LLM as Judge
K = 5 

def retrieve_topk(query: str, k: int = K):
    qvec = embedding_model.encode(query).tolist()
    results = qdrant_client.query_points(
        collection_name=collection_name,
        query=qvec,
        limit=k
    )

    candidates = []
    for r in results.points:
        pl = r.payload or {}
        candidates.append({
            "player": pl.get("player"),
            "profil_type": extract_profil_type(pl.get("summary")),
            "summary": pl.get("summary", "")
        })
    return candidates


def clip(txt: str, n: int = 500) -> str:
    txt = txt.strip()
    return (txt[:n] + "…") if len(txt) > n else txt

JUDGE_PROMPT = """
    Tu es évaluateur de pertinence pour le recrutement football.

    Consigne : pour une requête donnée, on te fournit une liste de candidats (profil-type + extrait de résumé).
    Attribue une note de pertinence entière à chacun des candidats, dans {0,1,2,3} :
    - 0 = hors sujet
    - 1 = partiellement pertinent (une seule caractéristique correspond)
    - 2 = pertinent (bon match global)
    - 3 = très pertinent (profil idéal pour la requête)

    Règles :
    - Juge uniquement d’après la requête et la description fournie.
    - Pas de suppositions : si c’est ambigu, note plus bas.
    - Réponse attendue : UNIQUEMENT une liste JSON d’entiers de longueur K, par ex: [2,1,0,3,2]
"""

def judge_relevance_llm(query: str, candidates: list[dict], model: str) -> list[int]:
    lines = [f"Requête: {query}", "", "Candidats:"]
    for i, c in enumerate(candidates, start=1):
        ptype = c.get("profil_type") or ""
        summ = clip(c.get("summary",""), 450)
        lines.append(f"{i}. Profil-type: {ptype if ptype else '(non précisé)'}")
        lines.append(f" Résumé: {summ}")
    lines.append("")
    lines.append("Rends uniquement la liste JSON des notes, rien d’autre.")

    prompt = JUDGE_PROMPT + "\n" + "\n".join(lines)

    resp = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "Tu es strict, cohérent et concis."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.0,
        max_tokens=50
    )
    text = resp.choices[0].message.content.strip()

    try:
        scores = json.loads(text)
    except Exception:
        # fallback: extraire la 1ère liste [ ... ] d'entiers
        m = re.search(r"\[(\s*\d\s*(?:,\s*\d\s*)*)\]", text)
        scores = [int(x) for x in m.group(1).split(",")] if m else []

    if len(scores) != len(candidates):
        scores = (scores + [0]*len(candidates))[:len(candidates)]
    scores = [max(0, min(3, int(x))) for x in scores]
    return scores

def ndcg_at_k(gains: list[int], k: int) -> float:
    gains = gains[:k]
    dcg = sum(g / math.log2(i + 2) for i, g in enumerate(gains))
    ideal = sorted(gains, reverse=True)
    idcg = sum(g / math.log2(i + 2) for i, g in enumerate(ideal))
    return (dcg / idcg) if idcg > 0 else 0.0

def eval_llm_judge(eval_queries: list[dict], judge_model: str, k: int = K, max_queries: int = 10):
    ndcgs = []
    judged_details = []

    subset = eval_queries[:max_queries]  # on démarre sur 10
    for sample in subset:
        query = sample["query"]
        candidates = retrieve_topk(query, k=k)
        gains = judge_relevance_llm(query, candidates, model=judge_model)

        ndcg = ndcg_at_k(gains, k)
        ndcgs.append(ndcg)

        judged_details.append({
            "query": query,
            "candidates": [{"player": c["player"], "profil_type": c["profil_type"]} for c in candidates],
            "gains": gains,
            f"nDCG@{k}": round(ndcg, 3)
        })

    return {
        f"nDCG@{k}": round(sum(ndcgs)/len(ndcgs), 3) if ndcgs else 0.0,
        "details": judged_details
    }

In [159]:
report_judge = eval_llm_judge(
    eval_queries=eval_queries,
    judge_model=config.Config.OPENAI_MODEL,
    k=5,
    max_queries=10
)

print("LLM-judge nDCG@5:", report_judge["nDCG@5"])
for d in report_judge["details"][:3]:
    print("\nQ:", d["query"])
    print("Gains:", d["gains"])
    print("TopK:", [(c["player"], c["profil_type"]) for c in d["candidates"]])

  return forward_call(*args, **kwargs)


LLM-judge nDCG@5: 0.918

Q: défenseur central solide avec pressing intense, relance propre et bonne présence aérienne
Gains: [2, 2, 1, 1, 2]
TopK: [('Marquinhos', 'Défenseur central solide et fiable.'), ('Cameron Burgess', 'Défenseur central solide et relanceur.'), ('Taylor Harwood-Bellis', 'Défenseur central solide et fiable.'), ('Adam Masina', 'Défenseur central solide et fiable.'), ('Abdukodir Khusanov', 'Défenseur central solide et engagé.')]

Q: milieu de terrain central avec pressing intense, relance propre et présence aérienne
Gains: [2, 2, 1, 2, 3]
TopK: [('Elliot Anderson', 'Milieu central polyvalent et engagé.'), ('Seydouba Cissé', 'Milieu central polyvalent et discipliné.'), ('Arne Maier', 'Milieu central sobre et travailleur.'), ('Youssouf Fofana', 'Milieu central créatif et engagé.'), ('Ondrej Duda', 'Milieu central créatif et engagé.')]

Q: milieu central polyvalent avec pressing intense, relance propre et capacité à jouer entre les lignes
Gains: [2, 2, 2, 3, 1]
TopK: [('