In [1]:
import sys
import os

parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

import config

In [30]:
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, PayloadSchemaType, Filter, MatchAny, FieldCondition
import pandas as pd
import json
from tqdm import tqdm

In [3]:
with open("../../data/player_summaries.json", "r", encoding="utf-8") as f:
    players_summaries = json.load(f)
    
df_summaries = pd.DataFrame([
    {'player': player.split('(')[0].strip(), 'team': player.split('(')[1].replace(')', '').strip(), 'summary': summary} for player, summary in players_summaries.items()
])
df_summaries = df_summaries.reset_index() 
df_summaries.head()

Unnamed: 0,index,player,team,summary
0,0,Ben White,Arsenal,Le joueur évolue principalement en tant que dé...
1,1,Bukayo Saka,Arsenal,Le joueur évolue principalement en tant qu'att...
2,2,David Raya,Arsenal,Le joueur évolue principalement au poste de ga...
3,3,Declan Rice,Arsenal,Le joueur évolue principalement au poste de mi...
4,4,Ethan Nwaneri,Arsenal,Le joueur évolue principalement en tant qu'att...


In [4]:
df_players = pd.read_csv('../../data/players_stats.csv')
df_players = df_players.reset_index(drop=True)
df_players.head()

Unnamed: 0,index,league,season,team,player,nation__standard,pos__standard,age__standard,born__standard,Playing Time_MP_standard,...,Performance_Crs_misc,Performance_Int_misc,Performance_TklW_misc,Performance_PKwon_misc,Performance_PKcon_misc,Performance_OG_misc,Performance_Recov_misc,Aerial Duels_Won_misc,Aerial Duels_Lost_misc,Aerial Duels_Won%_misc
0,0,ENG-Premier League,2425,Arsenal,Ben White,ENG,DF,26,1997,17,...,17,16,14,0,0,0,37,18,9,66.7
1,1,ENG-Premier League,2425,Arsenal,Bukayo Saka,ENG,"FW,MF",22,2001,25,...,117,3,15,1,0,0,70,10,20,33.3
2,2,ENG-Premier League,2425,Arsenal,David Raya,ESP,GK,28,1995,38,...,0,2,0,0,1,0,41,13,0,100.0
3,3,ENG-Premier League,2425,Arsenal,Declan Rice,ENG,MF,25,1999,35,...,164,25,25,0,0,0,156,37,25,59.7
4,4,ENG-Premier League,2425,Arsenal,Ethan Nwaneri,ENG,"FW,MF",17,2007,26,...,54,2,7,0,0,0,34,4,8,33.3


In [5]:
df_players_summaries = df_summaries.merge(df_players, how='left', on=['player', 'team'])
df_players_summaries = df_players_summaries = df_players_summaries[[
    'league',
    'season',
    'player',
    'team',
    'pos__standard',
    'summary'
]]

df_players_summaries.rename(columns={'pos__standard': 'position'}, inplace=True)
df_players_summaries.head()

Unnamed: 0,league,season,player,team,position,summary
0,ENG-Premier League,2425,Ben White,Arsenal,DF,Le joueur évolue principalement en tant que dé...
1,ENG-Premier League,2425,Bukayo Saka,Arsenal,"FW,MF",Le joueur évolue principalement en tant qu'att...
2,ENG-Premier League,2425,David Raya,Arsenal,GK,Le joueur évolue principalement au poste de ga...
3,ENG-Premier League,2425,Declan Rice,Arsenal,MF,Le joueur évolue principalement au poste de mi...
4,ENG-Premier League,2425,Ethan Nwaneri,Arsenal,"FW,MF",Le joueur évolue principalement en tant qu'att...


In [6]:
embedding_model = SentenceTransformer("BAAI/bge-m3")
qdrant_client = QdrantClient(
    host='localhost', 
    port=6333,
    timeout=30.0
)

In [7]:
collection_name = 'ragscout_players'

if not qdrant_client.collection_exists(collection_name):
    qdrant_client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(
            size=1024,
            distance=Distance.COSINE
        )
    )

In [22]:
FBREF_TO_STD = {
    "GK": "GK",
    "DF": "DF",
    "DF,MF": "DM",
    "MF,DF": "DM",
    "MF": "CM",
    "MF,FW": "AM",
    'FW,MF': 'AM',
    "FW": "ST",
    'DF,FW': 'DF',
    'FW,DF': 'DF'
}

def normalize_position(raw_pos: str) -> str:
    if not raw_pos:
        return "UNK"
    raw = raw_pos.strip().upper()
    return FBREF_TO_STD.get(raw, raw[:2])

def age_bucket(age: int) -> str:
    if age is None:
        return "unknown"
    if age <= 21: return "U21"
    if age <= 23: return "U23"
    if age <= 25: return "U25"
    if age <= 28: return "U28"
    if age <= 32: return "U32"
    return "32+"

points = []
for idx, row in tqdm(df_players_summaries.iterrows()):
    embedding = embedding_model.encode(
        row['summary'],
        normalize_embeddings = True
    ).tolist()
    
    pos_std = normalize_position(getattr(row, "position", ""))
    nat = getattr(row, "nationality", "")
    age = getattr(row, "age", None)
    try:
        age = int(age) if age is not None and str(age).isdigit() else None
    except:
        age = None
    
    metadata = {
        'season': row['season'],
        'player': row['player'],
        'position_std': pos_std,
        'age': age,
        'age_bucket': age_bucket(age),
        'nationality': nat,
        'league': row['league'],
        'team': row['team'],
        'position': row['position'],
        'summary': row['summary'],
    }
    
    point = PointStruct(
        id=idx,
        vector=embedding,
        payload=metadata
    )
    
    points.append(point)

  return forward_call(*args, **kwargs)
1735it [04:13,  6.84it/s]


In [24]:
BATCH_SIZE = 100

for i in range(0, len(points), BATCH_SIZE):
    batch = points[i:i + BATCH_SIZE]
    qdrant_client.upsert(
        collection_name=collection_name, 
        points=batch
    )
    
    print(f"✅ {min(i + BATCH_SIZE, len(points))} / {len(points)} envoyé")

print(f"✅ {len(points)} joueurs insérés dans Qdrant avec succès !")

✅ 100 / 1735 envoyé
✅ 200 / 1735 envoyé
✅ 300 / 1735 envoyé
✅ 400 / 1735 envoyé
✅ 500 / 1735 envoyé
✅ 600 / 1735 envoyé
✅ 700 / 1735 envoyé
✅ 800 / 1735 envoyé
✅ 900 / 1735 envoyé
✅ 1000 / 1735 envoyé
✅ 1100 / 1735 envoyé
✅ 1200 / 1735 envoyé
✅ 1300 / 1735 envoyé
✅ 1400 / 1735 envoyé
✅ 1500 / 1735 envoyé
✅ 1600 / 1735 envoyé
✅ 1700 / 1735 envoyé
✅ 1735 / 1735 envoyé
✅ 1735 joueurs insérés dans Qdrant avec succès !


In [25]:
qdrant_client.create_payload_index(
    collection_name="ragscout_players",
    field_name="position_std",
    field_schema=PayloadSchemaType.KEYWORD,
)
qdrant_client.create_payload_index(
    collection_name="ragscout_players",
    field_name="league",
    field_schema=PayloadSchemaType.KEYWORD,
)
qdrant_client.create_payload_index(
    collection_name="ragscout_players",
    field_name="season",
    field_schema=PayloadSchemaType.INTEGER,
)
qdrant_client.create_payload_index(
    collection_name="ragscout_players",
    field_name="age",
    field_schema=PayloadSchemaType.INTEGER,
)
qdrant_client.create_payload_index(
    collection_name="ragscout_players",
    field_name="age_bucket",
    field_schema=PayloadSchemaType.KEYWORD,
)

UpdateResult(operation_id=63, status=<UpdateStatus.COMPLETED: 'completed'>)

In [26]:
import re
import numpy as np
from rank_bm25 import BM25Okapi  # pip install rank-bm25

_WORD_RE = re.compile(r"\w+", re.UNICODE)

def _tok(s: str):
    return _WORD_RE.findall((s or "").lower())

def _normalize_0_1(arr):
    arr = np.asarray(arr, dtype=float)
    if arr.size == 0:
        return arr
    mn, mx = float(np.min(arr)), float(np.max(arr))
    if mx - mn < 1e-12:
        return np.zeros_like(arr)  # tous égaux => neutre
    return (arr - mn) / (mx - mn + 1e-9)

def _bm25_scores(query: str, docs: list[str]) -> np.ndarray:
    corpus_tokens = [_tok(d) for d in docs]
    bm25 = BM25Okapi(corpus_tokens)
    return np.array(bm25.get_scores(_tok(query)))

def extract_profil_type(summary: str) -> str | None:
    if not summary:
        return None
    
    m = re.search(r"Profil-type\s*:\s*(.+)", summary, flags=re.IGNORECASE)
    return m.group(1).strip() if m else None

In [32]:
# Détection simple FR/EN -> position_std
POS_PATTERNS = [
    (r"\b(gardien|goalkeeper|keeper|gb)\b", "GK"),
    (r"\b(défenseur central|central defender|centre[- ]back|dc)\b", "DF"),
    (r"\b(lat[eé]ral|full[- ]?back|back)\b", "DF"),
    (r"\b(milieu d[eé]fensif|6\b|defensive midfielder|dm)\b", "DM"),
    (r"\b(milieu (central|relayeur)|8\b|central midfielder|cm)\b", "CM"),
    (r"\b(meneur|num[eé]ro 10|playmaker|am)\b", "AM"),
    (r"\b(ailier|wing(er)?|wide)\b", "AM"),
    (r"\b(avant[- ]centre|but(e)ur|striker|9\b|st)\b", "ST"),
]

LEAGUE_MAP = {
    "premier league": "Premier League",
    "ligue 1": "Ligue 1",
    "la liga": "La Liga",
    "bundesliga": "Bundesliga",
    "serie a": "Serie A",
}

def infer_intent_from_query(query: str):
    q = query.lower()
    # position
    pos = None
    for pat, code in POS_PATTERNS:
        if re.search(pat, q):
            pos = code
            break

    # side (gauche/droite) -> seulement boost, à moins que tu aies un champ 'side'
    side = "L" if re.search(r"\b(gauche|left)\b", q) else ("R" if re.search(r"\b(droite|right)\b", q) else None)

    # footed
    footed = "L" if re.search(r"\b(gaucher|left[- ]?foot(ed)?)\b", q) else ("R" if re.search(r"\b(droitier|right[- ]?foot(ed)?)\b", q) else None)

    # âge: U23, U21, "moins de 25", "<= 25"
    age_max = None
    m = re.search(r"u(\d{2})", q)
    if m:
        age_max = int(m.group(1))
    else:
        m = re.search(r"(?:moins de|under|<=)\s*(\d{2})", q)
        if m:
            age_max = int(m.group(1))

    # ligue
    league = None
    for k, v in LEAGUE_MAP.items():
        if k in q:
            league = v
            break

    return {
        "position_std": pos,
        "side": side,
        "footed": footed,
        "age_max": age_max,
        "league": league,
    }

def make_qdrant_filter(intent: dict) -> Filter | None:
    must = []
    # On filtre uniquement sur position si détectée (filtre léger mais efficace)
    if intent.get("position_std"):
        must.append(
            FieldCondition(
                key="position_std",
                match=MatchAny(any=[intent["position_std"]])
            )
        )
    
    if not must:
        return None
    return Filter(must=must)

In [35]:
query = 'Ailier dynamique, des buts et passe décisives de son equipe, fort à la finition. Rapide, dribbleur, avec pas mal de passes clés'
intent = infer_intent_from_query(query)
qdrant_filter = make_qdrant_filter(intent)

query_vector = embedding_model.encode(
    query,
    normalize_embeddings = True
).tolist()

dense_top_n = min(100, max(5 * 5, 50))

results = qdrant_client.query_points(
    collection_name=collection_name,
    query=query_vector,
    limit=dense_top_n,
    with_payload=True,
    query_filter=qdrant_filter
)

# 3) Préparer les candidats
candidates = []
for point in results.points:
    payload = point.payload or {}
    name = payload.get("player", "Nom inconnu")
    summary = payload.get("summary", "Aucune description disponible")
    profil_type = extract_profil_type(summary) or ""

    # point.score = similarité (Cosine) si ta collection est en distance 'Cosine'
    candidates.append({
        "name": name,
        "profil_type": profil_type,
        "summary": summary,
        "full_summary": summary,
        "similarity_score_raw": float(point.score),
        "position_std": payload.get("position_std", "UNK"),
        "league": payload.get("league"),
        "age": payload.get("age"),
        "age_bucket": payload.get("age_bucket")
    })

# 4) Construire les documents pour BM25 (profil_type + summary)
bm25_docs = [
    f"{c['profil_type']} {c['summary']}".strip() for c in candidates
]
bm25_scores = _bm25_scores(query, bm25_docs)

# 5) Normaliser les deux signaux pour fusion
dense_scores = np.array([c["similarity_score_raw"] for c in candidates], dtype=float)
dense_norm = _normalize_0_1(dense_scores)
bm25_norm  = _normalize_0_1(bm25_scores)

# 6) Fusion (réglable) — commence avec alpha=0.75
alpha = 0.75
fused = alpha * dense_norm + (1.0 - alpha) * bm25_norm

boosts = np.zeros_like(fused)
for i, c in enumerate(candidates):
    b = 0.0
    # petit bonus si position_std matche (utile si pas filtré, ou en plus du filtre)
    if intent.get("position_std") and c["position_std"] == intent["position_std"]:
        b += 0.03
    # bonus si ligue mentionnée
    if intent.get("league") and c["league"] == intent["league"]:
        b += 0.02
    # bonus si âge <= age_max
    if intent.get("age_max") and c.get("age") is not None and c["age"] <= intent["age_max"]:
        b += 0.02
    boosts[i] = b

fused = fused + boosts

# 7) Réordonner par score fusionné décroissant
order = np.argsort(-fused)
ranked_players = []
for i in order[:5]:
    c = candidates[i]
    ranked_players.append({
        "name": c["name"],
        "profil_type": c["profil_type"],
        "position_std": c.get("position_std", "UNK"),
        "league": c.get("league"),
        "age": c.get("age"),
        "age_bucket": c.get("age_bucket"),
        "summary": c["summary"],        # tronquage si tu veux
        "full_summary": c["full_summary"],
        # on expose les scores pour transparence / debug UI
        "score_fused": float(fused[i]),
        "score_dense": float(dense_norm[i]),
        "score_bm25":  float(bm25_norm[i]),
        "similarity_score": float(c["similarity_score_raw"]),  # brut Qdrant
    })

for ranked_player in ranked_players:
    print(f"🔹 {ranked_player['name']}")
    print(f"Résumé : {ranked_player['summary']}")
    print(f"Score : {ranked_player['similarity_score']:.3f}")
    print(f"Score fused : {ranked_player['score_fused']:.3f}")
    print(f"Score dense : {ranked_player['score_dense']:.3f}")
    print(f"Score BM25 : {ranked_player['score_bm25']:.3f}")
    print("---")

  return forward_call(*args, **kwargs)


🔹 Evann Guessand
Résumé : Le joueur évolue principalement en tant qu'attaquant, avec une capacité à jouer également au milieu de terrain. Son style de jeu se caractérise par une forte implication offensive, avec une tendance à se projeter vers l'avant et à participer activement aux phases de construction. Il montre une bonne capacité à conserver le ballon et à effectuer des appels dans la profondeur, tout en étant impliqué dans le pressing défensif.

Parmi ses principales qualités, on note un bon volume de jeu, avec une présence notable dans les duels et une mobilité qui lui permet de se démarquer. Sur le plan technique, il se distingue par sa capacité à réaliser des passes précises, notamment dans les zones clés, et par sa créativité dans le jeu. Sa finition est également un atout, avec un nombre significatif de buts inscrits. Mentalement, il fait preuve d'une bonne concentration et d'une discipline tactique, ce qui lui permet de s'adapter aux exigences du jeu.

Cependant, il pourrait