<a href="https://colab.research.google.com/github/Amik24/semantic-analysis-project/blob/Ikram_notebooks/scoring_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# code/scoring.py
# ---------------------------------------------
# Rôle : fonctions de scoring pour compétences et métiers.

from typing import Dict, List, Literal
import numpy as np
import pandas as pd
from sentence_transformers import util
import torch

# ---------- COMPETENCE SCORING (MAX vs AVG) ----------

def compute_comp_scores(
    user_emb: torch.Tensor,
    comp_emb: torch.Tensor,
    comp_ids: List[str],
    comp_texts: List[str],
    cid2block: Dict[str, str],
    mode: Literal["max", "avg"] = "avg"
) -> pd.DataFrame:
    """
    Calcule un score de similarité pour CHAQUE compétence.
    mode='max' : meilleur score parmi toutes les réponses (très sensible)
    mode='avg' : moyenne des réponses -> profil unique (plus stable)
    Retourne un DataFrame trié par 'Score' décroissant.
    """
    if mode == "max":
        S = util.cos_sim(user_emb, comp_emb)           # (n_inputs x n_comp)
        scores = S.max(dim=0).values.cpu().numpy()
    elif mode == "avg":
        user_avg = user_emb.mean(dim=0, keepdim=True)  # (1 x d)
        S = util.cos_sim(user_avg, comp_emb)           # (1 x n_comp)
        scores = S.squeeze(0).cpu().numpy()
    else:
        raise ValueError("mode must be 'max' or 'avg'")

    comp_df = pd.DataFrame({
        "CompetencyID":   comp_ids,
        "CompetencyText": comp_texts,
        "BlockName":      [cid2block[c] for c in comp_ids],
        "Score":          scores
    }).sort_values("Score", ascending=False).reset_index(drop=True)

    return comp_df

def block_coverage(comp_df: pd.DataFrame) -> pd.Series:
    """
    Score moyen par bloc (Series triée).
    """
    return comp_df.groupby("BlockName")["Score"].mean().sort_values(ascending=False)

# ---------- JOB SCORING (Top-K + Mean fallback) ----------

def score_job_topk(required_ids: List[str], score_map: Dict[str, float], k: int = 3) -> float:
    """
    Moyenne des K meilleurs scores de compétences du métier.
    Favorise les points forts (utile si peu de réponses utilisateur).
    """
    vals = [score_map.get(cid, 0.0) for cid in required_ids if cid in score_map]
    if not vals:
        return 0.0
    vals.sort(reverse=True)
    vals = vals[:k] if len(vals) >= k else vals
    return float(np.mean(vals))

def score_job_mean(required_ids: List[str], score_map: Dict[str, float]) -> float:
    """
    Baseline : moyenne simple de toutes les compétences requises.
    """
    vals = [score_map.get(cid, 0.0) for cid in required_ids if cid in score_map]
    return float(np.mean(vals)) if vals else 0.0

def rank_jobs(
    jobs_df: pd.DataFrame,
    comp_df: pd.DataFrame,
    method: Literal["topk", "mean"] = "topk",
    top_k: int = 3
) -> pd.DataFrame:
    """
    Ajoute une colonne 'JobScore' et retourne le DataFrame trié.
    jobs_df doit contenir : ['JobID','JobTitle','RequiredCompetencies']
    comp_df doit contenir : ['CompetencyID','Score']
    """
    score_map = dict(zip(comp_df["CompetencyID"], comp_df["Score"]))

    if method == "topk":
        scorer = lambda ids: score_job_topk(ids, score_map, k=top_k)
    elif method == "mean":
        scorer = lambda ids: score_job_mean(ids, score_map)
    else:
        raise ValueError("method must be 'topk' or 'mean'")

    ranked = jobs_df.copy()
    ranked["JobScore"] = ranked["RequiredCompetencies"].apply(scorer)
    ranked = ranked.sort_values("JobScore", ascending=False).reset_index(drop=True)
    return ranked
