In [1]:
# -*- coding: utf-8 -*-
"""
Assignment pipeline using min-cost flow.

Features:
- Ordered and weighted preference modes.
- Input validation and deterministic behavior.
- Optional interpretation of weighted prefs as "higher is better".
- Optional inclusion of an "__NA__" bucket when capacity is insufficient.
- CSV outputs plus quick satisfaction statistics.
- IPython display fallback to plain text when not available.
"""
from __future__ import annotations

from pathlib import Path
from typing import Dict, Iterable, List, Optional, Tuple
import warnings

import networkx as nx
import pandas as pd

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 0)
pd.set_option("display.max_colwidth", None)


def resolve_data_dir() -> Path:
    """
    Resolve the data directory (../data if inside src/, otherwise ./data).
    Create the folder if it does not exist.
    """
    try:
        base_dir = Path(__file__).resolve().parent
        root_dir = base_dir.parent
    except NameError:
        # When running in notebooks or REPL (__file__ undefined)
        root_dir = Path.cwd()
        if root_dir.name == "src" and (root_dir.parent / "data").exists():
            root_dir = root_dir.parent
    data_dir = (root_dir / "data").resolve()
    data_dir.mkdir(parents=True, exist_ok=True)
    return data_dir


def load_projects_df(path: Path) -> pd.DataFrame:
    """
    Load projects.csv with columns:
      - id (required, unique)
      - label (optional -> default = id)
      - capacity (optional -> default = 1; must be >= 0)
    """
    df = pd.read_csv(path, dtype=str).fillna("")
    if "id" not in df.columns:
        raise ValueError("projects.csv must contain column 'id'.")
    if "label" not in df.columns:
        df["label"] = df["id"]
    if "capacity" not in df.columns:
        df["capacity"] = "1"

    df["id"] = df["id"].str.strip()
    df["label"] = df["label"].str.strip()
    df["capacity"] = (
        pd.to_numeric(df["capacity"].replace("", "1"), errors="coerce")
        .fillna(1)
        .astype(int)
    )

    # Validations
    if df["id"].duplicated().any():
        dups = df.loc[df["id"].duplicated(), "id"].tolist()
        raise ValueError(f"projects.csv: 'id' must be unique. Duplicates: {dups}")
    if (df["capacity"] < 0).any():
        negs = df.loc[df["capacity"] < 0, "id"].tolist()
        raise ValueError(
            f"projects.csv: 'capacity' must be >= 0. Offending projects: {negs}"
        )

    return df[["id", "label", "capacity"]]


def _split_semicolon(s: str) -> List[str]:
    return [x.strip() for x in (s or "").split(";") if x.strip()]


def _parse_weighted_prefs(raw: str, valid: set[str]) -> Dict[str, float]:
    """
    Parse weighted preferences in format 'p1:1.5;p2:3' -> {'p1': 1.5, 'p2': 3.0}
    Ignores unknown projects and non-numeric weights.
    """
    result: Dict[str, float] = {}
    for tok in _split_semicolon(raw):
        if ":" not in tok:
            warnings.warn(
                f"Malformed weighted preference (missing ':') ignored: '{tok}'",
                RuntimeWarning,
            )
            continue
        pid, w = tok.split(":", 1)
        pid = pid.strip()
        if pid not in valid:
            warnings.warn(
                f"Unknown project in weighted prefs ignored: '{pid}'",
                RuntimeWarning,
            )
            continue
        try:
            result[pid] = float(w.strip())
        except ValueError:
            warnings.warn(
                f"Non-numeric weight ignored for '{pid}': '{w}'", RuntimeWarning
            )
            continue
    return result


def load_choices_df(path: Path, valid_projects: Iterable[str]) -> pd.DataFrame:
    """
    Load student-choices.csv with columns:
      - student (required)
      - prefs (required), e.g. "p1;p2;p3" or "p1:0;p2:1.5"
      - weight (optional -> default 1; coerced to >= 1)
      - names  (optional); if provided, must contain exactly `weight` names
        separated by ';'. Otherwise auto-generated from 'student'.
    Automatically detects 'ordered' vs 'weighted'.
    Returns columns: key, weight, mode, prefs_ordered, prefs_weighted, names
    """
    df = pd.read_csv(path, dtype=str).fillna("")
    required = {"student", "prefs"}
    if not required.issubset(df.columns):
        raise ValueError(
            "student-choices.csv must contain at least 'student' and 'prefs'."
        )

    df["student"] = df["student"].str.strip()
    df = df[df["student"] != ""].copy()

    if "weight" not in df.columns:
        df["weight"] = "1"
    df["weight"] = pd.to_numeric(df["weight"], errors="coerce").fillna(1).astype(int)
    df.loc[df["weight"] < 1, "weight"] = 1

    valid = set(valid_projects)

    modes: List[str] = []
    prefs_ordered_col: List[List[str]] = []
    prefs_weighted_col: List[Dict[str, float]] = []
    names_col: List[List[str]] = []

    for _, row in df.iterrows():
        key = row["student"]
        w = int(row["weight"])
        raw_prefs = str(row["prefs"])
        tokens = _split_semicolon(raw_prefs)
        is_weighted = any(":" in tok for tok in tokens)

        if is_weighted:
            mode = "weighted"
            weights_map = _parse_weighted_prefs(raw_prefs, valid)
            if not weights_map:
                # No valid 'p:w' tokens -> fallback to 'ordered' from plain ids
                modes.append("ordered")
                ordered = [p for p in tokens if ":" not in p and p in valid]
                if not ordered:
                    warnings.warn(
                        f"No valid preference found for '{key}'.",
                        RuntimeWarning,
                    )
                prefs_ordered_col.append(ordered)
                prefs_weighted_col.append({})
            else:
                modes.append(mode)
                # Deterministic ordering by (weight, pid)
                ordered_pairs = sorted(weights_map.items(), key=lambda kv: (kv[1], kv[0]))
                prefs_ordered_col.append([pid for pid, _ in ordered_pairs])
                prefs_weighted_col.append(weights_map)
        else:
            mode = "ordered"
            modes.append(mode)
            ordered = [p for p in tokens if p in valid]
            if not ordered:
                warnings.warn(
                    f"No valid preference found for '{key}'.",
                    RuntimeWarning,
                )
            prefs_ordered_col.append(ordered)
            prefs_weighted_col.append({})

        raw_names = _split_semicolon(row.get("names", ""))
        if raw_names and len(raw_names) != w:
            warnings.warn(
                f"'names' provided for '{key}' but length != weight "
                f"({len(raw_names)} != {w}). Ignoring provided names.",
                RuntimeWarning,
            )
            raw_names = []
        if not raw_names:
            raw_names = [f"{key}#{i + 1}" if w > 1 else key for i in range(w)]
        names_col.append(raw_names)

    df = df.rename(columns={"student": "key"})
    df["mode"] = modes
    df["prefs_ordered"] = prefs_ordered_col
    df["prefs_weighted"] = prefs_weighted_col
    df["names"] = names_col
    return df[
        ["key", "weight", "mode", "prefs_ordered", "prefs_weighted", "names"]
    ].reset_index(drop=True)


def _common_graph_skeleton(
    projects_df: pd.DataFrame,
    total_students: int,
    unassigned_label: str,
) -> Tuple[nx.DiGraph, str, str, Dict[str, int], List[str], int]:
    """
    Build the sink side of the flow network (projects -> t), and add source/sink
    nodes with correct global demand.
    """
    projects = projects_df["id"].tolist()
    capacities = dict(zip(projects_df["id"], projects_df["capacity"]))
    cap = {p: int(capacities.get(p, 1)) for p in projects}
    total_cap = sum(cap.values())
    proj_ids = projects[:]
    if total_cap < total_students:
        proj_ids.append(unassigned_label)
        cap[unassigned_label] = total_students - total_cap

    # Deterministic ordering for stability
    proj_ids = list(proj_ids)

    g = nx.DiGraph()
    s, t = "_s", "_t"
    flow_target = min(total_students, sum(cap.values()))
    g.add_node(s, demand=-flow_target)
    g.add_node(t, demand=flow_target)
    for p in proj_ids:
        g.add_node(p, demand=0)
        g.add_edge(p, t, capacity=cap[p], weight=0)
    return g, s, t, cap, proj_ids, flow_target


def build_graph_unweighted(
    entries_df: pd.DataFrame,
    projects_df: pd.DataFrame,
    rank_cost: Optional[List[float]],
    penalty: Optional[float],
    unassigned_label: str = "__NA__",
) -> Tuple[nx.DiGraph, Dict]:
    """
    Build graph for 'ordered' mode.
    - rank_cost: list of costs per rank (default: 0 for best, 1 for next, ...).
      If the list is too short, extrapolate linearly with slope=1.
    - penalty: cost for any non-listed project (default = max(rank_cost) + 5).
    """
    n_people = int(entries_df["weight"].sum())
    if rank_cost is None:
        max_len = int(entries_df["prefs_ordered"].map(len).max()) if len(entries_df) else 1
        rank_cost = list(range(max(1, max_len)))  # [0, 1, 2, ..., L-1]
    if penalty is None:
        penalty = (max(rank_cost) if rank_cost else 5) + 5

    g, s, t, cap, proj_ids, flow_target = _common_graph_skeleton(
        projects_df, n_people, unassigned_label
    )

    for i, row in entries_df.reset_index(drop=True).iterrows():
        u = f"e{i}"
        g.add_node(u, demand=0)
        g.add_edge(s, u, capacity=int(row["weight"]), weight=0)

    for i, row in entries_df.reset_index(drop=True).iterrows():
        u = f"e{i}"
        prefs: List[str] = row["prefs_ordered"]
        rank_map = {p: r for r, p in enumerate(prefs)}
        for p in proj_ids:
            if p in rank_map:
                r = rank_map[p]
                if r < len(rank_cost):
                    cost = rank_cost[r]
                else:
                    # Linear extrapolation past provided rank_cost
                    cost = rank_cost[-1] + (r - (len(rank_cost) - 1))
            else:
                cost = penalty
            g.add_edge(u, p, capacity=int(row["weight"]), weight=float(cost))

    meta = {
        "s": s,
        "t": t,
        "entries": entries_df,
        "cap": cap,
        "flow_target": flow_target,
        "unassigned": unassigned_label,
    }
    return g, meta


def build_graph_weighted(
    entries_df: pd.DataFrame,
    projects_df: pd.DataFrame,
    penalty: Optional[float],
    unassigned_label: str = "__NA__",
    higher_is_better: bool = False,
) -> Tuple[nx.DiGraph, Dict]:
    """
    Build graph for 'weighted' mode.

    Default meaning: 'weight' = cost (lower is better).
    If higher_is_better=True, transform weights into costs using (max - w).

    - penalty: cost for any non-listed project (default 10.0).
    """
    n_people = int(entries_df["weight"].sum())
    if penalty is None:
        penalty = 10.0

    g, s, t, cap, proj_ids, flow_target = _common_graph_skeleton(
        projects_df, n_people, unassigned_label
    )

    for i, row in entries_df.reset_index(drop=True).iterrows():
        u = f"e{i}"
        g.add_node(u, demand=0)
        g.add_edge(s, u, capacity=int(row["weight"]), weight=0)

    for i, row in entries_df.reset_index(drop=True).iterrows():
        u = f"e{i}"
        wmap: Dict[str, float] = dict(row["prefs_weighted"])
        if higher_is_better and wmap:
            mx = max(wmap.values())
            # Convert to positive costs: cost = (mx - score)
            wmap = {p: (mx - v) for p, v in wmap.items()}
        for p in proj_ids:
            cost = float(wmap[p]) if p in wmap else float(penalty)
            g.add_edge(u, p, capacity=int(row["weight"]), weight=cost)

    meta = {
        "s": s,
        "t": t,
        "entries": entries_df,
        "cap": cap,
        "flow_target": flow_target,
        "unassigned": unassigned_label,
    }
    return g, meta


def solve_min_cost(g: nx.DiGraph) -> Tuple[Dict, float]:
    """
    Solve min-cost flow. Returns (flow_dict, total_cost).
    """
    flow = nx.min_cost_flow(g, demand="demand", capacity="capacity", weight="weight")
    cost = nx.cost_of_flow(g, flow, weight="weight")
    return flow, float(cost)


def expand_to_individual_rows(flow: Dict, meta: Dict) -> pd.DataFrame:
    """
    Expand the flow into per-person rows.
    Columns: student, project_id, choice_rank, choice_weight
    """
    entries_df: pd.DataFrame = meta["entries"]
    unassigned = meta["unassigned"]
    rows: List[Tuple[str, Optional[str], Optional[int], Optional[float]]] = []

    for i, row in entries_df.reset_index(drop=True).iterrows():
        u = f"e{i}"
        alloc = [(p, f) for p, f in flow[u].items() if f > 0]
        names = list(row["names"])
        key = row["key"]
        ordered = row.get("prefs_ordered", []) or []
        rank_map = {p: (r + 1) for r, p in enumerate(ordered)}
        wmap: Dict[str, float] = row.get("prefs_weighted", {}) or {}

        for p, k in alloc:
            for _ in range(int(k)):
                nm = names.pop(0) if names else f"{key}#?"
                pid = None if p == unassigned else p
                choice_rank = rank_map.get(p) if pid is not None else None
                choice_weight = wmap.get(p) if pid is not None else None
                rows.append((nm, pid, choice_rank, choice_weight))

    return pd.DataFrame(
        rows, columns=["student", "project_id", "choice_rank", "choice_weight"]
    )


def build_student_df(
    assign_df: pd.DataFrame, projects_df: pd.DataFrame, mode: str
) -> pd.DataFrame:
    """
    Enrich assignment with project labels and an 'initial_choice' string.
    """
    labels = dict(zip(projects_df["id"], projects_df["label"]))
    df = assign_df.copy()
    df["project_label"] = df["project_id"].map(labels).fillna(
        df["project_id"].fillna("")
    )

    def _fmt(row: pd.Series) -> str:
        r = row.get("choice_rank")
        w = row.get("choice_weight")
        if mode == "weighted":
            if pd.notna(r) and pd.notna(w):
                return f"{int(r)}:{w:.3f}"
            if pd.notna(r):
                return f"{int(r)}:"
            if pd.notna(w):
                return f":{w:.3f}"
            return ""
        return str(int(r)) if pd.notna(r) else ""

    df["initial_choice"] = df.apply(_fmt, axis=1)
    return df.sort_values("student").reset_index(drop=True)


def build_project_df(
    assign_df: pd.DataFrame,
    projects_df: pd.DataFrame,
    include_unassigned: str | bool = "auto",
    unassigned_label: str = "__NA__",
) -> pd.DataFrame:
    """
    Aggregate by project.

    include_unassigned:
      - True  : always add the 'unassigned_label' row
      - False : never add it
      - 'auto': add only if unassigned exist
    """
    order = projects_df["id"].tolist()
    labels = dict(zip(projects_df["id"], projects_df["label"]))
    grouped = assign_df.groupby("project_id")["student"].apply(list).to_dict()
    rows = []
    for pid in order:
        students = sorted(grouped.get(pid, []))
        rows.append([labels.get(pid, pid), pid, len(students), ";".join(students)])

    add_unassigned = (include_unassigned is True) or (
        include_unassigned == "auto" and unassigned_label in grouped
    )
    if add_unassigned:
        students = sorted(grouped.get(unassigned_label, []))
        rows.append(
            [unassigned_label, unassigned_label, len(students), ";".join(students)]
        )

    return pd.DataFrame(
        rows, columns=["project_label", "project_id", "effectif", "students"]
    )


def write_students_csv(df_students: pd.DataFrame, path: Path) -> None:
    df = df_students[["student", "project_id", "project_label", "initial_choice"]].copy()
    df.to_csv(path, index=False, encoding="utf-8")


def write_projects_csv(df_projects: pd.DataFrame, path: Path) -> None:
    df_projects.to_csv(path, index=False, encoding="utf-8")


def satisfaction_stats(df_students: pd.DataFrame) -> pd.DataFrame:
    """
    Return quick satisfaction metrics.
    """
    s = df_students.copy()
    # Extract rank (before ':') if present
    s["rank"] = pd.to_numeric(
        s["initial_choice"].str.split(":").str[0],
        errors="coerce",
    )
    out = {
        "n": len(s),
        "assigned": int(s["project_id"].notna().sum()),
        "unassigned": int(s["project_id"].isna().sum()),
        "median_rank": float(s["rank"].median()) if len(s) else float("nan"),
        "p_top1": float((s["rank"] == 1).mean()) if len(s) else float("nan"),
        "p_top3": float((s["rank"] <= 3).mean()) if len(s) else float("nan"),
    }
    return pd.DataFrame([out])


def run_pipeline_unweighted(
    data_dir: Path,
    rank_cost: Optional[List[float]] = None,
    penalty: Optional[float] = None,
    unassigned_label: str = "__NA__",
    write_outputs: bool = True,
) -> Tuple[pd.DataFrame, pd.DataFrame, float, Dict[str, Path]]:
    """
    Run the ordered-preferences pipeline end-to-end.
    """
    paths = {
        "projects": data_dir / "projects.csv",
        "choices": data_dir / "student-choices.csv",
        "student_out": data_dir / "assignment_student_unweighted.csv",
        "project_out": data_dir / "assignment_project_unweighted.csv",
    }
    projects_df = load_projects_df(paths["projects"])
    choices_df = load_choices_df(paths["choices"], projects_df["id"].tolist())
    g, meta = build_graph_unweighted(
        choices_df, projects_df, rank_cost, penalty, unassigned_label
    )
    flow, cost = solve_min_cost(g)
    assign_df = expand_to_individual_rows(flow, meta)
    df_students = build_student_df(assign_df, projects_df, mode="unweighted")
    df_projects = build_project_df(
        assign_df, projects_df, include_unassigned="auto", unassigned_label=unassigned_label
    )
    if write_outputs:
        write_students_csv(df_students, paths["student_out"])
        write_projects_csv(df_projects, paths["project_out"])
    return df_students, df_projects, cost, paths


def run_pipeline_weighted(
    data_dir: Path,
    penalty: Optional[float] = None,
    unassigned_label: str = "__NA__",
    write_outputs: bool = True,
    higher_is_better: bool = False,
) -> Tuple[pd.DataFrame, pd.DataFrame, float, Dict[str, Path]]:
    """
    Run the weighted-preferences pipeline end-to-end.
    """
    paths = {
        "projects": data_dir / "projects.csv",
        "choices": data_dir / "student-choices.csv",
        "student_out": data_dir / "assignment_student_weighted.csv",
        "project_out": data_dir / "assignment_project_weighted.csv",
    }
    projects_df = load_projects_df(paths["projects"])
    choices_df = load_choices_df(paths["choices"], projects_df["id"].tolist())
    g, meta = build_graph_weighted(
        choices_df,
        projects_df,
        penalty,
        unassigned_label,
        higher_is_better=higher_is_better,
    )
    flow, cost = solve_min_cost(g)
    assign_df = expand_to_individual_rows(flow, meta)
    df_students = build_student_df(assign_df, projects_df, mode="weighted")
    df_projects = build_project_df(
        assign_df, projects_df, include_unassigned="auto", unassigned_label=unassigned_label
    )
    if write_outputs:
        write_students_csv(df_students, paths["student_out"])
        write_projects_csv(df_projects, paths["project_out"])
    return df_students, df_projects, cost, paths


def run_both(
    data_dir: Optional[Path] = None,
    rank_cost: Optional[List[float]] = None,
    unweighted_penalty: Optional[float] = None,
    weighted_penalty: Optional[float] = None,
    unassigned_label: str = "__NA__",
    write_outputs: bool = True,
    higher_is_better: bool = False,
) -> Dict[str, object]:
    """
    Run both pipelines and return all outputs.
    """
    data_dir = data_dir or resolve_data_dir()
    stu_unw, prj_unw, cost_unw, paths_unw = run_pipeline_unweighted(
        data_dir=data_dir,
        rank_cost=rank_cost,
        penalty=unweighted_penalty,
        unassigned_label=unassigned_label,
        write_outputs=write_outputs,
    )
    stu_w, prj_w, cost_w, paths_w = run_pipeline_weighted(
        data_dir=data_dir,
        penalty=weighted_penalty,
        unassigned_label=unassigned_label,
        write_outputs=write_outputs,
        higher_is_better=higher_is_better,
    )
    return {
        "students_unweighted": stu_unw,
        "projects_unweighted": prj_unw,
        "cost_unweighted": cost_unw,
        "paths_unweighted": paths_unw,
        "students_weighted": stu_w,
        "projects_weighted": prj_w,
        "cost_weighted": cost_w,
        "paths_weighted": paths_w,
    }


if __name__ == "__main__":
    DATA_DIR = resolve_data_dir()
    results = run_both(
        data_dir=DATA_DIR,
        rank_cost=None,
        unweighted_penalty=None,
        weighted_penalty=10.0,
        unassigned_label="__NA__",
        write_outputs=True,
        higher_is_better=False,  # set True if weights mean "preference" (higher = better)
    )
    print("Cost (unweighted):", results["cost_unweighted"])
    print("Cost (weighted):  ", results["cost_weighted"])
    print(
        "Written:",
        results["paths_unweighted"]["student_out"].name,
        ",",
        results["paths_unweighted"]["project_out"].name,
        ",",
        results["paths_weighted"]["student_out"].name,
        ",",
        results["paths_weighted"]["project_out"].name,
    )

    # IPython is optional; fallback to plain text if not available
    try:
        from IPython.display import display  # type: ignore
    except Exception:
        display = None  # type: ignore

    def _show(df: pd.DataFrame, title: str) -> None:
        print(f"\n=== {title} ===")
        if display:
            display(df)  # type: ignore
        else:
            print(df.to_string(index=False))

    _show(results["students_unweighted"], "Unweighted — Students")
    _show(results["projects_unweighted"], "Unweighted — Projects")
    _show(results["students_weighted"], "Weighted — Students")
    _show(results["projects_weighted"], "Weighted — Projects")

    # Quick satisfaction stats
    try:
        print("\n=== Stats — Unweighted ===")
        print(satisfaction_stats(results["students_unweighted"]).to_string(index=False))
        print("\n=== Stats — Weighted ===")
        print(satisfaction_stats(results["students_weighted"]).to_string(index=False))
    except Exception as exc:
        warnings.warn(f"Could not compute stats: {exc}", RuntimeWarning)


Cost (unweighted): 14.0
Cost (weighted):   5.563000000000001
Written: assignment_student_unweighted.csv , assignment_project_unweighted.csv , assignment_student_weighted.csv , assignment_project_weighted.csv

=== Unweighted — Students ===


Unnamed: 0,student,project_id,choice_rank,choice_weight,project_label,initial_choice
0,Aaron HUMBERT,P1,1,0.1,Data Cleaning,1
1,Adam FOURNIER,P5,1,0.1,Time Series Forecasting,1
2,Agathe DUPUIS,P10,1,0.1,Computer Vision,1
3,Alice MARTIN,P1,2,0.133,Data Cleaning,2
4,Amandine RENARD,P9,1,0.1,Web Analytics,1
5,Ambre FABRE,P4,1,0.1,NLP Chatbot,1
6,Anaïs LÉFÈVRE,P4,1,0.1,NLP Chatbot,1
7,Antoine PERROT,P2,1,0.1,Recommender System,1
8,Arthur DAVID,P7,1,0.1,Reinforcement Learning,1
9,Aya MARTINEZ,P10,1,0.1,Computer Vision,1



=== Unweighted — Projects ===


Unnamed: 0,project_label,project_id,effectif,students
0,Data Cleaning,P1,6,Aaron HUMBERT;Alice MARTIN;Eliott CHARPENTIER;Lina MOREL;Lucas BERNARD;Lucie NOËL
1,Recommender System,P2,6,Antoine PERROT;Emma DUBOIS;Ethan GIRARD;Hugo THOMAS;Marius MATHIEU;Nina BARBIER
2,Image Classification,P3,6,Iris BLANCHARD;Louis DURAND;Léa RICHARD;Manon LEROY;Maël ANDRÉ;Valentin CHARLES
3,NLP Chatbot,P4,6,Ambre FABRE;Anaïs LÉFÈVRE;Chloé ROBERT;Gabriel PETIT;Maxime MULLER;Romane GONZALEZ
4,Time Series Forecasting,P5,6,Adam FOURNIER;Camille SIMON;Jeanne BERTRAND;Nathan MOREAU;Noé ROUX;Zoé VINCENT
5,Anomaly Detection,P6,6,Mila DUPONT;Nino OLIVIER;Pauline GAUTHIER;Raphaël MERCIER;Salomé COUSIN;Timéo PIRES
6,Reinforcement Learning,P7,6,Arthur DAVID;Inès GARCIA;Jules LAURENT;Sarah LEFEBVRE;Tom MICHEL;Émile REY
7,Optimization Engine,P8,5,Justine LOPES;Marion DUMAS;Victor LAMBERT;Yanis RENAUD;Élise BONNET
8,Web Analytics,P9,3,Amandine RENARD;Noa NAVARRO;Sacha FRANÇOIS
9,Computer Vision,P10,3,Agathe DUPUIS;Aya MARTINEZ;Oscar BOYER



=== Weighted — Students ===


Unnamed: 0,student,project_id,choice_rank,choice_weight,project_label,initial_choice
0,Adam FOURNIER,P5,1,0.1,Time Series Forecasting,1:0.100
1,Agathe DUPUIS,P10,1,0.1,Computer Vision,1:0.100
2,Alice MARTIN,P1,2,0.133,Data Cleaning,2:0.133
3,Amandine RENARD,P9,1,0.1,Web Analytics,1:0.100
4,Ambre FABRE,P4,1,0.1,NLP Chatbot,1:0.100
5,Anaïs LÉFÈVRE,P4,1,0.1,NLP Chatbot,1:0.100
6,Arthur DAVID,P7,1,0.1,Reinforcement Learning,1:0.100
7,Aya MARTINEZ,P10,1,0.1,Computer Vision,1:0.100
8,Baptiste LEGRAND,P11,1,0.1,Graph Mining,1:0.100
9,Camille SIMON,P5,2,0.133,Time Series Forecasting,2:0.133



=== Weighted — Projects ===


Unnamed: 0,project_label,project_id,effectif,students
0,Data Cleaning,P1,4,Alice MARTIN;Lina MOREL;Lucas BERNARD;Lucie NOËL
1,Recommender System,P2,1,Emma DUBOIS
2,Image Classification,P3,3,Louis DURAND;Léa RICHARD;Manon LEROY
3,NLP Chatbot,P4,6,Ambre FABRE;Anaïs LÉFÈVRE;Hugo THOMAS;Maxime MULLER;Maël ANDRÉ;Valentin CHARLES
4,Time Series Forecasting,P5,6,Adam FOURNIER;Camille SIMON;Jeanne BERTRAND;Nathan MOREAU;Noé ROUX;Zoé VINCENT
5,Anomaly Detection,P6,9,Chloé ROBERT;Ethan GIRARD;Gabriel PETIT;Mila DUPONT;Nino OLIVIER;Pauline GAUTHIER;Raphaël MERCIER;Salomé COUSIN;Timéo PIRES
6,Reinforcement Learning,P7,6,Arthur DAVID;Inès GARCIA;Jules LAURENT;Marion DUMAS;Sarah LEFEBVRE;Tom MICHEL
7,Optimization Engine,P8,5,Justine LOPES;Victor LAMBERT;Yanis RENAUD;Élise BONNET;Émile REY
8,Web Analytics,P9,3,Amandine RENARD;Noa NAVARRO;Sacha FRANÇOIS
9,Computer Vision,P10,3,Agathe DUPUIS;Aya MARTINEZ;Oscar BOYER



=== Stats — Unweighted ===
 n  assigned  unassigned  median_rank   p_top1  p_top3
62        62           0          1.0 0.806452     1.0

=== Stats — Weighted ===
 n  assigned  unassigned  median_rank   p_top1   p_top3
55        55           0          1.0 0.727273 0.945455


In [2]:
# -*- coding: utf-8 -*-
"""
Export and visualization utilities for assignment graphs.

Includes:
- Graph export to GraphML, GEXF, GPickle (compatible), JSON (node-link),
  and a CSV of edges with positive flow.
- A simple bipartite visualization of the assignment (entries -> projects).

All comments, messages, and output labels are in English.
PEP 8 compliant.
"""
from __future__ import annotations

import csv
import json
import pickle
from pathlib import Path
from typing import Dict, List, Tuple

import matplotlib.pyplot as plt
import networkx as nx
from networkx.readwrite import json_graph


# ================================
# 8) Export and visualization
# ================================


def _ensure_dir(path: Path) -> None:
    """Ensure a directory exists."""
    path.mkdir(parents=True, exist_ok=True)


def export_graph_models(
    graph: nx.DiGraph,
    meta: Dict,  # kept for API symmetry
    flow: Dict,
    export_dir: Path,
    prefix: str = "model",
) -> Dict[str, Path]:
    """
    Export the given network with flow attributes in multiple formats:
      - GraphML (.graphml)
      - GEXF (.gexf)
      - GPickle (.gpickle) : robust across nx versions (fallback to pickle)
      - JSON node-link (.json)
      - CSV of edges with positive flow (_flow_edges.csv)

    Parameters
    ----------
    graph : nx.DiGraph
        The full flow network.
    meta : Dict
        Metadata dict returned by build_graph_* (unused here).
    flow : Dict
        Flow dictionary as returned by solve_min_cost.
    export_dir : Path
        Destination directory.
    prefix : str
        Filename prefix.

    Returns
    -------
    Dict[str, Path]
        Mapping of format name -> written file path.
    """
    _ensure_dir(export_dir)

    graph_copy = graph.copy()
    for u in flow:
        for v, fval in flow[u].items():
            if graph_copy.has_edge(u, v):
                graph_copy[u][v]["flow"] = int(fval)

    paths = {
        "graphml": export_dir / f"{prefix}.graphml",
        "gexf": export_dir / f"{prefix}.gexf",
        "gpickle": export_dir / f"{prefix}.gpickle",
        "json": export_dir / f"{prefix}.json",
        "csv": export_dir / f"{prefix}_flow_edges.csv",
    }

    # GraphML / GEXF
    nx.write_graphml(graph_copy, paths["graphml"])
    nx.write_gexf(graph_copy, paths["gexf"])

    # --- GPickle with compatibility across nx versions ---
    try:
        # NetworkX ≥ 3.x: write_gpickle may not be at top-level
        from networkx.readwrite.gpickle import (  # type: ignore
            write_gpickle as _write_gpickle,
        )

        _write_gpickle(graph_copy, paths["gpickle"])
    except Exception:
        # Robust fallback: Python pickle
        with open(paths["gpickle"], "wb") as fh:
            pickle.dump(graph_copy, fh, protocol=pickle.HIGHEST_PROTOCOL)

    # JSON node-link
    # Explicitly set edges="links" to silence FutureWarning in NetworkX>=3.6
    # while preserving current JSON schema.
    data = json_graph.node_link_data(graph_copy, edges="links")
    paths["json"].write_text(
        json.dumps(data, ensure_ascii=False, indent=2),
        encoding="utf-8",
    )

    # CSV of positive-flow edges
    with paths["csv"].open("w", newline="", encoding="utf-8") as fh:
        writer = csv.writer(fh)
        writer.writerow(["u", "v", "flow", "capacity", "weight"])
        for u in flow:
            for v, fval in flow[u].items():
                if fval > 0 and graph.has_edge(u, v):
                    writer.writerow(
                        [
                            u,
                            v,
                            int(fval),
                            graph[u][v].get("capacity", ""),
                            graph[u][v].get("weight", ""),
                        ]
                    )

    return paths


def visualize_assignment_graph(
    flow: Dict,
    meta: Dict,
    export_path: Path,
    title: str = "Assignment",
    max_labels: int = 60,
) -> Path:
    """
    Create a simple bipartite plot of the assignment graph
    (entries -> projects).

    Parameters
    ----------
    flow : Dict
        Flow dictionary as returned by solve_min_cost.
    meta : Dict
        Metadata dict produced by build_graph_* (must contain 'entries'
        and 'unassigned').
    export_path : Path
        Where to save the PNG figure.
    title : str
        Figure title.
    max_labels : int
        Maximum number of node/edge labels to draw (to avoid clutter).

    Returns
    -------
    Path
        The saved image path.
    """
    h = nx.DiGraph()
    entries = meta["entries"]
    _unassigned = meta["unassigned"]  # kept for completeness, not used directly

    # Left nodes = "group" nodes e0, e1, ...
    left_nodes: List[str] = []
    for i, _ in entries.reset_index(drop=True).iterrows():
        u = f"e{i}"
        h.add_node(u, bipartite=0)
        left_nodes.append(u)

    # Keep only edges group -> project (ignore _s, _t)
    edgelist: List[Tuple[str, str]] = []
    right_nodes_set = set()
    for u, outs in flow.items():
        if not str(u).startswith("e"):
            continue
        for v, fval in outs.items():
            if fval > 0 and v not in ("_s", "_t"):
                h.add_node(v, bipartite=1)
                h.add_edge(u, v, weight=int(fval))
                edgelist.append((u, v))
                right_nodes_set.add(v)

    # Bipartite layout positions
    pos: Dict[str, Tuple[float, float]] = {}
    for idx, u in enumerate(left_nodes):
        pos[u] = (0.0, -idx)

    right_nodes = sorted(right_nodes_set)
    for idx, v in enumerate(right_nodes):
        pos[v] = (1.0, -idx)

    # Figure size scales with node counts (basic heuristic)
    fig_w = max(8.0, len(right_nodes) * 0.25 + 6.0)
    fig_h = max(6.0, len(left_nodes) * 0.12 + 4.0)

    plt.figure(figsize=(fig_w, fig_h))
    nx.draw_networkx_nodes(
        h,
        pos,
        nodelist=left_nodes,
        node_shape="s",
        node_size=200,
        alpha=0.85,
    )
    nx.draw_networkx_nodes(
        h,
        pos,
        nodelist=right_nodes,
        node_shape="o",
        node_size=300,
        alpha=0.9,
    )

    # Edge widths proportional to assigned flow
    widths = [1 + 2 * h[u][v]["weight"] for (u, v) in edgelist]
    if edgelist:
        nx.draw_networkx_edges(
            h,
            pos,
            edgelist=edgelist,
            width=widths,
            arrows=False,
            alpha=0.5,
        )

    if len(left_nodes) <= max_labels:
        nx.draw_networkx_labels(
            h,
            pos,
            labels={u: u for u in left_nodes},
            font_size=8,
        )
    if len(right_nodes) <= max_labels:
        nx.draw_networkx_labels(
            h,
            pos,
            labels={v: v for v in right_nodes},
            font_size=9,
        )

    if edgelist and len(edgelist) <= max_labels:
        nx.draw_networkx_edge_labels(
            h,
            pos,
            edge_labels={(u, v): h[u][v]["weight"] for (u, v) in edgelist},
            font_size=7,
        )

    plt.title(title)
    plt.axis("off")
    export_path.parent.mkdir(parents=True, exist_ok=True)
    plt.tight_layout()
    plt.savefig(export_path, dpi=200)
    plt.close()
    return export_path


# ================================
# 9) Exports from an existing main
# ================================
if __name__ == "__main__":
    # The following assumes that the functions below are available in the
    # current module or imported from your assignment pipeline:
    # - resolve_data_dir
    # - load_projects_df
    # - load_choices_df
    # - build_graph_unweighted
    # - build_graph_weighted
    # - solve_min_cost

    # Resolve data directory locally (do not rely on another module's global)
    DATA_DIR = resolve_data_dir()
    EXPORT_DIR = (DATA_DIR / "exports").resolve()

    def _pretty_path(path: Path, base: Path) -> str:
        """
        Return a path relative to 'base' if possible; otherwise just the
        filename. This avoids leaking absolute directories in console output.
        """
        try:
            return str(path.relative_to(base))
        except Exception:
            return path.name

    # --- Unweighted: rebuild graph/flow for export ---
    prj_df = load_projects_df(DATA_DIR / "projects.csv")
    ch_df = load_choices_df(
        DATA_DIR / "student-choices.csv",
        prj_df["id"].tolist(),
    )
    g_unw, meta_unw = build_graph_unweighted(
        ch_df,
        prj_df,
        rank_cost=None,
        penalty=None,
        unassigned_label="__NA__",
    )
    flow_unw, _cost_unw = solve_min_cost(g_unw)
    paths_unw = export_graph_models(
        g_unw,
        meta_unw,
        flow_unw,
        EXPORT_DIR,
        prefix="unweighted_model",
    )
    vis_unw = visualize_assignment_graph(
        flow_unw,
        meta_unw,
        EXPORT_DIR / "unweighted_assignment.png",
        title="Assignment (Unweighted)",
    )

    # --- Weighted: rebuild graph/flow for export ---
    g_w, meta_w = build_graph_weighted(
        ch_df,
        prj_df,
        penalty=10.0,
        unassigned_label="__NA__",
    )
    flow_w, _cost_w = solve_min_cost(g_w)
    paths_w = export_graph_models(
        g_w,
        meta_w,
        flow_w,
        EXPORT_DIR,
        prefix="weighted_model",
    )
    vis_w = visualize_assignment_graph(
        flow_w,
        meta_w,
        EXPORT_DIR / "weighted_assignment.png",
        title="Assignment (Weighted)",
    )

    print("\nExports (unweighted):")
    for key, path in paths_unw.items():
        print(f"  {key}: {_pretty_path(path, DATA_DIR)}")
    print("  viz:", _pretty_path(vis_unw, DATA_DIR))

    print("\nExports (weighted):")
    for key, path in paths_w.items():
        print(f"  {key}: {_pretty_path(path, DATA_DIR)}")
    print("  viz:", _pretty_path(vis_w, DATA_DIR))



Exports (unweighted):
  graphml: exports\unweighted_model.graphml
  gexf: exports\unweighted_model.gexf
  gpickle: exports\unweighted_model.gpickle
  json: exports\unweighted_model.json
  csv: exports\unweighted_model_flow_edges.csv
  viz: exports\unweighted_assignment.png

Exports (weighted):
  graphml: exports\weighted_model.graphml
  gexf: exports\weighted_model.gexf
  gpickle: exports\weighted_model.gpickle
  json: exports\weighted_model.json
  csv: exports\weighted_model_flow_edges.csv
  viz: exports\weighted_assignment.png


In [3]:
# -*- coding: utf-8 -*-
"""
Minimal batch for the 3 projects / 3 students sample.

- Reads ONLY: data/3_sample/3_projects.csv and
              data/3_sample/3_student-choices.csv
- Runs both variants (unweighted + weighted) WITHOUT renaming/copying files
- Writes outputs into data/3_sample/
- Exports models and visualizations into data/3_sample/exports
"""


def run_sample_3x3() -> None:
    """Run the 3x3 sample batch end-to-end."""
    base_data_dir = resolve_data_dir()
    sample_dir = (base_data_dir / "3_sample").resolve()
    sample_dir.mkdir(parents=True, exist_ok=True)

    # --- Strict inputs ---
    src_projects = sample_dir / "3_projects.csv"
    src_choices = sample_dir / "3_student-choices.csv"

    if not src_projects.exists() or not src_choices.exists():
        raise FileNotFoundError(
            "Missing files in "
            f"{sample_dir}: "
            f"{'OK' if src_projects.exists() else 'MISSING: 3_projects.csv'}, "
            f"{'OK' if src_choices.exists() else 'MISSING: 3_student-choices.csv'}"
        )

    # --- Load input data ---
    prj_df = load_projects_df(src_projects)
    ch_df = load_choices_df(src_choices, prj_df["id"].tolist())

    # === UNWEIGHTED variant (ordered / ranks) ===
    g_unw, meta_unw = build_graph_unweighted(
        entries_df=ch_df,
        projects_df=prj_df,
        rank_cost=None,
        penalty=None,
        unassigned_label="__NA__",
    )
    flow_unw, cost_unw = solve_min_cost(g_unw)

    assign_unw = expand_to_individual_rows(flow_unw, meta_unw)
    students_unw_df = build_student_df(assign_unw, prj_df, mode="unweighted")
    projects_unw_df = build_project_df(
        assign_unw,
        prj_df,
        include_unassigned=True,
        unassigned_label="__NA__",
    )

    out_student_unw = sample_dir / "assignment_student_unweighted.csv"
    out_project_unw = sample_dir / "assignment_project_unweighted.csv"
    write_students_csv(students_unw_df, out_student_unw)
    write_projects_csv(projects_unw_df, out_project_unw)

    # === WEIGHTED variant (explicit weights) ===
    g_w, meta_w = build_graph_weighted(
        entries_df=ch_df,
        projects_df=prj_df,
        penalty=10.0,
        unassigned_label="__NA__",
    )
    flow_w, cost_w = solve_min_cost(g_w)

    assign_w = expand_to_individual_rows(flow_w, meta_w)
    students_w_df = build_student_df(assign_w, prj_df, mode="weighted")
    projects_w_df = build_project_df(
        assign_w,
        prj_df,
        include_unassigned=True,
        unassigned_label="__NA__",
    )

    out_student_w = sample_dir / "assignment_student_weighted.csv"
    out_project_w = sample_dir / "assignment_project_weighted.csv"
    write_students_csv(students_w_df, out_student_w)
    write_projects_csv(projects_w_df, out_project_w)

    # --- Graph/model exports ---
    export_dir = sample_dir / "exports"
    export_dir.mkdir(parents=True, exist_ok=True)

    export_graph_models(
        g_unw, meta_unw, flow_unw, export_dir, prefix="unweighted_model"
    )
    visualize_assignment_graph(
        flow_unw,
        meta_unw,
        export_dir / "unweighted_assignment.png",
        title="Assignment (Unweighted)",
    )

    export_graph_models(
        g_w, meta_w, flow_w, export_dir, prefix="weighted_model"
    )
    visualize_assignment_graph(
        flow_w,
        meta_w,
        export_dir / "weighted_assignment.png",
        title="Assignment (Weighted)",
    )

    # Pretty console output without leaking absolute directories
    def _pretty_path(path, base):
        try:
            return str(path.relative_to(base))
        except Exception:
            return path.name

    print("\n[3x3] Done.")
    print(f" - Cost (unweighted): {cost_unw}")
    print(f" - Cost (weighted)  : {cost_w}")
    print(" - Files written:")
    print(f"   • {_pretty_path(out_student_unw, sample_dir)}")
    print(f"   • {_pretty_path(out_project_unw, sample_dir)}")
    print(f"   • {_pretty_path(out_student_w, sample_dir)}")
    print(f"   • {_pretty_path(out_project_w, sample_dir)}")
    print(f"   • Exports in: {_pretty_path(export_dir, sample_dir)}")


if __name__ == "__main__":
    # Launch the 3x3 batch when invoked as a script
    run_sample_3x3()



[3x3] Done.
 - Cost (unweighted): 0.0
 - Cost (weighted)  : 0.8999999999999999
 - Files written:
   • assignment_student_unweighted.csv
   • assignment_project_unweighted.csv
   • assignment_student_weighted.csv
   • assignment_project_weighted.csv
   • Exports in: exports
