In [38]:
from pathlib import Path
import pandas as pd, numpy as np, matplotlib.pyplot as plt, unicodedata, re
from datetime import date

# Fichiers
DATA = Path("Etat_De_Lart.xlsx")   
EXPORTS = Path("exports_stats"); EXPORTS.mkdir(exist_ok=True)

# Chargement + renommage colonnes pour avoir des nom + normés
df = pd.read_excel(DATA).rename(columns={
    "Nom du système": "system_name",
    "Input Modalities": "input_modality",
    "Ouput Modalities": "output_modality",
    "Delivery Technology": "delivery_tech",
    "Reception technology": "reception_tech",
    "Object": "object_type",
    "Date": "date",
    "Titre de l'article": "paper_title",
})

# Split générique entre nos critères , ;, /, +, "et", "and"
SPLIT_REGEX = r"[;,/|＋\+]|(?:\s+et\s+)|(?:\s+and\s+)"

# on protege Microcontroler ect du regex
PROTECTED_PATTERNS_BY_COL = {
    "reception_tech": [
        (
            re.compile(
                r"\bmicrocontrollers?\s*,?\s*sensors?\s*,?\s*(?:and|et|&)\s+actuators?\b",
                flags=re.IGNORECASE
            ),
            "Microcontrollers, Sensors, and Actuators"
        )
    ]
}

def tokenize(val, col_name=None):
    if pd.isna(val):
        return []
    s = unicodedata.normalize("NFKC", str(val)).strip()
    placeholders = {}
    if col_name in PROTECTED_PATTERNS_BY_COL:
        for idx, (pat, canonical) in enumerate(PROTECTED_PATTERNS_BY_COL[col_name]):
            ph = f"__PROT{idx}__"
            s, n = pat.subn(ph, s)
            if n > 0:
                placeholders[ph] = canonical
    parts = re.split(SPLIT_REGEX, s, flags=re.IGNORECASE)
    out = []
    for p in parts:
        t = norm(p)
        if isinstance(t, str) and t:
            t = t.strip(' "“”‘’[]()')
            if not t:
                continue
            out.append(placeholders.get(t, t))
    return out

def explode_and_canon(df, col_name, token_col):
    rows = []
    for _, r in df[["system_name", col_name]].iterrows():
        for t in tokenize(r[col_name], col_name):
            ct = CANON_FUN[col_name](t) if 'CANON_FUN' in globals() and col_name in CANON_FUN else t
            if ct:
                rows.append({"system_name": r["system_name"], token_col: ct})
    out = pd.DataFrame(rows)
    if not out.empty:
        out = out.drop_duplicates(["system_name", token_col])
    return out

# Tables de tokens pour avoir accès plus faciement a chaque critères 
input_tok     = explode_and_canon(df, "input_modality", "input_token")
delivery_tok  = explode_and_canon(df, "delivery_tech", "delivery_token")
output_tok    = explode_and_canon(df, "output_modality", "output_token")
reception_tok = explode_and_canon(df, "reception_tech", "reception_token")
object_tok    = explode_and_canon(df, "object_type", "object_token")

#pour les libellés des axes 
AXIS_Y = {
    "input_modality":  "Input modalities",
    "object_type":     "Object Characteristics",
    "reception_tech":  "Reception Technologies",
    "delivery_tech":   "Delivery Technologies",
    "output_modality": "Output Modalities",
}
X_LABEL_COUNT   = "Number of systems"
X_LABEL_PERCENT = "Percent of systems (%)"

# trace un histograme 
def _barh(series, ylabel, xlabel, out_name, xlim=None, step=None):
    if series is None or series.empty:
        fig, ax = plt.subplots(figsize=(4,3)); ax.axis("off")
        plt.savefig(EXPORTS / out_name, dpi=200, bbox_inches="tight"); plt.close(fig); return
    s = series.sort_values()  
    labels = [str(i) for i in s.index]
    max_lbl = max((len(x) for x in labels), default=8)
    fig_w = max(7.5, 7.0 + 0.06 * max_lbl)
    fig_h = max(3.5, 0.35 * len(labels) + 1.2)
    fig, ax = plt.subplots(figsize=(fig_w, fig_h))
    s.plot(kind="barh", ax=ax)
    ax.set_ylabel(ylabel); ax.set_xlabel(xlabel)
    if xlim is not None:
        ax.set_xlim(0, xlim)
        if step: ax.set_xticks(range(0, xlim + 1, step))
    left = min(0.5, 0.12 + 0.012 * max_lbl)
    fig.subplots_adjust(left=left, right=0.98, top=0.98, bottom=0.14)
    plt.savefig(EXPORTS / out_name, dpi=200, bbox_inches="tight"); plt.close(fig)

# Pour chaque critères ressort son total en cout et en %
def plot_counts_and_percents(dims):
    counts = []
    xmax = 0
    for d in dims:
        s = d["df"][d["col"]].value_counts() if d["df"] is not None and not d["df"].empty else pd.Series(dtype=int)
        counts.append((d, s))
        if not s.empty:
            xmax = max(xmax, int(s.max()))
    common_x = int(np.ceil(xmax / 50.0) * 50) if xmax > 0 else 50

    for d, s in counts:
        _barh(s, d["y"], "Number of systems", f"{d['stub']}_tokens_count.png", xlim=common_x, step=50)
        if not s.empty:
            p = (s / s.sum() * 100).round(1)
        else:
            p = s
        _barh(p, d["y"], "Percent of systems (%)", f"{d['stub']}_tokens_percent.png", xlim=100, step=10)


# def pour les heatmaps plus vraiment utile 
def plot_heatmap(dfA, colA, dfB, colB, xlabel, ylabel, out_name, annotate=False):
    if dfA is None or dfA.empty or dfB is None or dfB.empty:
        fig, ax = plt.subplots(figsize=(4,3)); ax.axis("off")
        plt.savefig(EXPORTS / out_name, dpi=200, bbox_inches="tight"); plt.close(fig); return
    pairs = dfA[["system_name", colA]].merge(
        dfB[["system_name", colB]], on="system_name", how="inner")
    if pairs.empty:
        fig, ax = plt.subplots(figsize=(4,3)); ax.axis("off")
        plt.savefig(EXPORTS / out_name, dpi=200, bbox_inches="tight"); plt.close(fig); return
    pivot = pd.crosstab(pairs[colA], pairs[colB])
    n_rows, n_cols = pivot.shape
    cell_w, cell_h = 1.2, 0.9
    fig_w = max(6.5, cell_w * n_cols + 2.5); fig_h = max(4.5, cell_h * n_rows + 2.5)
    fig, ax = plt.subplots(figsize=(fig_w, fig_h))
    im = ax.imshow(pivot.values, aspect="auto", interpolation="nearest")
    ax.set_xticks(range(n_cols)); ax.set_yticks(range(n_rows))
    ax.set_xticklabels(pivot.columns, rotation=30, ha="right")
    ax.set_yticklabels(pivot.index)
    ax.set_xlabel(xlabel); ax.set_ylabel(ylabel); ax.tick_params(labelsize=10)
    if annotate:
        for i in range(n_rows):
            for j in range(n_cols):
                ax.text(j, i, str(int(pivot.values[i, j])), ha="center", va="center", fontsize=9)
    max_x = max((len(str(x)) for x in pivot.columns), default=5)
    max_y = max((len(str(y)) for y in pivot.index), default=5)
    bottom = min(0.6, 0.16 + 0.010 * max_x); left = min(0.6, 0.12 + 0.012 * max_y)
    fig.subplots_adjust(left=left, right=0.98, top=0.98, bottom=bottom)
    fig.colorbar(im, ax=ax, fraction=0.035, pad=0.04)
    plt.savefig(EXPORTS / out_name, dpi=200, bbox_inches="tight"); plt.close(fig)
    pivot.to_csv(EXPORTS / (Path(out_name).stem + ".csv"))


dims = [
    {"stub":"input",          "y":"Input modalities",      "df": input_tok,     "col":"input_token"},
    {"stub":"delivery_tech",  "y":"Delivery Technologies", "df": delivery_tok,  "col":"delivery_token"},
    {"stub":"output_modality","y":"Output Modalities",     "df": output_tok,    "col":"output_token"},
    {"stub":"reception_tech", "y":"Reception Technologies","df": reception_tok, "col":"reception_token"},
    {"stub":"object_type",    "y":"Object Characteristics","df": object_tok,    "col":"object_token"},
]
plot_counts_and_percents(dims)

# Heatmaps 
plot_heatmap(input_tok, "input_token", reception_tok, "reception_token",
             xlabel="Reception Technologies", ylabel="Input modalities",
             out_name="input_tokens_x_reception_tokens.png", annotate=False)

plot_heatmap(input_tok, "input_token", delivery_tok, "delivery_token",
             xlabel="Delivery Technologies", ylabel="Input modalities",
             out_name="input_tokens_x_delivery_tokens.png", annotate=False)


print("c'est bien exporté ici", EXPORTS.resolve())


Exports écrits dans: C:\Users\rodrig296u\Documents\Prog\Jupyter\exports_stats


In [39]:
import numpy as np
import matplotlib.ticker as mticker


TOP_N = 5  

def plot_trends_lines(pivot: pd.DataFrame, outfile_png: str, top_n: int = TOP_N, y_step: int = 5):
    if pivot is None or pivot.empty:
        fig, ax = plt.subplots(figsize=(6, 3)); ax.axis("off")
        plt.savefig(EXPORTS / outfile_png, dpi=200, bbox_inches="tight"); plt.close(fig); return

    totals = pivot.sum(axis=0).sort_values(ascending=False)
    cols = totals.index[:min(top_n, len(totals))].tolist()
    data = pivot[cols].copy()

    years = data.index.tolist()
    x = np.arange(len(years))
    fig_w = max(10, 0.6 * len(years))
    fig, ax = plt.subplots(figsize=(fig_w, 6))

    for c in cols:
        ax.plot(x, data[c].values, label=c, linewidth=2)

    # Axes
    ax.set_xlabel("Year")
    ax.set_ylabel("Number of systems")
    ax.set_xlim(-0.2, len(years) - 0.8)

    step_x = 1 if len(years) <= 12 else 2
    ax.set_xticks(x[::step_x])
    ax.set_xticklabels([str(y) for y in years][::step_x])

    ymax_data = float(np.nanmax(data.values)) if data.size else 0.0
    ymax = int(np.ceil(ymax_data / y_step) * y_step) or y_step
    ax.set_ylim(0, ymax)
    ax.set_yticks(np.arange(0, ymax + 1, y_step))
    ax.yaxis.set_major_formatter(mticker.FormatStrFormatter('%d'))

    ax.legend(loc="center left", bbox_to_anchor=(1.02, 0.5), borderaxespad=0.)
    fig.subplots_adjust(left=0.10, right=0.80, top=0.98, bottom=0.16)
    # export
    plt.savefig(EXPORTS / outfile_png, dpi=200, bbox_inches="tight")
    plt.close(fig)

# Génération des courbes pour chaque critère plus très utile
for name, dft, tok_col in [
    ("input_modality",  input_tok,     "input_token"),
    ("object_type",     object_tok,    "object_token"),
    ("reception_tech",  reception_tok, "reception_token"),
    ("delivery_tech",   delivery_tok,  "delivery_token"),
    ("output_modality", output_tok,    "output_token"),
]:
    pivot = trends_by_year(dft, tok_col, systems_year, YEARS)
    plot_trends_lines(pivot, f"trend_{name}_lines.png", top_n=TOP_N, y_step=5)

# Time trend des systems par années

def plot_total_systems_per_year(systems_year_df, years, outfile_png="trend_total_systems_per_year.png", y_step=5):
    if systems_year_df is None or systems_year_df.empty:
        fig, ax = plt.subplots(figsize=(6, 3)); ax.axis("off")
        # export 
        plt.savefig(EXPORTS / outfile_png, dpi=200, bbox_inches="tight"); plt.close(fig); return

    ts = (systems_year_df.groupby("year")["system_name"]
          .nunique()
          .reindex(years, fill_value=0))
    # export CSV
    ts.rename("count").to_csv(EXPORTS / "trend_total_systems_per_year.csv", header=True)

    x = np.arange(len(years))
    fig_w = max(10, 0.6 * len(years))
    fig, ax = plt.subplots(figsize=(fig_w, 6))

    ax.plot(x, ts.values, linewidth=2, label="Total systems")

    ax.set_xlabel("Year")
    ax.set_ylabel("Number of systems")
    ax.set_xlim(-0.2, len(years) - 0.8)

    step_x = 1 if len(years) <= 12 else 2
    ax.set_xticks(x[::step_x])
    ax.set_xticklabels([str(y) for y in years][::step_x])

    ymax_data = float(np.nanmax(ts.values)) if ts.size else 0.0
    ymax = int(np.ceil(ymax_data / y_step) * y_step) or y_step
    ax.set_ylim(0, ymax)
    ax.set_yticks(np.arange(0, ymax + 1, y_step))
    ax.yaxis.set_major_formatter(mticker.FormatStrFormatter('%d'))

    ax.legend(loc="upper left")
    fig.subplots_adjust(left=0.10, right=0.98, top=0.98, bottom=0.16)
    # export CSV
    plt.savefig(EXPORTS / outfile_png, dpi=200, bbox_inches="tight")
    plt.close(fig)
plot_total_systems_per_year(systems_year, YEARS, outfile_png="trend_total_systems_per_year.png", y_step=5)

print("c'est bien exporté ici", EXPORTS.resolve())


In [40]:
# Sankey (Input -> Object -> Reception -> Delivery -> Output)
try:
    import plotly.graph_objects as go
    def build_sankey_from_layers(layers, html_path):
        labels=[]; node_index={}
        for i, (_, df_layer, col) in enumerate(layers):
            if df_layer is None or df_layer.empty: continue
            for v in sorted(df_layer[col].dropna().unique().tolist()):
                node_index[(i, v)] = len(labels); labels.append(str(v))
        sources, targets, values = [], [], []
        for i in range(len(layers)-1):
            _, df_a, col_a = layers[i]; _, df_b, col_b = layers[i+1]
            if df_a is None or df_b is None or df_a.empty or df_b.empty: continue
            a = df_a[["system_name", col_a]].drop_duplicates()
            b = df_b[["system_name", col_b]].drop_duplicates()
            m = a.merge(b, on="system_name", how="inner")
            if m.empty: continue
            ct = m.groupby([col_a, col_b]).size().reset_index(name="value")
            for va, vb, vv in zip(ct[col_a], ct[col_b], ct["value"]):
                if (i, va) in node_index and (i+1, vb) in node_index:
                    sources.append(node_index[(i, va)]); targets.append(node_index[(i+1, vb)]); values.append(int(vv))
        if not values: return None
        fig = go.Figure(data=[go.Sankey(node=dict(label=labels, pad=10, thickness=15),
                                        link=dict(source=sources, target=targets, value=values))])
        fig.write_html(html_path); return html_path
    build_sankey_from_layers(
        layers=[("Input", input_tok, "input_token"),
                ("Object", object_tok, "object_token"),
                ("Reception", reception_tok, "reception_token"),
                ("Delivery", delivery_tok, "delivery_token"),
                ("Output", output_tok, "output_token")],
        html_path=str(EXPORTS / "sankey_tokens_input_object_reception_delivery_output.html"),
    )
except Exception:
    pass

print("c'est bien exporté ici", EXPORTS.resolve())
