<a href="https://colab.research.google.com/github/Demiurgy/Wines/blob/main/wines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# --- 0) БАЗА: импорты ---
import os
import pandas as pd

# Если у тебя parquet — часто нужен движок:
try:
    import pyarrow  # noqa: F401
except Exception:
    pass


In [4]:
# --- 1) Монтируем Google Drive ---
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# --- 2) Укажи путь к файлу ---
FILE_PATH = "/content/drive/MyDrive/wines/wine_work.csv"

assert os.path.exists(FILE_PATH), f"Файл не найден: {FILE_PATH}"
print("OK, file exists:", FILE_PATH)


OK, file exists: /content/drive/MyDrive/wines/wine_work.csv


In [6]:
del df


NameError: name 'df' is not defined

In [7]:
import pandas as pd
import numpy as np
import ast
import re

# --- 1) загрузка (подставь свой способ) ---
df = pd.read_csv("/content/drive/MyDrive/wines/wine_work.csv")  # или read_excel / из gdrive и т.д.

# --- 2) парсер nearests ---
def parse_nearests(x):
    """
    nearests: обычно строка вида "[{'match_percent': 86.35, 'filepath': 'IMG_1e.jpg', 'image_id': 'IMG1'}, ...]"
    Это НЕ JSON, это repr Python-объекта -> парсим через ast.literal_eval.
    Возвращаем list[dict] или [].
    """
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return []
    if isinstance(x, list):
        return x

    s = str(x).strip()
    if s == "" or s.lower() in {"nan", "none"}:
        return []

    # Иногда встречаются странные пробелы/переносы или лишний текст — минимально чистим.
    # (Не пытаемся "умно чинить" слишком много, иначе можно повредить данные.)
    s = re.sub(r"\s+", " ", s)

    try:
        obj = ast.literal_eval(s)
        if isinstance(obj, list):
            # гарантируем словари нужной формы
            out = []
            for it in obj:
                if isinstance(it, dict):
                    out.append(it)
            return out
        return []
    except Exception:
        # если сломалось — возвращаем пусто, но можно логировать
        return []

df["nearests_parsed"] = df["nearests"].apply(parse_nearests)

# --- 3) "edges" таблица: одна строка = одна пара (source -> neighbor) ---
edges = (
    df[["image_id", "filepath", "nearests_parsed"]]
    .explode("nearests_parsed", ignore_index=True)
)

# если nearests_parsed был пустым списком, explode даст NaN
edges = edges.dropna(subset=["nearests_parsed"]).copy()

# вытаскиваем поля из dict
edges["neighbor_image_id"] = edges["nearests_parsed"].apply(lambda d: d.get("image_id"))
edges["neighbor_filepath"] = edges["nearests_parsed"].apply(lambda d: d.get("filepath"))
edges["match_percent"] = edges["nearests_parsed"].apply(lambda d: d.get("match_percent"))

# нормализуем типы
edges["match_percent"] = pd.to_numeric(edges["match_percent"], errors="coerce")

# --- 4) анти-паранойя: уберём самоссылки (часто там 100% на самого себя) ---
edges = edges[edges["neighbor_image_id"].notna()].copy()
edges = edges[edges["image_id"] != edges["neighbor_image_id"]].copy()

# --- 5) быстрые sanity checks ---
print("Rows in df:", len(df))
print("Parsed nearests non-empty:", (df["nearests_parsed"].apply(len) > 0).sum())
print("Edges count (without self-links):", len(edges))
print("Match percent stats:")
print(edges["match_percent"].describe())

# --- 6) топ-1 сосед для каждой картинки (удобно для дедупа/семейств) ---
# Берём максимальный match_percent по source image_id
nearest_top1 = (
    edges.sort_values(["image_id", "match_percent"], ascending=[True, False])
         .groupby("image_id", as_index=False)
         .first()
         .rename(columns={
             "neighbor_image_id": "top1_neighbor_image_id",
             "neighbor_filepath": "top1_neighbor_filepath",
             "match_percent": "top1_match_percent"
         })
)

df2 = df.merge(nearest_top1[["image_id", "top1_neighbor_image_id", "top1_neighbor_filepath", "top1_match_percent"]],
               on="image_id", how="left")

display(df2.head(10))
display(edges.head(20))


Rows in df: 1070
Parsed nearests non-empty: 1069
Edges count (without self-links): 4233
Match percent stats:
count    4233.000000
mean       46.661439
std        30.933542
min         5.010000
25%        15.450000
50%        44.940000
75%        74.030000
max       100.000000
Name: match_percent, dtype: float64


Unnamed: 0,image,image_id,filepath,color_type,vintage_year,wineID,manufacturer,color,sparkling,sugar,...,file_bottle_name,file_label_name,notes,sortnum,nearests,search_json,nearests_parsed,top1_neighbor_image_id,top1_neighbor_filepath,top1_match_percent
0,1-41/Массандра_мускат белый южнобережный 2.jpg,IMG1,IMG_1e.jpg,Белое,ЗГУ Крым,1,Массандра,Белое,Креплёное,Сладкое,...,Массандра_Мускат_белый_южнобережный.jpg,Массандра_мускат белый южнобережный 2.jpg,,1,"[{'match_percent': 86.35, 'filepath': 'IMG_1e....","[{'filepath': 'IMG_1e.jpg', 'score': 65.03, 's...","[{'match_percent': 86.35, 'filepath': 'IMG_1e....",IMG5,IMG_5e.jpg,69.87
1,1-41/Массандра портвейн белый южнобережный 2.jpg,IMG2,IMG_2e.jpg,Белое,ЗГУ Крым,2,Массандра,Белое,Креплёное,Сладкое,...,Массандра портвейн белый южнобережный 1.jpg,Массандра портвейн белый южнобережный 2.jpg,,2,"[{'match_percent': 100.0, 'filepath': 'IMG_2e....","[{'filepath': 'IMG_2e.jpg', 'score': 85.22, 's...","[{'match_percent': 100.0, 'filepath': 'IMG_2e....",IMG949,IMG_949e.jpg,91.18
2,1-41/Массандра портвейн красный 2.jpg,IMG3,IMG_3e.jpg,Красное,ЗГУ Крым,3,Массандра,Красное,Креплёное,Сладкое,...,Массандра портвейн красный 1.jpg,Массандра портвейн красный 2.jpg,,3,"[{'match_percent': 94.8, 'filepath': 'IMG_3e.j...","[{'filepath': 'IMG_3e.jpg', 'score': 67.06, 's...","[{'match_percent': 94.8, 'filepath': 'IMG_3e.j...",IMG6,IMG_6e.jpg,68.61
3,1-41/Массандра портвейн красный ливадия 2.jpg,IMG4,IMG_4e.jpg,Красное,ЗГУ Крым,4,Массандра,Красное,Креплёное,Сладкое,...,Массандра портвейн красный ливадия 1.jpg,Массандра портвейн красный ливадия 2.jpg,,4,"[{'match_percent': 92.89, 'filepath': 'IMG_4e....","[{'filepath': 'IMG_4e.jpg', 'score': 65.2, 'sc...","[{'match_percent': 92.89, 'filepath': 'IMG_4e....",IMG3,IMG_3e.jpg,72.26
4,1-41/Массандра седьмое небо князя голицына 2.jpg,IMG5,IMG_5e.jpg,Белое,ЗГУ Крым,5,Массандра,Белое,Креплёное,Сладкое,...,Массандра седьмое небо князя голицына 1.jpg,Массандра седьмое небо князя голицына 2.jpg,,5,"[{'match_percent': 100.0, 'filepath': 'IMG_5e....","[{'filepath': 'IMG_5e.jpg', 'score': 69.36, 's...","[{'match_percent': 100.0, 'filepath': 'IMG_5e....",IMG1,IMG_1e.jpg,73.84
5,1-41/Массандра портвей белый поручик Голицын 2...,IMG6,IMG_6e.jpg,Белое,ЗГУ Крым,6,Массандра,Белое,Креплёное,Сладкое,...,Массандра портвей белый поручик Голицын 1.jpg,Массандра портвей белый поручик Голицын 2.jpg,,6,"[{'match_percent': 91.94, 'filepath': 'IMG_6e....","[{'filepath': 'IMG_6e.jpg', 'score': 71.43, 's...","[{'match_percent': 91.94, 'filepath': 'IMG_6e....",IMG4,IMG_4e.jpg,83.76
6,1-41/Массандра херес 2.jpg,IMG7,IMG_7e.jpg,Белое,ЗГУ Крым,7,Массандра,Белое,Креплёное,Сладкое,...,Массандра херес 1.jpg,Массандра херес 2.jpg,,7,"[{'match_percent': 90.56, 'filepath': 'IMG_7e....","[{'filepath': 'IMG_7e.jpg', 'score': 73.28, 's...","[{'match_percent': 90.56, 'filepath': 'IMG_7e....",IMG29,IMG_29e.jpg,80.97
7,1-41/Массандра портвейн красный крымский 2.jpg,IMG8,IMG_8e.jpg,Красное,ЗГУ Крым,8,Массандра,Красное,Креплёное,Сладкое,...,Массандра портвейн красный крымский 1.jpg,Массандра портвейн красный крымский 2.jpg,,8,"[{'match_percent': 97.34, 'filepath': 'IMG_8e....","[{'filepath': 'IMG_8e.jpg', 'score': 68.18, 's...","[{'match_percent': 97.34, 'filepath': 'IMG_8e....",IMG9,IMG_9e.jpg,84.39
8,1-41/Массандра портвейн белый крымский 2.jpg,IMG9,IMG_9e.jpg,Белое,ЗГУ Крым,9,Массандра,Белое,Креплёное,Сладкое,...,Массандра портвейн белый крымский 1.jpg,Массандра портвейн белый крымский 2.jpg,,9,"[{'match_percent': 94.41, 'filepath': 'IMG_9e....","[{'filepath': 'IMG_9e.jpg', 'score': 66.95, 's...","[{'match_percent': 94.41, 'filepath': 'IMG_9e....",IMG8,IMG_8e.jpg,89.27
9,1-41/Массандра портвейн юелый сурож 2.jpg,IMG10,IMG_10e.jpg,Белое,ЗГУ Крым,10,Массандра,Белое,Креплёное,Сладкое,...,Массандра портвейн юелый сурож 1.jpg,Массандра портвейн юелый сурож 2.jpg,,10,"[{'match_percent': 98.19, 'filepath': 'IMG_10e...","[{'filepath': 'IMG_10e.jpg', 'score': 68.08, '...","[{'match_percent': 98.19, 'filepath': 'IMG_10e...",IMG8,IMG_8e.jpg,80.19


Unnamed: 0,image_id,filepath,nearests_parsed,neighbor_image_id,neighbor_filepath,match_percent
1,IMG1,IMG_1e.jpg,"{'match_percent': 62.31, 'filepath': 'IMG_31e....",IMG31,IMG_31e.jpg,62.31
2,IMG1,IMG_1e.jpg,"{'match_percent': 64.77, 'filepath': 'IMG_6e.j...",IMG6,IMG_6e.jpg,64.77
3,IMG1,IMG_1e.jpg,"{'match_percent': 60.86, 'filepath': 'IMG_2e.j...",IMG2,IMG_2e.jpg,60.86
4,IMG1,IMG_1e.jpg,"{'match_percent': 69.87, 'filepath': 'IMG_5e.j...",IMG5,IMG_5e.jpg,69.87
5,IMG1,IMG_1e.jpg,"{'match_percent': 58.79, 'filepath': 'IMG_29e....",IMG29,IMG_29e.jpg,58.79
6,IMG1,IMG_1e.jpg,"{'match_percent': 63.75, 'filepath': 'IMG_631e...",IMG631,IMG_631e.jpg,63.75
7,IMG1,IMG_1e.jpg,"{'match_percent': 59.06, 'filepath': 'IMG_3e.j...",IMG3,IMG_3e.jpg,59.06
8,IMG1,IMG_1e.jpg,"{'match_percent': 56.88, 'filepath': 'IMG_7e.j...",IMG7,IMG_7e.jpg,56.88
9,IMG1,IMG_1e.jpg,"{'match_percent': 42.88, 'filepath': 'IMG_4e.j...",IMG4,IMG_4e.jpg,42.88
11,IMG2,IMG_2e.jpg,"{'match_percent': 72.54, 'filepath': 'IMG_6e.j...",IMG6,IMG_6e.jpg,72.54


In [8]:
lens = df["nearests_parsed"].apply(len)
print(lens.describe())
print(lens.value_counts().head(20))

# сколько ребер получится, если просто убрать self-ссылку "в лоб"
approx_edges = (lens - 1).clip(lower=0).sum()
print("Approx edges if each list has exactly one self-link:", int(approx_edges))


count    1070.000000
mean        4.955140
std         3.284434
min         0.000000
25%         2.000000
50%         4.000000
75%         8.000000
max        10.000000
Name: nearests_parsed, dtype: float64
nearests_parsed
1     199
10    195
2     148
3     126
6      89
4      83
7      67
5      65
8      54
9      43
0       1
Name: count, dtype: int64
Approx edges if each list has exactly one self-link: 4233


In [9]:
# Найдем строки, где парс дал не список dict'ов "как надо"
bad = df[df["nearests_parsed"].apply(lambda lst: any(not isinstance(x, dict) for x in lst))]
print("Rows with non-dict items inside nearests_parsed:", len(bad))

# Найдем строки, где в dict нет image_id или match_percent
bad2 = df[df["nearests_parsed"].apply(lambda lst: any(("image_id" not in d or "match_percent" not in d) for d in lst if isinstance(d, dict)))]
print("Rows with missing keys in nearest dicts:", len(bad2))


Rows with non-dict items inside nearests_parsed: 0
Rows with missing keys in nearest dicts: 0


In [10]:
# создаем "ключ пары" без направления
a = edges[["image_id", "neighbor_image_id", "match_percent"]].copy()
a["u"] = a[["image_id","neighbor_image_id"]].min(axis=1)
a["v"] = a[["image_id","neighbor_image_id"]].max(axis=1)

und = (a.groupby(["u","v"], as_index=False)["match_percent"].max()
         .rename(columns={"u":"node1","v":"node2","match_percent":"w"}))

display(und.sort_values("w", ascending=False).head(20))


Unnamed: 0,node1,node2,w
2325,IMG937,IMG938,100.0
2305,IMG91,IMG93,100.0
2209,IMG86,IMG89,100.0
2267,IMG87,IMG89,100.0
2268,IMG87,IMG90,100.0
2285,IMG89,IMG90,100.0
2180,IMG85,IMG86,100.0
2138,IMG82,IMG83,100.0
2162,IMG84,IMG89,100.0
2177,IMG847,IMG849,100.0


In [11]:
pairs100 = und[und["w"] >= 99.9].copy()  # und из твоего кода, с node1/node2/w
ids = pd.unique(pairs100[["node1","node2"]].values.ravel("K"))

meta = df[df["image_id"].isin(ids)][
    ["image_id","manufacturer","folder_name","name","image","filepath","file_label_name","file_bottle_name"]
].sort_values(["manufacturer","folder_name","name","image_id"])

display(meta)
print("Manufacturers in 100%-pairs:", meta["manufacturer"].nunique())


Unnamed: 0,image_id,manufacturer,folder_name,name,image,filepath,file_label_name,file_bottle_name
1056,IMG1083,Alma Valley,1070-1096,Шираз,1070-1096/альма вэлли шираз (1).jpg,IMG_1083e.jpg,альма вэлли шираз (1).jpg,альма вэлли шираз (2).jpg
373,IMG383,Alma Valley,316-384,Rieslaner,316-384/alma valley рисланер 2.jpg,IMG_383e.jpg,alma valley рисланер 2.jpg,alma valley рисланер.jpg
790,IMG802,Alma Valley,799-826,Траминер,799-826/альма вэлли траминер (2).jpg,IMG_802e.jpg,альма вэлли траминер (2).jpg,альма вэлли траминер (1).jpg
109,IMG110,Alma Valley,96-151,Caberne Sauvignon,96-151/alma valley каберне совиньон (2).jpg,IMG_110e.jpg,alma valley каберне совиньон (2).jpg,alma valley каберне совиньон.jpg
110,IMG111,Alma Valley,96-151,Cabernet Franc,96-151/alma valley каберне фран (2).jpg,IMG_111e.jpg,alma valley каберне фран (2).jpg,alma valley каберне фран.jpg
...,...,...,...,...,...,...,...,...
724,IMG736,Фанагория,727-798,Саперави,727-798/фанагория крю лермонт саперави 2.jpg,IMG_736e.jpg,фанагория крю лермонт саперави 2.jpg,фанагория крю лермонт саперави.jpg
924,IMG938,Фанагория,915-953,Фанагория Брют,915-953/фанагория брют (1).jpg,IMG_938e.jpg,фанагория брют (1).jpg,фанагория брют (2).jpg
923,IMG937,Фанагория,915-953,Фанагория Полусладкое,915-953/фанагория полусладкое (1).jpg,IMG_937e.jpg,фанагория полусладкое (1).jpg,фанагория полусладкое (2).jpg
774,IMG786,Шато Тамань,727-798,Селект Руж,727-798/шато тамань селект руж 2.jpg,IMG_786e.jpg,шато тамань селект руж 2.jpg,шато тамань селект руж.jpg


Manufacturers in 100%-pairs: 26


In [12]:
# und: node1,node2,w  (симметризованный max)
pairs = und.copy()

# добавим мету к обоим концам пары
meta_cols = ["image_id","manufacturer","name","color","color_type","sparkling","sugar","vintage_year","folder_name","image","file_label_name"]
m = df[meta_cols].copy()

pairs = pairs.merge(m, left_on="node1", right_on="image_id", how="left").drop(columns=["image_id"])
pairs = pairs.merge(m, left_on="node2", right_on="image_id", how="left", suffixes=("_a","_b")).drop(columns=["image_id"])

# флаги различий по consumer-critical
def neq(a,b):
    return (a.fillna("∅") != b.fillna("∅"))

pairs["diff_color"] = neq(pairs["color_a"], pairs["color_b"])
pairs["diff_sugar"] = neq(pairs["sugar_a"], pairs["sugar_b"])
pairs["diff_sparkling"] = neq(pairs["sparkling_a"], pairs["sparkling_b"])
pairs["diff_vintage"] = neq(pairs["vintage_year_a"].astype(str), pairs["vintage_year_b"].astype(str))

pairs["diff_critical_cnt"] = pairs[["diff_color","diff_sugar","diff_sparkling"]].sum(axis=1)

# 1) внутри одного производителя
danger_inbrand = pairs[
    (pairs["manufacturer_a"] == pairs["manufacturer_b"]) &
    (pairs["w"] >= 90) &
    (pairs["diff_critical_cnt"] >= 1)
].sort_values(["diff_critical_cnt","w"], ascending=[False, False])

display(danger_inbrand.head(50)[[
    "w",
    "manufacturer_a",
    "name_a","color_a","sparkling_a","sugar_a",
    "name_b","color_b","sparkling_b","sugar_b",
    "node1","node2",
    "image_a","image_b"
]])

print("Danger pairs (in-brand):", len(danger_inbrand))


Unnamed: 0,w,manufacturer_a,name_a,color_a,sparkling_a,sugar_a,name_b,color_b,sparkling_b,sugar_b,node1,node2,image_a,image_b
691,100.0,Усадьба Дивноморское,Grand Rose,Розе,Игристое,Экстра Брют,Гранд Кюве,Белое,Игристое,Брют,IMG253,IMG990,237-315/усадьба дивноморское гранд розе 2.jpg,979-1009/усадьба дивноморское гранд кюве (1).jpg
2076,100.0,Инкерман,Winemaker's selection Мускат,Белое,Тихое,Полусладкое,Winemaker's Rose Каберне Совиньон,Розовое,Тихое,Сухое,IMG78,IMG80,77-95/inkerman muskat 2.jpg,77-95/inkerman rose 2.jpg
2185,100.0,ЗМВ Коктебель,Кокур,Белое,Игристое,Брют,Пино Нуар,Розовое,Игристое,Экстра Брют,IMG851,IMG852,827-877/коктебель игристое кокур (1).jpg,827-877/коктебель игристое ПН (1).jpg
2341,94.52,Усадьба Саркел,Петнат Сибирьковый,Белое,Игристое,Экстра Брют,Денисовский,Розе,Игристое,Сухое,IMG954,IMG979,954-978/усадьба саркел петнат сибирьковый (1).jpg,979-1009/усадьба саркел денисовский розе (1).jpg
1180,94.26,Domaine Lipko,White Blend 2021,Белое,Тихое,Полусухое,Blaufrankisch,Красное,Тихое,Сухое,IMG448,IMG451,385-456/липко белый бленд 2.jpg,385-456/липко блауфранкиш 2.jpg
351,94.09,Alma Valley,Colombard,Белое,Тихое,Полусухое,Pinot Noir,Красное,Тихое,Сухое,IMG116,IMG118,96-151/alma valley коломбард (2).jpg,96-151/alma valley пино нуар (2).jpg
2088,92.9,Шато Тамань,Селект Руж,Красное,Тихое,Полусладкое,Селект Розе,Розовое,Тихое,Сухое,IMG784,IMG787,727-798/шато тамань селект руж полусладкое 2.jpg,727-798/шато тамань селект розе 2.jpg
360,92.82,Alma Valley,Pinot Noir,Красное,Тихое,Сухое,Rieslaner,Белое,Тихое,Полусладкое,IMG118,IMG383,96-151/alma valley пино нуар (2).jpg,316-384/alma valley рисланер 2.jpg
337,91.28,Alma Valley,Pinot Gris,Белое,Тихое,Полусухое,Pinot Noir,Красное,Тихое,Сухое,IMG114,IMG118,96-151/alma valley пино гри (2).jpg,96-151/alma valley пино нуар (2).jpg
2078,90.31,Инкерман,Winemaker's selection Мускат,Белое,Тихое,Полусладкое,Winemaker's selection Саперави,Красное,Тихое,Сухое,IMG78,IMG82,77-95/inkerman muskat 2.jpg,77-95/inkerman саперави (2).jpg


Danger pairs (in-brand): 157


In [13]:
TH = 95.0

meta_cols = ["image_id","manufacturer","name","color","sparkling","sugar","vintage_year","image","filepath","file_label_name","folder_name"]
m = df[meta_cols].copy()

pairs = und[und["w"] >= TH].copy()
pairs = pairs.merge(m, left_on="node1", right_on="image_id", how="left").drop(columns=["image_id"])
pairs = pairs.merge(m, left_on="node2", right_on="image_id", how="left", suffixes=("_a","_b")).drop(columns=["image_id"])

# только внутри производителя
pairs = pairs[pairs["manufacturer_a"] == pairs["manufacturer_b"]].copy()

def neq(a, b):
    return a.fillna("∅") != b.fillna("∅")

pairs["diff_color"] = neq(pairs["color_a"], pairs["color_b"])
pairs["diff_sparkling"] = neq(pairs["sparkling_a"], pairs["sparkling_b"])
pairs["diff_sugar"] = neq(pairs["sugar_a"], pairs["sugar_b"])

# "почему опасно"
def why(row):
    bits = []
    if row["diff_color"]: bits.append("color")
    if row["diff_sparkling"]: bits.append("sparkling")
    if row["diff_sugar"]: bits.append("sugar")
    return ", ".join(bits) if bits else ""

pairs["why_diff"] = pairs.apply(why, axis=1)

# компактные подписи как на полке
pairs["sku_a"] = pairs["name_a"].fillna("") + " | " + pairs["color_a"].fillna("") + " | " + pairs["sparkling_a"].fillna("") + " | " + pairs["sugar_a"].fillna("")
pairs["sku_b"] = pairs["name_b"].fillna("") + " | " + pairs["color_b"].fillna("") + " | " + pairs["sparkling_b"].fillna("") + " | " + pairs["sugar_b"].fillna("")

# итоговая табличка
tbl = pairs[[
    "manufacturer_a","w","why_diff",
    "node1","sku_a","image_a","file_label_name_a","filepath_a",
    "node2","sku_b","image_b","file_label_name_b","filepath_b",
]].rename(columns={
    "manufacturer_a":"manufacturer",
    "w":"match_percent",
    "node1":"image_id_a",
    "node2":"image_id_b",
    "image_a":"path_image_a",
    "image_b":"path_image_b",
    "filepath_a":"embed_path_a",
    "filepath_b":"embed_path_b",
})

# сортировка: сначала самые опасные — у кого больше отличий + выше сходство
tbl["diff_cnt"] = tbl["why_diff"].apply(lambda s: 0 if s=="" else len(s.split(", ")))
tbl = tbl.sort_values(["diff_cnt","match_percent"], ascending=[False, False]).drop(columns=["diff_cnt"])

display(tbl.head(200))


Unnamed: 0,manufacturer,match_percent,why_diff,image_id_a,sku_a,path_image_a,file_label_name_a,embed_path_a,image_id_b,sku_b,path_image_b,file_label_name_b,embed_path_b
49,Усадьба Дивноморское,100.00,"color, sugar",IMG253,Grand Rose | Розе | Игристое | Экстра Брют,237-315/усадьба дивноморское гранд розе 2.jpg,усадьба дивноморское гранд розе 2.jpg,IMG_253e.jpg,IMG990,Гранд Кюве | Белое | Игристое | Брют,979-1009/усадьба дивноморское гранд кюве (1).jpg,усадьба дивноморское гранд кюве (1).jpg,IMG_990e.jpg
159,Инкерман,100.00,"color, sugar",IMG78,Winemaker's selection Мускат | Белое | Тихое |...,77-95/inkerman muskat 2.jpg,inkerman muskat 2.jpg,IMG_78e.jpg,IMG80,Winemaker's Rose Каберне Совиньон | Розовое | ...,77-95/inkerman rose 2.jpg,inkerman rose 2.jpg,IMG_80e.jpg
173,ЗМВ Коктебель,100.00,"color, sugar",IMG851,Кокур | Белое | Игристое | Брют,827-877/коктебель игристое кокур (1).jpg,коктебель игристое кокур (1).jpg,IMG_851e.jpg,IMG852,Пино Нуар | Розовое | Игристое | Экстра Брют,827-877/коктебель игристое ПН (1).jpg,коктебель игристое ПН (1).jpg,IMG_852e.jpg
1,Валерий Захарьин,100.00,color,IMG1033,Пино Нуар | Розе | Игристое | Брют,1010-1069/валерий захарьин ПН (1).jpg,валерий захарьин ПН (1).jpg,IMG_1033e.jpg,IMG1036,Рислинг | Белое | Игристое | Брют,1010-1069/валерий захарьин рислинг (1).jpg,валерий захарьин рислинг (1).jpg,IMG_1036e.jpg
3,Валерий Захарьин,100.00,color,IMG1034,Пино Гри | Розе | Игристое | Брют,1010-1069/валерий захарьин ПГ (1).jpg,валерий захарьин ПГ (1).jpg,IMG_1034e.jpg,IMG1035,Шардоне | Белое | Игристое | Брют,1010-1069/валерий захарьин шардоне (1).jpg,валерий захарьин шардоне (1).jpg,IMG_1035e.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,Галицкий и Галицкий,95.34,,IMG463,Рислинг Красная горка | Белое | Тихое | Сухое,457-520/галицкий рислинг 2.jpg,галицкий рислинг 2.jpg,IMG_463e.jpg,IMG468,Шардоне Акация Красная горка | Белое | Тихое |...,457-520/галицкий шардоне акация 2.jpg,галицкий шардоне акация 2.jpg,IMG_468e.jpg
167,"ООО ВК ""САТЕРА""",95.33,,IMG841,Esse Брют Кюве престиж | Белое | Игристое | Брют,827-877/эссе брют кюве престиж (1).jpg,эссе брют кюве престиж (1).jpg,IMG_841e.jpg,IMG842,Esse Брют Блан де Нуар | Белое | Игристое | Брют,827-877/эссе брют блан де нуар (1).jpg,эссе брют блан де нуар (1).jpg,IMG_842e.jpg
44,Andryus Yutsis,95.24,,IMG215,Шенен Блан | Белое | Тихое | Сухое,152-236/Andryus Yutsis шенен блан 2.jpg,Andryus Yutsis шенен блан 2.jpg,IMG_215e.jpg,IMG216,Мускат Блан Вионье | Белое | Тихое | Сухое,152-236/Andryus Yutsis мускат блан вионье 2.jpg,Andryus Yutsis мускат блан вионье 2.jpg,IMG_216e.jpg
91,"ООО ВК ""САТЕРА""",95.21,,IMG418,Esse Мускат | Белое | Тихое | Сухое,385-456/эссе стоунс мускат 2.jpg,эссе стоунс мускат 2.jpg,IMG_418e.jpg,IMG520,Esse Совиньон Блан | Белое | Тихое | Сухое,457-520/эссе стоунс СБ 2.jpg,эссе стоунс СБ 2.jpg,IMG_520e.jpg


In [14]:
out_path = "in_brand_pairs.csv"
tbl.to_csv(out_path, index=False, encoding="utf-8-sig")
print("Saved:", out_path, "rows:", len(tbl))


Saved: in_brand_pairs.csv rows: 192


In [15]:
import os
print("CWD:", os.getcwd())
!ls -lah


CWD: /content
total 100K
drwxr-xr-x 1 root root 4.0K Feb 13 20:07 .
drwxr-xr-x 1 root root 4.0K Feb 13 19:44 ..
drwxr-xr-x 4 root root 4.0K Jan 16 14:24 .config
drwx------ 6 root root 4.0K Feb 13 19:45 drive
-rw-r--r-- 1 root root  78K Feb 13 20:07 in_brand_pairs.csv
drwxr-xr-x 1 root root 4.0K Jan 16 14:24 sample_data


In [16]:
from google.colab import files
files.download("in_brand_pairs.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [16]:
!pip -q install networkx pyvis

import pandas as pd
import numpy as np
import networkx as nx
from pyvis.network import Network

TH = 95.0

# --- 1) симметризованные пары должны быть в und: node1,node2,w ---
# und = ...

# --- 2) мета-таблица ---
meta_cols = ["image_id","manufacturer","name","color","sparkling","sugar","image"]
m = df[meta_cols].copy()

# --- 3) отфильтруем пары >=TH и добавим метаданные ---
pairs = und[und["w"] >= TH].copy()

pairs = pairs.merge(m, left_on="node1", right_on="image_id", how="left").drop(columns=["image_id"])
pairs = pairs.merge(m, left_on="node2", right_on="image_id", how="left", suffixes=("_a","_b")).drop(columns=["image_id"])

# только внутри одного производителя
pairs_in = pairs[pairs["manufacturer_a"] == pairs["manufacturer_b"]].copy()

def neq(a, b):
    return a.fillna("∅") != b.fillna("∅")

pairs_in["danger"] = (
    neq(pairs_in["sparkling_a"], pairs_in["sparkling_b"]) |
    neq(pairs_in["sugar_a"], pairs_in["sugar_b"]) |
    neq(pairs_in["color_a"], pairs_in["color_b"])
)

# --- 4) выберем производителя с наибольшим числом пар (можешь заменить на нужного) ---
if len(pairs_in) == 0:
    raise RuntimeError(f"Нет in-brand пар с порогом w>={TH}. Попробуй снизить TH.")

top_manu = pairs_in["manufacturer_a"].value_counts().index[0]
sub = pairs_in[pairs_in["manufacturer_a"] == top_manu].copy()

print("Manufacturer:", top_manu)
print("Pairs:", len(sub))

# --- 5) построим networkx граф G ---
G = nx.Graph()
for r in sub.itertuples(index=False):
    G.add_edge(r.node1, r.node2, w=float(r.w), danger=bool(r.danger))

# --- 6) интерактивный PyVis ---
net = Network(height="750px", width="100%", notebook=True, cdn_resources="in_line", directed=False)

# добавим узлы с подсказками
node_meta = df.set_index("image_id")[["manufacturer","name","color","sparkling","sugar"]].to_dict(orient="index")

for n in G.nodes():
    md = node_meta.get(n, {})
    label = (md.get("name") or n)[:24]
    title = (
        f"<b>{md.get('manufacturer','')}</b><br>"
        f"{md.get('name','')}<br>"
        f"{md.get('color','')} | {md.get('sparkling','')} | {md.get('sugar','')}"
    )
    net.add_node(n, label=label, title=title)

for u, v, d in G.edges(data=True):
    w = d.get("w", 0.0)
    danger = d.get("danger", False)
    width = 7 if danger else 2
    net.add_edge(u, v, value=w, width=width, title=f"w={w:.2f} | danger={danger}")

from IPython.display import HTML, display

out = "inbrand_graph.html"
net.write_html(out, open_browser=False)
display(HTML(open(out, "r", encoding="utf-8").read()))




[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/756.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m604.2/756.0 kB[0m [31m17.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m756.0/756.0 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
[?25hManufacturer: Alma Valley
Pairs: 27


In [33]:
TH = 10.0  # попробуй меньше если пар мало

meta_cols = ["image_id","manufacturer","name","color","sparkling","sugar","vintage_year","image","filepath","file_label_name","folder_name"]
m = df[meta_cols].copy()

pairs = und[und["w"] >= TH].copy()

pairs = pairs.merge(m, left_on="node1", right_on="image_id", how="left").drop(columns=["image_id"])
pairs = pairs.merge(m, left_on="node2", right_on="image_id", how="left", suffixes=("_a","_b")).drop(columns=["image_id"])

# --- CROSS-BRAND ---
pairs_x = pairs[pairs["manufacturer_a"] != pairs["manufacturer_b"]].copy()

def neq(a, b):
    return a.fillna("∅") != b.fillna("∅")

pairs_x["diff_color"] = neq(pairs_x["color_a"], pairs_x["color_b"])
pairs_x["diff_sparkling"] = neq(pairs_x["sparkling_a"], pairs_x["sparkling_b"])
pairs_x["diff_sugar"] = neq(pairs_x["sugar_a"], pairs_x["sugar_b"])

def why(row):
    bits = []
    if row["diff_color"]: bits.append("color")
    if row["diff_sparkling"]: bits.append("sparkling")
    if row["diff_sugar"]: bits.append("sugar")
    return ", ".join(bits)

pairs_x["why_diff"] = pairs_x.apply(why, axis=1)

# Полочный “ярлык”
pairs_x["sku_a"] = pairs_x["name_a"].fillna("") + " | " + pairs_x["color_a"].fillna("") + " | " + pairs_x["sparkling_a"].fillna("") + " | " + pairs_x["sugar_a"].fillna("")
pairs_x["sku_b"] = pairs_x["name_b"].fillna("") + " | " + pairs_x["color_b"].fillna("") + " | " + pairs_x["sparkling_b"].fillna("") + " | " + pairs_x["sugar_b"].fillna("")

# Сколько критических различий
pairs_x["diff_cnt"] = pairs_x[["diff_color","diff_sparkling","diff_sugar"]].sum(axis=1)

# Итоговая таблица
tbl_x = pairs_x[[
    "w",
    "manufacturer_a","sku_a","image_a","file_label_name_a","filepath_a","node1",
    "manufacturer_b","sku_b","image_b","file_label_name_b","filepath_b","node2",
    "why_diff","diff_cnt"
]].rename(columns={
    "w":"match_percent",
    "node1":"image_id_a",
    "node2":"image_id_b",
    "image_a":"path_image_a",
    "image_b":"path_image_b",
    "filepath_a":"embed_path_a",
    "filepath_b":"embed_path_b",
})

# Сортировка: сначала “самые похожие” и/или “самые опасные”
tbl_x = tbl_x.sort_values(["match_percent","diff_cnt"], ascending=[False, False]).drop(columns=["diff_cnt"])

# Красивый заголовок пары
tbl_x.insert(0, "scope", "cross_brand")
tbl_x.insert(1, "pair_title",
             tbl_x["manufacturer_a"] + ": " + tbl_x["sku_a"].str.slice(0, 35) +
             "  ↔  " +
             tbl_x["manufacturer_b"] + ": " + tbl_x["sku_b"].str.slice(0, 35)
            )

display(tbl_x.head(50))
print("Cross-brand pairs:", len(tbl_x))
print("Distinct manufacturers involved:", pd.concat([tbl_x["manufacturer_a"], tbl_x["manufacturer_b"]]).nunique())


Unnamed: 0,scope,pair_title,match_percent,manufacturer_a,sku_a,path_image_a,file_label_name_a,embed_path_a,image_id_a,manufacturer_b,sku_b,path_image_b,file_label_name_b,embed_path_b,image_id_b,why_diff
223,cross_brand,Галицкий и Галицкий: Ballet Blanc Красная горк...,67.24,Галицкий и Галицкий,Ballet Blanc Красная горка | Белое | Тихое | С...,1070-1096/галицкий балет блан магнум (1).jpg,галицкий балет блан магнум (1).jpg,IMG_1084e.jpg,IMG1084,Усадьба Дивноморское,Солист | Розовое | Тихое | Сухое,457-520/усадьба дивноморское солист 2.jpg,усадьба дивноморское солист 2.jpg,IMG_511e.jpg,IMG511,color
1066,cross_brand,Николаев и сыновья: Riesling | Белое | Тихое |...,49.77,Николаев и сыновья,Riesling | Белое | Тихое | Сухое,457-520/николаев и сыновья рислинг 2.jpg,николаев и сыновья рислинг 2.jpg,IMG_473e.jpg,IMG473,Галицкий и Галицкий,Cosaque | Красное | Тихое | Сухое,979-1009/галицкий казак магнум (1).jpg,галицкий казак магнум (1).jpg,IMG_984e.jpg,IMG984,color
960,cross_brand,Cellar Master: Мальвазия село Яркое | Белое | ...,48.04,Cellar Master,Мальвазия село Яркое | Белое | Тихое | Сухое,385-456/целар мастерс мальвазия 2.jpg,целар мастерс мальвазия 2.jpg,IMG_415e.jpg,IMG415,Шато Тамань,Delicate | Розовое | Тихое | Полусухое,727-798/шато тамань деликат розе 2.jpg,шато тамань деликат розе 2.jpg,IMG_793e.jpg,IMG793,"color, sugar"
1072,cross_brand,Николаев и сыновья: Sauvignon Blanc | Белое | ...,22.36,Николаев и сыновья,Sauvignon Blanc | Белое | Тихое | Сухое,457-520/николаев и сыновья СБ (2).jpg,николаев и сыновья СБ (2).jpg,IMG_475e.jpg,IMG475,Инкерман,Winemaker's selection Кокур | Белое | Тихое | ...,77-95/inkerman kokur blan 2.jpg,inkerman kokur blan 2.jpg,IMG_79e.jpg,IMG79,
1413,cross_brand,Николаев и сыновья: Пти Мансан | Белое | Тихое...,19.5,Николаев и сыновья,Пти Мансан | Белое | Тихое | Сладкое,631-683/николаев и сыновья пти мансан 2.jpg,николаев и сыновья пти мансан 2.jpg,IMG_637e.jpg,IMG637,Инкерман,Winemaker's selection Пино Нуар | Красное | Ти...,77-95/inkerman пино нуар (2).jpg,inkerman пино нуар (2).jpg,IMG_83e.jpg,IMG83,"color, sugar"
1073,cross_brand,Николаев и сыновья: Sauvignon Blanc | Белое | ...,17.55,Николаев и сыновья,Sauvignon Blanc | Белое | Тихое | Сухое,457-520/николаев и сыновья СБ (2).jpg,николаев и сыновья СБ (2).jpg,IMG_475e.jpg,IMG475,Инкерман,Winemaker's selection Шардоне | Белое | Тихое ...,77-95/inkerman шардоне (2).jpg,inkerman шардоне (2).jpg,IMG_81e.jpg,IMG81,
1412,cross_brand,Николаев и сыновья: Пти Мансан | Белое | Тихое...,16.29,Николаев и сыновья,Пти Мансан | Белое | Тихое | Сладкое,631-683/николаев и сыновья пти мансан 2.jpg,николаев и сыновья пти мансан 2.jpg,IMG_637e.jpg,IMG637,Инкерман,Winemaker's selection Шардоне | Белое | Тихое ...,77-95/inkerman шардоне (2).jpg,inkerman шардоне (2).jpg,IMG_81e.jpg,IMG81,sugar
1008,cross_brand,Domaine Lipko: White Blend 2021 | Белое | Тихо...,10.83,Domaine Lipko,White Blend 2021 | Белое | Тихое | Полусухое,385-456/липко белый бленд 2.jpg,липко белый бленд 2.jpg,IMG_448e.jpg,IMG448,Галицкий и Галицкий,Рислинг Красная горка | Белое | Тихое | Сухое,457-520/галицкий рислинг 2.jpg,галицкий рислинг 2.jpg,IMG_463e.jpg,IMG463,sugar
1032,cross_brand,Галицкий и Галицкий: Рислинг Красная горка | Б...,10.21,Галицкий и Галицкий,Рислинг Красная горка | Белое | Тихое | Сухое,457-520/галицкий рислинг 2.jpg,галицкий рислинг 2.jpg,IMG_463e.jpg,IMG463,Имение Сикоры,Рислинг | Белое | Тихое | Сухое,575-630/сикоры семейный резерв рислинг 2.jpg,сикоры семейный резерв рислинг 2.jpg,IMG_581e.jpg,IMG581,


Cross-brand pairs: 9
Distinct manufacturers involved: 8


In [34]:
def cross_brand_count(th):
    px = pairs[pairs["w"] >= th].copy()
    px = px[px["manufacturer_a"] != px["manufacturer_b"]]
    return len(px), px["manufacturer_a"].nunique(), px["manufacturer_b"].nunique()

for th in [95, 93, 90, 88, 85, 80, 75, 40, 30, 20]:
    cnt, na, nb = cross_brand_count(th)
    print(th, "pairs:", cnt, "| brands A:", na, "| brands B:", nb)


95 pairs: 0 | brands A: 0 | brands B: 0
93 pairs: 0 | brands A: 0 | brands B: 0
90 pairs: 0 | brands A: 0 | brands B: 0
88 pairs: 0 | brands A: 0 | brands B: 0
85 pairs: 0 | brands A: 0 | brands B: 0
80 pairs: 0 | brands A: 0 | brands B: 0
75 pairs: 0 | brands A: 0 | brands B: 0
40 pairs: 3 | brands A: 3 | brands B: 3
30 pairs: 3 | brands A: 3 | brands B: 3
20 pairs: 4 | brands A: 3 | brands B: 4


In [35]:
# edges: (image_id -> neighbor_image_id, match_percent) у тебя уже есть
# добавим производителей источника и соседа
src = df.set_index("image_id")["manufacturer"]
edges2 = edges.copy()
edges2["manu_a"] = edges2["image_id"].map(src)
edges2["manu_b"] = edges2["neighbor_image_id"].map(src)

cross_edges = edges2[edges2["manu_a"] != edges2["manu_b"]].copy()

print("Directed cross-brand edges:", len(cross_edges))
print(cross_edges["match_percent"].describe())
display(cross_edges.sort_values("match_percent", ascending=False).head(30)[
    ["match_percent","manu_a","image_id","manu_b","neighbor_image_id","neighbor_filepath"]
])


Directed cross-brand edges: 17
count    17.000000
mean     18.654118
std      18.457853
min       5.680000
25%       7.250000
50%      10.210000
75%      19.500000
max      67.240000
Name: match_percent, dtype: float64


Unnamed: 0,match_percent,manu_a,image_id,manu_b,neighbor_image_id,neighbor_filepath
2573,67.24,Усадьба Дивноморское,IMG511,Галицкий и Галицкий,IMG1084,IMG_1084e.jpg
4895,49.77,Галицкий и Галицкий,IMG984,Николаев и сыновья,IMG473,IMG_473e.jpg
4151,48.04,Шато Тамань,IMG793,Cellar Master,IMG415,IMG_415e.jpg
617,22.36,Инкерман,IMG79,Николаев и сыновья,IMG475,IMG_475e.jpg
3251,19.5,Николаев и сыновья,IMG637,Инкерман,IMG83,IMG_83e.jpg
2413,17.55,Николаев и сыновья,IMG475,Инкерман,IMG81,IMG_81e.jpg
3252,16.29,Николаев и сыновья,IMG637,Инкерман,IMG81,IMG_81e.jpg
2253,10.83,Domaine Lipko,IMG448,Галицкий и Галицкий,IMG463,IMG_463e.jpg
2917,10.21,Имение Сикоры,IMG581,Галицкий и Галицкий,IMG463,IMG_463e.jpg
3232,7.39,Усадьба Перовских,IMG634,Имение Сикоры,IMG598,IMG_598e.jpg


In [36]:
top_cross = cross_edges.sort_values("match_percent", ascending=False).head(50).copy()

# добавим названия/атрибуты
meta = df.set_index("image_id")[["manufacturer","name","color","sparkling","sugar","image"]]

top_cross["name_a"] = top_cross["image_id"].map(meta["name"])
top_cross["name_b"] = top_cross["neighbor_image_id"].map(meta["name"])
top_cross["color_a"] = top_cross["image_id"].map(meta["color"])
top_cross["color_b"] = top_cross["neighbor_image_id"].map(meta["color"])
top_cross["sparkling_a"] = top_cross["image_id"].map(meta["sparkling"])
top_cross["sparkling_b"] = top_cross["neighbor_image_id"].map(meta["sparkling"])
top_cross["sugar_a"] = top_cross["image_id"].map(meta["sugar"])
top_cross["sugar_b"] = top_cross["neighbor_image_id"].map(meta["sugar"])
top_cross["path_a"] = top_cross["image_id"].map(meta["image"])
top_cross["path_b"] = top_cross["neighbor_image_id"].map(meta["image"])

display(top_cross[[
    "match_percent","manu_a","name_a","color_a","sparkling_a","sugar_a","path_a",
    "manu_b","name_b","color_b","sparkling_b","sugar_b","path_b"
]])


Unnamed: 0,match_percent,manu_a,name_a,color_a,sparkling_a,sugar_a,path_a,manu_b,name_b,color_b,sparkling_b,sugar_b,path_b
2573,67.24,Усадьба Дивноморское,Солист,Розовое,Тихое,Сухое,457-520/усадьба дивноморское солист 2.jpg,Галицкий и Галицкий,Ballet Blanc Красная горка,Белое,Тихое,Сухое,1070-1096/галицкий балет блан магнум (1).jpg
4895,49.77,Галицкий и Галицкий,Cosaque,Красное,Тихое,Сухое,979-1009/галицкий казак магнум (1).jpg,Николаев и сыновья,Riesling,Белое,Тихое,Сухое,457-520/николаев и сыновья рислинг 2.jpg
4151,48.04,Шато Тамань,Delicate,Розовое,Тихое,Полусухое,727-798/шато тамань деликат розе 2.jpg,Cellar Master,Мальвазия село Яркое,Белое,Тихое,Сухое,385-456/целар мастерс мальвазия 2.jpg
617,22.36,Инкерман,Winemaker's selection Кокур,Белое,Тихое,Сухое,77-95/inkerman kokur blan 2.jpg,Николаев и сыновья,Sauvignon Blanc,Белое,Тихое,Сухое,457-520/николаев и сыновья СБ (2).jpg
3251,19.5,Николаев и сыновья,Пти Мансан,Белое,Тихое,Сладкое,631-683/николаев и сыновья пти мансан 2.jpg,Инкерман,Winemaker's selection Пино Нуар,Красное,Тихое,Полусухое,77-95/inkerman пино нуар (2).jpg
2413,17.55,Николаев и сыновья,Sauvignon Blanc,Белое,Тихое,Сухое,457-520/николаев и сыновья СБ (2).jpg,Инкерман,Winemaker's selection Шардоне,Белое,Тихое,Сухое,77-95/inkerman шардоне (2).jpg
3252,16.29,Николаев и сыновья,Пти Мансан,Белое,Тихое,Сладкое,631-683/николаев и сыновья пти мансан 2.jpg,Инкерман,Winemaker's selection Шардоне,Белое,Тихое,Сухое,77-95/inkerman шардоне (2).jpg
2253,10.83,Domaine Lipko,White Blend 2021,Белое,Тихое,Полусухое,385-456/липко белый бленд 2.jpg,Галицкий и Галицкий,Рислинг Красная горка,Белое,Тихое,Сухое,457-520/галицкий рислинг 2.jpg
2917,10.21,Имение Сикоры,Рислинг,Белое,Тихое,Сухое,575-630/сикоры семейный резерв рислинг 2.jpg,Галицкий и Галицкий,Рислинг Красная горка,Белое,Тихое,Сухое,457-520/галицкий рислинг 2.jpg
3232,7.39,Усадьба Перовских,Рислинг,Белое,Тихое,Полусухое,631-683/усадьба перовских рислинг резерв 2.jpg,Имение Сикоры,Рислинг,Белое,Тихое,Сухое,575-630/сикоры рислинг 2.jpg


In [23]:
import requests, pandas as pd

PUBLIC_KEY = "https://disk.yandex.ru/d/XNp82frenASxWQ"

def ydisk_public_list(public_key, path="", limit=200):
    api = "https://cloud-api.yandex.net/v1/disk/public/resources"
    r = requests.get(api, params={"public_key": public_key, "path": path, "limit": limit}, timeout=60)
    # если ошибка — покажем текст Яндекса, он полезный
    if r.status_code != 200:
        print("Status:", r.status_code)
        try:
            print(r.json())
        except Exception:
            print(r.text[:500])
        r.raise_for_status()
    return r.json()

root = ydisk_public_list(PUBLIC_KEY, path="")
print("Root name:", root.get("name"))
print("Type:", root.get("type"))
items = root.get("_embedded", {}).get("items", [])
print("Items in root:", len(items))
pd.DataFrame([{"name":it["name"], "type":it["type"], "path":it["path"]} for it in items]).head(50)


Root name: our_wine
Type: dir
Items in root: 23


Unnamed: 0,name,type,path
0,1-41,dir,/1-41
1,1010-1069,dir,/1010-1069
2,1070-1096,dir,/1070-1096
3,152-236,dir,/152-236
4,237-315,dir,/237-315
5,316-384,dir,/316-384
6,385-456,dir,/385-456
7,42-76,dir,/42-76
8,457-520,dir,/457-520
9,521-574,dir,/521-574


In [24]:
import os, requests

PUBLIC_KEY = "https://disk.yandex.ru/d/XNp82frenASxWQ"

def ydisk_public_download(public_key: str, rel_path: str, out_dir="/content/cache"):
    # нормализуем путь как у Яндекса: "/316-384/....jpg"
    rel_path = str(rel_path)
    if not rel_path.startswith("/"):
        rel_path = "/" + rel_path

    api = "https://cloud-api.yandex.net/v1/disk/public/resources/download"
    r = requests.get(api, params={"public_key": public_key, "path": rel_path}, timeout=60)
    if r.status_code != 200:
        # покажем полезную ошибку, если снова не найдёт
        try:
            print("Error:", r.status_code, r.json())
        except Exception:
            print("Error:", r.status_code, r.text[:300])
        r.raise_for_status()

    href = r.json()["href"]

    # локальный путь сохраняем без ведущего /
    local_rel = rel_path.lstrip("/")
    out_path = os.path.join(out_dir, local_rel)
    os.makedirs(os.path.dirname(out_path), exist_ok=True)

    with requests.get(href, stream=True, timeout=120) as resp:
        resp.raise_for_status()
        with open(out_path, "wb") as f:
            for chunk in resp.iter_content(chunk_size=1024*1024):
                if chunk:
                    f.write(chunk)

    return out_path


In [25]:
a_rel = "316-384/маутин игл алеатико розе 2.jpg"
b_rel = "316-384/маутин игл шардоне 2.jpg"

a_local = ydisk_public_download(PUBLIC_KEY, a_rel)
b_local = ydisk_public_download(PUBLIC_KEY, b_rel)

print(a_local)
print(b_local)


/content/cache/316-384/маутин игл алеатико розе 2.jpg
/content/cache/316-384/маутин игл шардоне 2.jpg


In [29]:
!pip -q install requests pillow


In [37]:
print("tbl rows:", len(tbl))

# tbl_x должен существовать
print("tbl_x exists:", "tbl_x" in globals())
if "tbl_x" in globals():
    print("tbl_x rows:", len(tbl_x))
    # ключевой тест: должны быть разные производители
    if "manufacturer_a" in tbl_x.columns and "manufacturer_b" in tbl_x.columns:
        print("cross-brand rows (manufacturer_a != manufacturer_b):",
              (tbl_x["manufacturer_a"] != tbl_x["manufacturer_b"]).sum())


tbl rows: 192
tbl_x exists: True
tbl_x rows: 9
cross-brand rows (manufacturer_a != manufacturer_b): 9


In [39]:
!pip -q install requests pillow

import os, re, time, html, shutil, requests
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from PIL import Image

PUBLIC_KEY = "https://disk.yandex.ru/d/XNp82frenASxWQ"

def ydisk_public_download(public_key: str, rel_path: str, out_dir="/content/cache", timeout=60):
    rel_path = str(rel_path)
    if not rel_path.startswith("/"):
        rel_path = "/" + rel_path

    local_rel = rel_path.lstrip("/")
    out_path = os.path.join(out_dir, local_rel)
    os.makedirs(os.path.dirname(out_path), exist_ok=True)

    if os.path.exists(out_path) and os.path.getsize(out_path) > 0:
        return out_path

    api = "https://cloud-api.yandex.net/v1/disk/public/resources/download"
    r = requests.get(api, params={"public_key": public_key, "path": rel_path}, timeout=timeout)
    r.raise_for_status()
    href = r.json()["href"]

    with requests.get(href, stream=True, timeout=180) as resp:
        resp.raise_for_status()
        with open(out_path, "wb") as f:
            for chunk in resp.iter_content(chunk_size=1024*1024):
                if chunk:
                    f.write(chunk)

    return out_path

def safe_filename(s: str, max_len=140):
    s = str(s)
    s = re.sub(r"[\\/:*?\"<>|]+", "_", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s[:max_len].rstrip()

def make_thumb(src_path, dst_path, max_side=900, quality=80):
    dst_path = Path(dst_path)
    dst_path.parent.mkdir(parents=True, exist_ok=True)
    if dst_path.exists():
        return str(dst_path)
    im = Image.open(src_path).convert("RGB")
    im.thumbnail((max_side, max_side))
    im.save(dst_path, format="JPEG", quality=quality, optimize=True)
    return str(dst_path)

def build_html(tbl, out_dir="/content/pairs_html", max_pairs=None,
               thumbs_max_side=900, workers=12, title="Pairs catalog"):
    out_dir = Path(out_dir)
    img_dir = out_dir / "img"
    img_dir.mkdir(parents=True, exist_ok=True)

    rows = tbl.to_dict("records") if max_pairs is None else tbl.head(max_pairs).to_dict("records")

    # 1) уникальные картинки
    uniq_paths = set()
    for r in rows:
        uniq_paths.add(r["path_image_a"])
        uniq_paths.add(r["path_image_b"])
    uniq_paths = sorted(uniq_paths)

    print(f"Pairs: {len(rows)} | unique images: {len(uniq_paths)} | workers: {workers}")

    # 2) скачивание параллельно
    local_map = {}
    failed = []

    def task(rel):
        try:
            p = ydisk_public_download(PUBLIC_KEY, rel)
            return rel, p, None
        except Exception as e:
            return rel, None, str(e)

    with ThreadPoolExecutor(max_workers=workers) as ex:
        futures = [ex.submit(task, rel) for rel in uniq_paths]
        done = 0
        for fut in as_completed(futures):
            rel, p, err = fut.result()
            done += 1
            if err:
                failed.append((rel, err))
            else:
                local_map[rel] = p
            if done % 25 == 0 or done == len(uniq_paths):
                print(f"Downloaded {done}/{len(uniq_paths)}")

    # 3) миниатюры
    thumb_map = {}
    for i, rel in enumerate(uniq_paths, start=1):
        if rel not in local_map:
            continue
        fname = safe_filename(Path(rel).name)
        thumb_name = f"{i:04d}_{fname}.jpg"
        thumb_path = img_dir / thumb_name
        make_thumb(local_map[rel], thumb_path, max_side=thumbs_max_side, quality=80)
        thumb_map[rel] = f"img/{thumb_name}"

    # 4) HTML
    parts = []
    parts.append("<!doctype html><html><head><meta charset='utf-8'>")
    parts.append("<meta name='viewport' content='width=device-width, initial-scale=1'>")
    parts.append("<style>")
    parts.append("""
    body{font-family:system-ui,-apple-system,Segoe UI,Roboto,Arial;margin:18px; color:#111;}
    h1{font-size:20px; margin:0 0 8px;}
    .meta{color:#555; font-size:13px; margin-bottom:14px;}
    .item{border:1px solid #e6e6e6; border-radius:12px; padding:12px; margin:14px 0;}
    .hdr{display:flex; justify-content:space-between; gap:12px; flex-wrap:wrap; margin-bottom:10px;}
    .badge{display:inline-block; padding:3px 8px; border-radius:999px; font-size:12px; background:#f3f3f3;}
    .badge.danger{background:#ffe8e8;}
    .grid{display:flex; gap:14px; flex-wrap:wrap;}
    .card{flex:1 1 420px;}
    .cap{font-size:13px; margin:0 0 6px 0; line-height:1.25;}
    .cap b{font-weight:700;}
    img{max-width:100%; border:1px solid #ddd; border-radius:10px; background:#fafafa;}
    .path{font-size:11px; color:#666; margin-top:6px; word-break:break-all;}
    </style></head><body>
    """)
    parts.append(f"<h1>{html.escape(title)}</h1>")
    parts.append(f"<div class='meta'>Pairs: {len(rows)} • Unique images: {len(uniq_paths)} • Failed downloads: {len(failed)}</div>")

    missing_pairs = 0
    for idx, r in enumerate(rows, start=1):
        manu = r.get("manufacturer", "") or r.get("manufacturer_a","")
        mp = float(r.get("match_percent",0) or 0)
        why = r.get("why_diff","") or ""
        badge_cls = "badge danger" if why else "badge"
        badge_txt = f"diff: {why}" if why else "same attributes"

        a_rel = r["path_image_a"]
        b_rel = r["path_image_b"]
        a_img = thumb_map.get(a_rel)
        b_img = thumb_map.get(b_rel)
        if (a_img is None) or (b_img is None):
            missing_pairs += 1

        parts.append("<div class='item'>")
        parts.append("<div class='hdr'>")
        parts.append(f"<div><span class='badge'>{html.escape(str(manu))}</span> "
                     f"<span class='badge'>match: {mp:.2f}</span> "
                     f"<span class='{badge_cls}'>{html.escape(badge_txt)}</span></div>")
        parts.append(f"<div class='meta'>#{idx}</div>")
        parts.append("</div>")

        parts.append("<div class='grid'>")
        parts.append("<div class='card'>")
        parts.append(f"<p class='cap'><b>A:</b> {html.escape(r.get('sku_a','') or '')}</p>")
        parts.append(f"<img src='{html.escape(a_img)}'>" if a_img else "<div class='meta'>A missing</div>")
        parts.append(f"<div class='path'>{html.escape(str(a_rel))}</div>")
        parts.append("</div>")

        parts.append("<div class='card'>")
        parts.append(f"<p class='cap'><b>B:</b> {html.escape(r.get('sku_b','') or '')}</p>")
        parts.append(f"<img src='{html.escape(b_img)}'>" if b_img else "<div class='meta'>B missing</div>")
        parts.append(f"<div class='path'>{html.escape(str(b_rel))}</div>")
        parts.append("</div>")
        parts.append("</div></div>")

    parts.append(f"<div class='meta'>Missing pairs (due to failed downloads): {missing_pairs}</div>")
    parts.append("</body></html>")

    html_path = out_dir / "index.html"
    html_path.write_text("\n".join(parts), encoding="utf-8")

    return str(html_path), str(out_dir), failed


In [40]:
html_path_x, folder_path_x, failed_x = build_html(
    tbl_x,
    out_dir="/content/cross_brand_pairs_html_fast",
    max_pairs=None,
    thumbs_max_side=900,
    workers=12,
    title="Cross-brand label similarity (top pairs)"
)

print("HTML:", html_path_x)
print("Folder:", folder_path_x)
print("Failed:", len(failed_x))


Pairs: 9 | unique images: 14 | workers: 12
Downloaded 14/14
HTML: /content/cross_brand_pairs_html_fast/index.html
Folder: /content/cross_brand_pairs_html_fast
Failed: 0


In [41]:
import shutil
zip_path = shutil.make_archive("/content/cross_brand_pairs_html_fast", "zip", "/content/cross_brand_pairs_html_fast")

from google.colab import files
files.download(zip_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [42]:
print("df:", "df" in globals(), "shape:", getattr(df, "shape", None))
print("und:", "und" in globals(), "shape:", getattr(und, "shape", None))


df: True shape: (1070, 22)
und: True shape: (2358, 3)


In [43]:
!pip -q install pyvis networkx


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/756.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━[0m [32m686.1/756.0 kB[0m [31m20.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m756.0/756.0 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.6/1.6 MB[0m [31m77.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [44]:
import pandas as pd
import networkx as nx
from collections import Counter

mode = "in_brand"   # поменяй на "cross_brand" позже
TH_IN = 95.0
TH_X  = 55.0
MAX_EDGES_PER_NODE = 6 if mode=="in_brand" else 3

meta_cols = ["image_id","manufacturer","name","color","sparkling","sugar","image","filepath","file_label_name","folder_name"]
m = df[meta_cols].copy().set_index("image_id")

if mode == "in_brand":
    TH = TH_IN
    e = und[und["w"] >= TH].copy()
else:
    TH = TH_X
    e = und[und["w"] >= TH].copy()
    # оставляем только межбренд
    e = e.merge(m[["manufacturer"]], left_on="node1", right_index=True, how="left")
    e = e.merge(m[["manufacturer"]], left_on="node2", right_index=True, how="left", suffixes=("_a","_b"))
    e = e[e["manufacturer_a"] != e["manufacturer_b"]][["node1","node2","w"]]

# degree-cap: оставляем top-K ребер на узел (и по node1, и по node2)
e1 = e.sort_values("w", ascending=False).groupby("node1", as_index=False).head(MAX_EDGES_PER_NODE)
e2 = e.sort_values("w", ascending=False).groupby("node2", as_index=False).head(MAX_EDGES_PER_NODE)
e_lim = pd.concat([e1, e2], ignore_index=True).drop_duplicates(["node1","node2","w"])

print("Mode:", mode, "| TH:", TH, "| edges:", len(e), "->", len(e_lim))
print("Nodes:", pd.unique(e_lim[["node1","node2"]].values.ravel()).size)

G = nx.Graph()
for r in e_lim.itertuples(index=False):
    G.add_edge(r.node1, r.node2, weight=float(r.w))


Mode: in_brand | TH: 95.0 | edges: 192 -> 192
Nodes: 200


In [45]:
from networkx.algorithms.community import greedy_modularity_communities

# community detection
communities = list(greedy_modularity_communities(G, weight="weight"))
print("Communities found:", len(communities))

# назначаем id кластера узлам
node2cluster = {}
for cid, comm in enumerate(communities):
    for n in comm:
        node2cluster[n] = cid

nx.set_node_attributes(G, node2cluster, "cluster")


Communities found: 67


In [46]:
def cluster_summary(G, m, communities):
    rows = []
    for cid, comm in enumerate(communities):
        comm = list(comm)
        manus = [m.loc[n, "manufacturer"] for n in comm if n in m.index]
        cnt = Counter(manus)
        top_manu, top_cnt = (cnt.most_common(1)[0] if cnt else ("?", 0))
        purity = top_cnt / max(1, len(comm))

        # средний вес ребер внутри кластера
        sub = G.subgraph(comm)
        ws = [d.get("weight", 0.0) for _,_,d in sub.edges(data=True)]
        w_mean = sum(ws)/len(ws) if ws else 0.0

        rows.append({
            "cluster": cid,
            "nodes": len(comm),
            "edges_in": sub.number_of_edges(),
            "w_mean_in": round(w_mean, 2),
            "top_manufacturer": top_manu,
            "top_share": round(purity, 3),
            "manufacturers_n": len(cnt),
            "top3_manufacturers": ", ".join([f"{k}({v})" for k,v in cnt.most_common(3)])
        })
    return pd.DataFrame(rows).sort_values(["nodes","edges_in"], ascending=False)

clu = cluster_summary(G, m, communities)
display(clu.head(30))


Unnamed: 0,cluster,nodes,edges_in,w_mean_in,top_manufacturer,top_share,manufacturers_n,top3_manufacturers
1,1,9,20,99.64,Uppa Winery,1.0,1,Uppa Winery(9)
0,0,9,17,98.8,Alma Valley,1.0,1,Alma Valley(9)
2,2,7,20,99.48,Виноградники Гай-Кодзора,1.0,1,Виноградники Гай-Кодзора(7)
3,3,6,9,98.93,Инкерман,1.0,1,Инкерман(6)
4,4,5,9,100.0,Alma Valley,1.0,1,Alma Valley(5)
5,5,5,9,98.85,Domaine Lipko,1.0,1,Domaine Lipko(5)
6,6,5,5,98.49,Николаев и сыновья,1.0,1,Николаев и сыновья(5)
7,7,5,4,98.81,Валерий Захарьин,1.0,1,Валерий Захарьин(5)
8,8,4,4,100.0,Villa di Alma,1.0,1,Villa di Alma(4)
10,10,4,4,100.0,Инкерман,1.0,1,Инкерман(4)


In [47]:
from pyvis.network import Network
import hashlib

def color_for_cluster(cid):
    h = hashlib.md5(str(cid).encode("utf-8")).hexdigest()
    return "#" + h[:6]

def sku(md):
    return f"{md.get('name','') or ''} | {md.get('color','') or ''} | {md.get('sparkling','') or ''} | {md.get('sugar','') or ''}".strip(" |")

net = Network(height="820px", width="100%", bgcolor="#ffffff", font_color="#111", notebook=True)
net.force_atlas_2based(gravity=-35, central_gravity=0.01, spring_length=150, spring_strength=0.04, damping=0.6)

# узлы
deg = dict(G.degree())
for n in G.nodes():
    md = m.loc[n].to_dict() if n in m.index else {}
    cid = G.nodes[n].get("cluster", -1)
    title = (
        f"<b>{n}</b><br>"
        f"cluster: {cid}<br>"
        f"{md.get('manufacturer','')}<br>"
        f"{sku(md)}<br>"
        f"path: {md.get('image','')}"
    )
    net.add_node(
        n,
        label=n,
        title=title,
        color=color_for_cluster(cid),
        size=8 + 2*min(deg.get(n,0), 14)
    )

# ребра
for u, v, d in G.edges(data=True):
    w = d.get("weight", 0.0)
    width = 1 + (w - TH) / max(1, (100-TH)) * 6 if mode=="in_brand" else 1 + (w - TH)/max(1,(70-TH))*6
    net.add_edge(u, v, title=f"match: {w:.2f}", width=width, value=w)

net.show_buttons(filter_=["physics", "interaction", "layout"])

out_html = f"/content/graph_{mode}_clusters.html"
net.save_graph(out_html)
out_html




'/content/graph_in_brand_clusters.html'

In [48]:
from IPython.display import IFrame, display
display(IFrame(out_html, width="100%", height=860))


In [49]:
from google.colab import files
files.download(out_html)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [50]:
!pip -q install networkx pyvis


In [51]:
import pandas as pd
import networkx as nx
from collections import Counter
import hashlib

# метаданные (можешь добавить поля при желании)
meta_cols = ["image_id","manufacturer","name","color","sparkling","sugar","image","filepath","file_label_name","folder_name"]
m = df[meta_cols].copy().set_index("image_id")

TH = 95.0              # порог похожести для "почти одинаковые"
MAX_EDGES_PER_NODE = 6 # ограничитель лапши

# отфильтровали ребра
e = und[und["w"] >= TH].copy()
print("Edges >= TH:", len(e))


Edges >= TH: 192


In [52]:
# degree-cap: оставляем top-K ребер для каждого узла (и по node1 и по node2)
e1 = e.sort_values("w", ascending=False).groupby("node1", as_index=False).head(MAX_EDGES_PER_NODE)
e2 = e.sort_values("w", ascending=False).groupby("node2", as_index=False).head(MAX_EDGES_PER_NODE)
e_lim = pd.concat([e1, e2], ignore_index=True).drop_duplicates(["node1","node2","w"])

print("Edges after cap:", len(e_lim))
print("Nodes:", pd.unique(e_lim[["node1","node2"]].values.ravel()).size)

G = nx.Graph()
for r in e_lim.itertuples(index=False):
    G.add_edge(r.node1, r.node2, weight=float(r.w))

print("Graph:", G.number_of_nodes(), "nodes,", G.number_of_edges(), "edges")


Edges after cap: 192
Nodes: 200
Graph: 200 nodes, 192 edges


In [53]:
from networkx.algorithms.community import greedy_modularity_communities

communities = list(greedy_modularity_communities(G, weight="weight"))
print("Communities found:", len(communities))

node2cluster = {}
for cid, comm in enumerate(communities):
    for n in comm:
        node2cluster[n] = cid

nx.set_node_attributes(G, node2cluster, "cluster")


Communities found: 67


In [54]:
def cluster_summary(G, m, communities):
    rows = []
    for cid, comm in enumerate(communities):
        comm = list(comm)
        manus = [m.loc[n, "manufacturer"] for n in comm if n in m.index]
        cnt = Counter(manus)
        top_manu, top_cnt = (cnt.most_common(1)[0] if cnt else ("?", 0))
        purity = top_cnt / max(1, len(comm))

        sub = G.subgraph(comm)
        ws = [d.get("weight", 0.0) for _,_,d in sub.edges(data=True)]
        w_mean = sum(ws)/len(ws) if ws else 0.0

        rows.append({
            "cluster": cid,
            "nodes": len(comm),
            "edges_in": sub.number_of_edges(),
            "w_mean_in": round(w_mean, 2),
            "top_manufacturer": top_manu,
            "top_share": round(purity, 3),
            "manufacturers_n": len(cnt),
            "top3_manufacturers": ", ".join([f"{k}({v})" for k,v in cnt.most_common(3)])
        })
    return pd.DataFrame(rows).sort_values(["nodes","edges_in"], ascending=False)

clu = cluster_summary(G, m, communities)
display(clu.head(30))


Unnamed: 0,cluster,nodes,edges_in,w_mean_in,top_manufacturer,top_share,manufacturers_n,top3_manufacturers
1,1,9,20,99.64,Uppa Winery,1.0,1,Uppa Winery(9)
0,0,9,17,98.8,Alma Valley,1.0,1,Alma Valley(9)
2,2,7,20,99.48,Виноградники Гай-Кодзора,1.0,1,Виноградники Гай-Кодзора(7)
3,3,6,9,98.93,Инкерман,1.0,1,Инкерман(6)
4,4,5,9,100.0,Alma Valley,1.0,1,Alma Valley(5)
5,5,5,9,98.85,Domaine Lipko,1.0,1,Domaine Lipko(5)
6,6,5,5,98.49,Николаев и сыновья,1.0,1,Николаев и сыновья(5)
7,7,5,4,98.81,Валерий Захарьин,1.0,1,Валерий Захарьин(5)
8,8,4,4,100.0,Villa di Alma,1.0,1,Villa di Alma(4)
10,10,4,4,100.0,Инкерман,1.0,1,Инкерман(4)


In [55]:
def sku(md):
    return f"{md.get('name','') or ''} | {md.get('color','') or ''} | {md.get('sparkling','') or ''} | {md.get('sugar','') or ''}".strip(" |")

def short_label(md, max_len=18):
    s = (md.get("name") or "").strip()
    if not s:
        return md.get("image_id") or ""
    s = s.replace("\n"," ").strip()
    return (s[:max_len] + "…") if len(s) > max_len else s

def color_for_cluster(cid):
    h = hashlib.md5(str(cid).encode("utf-8")).hexdigest()
    return "#" + h[:6]


In [56]:
from pyvis.network import Network

deg = dict(G.degree())

net = Network(height="860px", width="100%", bgcolor="#ffffff", font_color="#111", notebook=True)
net.force_atlas_2based(gravity=-35, central_gravity=0.01, spring_length=150, spring_strength=0.04, damping=0.6)

for n in G.nodes():
    md = m.loc[n].to_dict() if n in m.index else {}
    md["image_id"] = n
    cid = G.nodes[n].get("cluster", -1)

    title = (
        f"<b>{n}</b><br>"
        f"cluster: {cid}<br>"
        f"{md.get('manufacturer','')}<br>"
        f"{sku(md)}<br>"
        f"path: {md.get('image','')}"
    )

    net.add_node(
        n,
        label=short_label(md, max_len=18),     # <-- видно на графе
        title=title,                           # <-- при наведении
        color=color_for_cluster(cid),          # <-- цвет кластера
        size=8 + 2*min(deg.get(n,0), 14)
    )

for u, v, d in G.edges(data=True):
    w = d.get("weight", 0.0)
    width = 1 + (w - TH) / max(1, (100-TH)) * 6
    net.add_edge(u, v, title=f"match: {w:.2f}", width=width, value=w)

net.show_buttons(filter_=["physics", "interaction", "layout"])

out_html = "/content/graph_in_brand_clusters_named.html"
net.save_graph(out_html)
print("Saved:", out_html)


Saved: /content/graph_in_brand_clusters_named.html


In [57]:
from IPython.display import IFrame, display
display(IFrame(out_html, width="100%", height=900))


In [58]:
from google.colab import files
files.download(out_html)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [59]:
import pandas as pd
import networkx as nx
from collections import Counter
import hashlib

meta_cols = ["image_id","manufacturer","name","color","sparkling","sugar","image","filepath","file_label_name","folder_name"]
m = df[meta_cols].copy().set_index("image_id")

# 1) берём все ребра и приклеиваем производителей
e = und.copy()
e = e.merge(m[["manufacturer"]], left_on="node1", right_index=True, how="left")
e = e.merge(m[["manufacturer"]], left_on="node2", right_index=True, how="left", suffixes=("_a","_b"))

# 2) оставляем только МЕЖБРЕНД
e = e[e["manufacturer_a"] != e["manufacturer_b"]][["node1","node2","w","manufacturer_a","manufacturer_b"]].copy()

print("Cross-brand edges total:", len(e))
print(e["w"].describe())


Cross-brand edges total: 16
count    16.000000
mean     19.366875
std      18.820017
min       5.680000
25%       7.225000
50%      10.520000
75%      20.215000
max      67.240000
Name: w, dtype: float64


In [60]:
TOP_EDGES = 400          # 200–800, зависит от "густоты" графа
MAX_EDGES_PER_NODE = 3   # держит лапшу в узде

# берём top-N по весу
e_top = e.sort_values("w", ascending=False).head(TOP_EDGES).copy()

# degree-cap: top-K на узел (в обе стороны)
e1 = e_top.sort_values("w", ascending=False).groupby("node1", as_index=False).head(MAX_EDGES_PER_NODE)
e2 = e_top.sort_values("w", ascending=False).groupby("node2", as_index=False).head(MAX_EDGES_PER_NODE)
e_lim = pd.concat([e1, e2], ignore_index=True).drop_duplicates(["node1","node2","w"])

print("Edges after topN+cap:", len(e_lim))
print("Nodes:", pd.unique(e_lim[["node1","node2"]].values.ravel()).size)


Edges after topN+cap: 16
Nodes: 24


In [61]:
Gx = nx.Graph()
for r in e_lim.itertuples(index=False):
    Gx.add_edge(r.node1, r.node2, weight=float(r.w))

print("Graph:", Gx.number_of_nodes(), "nodes,", Gx.number_of_edges(), "edges")


Graph: 24 nodes, 16 edges


In [62]:
from networkx.algorithms.community import greedy_modularity_communities

communities_x = list(greedy_modularity_communities(Gx, weight="weight"))
print("Communities found:", len(communities_x))

node2cluster_x = {}
for cid, comm in enumerate(communities_x):
    for n in comm:
        node2cluster_x[n] = cid

nx.set_node_attributes(Gx, node2cluster_x, "cluster")


Communities found: 8


In [63]:
def cluster_summary(G, m, communities):
    rows = []
    for cid, comm in enumerate(communities):
        comm = list(comm)
        manus = [m.loc[n, "manufacturer"] for n in comm if n in m.index]
        cnt = Counter(manus)

        top_manu, top_cnt = (cnt.most_common(1)[0] if cnt else ("?", 0))
        top_share = top_cnt / max(1, len(comm))

        sub = G.subgraph(comm)
        ws = [d.get("weight", 0.0) for _,_,d in sub.edges(data=True)]
        w_mean = sum(ws)/len(ws) if ws else 0.0

        rows.append({
            "cluster": cid,
            "nodes": len(comm),
            "edges_in": sub.number_of_edges(),
            "w_mean_in": round(w_mean, 2),
            "manufacturers_n": len(cnt),
            "top_manufacturer": top_manu,
            "top_share": round(top_share, 3),
            "top5_manufacturers": ", ".join([f"{k}({v})" for k,v in cnt.most_common(5)])
        })
    return pd.DataFrame(rows).sort_values(["manufacturers_n","nodes"], ascending=False)

clu_x = cluster_summary(Gx, m, communities_x)
display(clu_x.head(30))


Unnamed: 0,cluster,nodes,edges_in,w_mean_in,manufacturers_n,top_manufacturer,top_share,top5_manufacturers
1,1,5,4,8.89,4,Имение Сикоры,0.4,"Имение Сикоры(2), Усадьба Перовских(1), Галицк..."
0,0,6,5,16.59,3,Инкерман,0.5,"Инкерман(3), Николаев и сыновья(2), Domaine Li..."
2,2,3,2,6.63,3,Cellar Master,0.333,"Cellar Master(1), Инкерман(1), Валерий Захарьи..."
3,3,2,1,67.24,2,Галицкий и Галицкий,0.5,"Галицкий и Галицкий(1), Усадьба Дивноморское(1)"
4,4,2,1,49.77,2,Галицкий и Галицкий,0.5,"Галицкий и Галицкий(1), Николаев и сыновья(1)"
5,5,2,1,48.04,2,Cellar Master,0.5,"Cellar Master(1), Шато Тамань(1)"
6,6,2,1,7.34,2,Фанагория,0.5,"Фанагория(1), Uppa Winery(1)"
7,7,2,1,5.68,2,Temelion,0.5,"Temelion(1), Золотая балка(1)"


In [64]:
!pip -q install pyvis
from pyvis.network import Network

def sku(md):
    return f"{md.get('name','') or ''} | {md.get('color','') or ''} | {md.get('sparkling','') or ''} | {md.get('sugar','') or ''}".strip(" |")

def short_label(md, max_len=18):
    s = (md.get("name") or "").strip()
    if not s:
        return md.get("image_id") or ""
    s = s.replace("\n"," ").strip()
    return (s[:max_len] + "…") if len(s) > max_len else s

def color_for_cluster(cid):
    h = hashlib.md5(str(cid).encode("utf-8")).hexdigest()
    return "#" + h[:6]

deg = dict(Gx.degree())

net = Network(height="880px", width="100%", bgcolor="#ffffff", font_color="#111", notebook=True)
net.force_atlas_2based(gravity=-35, central_gravity=0.01, spring_length=170, spring_strength=0.04, damping=0.6)

for n in Gx.nodes():
    md = m.loc[n].to_dict() if n in m.index else {}
    md["image_id"] = n
    cid = Gx.nodes[n].get("cluster", -1)

    title = (
        f"<b>{n}</b><br>"
        f"cluster: {cid}<br>"
        f"{md.get('manufacturer','')}<br>"
        f"{sku(md)}<br>"
        f"path: {md.get('image','')}"
    )

    net.add_node(
        n,
        label=short_label(md, max_len=18),
        title=title,
        color=color_for_cluster(cid),
        size=8 + 2*min(deg.get(n,0), 14)
    )

# толщина ребра по w (в межбренде диапазон обычно 5..67, так что нормируем на max_w)
wmax = max([d.get("weight",0.0) for _,_,d in Gx.edges(data=True)] + [1.0])
wmin = min([d.get("weight",0.0) for _,_,d in Gx.edges(data=True)] + [0.0])

def edge_width_cross(w, min_w=0.8, max_w=9.0):
    t = (w - wmin) / max(1e-9, (wmax - wmin))
    t = max(0.0, min(1.0, t))
    return min_w + t*(max_w-min_w)

for u, v, d in Gx.edges(data=True):
    w = float(d.get("weight", 0.0))
    net.add_edge(u, v, title=f"match: {w:.2f}", width=edge_width_cross(w), value=w)

net.show_buttons(filter_=["physics", "interaction", "layout"])

out_html = "/content/graph_cross_brand_clusters_named.html"
net.save_graph(out_html)
print("Saved:", out_html)


Saved: /content/graph_cross_brand_clusters_named.html


In [65]:
from google.colab import files
files.download(out_html)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [66]:
# для текущего Gx/communities_x (которые ты уже построила)
pairs = []
for cid, comm in enumerate(communities_x):
    comm = list(comm)
    if len(comm) != 2:
        continue
    a, b = comm[0], comm[1]
    w = Gx[a][b]["weight"] if Gx.has_edge(a,b) else None
    ma = m.loc[a,"manufacturer"] if a in m.index else "?"
    mb = m.loc[b,"manufacturer"] if b in m.index else "?"
    na = m.loc[a,"name"] if a in m.index else ""
    nb = m.loc[b,"name"] if b in m.index else ""
    pairs.append({"cluster": cid, "w": w, "A": a, "manu_a": ma, "name_a": na,
                  "B": b, "manu_b": mb, "name_b": nb})

pairs_df = pd.DataFrame(pairs).sort_values("w", ascending=False)
display(pairs_df.head(30))


Unnamed: 0,cluster,w,A,manu_a,name_a,B,manu_b,name_b
0,3,67.24,IMG1084,Галицкий и Галицкий,Ballet Blanc Красная горка,IMG511,Усадьба Дивноморское,Солист
1,4,49.77,IMG984,Галицкий и Галицкий,Cosaque,IMG473,Николаев и сыновья,Riesling
2,5,48.04,IMG415,Cellar Master,Мальвазия село Яркое,IMG793,Шато Тамань,Delicate
3,6,7.34,IMG750,Фанагория,Декантер Саперави,IMG272,Uppa Winery,Pavel Shvets Gewurztraminer
4,7,5.68,IMG1051,Temelion,Темелион блан де блан 18,IMG991,Золотая балка,Кюве Де Витмер


In [67]:
!pip -q install pillow requests


In [68]:
import os, re, html, shutil, requests
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
from PIL import Image
from collections import Counter

# должно уже быть:
# Gx (networkx Graph), communities_x (list of sets), m (df indexed by image_id), PUBLIC_KEY

def safe_filename(s: str, max_len=140):
    s = str(s)
    s = re.sub(r"[\\/:*?\"<>|]+", "_", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s[:max_len].rstrip()

def make_thumb(src_path, dst_path, max_side=900, quality=80):
    dst_path = Path(dst_path)
    dst_path.parent.mkdir(parents=True, exist_ok=True)
    if dst_path.exists():
        return str(dst_path)
    im = Image.open(src_path).convert("RGB")
    im.thumbnail((max_side, max_side))
    im.save(dst_path, format="JPEG", quality=quality, optimize=True)
    return str(dst_path)

def ydisk_public_download(public_key: str, rel_path: str, out_dir="/content/cache", timeout=60):
    # rel_path у тебя в m["image"] вида "316-384/....jpg"
    rel_path = str(rel_path)
    if not rel_path.startswith("/"):
        rel_path = "/" + rel_path

    local_rel = rel_path.lstrip("/")
    out_path = os.path.join(out_dir, local_rel)
    os.makedirs(os.path.dirname(out_path), exist_ok=True)

    if os.path.exists(out_path) and os.path.getsize(out_path) > 0:
        return out_path

    api = "https://cloud-api.yandex.net/v1/disk/public/resources/download"
    r = requests.get(api, params={"public_key": public_key, "path": rel_path}, timeout=timeout)
    r.raise_for_status()
    href = r.json()["href"]

    with requests.get(href, stream=True, timeout=180) as resp:
        resp.raise_for_status()
        with open(out_path, "wb") as f:
            for chunk in resp.iter_content(chunk_size=1024*1024):
                if chunk:
                    f.write(chunk)
    return out_path

def sku_row(md):
    return f"{md.get('name','') or ''} | {md.get('color','') or ''} | {md.get('sparkling','') or ''} | {md.get('sugar','') or ''}".strip(" |")

def build_cluster_report(G, communities, m, public_key,
                         out_dir="/content/clusters_report",
                         workers=12,
                         thumbs_max_side=900,
                         min_cluster_size=2):
    out_dir = Path(out_dir)
    img_dir = out_dir / "img"
    img_dir.mkdir(parents=True, exist_ok=True)

    # ---- собираем кластера и сортируем по "интересности": сначала большие/плотные ----
    cluster_infos = []
    all_nodes = set()
    for cid, comm in enumerate(communities):
        nodes = list(comm)
        if len(nodes) < min_cluster_size:
            continue
        sub = G.subgraph(nodes)
        ws = [d.get("weight",0.0) for _,_,d in sub.edges(data=True)]
        w_mean = sum(ws)/len(ws) if ws else 0.0
        manus = [m.loc[n,"manufacturer"] for n in nodes if n in m.index]
        cnt = Counter(manus)
        cluster_infos.append({
            "cluster": cid,
            "nodes": nodes,
            "n_nodes": len(nodes),
            "n_edges": sub.number_of_edges(),
            "w_mean": w_mean,
            "manufacturers": cnt,
            "manufacturers_n": len(cnt),
        })
        all_nodes.update(nodes)

    cluster_infos.sort(key=lambda x: (x["manufacturers_n"], x["n_nodes"], x["n_edges"], x["w_mean"]), reverse=True)

    # ---- уникальные изображения для скачивания ----
    # в m у тебя есть колонка "image" = относительный путь в яндекс диске
    uniq_paths = []
    for n in all_nodes:
        if n in m.index:
            p = m.loc[n, "image"]
            if isinstance(p, str) and p.strip():
                uniq_paths.append(p.strip())
    uniq_paths = sorted(set(uniq_paths))

    print(f"Clusters: {len(cluster_infos)} | nodes total: {len(all_nodes)} | unique images: {len(uniq_paths)}")

    # ---- скачиваем параллельно ----
    local_map = {}
    failed = []

    def task(rel):
        try:
            return rel, ydisk_public_download(public_key, rel), None
        except Exception as e:
            return rel, None, str(e)

    with ThreadPoolExecutor(max_workers=workers) as ex:
        futs = [ex.submit(task, rel) for rel in uniq_paths]
        done = 0
        for fut in as_completed(futs):
            rel, loc, err = fut.result()
            done += 1
            if err:
                failed.append((rel, err))
            else:
                local_map[rel] = loc
            if done % 25 == 0 or done == len(uniq_paths):
                print(f"Downloaded {done}/{len(uniq_paths)}")

    # ---- миниатюры ----
    thumb_map = {}
    for i, rel in enumerate(uniq_paths, start=1):
        if rel not in local_map:
            continue
        fname = safe_filename(Path(rel).name)
        thumb_name = f"{i:05d}_{fname}.jpg"
        thumb_path = img_dir / thumb_name
        make_thumb(local_map[rel], thumb_path, max_side=thumbs_max_side, quality=80)
        thumb_map[rel] = f"img/{thumb_name}"

    # ---- HTML ----
    parts = []
    parts.append("<!doctype html><html><head><meta charset='utf-8'>")
    parts.append("<meta name='viewport' content='width=device-width, initial-scale=1'>")
    parts.append("<style>")
    parts.append("""
    body{font-family:system-ui,-apple-system,Segoe UI,Roboto,Arial;margin:18px;color:#111;}
    h1{font-size:20px;margin:0 0 10px;}
    .meta{color:#555;font-size:13px;margin-bottom:14px;}
    .toc{display:flex;flex-wrap:wrap;gap:8px;margin:10px 0 18px;}
    .chip{display:inline-block;padding:6px 10px;border-radius:999px;background:#f3f3f3;
          font-size:12px;text-decoration:none;color:#111;border:1px solid #e6e6e6;}
    .chip:hover{background:#eaeaea;}
    .cluster{border:1px solid #e6e6e6;border-radius:14px;padding:14px;margin:14px 0;}
    .hdr{display:flex;gap:10px;flex-wrap:wrap;align-items:baseline;justify-content:space-between;}
    .title{font-weight:700;}
    .badges{display:flex;gap:8px;flex-wrap:wrap}
    .badge{display:inline-block;padding:3px 8px;border-radius:999px;font-size:12px;background:#f7f7f7;border:1px solid #eee;}
    .grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(220px,1fr));gap:12px;margin-top:12px;}
    .card{border:1px solid #eee;border-radius:12px;padding:10px;background:#fff;}
    img{max-width:100%;border:1px solid #ddd;border-radius:10px;background:#fafafa;}
    .cap{font-size:12px;line-height:1.25;margin:8px 0 0;}
    .cap b{font-weight:700;}
    .small{font-size:11px;color:#666;word-break:break-all;margin-top:6px;}
    </style></head><body>
    """)
    parts.append("<h1>Visual clusters report (cross-brand)</h1>")
    parts.append(f"<div class='meta'>Clusters: {len(cluster_infos)} • Nodes: {len(all_nodes)} • Failed images: {len(failed)}</div>")

    # оглавление
    parts.append("<div class='toc'>")
    for info in cluster_infos:
        cid = info["cluster"]
        parts.append(f"<a class='chip' href='#c{cid}'>Cluster {cid} · {info['n_nodes']} nodes · {info['manufacturers_n']} brands</a>")
    parts.append("</div>")

    for info in cluster_infos:
        cid = info["cluster"]
        nodes = info["nodes"]
        sub = G.subgraph(nodes)

        top5 = ", ".join([f"{k}({v})" for k,v in info["manufacturers"].most_common(5)]) if info["manufacturers"] else "?"

        parts.append(f"<div class='cluster' id='c{cid}'>")
        parts.append("<div class='hdr'>")
        parts.append(f"<div class='title'>Cluster {cid}</div>")
        parts.append("<div class='badges'>")
        parts.append(f"<span class='badge'>{info['n_nodes']} nodes</span>")
        parts.append(f"<span class='badge'>{info['n_edges']} edges</span>")
        parts.append(f"<span class='badge'>w_mean_in: {info['w_mean']:.2f}</span>")
        parts.append(f"<span class='badge'>{info['manufacturers_n']} manufacturers</span>")
        parts.append("</div></div>")
        parts.append(f"<div class='meta'>Top manufacturers: {html.escape(top5)}</div>")

        # сортируем узлы внутри кластера по степени в субграфе (первое что смотришь)
        deg_sub = dict(sub.degree())
        nodes_sorted = sorted(nodes, key=lambda n: deg_sub.get(n,0), reverse=True)

        parts.append("<div class='grid'>")
        for n in nodes_sorted:
            if n not in m.index:
                continue
            md = m.loc[n].to_dict()
            rel = md.get("image","")
            img_src = thumb_map.get(rel)

            parts.append("<div class='card'>")
            if img_src:
                parts.append(f"<img src='{html.escape(img_src)}'>")
            else:
                parts.append("<div class='meta'>image missing</div>")

            parts.append(f"<div class='cap'><b>{html.escape(md.get('manufacturer',''))}</b></div>")
            parts.append(f"<div class='cap'>{html.escape(md.get('name','') or '')}</div>")
            parts.append(f"<div class='cap'>{html.escape(sku_row(md))}</div>")

            # покажем связи узла внутри кластера (быстро понять "центр")
            parts.append(f"<div class='small'>node: {html.escape(n)} · deg_in_cluster: {deg_sub.get(n,0)}</div>")
            parts.append(f"<div class='small'>path: {html.escape(str(rel))}</div>")
            parts.append("</div>")
        parts.append("</div>")  # grid

        parts.append("</div>")  # cluster

    parts.append("</body></html>")
    html_path = out_dir / "index.html"
    out_dir.mkdir(parents=True, exist_ok=True)
    html_path.write_text("\n".join(parts), encoding="utf-8")

    return str(html_path), str(out_dir), failed

html_path, folder_path, failed = build_cluster_report(
    Gx, communities_x, m, PUBLIC_KEY,
    out_dir="/content/cross_brand_clusters_report",
    workers=12,
    thumbs_max_side=900,
    min_cluster_size=2
)

print("HTML:", html_path)
print("Folder:", folder_path)
print("Failed:", len(failed))


Clusters: 8 | nodes total: 24 | unique images: 24
Downloaded 24/24
HTML: /content/cross_brand_clusters_report/index.html
Folder: /content/cross_brand_clusters_report
Failed: 0


In [69]:
import shutil
zip_path = shutil.make_archive("/content/cross_brand_clusters_report", "zip", "/content/cross_brand_clusters_report")

from google.colab import files
files.download(zip_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [73]:
TOP_EDGES = 1200
MAX_EDGES_PER_NODE = 3
TH_X = 2.0   # отрезаем совсем слабые

# межбрендовые ребра
e = und.copy()
e = e.merge(m[["manufacturer"]], left_on="node1", right_index=True, how="left")
e = e.merge(m[["manufacturer"]], left_on="node2", right_index=True, how="left", suffixes=("_a","_b"))
e = e[(e["manufacturer_a"] != e["manufacturer_b"]) & (e["w"] >= TH_X)][["node1","node2","w"]].copy()

# top-N + degree cap
e_top = e.sort_values("w", ascending=False).head(TOP_EDGES).copy()
e1 = e_top.sort_values("w", ascending=False).groupby("node1", as_index=False).head(MAX_EDGES_PER_NODE)
e2 = e_top.sort_values("w", ascending=False).groupby("node2", as_index=False).head(MAX_EDGES_PER_NODE)
e_lim = pd.concat([e1, e2], ignore_index=True).drop_duplicates(["node1","node2","w"])

Gx = nx.Graph()
for r in e_lim.itertuples(index=False):
    Gx.add_edge(r.node1, r.node2, weight=float(r.w))

from networkx.algorithms.community import greedy_modularity_communities
communities_x = list(greedy_modularity_communities(Gx, weight="weight"))

print("Edges:", len(e), "->", len(e_lim))
print("Graph:", Gx.number_of_nodes(), "nodes,", Gx.number_of_edges(), "edges")
print("Communities:", len(communities_x))

clu_x = cluster_summary(Gx, m, communities_x)
display(clu_x.head(20))


Edges: 16 -> 16
Graph: 24 nodes, 16 edges
Communities: 8


Unnamed: 0,cluster,nodes,edges_in,w_mean_in,manufacturers_n,top_manufacturer,top_share,top5_manufacturers
1,1,5,4,8.89,4,Имение Сикоры,0.4,"Имение Сикоры(2), Усадьба Перовских(1), Галицк..."
0,0,6,5,16.59,3,Инкерман,0.5,"Инкерман(3), Николаев и сыновья(2), Domaine Li..."
2,2,3,2,6.63,3,Cellar Master,0.333,"Cellar Master(1), Инкерман(1), Валерий Захарьи..."
3,3,2,1,67.24,2,Галицкий и Галицкий,0.5,"Галицкий и Галицкий(1), Усадьба Дивноморское(1)"
4,4,2,1,49.77,2,Галицкий и Галицкий,0.5,"Галицкий и Галицкий(1), Николаев и сыновья(1)"
5,5,2,1,48.04,2,Cellar Master,0.5,"Cellar Master(1), Шато Тамань(1)"
6,6,2,1,7.34,2,Фанагория,0.5,"Фанагория(1), Uppa Winery(1)"
7,7,2,1,5.68,2,Temelion,0.5,"Temelion(1), Золотая балка(1)"


In [74]:
import numpy as np
import pandas as pd

# узлы, которые вообще встречаются в und
nodes = pd.Index(sorted(set(und["node1"]).union(set(und["node2"]))))
n = len(nodes)
print("Nodes:", n)

# индексатор
idx = pd.Series(np.arange(n), index=nodes)

# матрица расстояний: по умолчанию 100 (очень далеко)
D = np.full((n, n), 100.0, dtype=np.float32)
np.fill_diagonal(D, 0.0)

# заполняем по und: dist = 100 - w
ii = idx.loc[und["node1"]].to_numpy()
jj = idx.loc[und["node2"]].to_numpy()
dist = (100.0 - und["w"].to_numpy(dtype=np.float32))

D[ii, jj] = np.minimum(D[ii, jj], dist)
D[jj, ii] = np.minimum(D[jj, ii], dist)  # симметрия

print("D ready:", D.shape, "min/max:", float(D.min()), float(D.max()))


Nodes: 880
D ready: (880, 880) min/max: 0.0 100.0


In [75]:
rng = np.random.default_rng(42)

def kmedoids_pam_light(D, k=12, iters=8, seed=42):
    rng = np.random.default_rng(seed)
    n = D.shape[0]

    # старт: случайные медоиды
    medoids = rng.choice(n, size=k, replace=False)

    for t in range(iters):
        # assign: ближайший медоид
        dist_to_medoids = D[:, medoids]               # (n, k)
        labels = dist_to_medoids.argmin(axis=1)       # (n,)
        best_cost = dist_to_medoids.min(axis=1).sum()

        improved = False

        # для каждого кластера попробуем выбрать "лучший" медоид внутри кластера
        for c in range(k):
            members = np.where(labels == c)[0]
            if len(members) == 0:
                # пустой кластер: перезапускаем медоид в случайную точку
                medoids[c] = rng.integers(0, n)
                improved = True
                continue

            # cost(x) = сумма расстояний от x до всех members
            # выберем x минимизирующий cost
            subD = D[np.ix_(members, members)]  # (m, m)
            costs = subD.sum(axis=1)
            new_medoid = members[costs.argmin()]

            if new_medoid != medoids[c]:
                medoids[c] = new_medoid
                improved = True

        # пересчёт стоимости после обновления
        dist_to_medoids2 = D[:, medoids]
        new_cost = dist_to_medoids2.min(axis=1).sum()

        print(f"iter {t+1}/{iters} | cost: {best_cost:.1f} -> {new_cost:.1f} | changed: {improved}")

        if not improved:
            break

    # финальные labels
    labels = D[:, medoids].argmin(axis=1)
    return medoids, labels

K = 12
medoids_idx, labels = kmedoids_pam_light(D, k=K, iters=10, seed=42)
print("Medoids:", medoids_idx[:10], "...")


iter 1/10 | cost: 83302.9 -> 82048.2 | changed: True
iter 2/10 | cost: 82048.2 -> 82048.2 | changed: False
Medoids: [137  74 681 866 570 787 113 298  84 608] ...


In [76]:
meta_cols = ["image_id","manufacturer","name","color","sparkling","sugar","image"]
m = df[meta_cols].copy().set_index("image_id")

def sku(md):
    return f"{md.get('name','') or ''} | {md.get('color','') or ''} | {md.get('sparkling','') or ''} | {md.get('sugar','') or ''}".strip(" |")

medoid_ids = nodes[medoids_idx].to_list()

rows = []
for c in range(K):
    members = nodes[np.where(labels == c)[0]]
    manus = [m.loc[x, "manufacturer"] for x in members if x in m.index]
    top = pd.Series(manus).value_counts().head(5)
    rows.append({
        "cluster": c,
        "size": len(members),
        "medoid_image_id": medoid_ids[c],
        "medoid_manufacturer": m.loc[medoid_ids[c], "manufacturer"] if medoid_ids[c] in m.index else "?",
        "medoid_name": m.loc[medoid_ids[c], "name"] if medoid_ids[c] in m.index else "",
        "brands_n": len(set(manus)),
        "top5_brands": ", ".join([f"{k}({v})" for k,v in top.items()]) if len(top) else ""
    })

arch = pd.DataFrame(rows).sort_values(["brands_n","size"], ascending=False)
display(arch)


Unnamed: 0,cluster,size,medoid_image_id,medoid_manufacturer,medoid_name,brands_n,top5_brands
0,0,797,IMG19,Массандра,Портвейн белый Алушта,86,"Фанагория(57), Массандра(45), ООО ВК ""САТЕРА""(..."
2,2,11,IMG769,Chateau Le Grand Vostock,Каберне Фран,1,Chateau Le Grand Vostock(11)
6,6,11,IMG150,Alma Valley,Kokur,1,Alma Valley(11)
9,9,10,IMG697,Собер Баш,Красностоп,1,Собер Баш(10)
10,10,10,IMG106,Усадьба Перовских,Мускат,1,Усадьба Перовских(10)
11,11,10,IMG175,Шато Ай-Даниль,Багряное солнечное,1,Шато Ай-Даниль(10)
3,3,8,IMG982,Скалистый берег,Красная Книга глава 2,1,Скалистый берег(8)
5,5,7,IMG89,Инкерман,Эритаж Пино Нуар Классический,1,Инкерман(7)
1,1,6,IMG111,Alma Valley,Cabernet Franc,1,Alma Valley(6)
8,8,4,IMG120,ЗМВ Коктебель,Магнетизм Розе Мерло,1,ЗМВ Коктебель(4)


In [77]:
TOP_PER_ARCHETYPE = 24

examples = []
for c in range(K):
    med = medoids_idx[c]
    members = np.where(labels == c)[0]
    # сортируем участников кластера по расстоянию к медоиду
    dists = D[members, med]
    order = members[np.argsort(dists)]
    pick = order[:TOP_PER_ARCHETYPE]

    for rank, j in enumerate(pick, start=1):
        img_id = nodes[j]
        md = m.loc[img_id].to_dict() if img_id in m.index else {}
        examples.append({
            "cluster": c,
            "rank": rank,
            "dist_to_medoid": float(D[j, med]),
            "match_to_medoid": float(100.0 - D[j, med]),
            "image_id": img_id,
            "manufacturer": md.get("manufacturer",""),
            "sku": sku(md),
            "path_image": md.get("image","")
        })

ex_df = pd.DataFrame(examples).sort_values(["cluster","rank"])
display(ex_df.head(40))


Unnamed: 0,cluster,rank,dist_to_medoid,match_to_medoid,image_id,manufacturer,sku,path_image
0,0,1,0.0,100.0,IMG19,Массандра,Портвейн белый Алушта | Белое | Креплёное | Сл...,1-41/Массандра портвейн белый алушта 2.jpg
1,0,2,10.080002,89.919998,IMG17,Массандра,Портвейн розовый Алушта | Розовое | Креплёное ...,1-41/Массандра портвейн розовый алушта 2.jpg
2,0,3,17.050003,82.949997,IMG16,Массандра,Портвейн красный Алушта | Красное | Креплёное ...,1-41/Массандра портвейн красный алушта 2.jpg
3,0,4,21.68,78.32,IMG15,Массандра,Портвейн белый гурзуф | Белое | Креплёное | Сл...,1-41/Массандра портвейн белый гурзуф 2.jpg
4,0,5,23.559998,76.440002,IMG10,Массандра,Портвейн белый сурож | Белое | Креплёное | Сла...,1-41/Массандра портвейн юелый сурож 2.jpg
5,0,6,28.739998,71.260002,IMG948,Массандра,Кагор Гурзуф | Красное | Креплёное | Сладкое,915-953/массандра кагор гурзуф (1).jpg
6,0,7,31.449997,68.550003,IMG8,Массандра,Портвейн красный Крымский | Красное | Креплёно...,1-41/Массандра портвейн красный крымский 2.jpg
7,0,8,31.839996,68.160004,IMG11,Массандра,Мадера Крымская | Белое | Креплёное | Сладкое,1-41/Массандра мадера крымская 2.jpg
8,0,9,32.300003,67.699997,IMG14,Массандра,Портвейн розовый гурзуф | Розовое | Креплёное ...,1-41/Массандра портвейн розовый гурзуф 2.jpg
9,0,10,32.809998,67.190002,IMG13,Массандра,Херес ореанда | Белое | Креплёное | Сухое,1-41/Массандра херес ореанда 2.jpg


In [78]:
# словарь cluster -> list of image relative paths
arch_imgs = {}
for c in range(K):
    arch_imgs[c] = ex_df[ex_df["cluster"]==c]["path_image"].dropna().tolist()

# покажем размеры
for c in range(K):
    print(c, "imgs:", len(arch_imgs[c]))


0 imgs: 24
1 imgs: 6
2 imgs: 11
3 imgs: 8
4 imgs: 3
5 imgs: 7
6 imgs: 11
7 imgs: 3
8 imgs: 4
9 imgs: 10
10 imgs: 10
11 imgs: 10


In [79]:
from pathlib import Path
import html as _html

out_dir = Path("/content/prototype_catalog")
img_dir = out_dir / "img"
img_dir.mkdir(parents=True, exist_ok=True)

# скачиваем и делаем thumbs только для уникальных картинок
uniq = sorted(set([p for lst in arch_imgs.values() for p in lst if isinstance(p,str) and p.strip()]))
print("Unique images to fetch:", len(uniq))

local_map = {}
thumb_map = {}

# параллельно можно, но для простоты — последовательно (у тебя немного)
for i, rel in enumerate(uniq, start=1):
    loc = ydisk_public_download(PUBLIC_KEY, rel)
    fname = safe_filename(Path(rel).name)
    thumb_name = f"{i:05d}_{fname}.jpg"
    thumb_path = img_dir / thumb_name
    make_thumb(loc, thumb_path, max_side=900, quality=80)
    thumb_map[rel] = f"img/{thumb_name}"

# HTML
parts = []
parts.append("<!doctype html><html><head><meta charset='utf-8'>")
parts.append("<meta name='viewport' content='width=device-width, initial-scale=1'>")
parts.append("""
<style>
body{font-family:system-ui,-apple-system,Segoe UI,Roboto,Arial;margin:18px;color:#111;}
h1{font-size:20px;margin:0 0 10px;}
.meta{color:#555;font-size:13px;margin-bottom:14px;}
.toc{display:flex;flex-wrap:wrap;gap:8px;margin:10px 0 18px;}
.chip{display:inline-block;padding:6px 10px;border-radius:999px;background:#f3f3f3;
      font-size:12px;text-decoration:none;color:#111;border:1px solid #e6e6e6;}
.section{border:1px solid #e6e6e6;border-radius:14px;padding:14px;margin:14px 0;}
.grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(220px,1fr));gap:12px;margin-top:12px;}
.card{border:1px solid #eee;border-radius:12px;padding:10px;background:#fff;}
img{max-width:100%;border:1px solid #ddd;border-radius:10px;background:#fafafa;}
.cap{font-size:12px;line-height:1.25;margin:8px 0 0;}
.small{font-size:11px;color:#666;word-break:break-all;margin-top:6px;}
</style>
</head><body>
""")
parts.append("<h1>Prototype catalog (k-medoids)</h1>")
parts.append(f"<div class='meta'>K={K} • TOP_PER_ARCHETYPE={TOP_PER_ARCHETYPE} • Unique images={len(uniq)}</div>")

# оглавление
parts.append("<div class='toc'>")
for _, r in arch.iterrows():
    c = int(r["cluster"])
    parts.append(f"<a class='chip' href='#c{c}'>Cluster {c} · size {int(r['size'])} · brands {int(r['brands_n'])}</a>")
parts.append("</div>")

for _, r in arch.iterrows():
    c = int(r["cluster"])
    med_id = r["medoid_image_id"]
    med_md = m.loc[med_id].to_dict() if med_id in m.index else {}
    med_sku = sku(med_md)

    parts.append(f"<div class='section' id='c{c}'>")
    parts.append(f"<div><b>Cluster {c}</b> · size {int(r['size'])} · brands {int(r['brands_n'])}</div>")
    parts.append(f"<div class='meta'>Medoid: {_html.escape(med_id)} — {_html.escape(med_sku)} — {_html.escape(r['medoid_manufacturer'])}</div>")
    parts.append(f"<div class='meta'>Top brands: {_html.escape(r['top5_brands'])}</div>")

    # карточки топ-рядом с медоидом
    parts.append("<div class='grid'>")
    sub = ex_df[ex_df["cluster"]==c].sort_values("rank")
    for _, ex in sub.iterrows():
        rel = ex["path_image"]
        img_src = thumb_map.get(rel)
        parts.append("<div class='card'>")
        if img_src:
            parts.append(f"<img src='{_html.escape(img_src)}'>")
        parts.append(f"<div class='cap'><b>{_html.escape(ex['manufacturer'])}</b></div>")
        parts.append(f"<div class='cap'>{_html.escape(ex['sku'])}</div>")
        parts.append(f"<div class='small'>match_to_medoid: {ex['match_to_medoid']:.2f}</div>")
        parts.append(f"<div class='small'>path: {_html.escape(str(rel))}</div>")
        parts.append("</div>")
    parts.append("</div></div>")

parts.append("</body></html>")

out_dir.mkdir(parents=True, exist_ok=True)
index_html = out_dir / "index.html"
index_html.write_text("\n".join(parts), encoding="utf-8")
print("Saved:", str(index_html))


Unique images to fetch: 107
Saved: /content/prototype_catalog/index.html


In [80]:
import shutil
zip_path = shutil.make_archive("/content/prototype_catalog", "zip", "/content/prototype_catalog")
from google.colab import files
files.download(zip_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [81]:
import numpy as np
import pandas as pd

meta_cols = ["image_id","manufacturer","name","color","sparkling","sugar","image"]
m = df[meta_cols].copy().set_index("image_id")

TH_X = 12.0  # минимум межбрендового сигнала, подними/опусти (10..20)
TOP_EDGES = 1500  # ограничим объём, чтобы не распухало

e = und.copy()
e = e.merge(m[["manufacturer"]], left_on="node1", right_index=True, how="left")
e = e.merge(m[["manufacturer"]], left_on="node2", right_index=True, how="left", suffixes=("_a","_b"))
e = e[(e["manufacturer_a"] != e["manufacturer_b"]) & (e["w"] >= TH_X)][["node1","node2","w"]]
e = e.sort_values("w", ascending=False).head(TOP_EDGES).copy()

nodes = pd.Index(sorted(set(e["node1"]).union(set(e["node2"]))))
print("Cross-brand edges used:", len(e))
print("Nodes participating:", len(nodes))
e["w"].describe()


Cross-brand edges used: 7
Nodes participating: 11


Unnamed: 0,w
count,7.0
mean,34.392857
std,20.329035
min,16.29
25%,18.525
50%,22.36
75%,48.905
max,67.24


In [82]:
n = len(nodes)
idx = pd.Series(np.arange(n), index=nodes)

D = np.full((n, n), 100.0, dtype=np.float32)
np.fill_diagonal(D, 0.0)

ii = idx.loc[e["node1"]].to_numpy()
jj = idx.loc[e["node2"]].to_numpy()
dist = (100.0 - e["w"].to_numpy(dtype=np.float32))

D[ii, jj] = np.minimum(D[ii, jj], dist)
D[jj, ii] = np.minimum(D[jj, ii], dist)

print("D ready:", D.shape, "min/max:", float(D.min()), float(D.max()))


D ready: (11, 11) min/max: 0.0 100.0


In [83]:
def kmedoids_pam_light(D, k=8, iters=10, seed=42):
    rng = np.random.default_rng(seed)
    n = D.shape[0]
    medoids = rng.choice(n, size=min(k,n), replace=False)

    for t in range(iters):
        dist_to = D[:, medoids]
        labels = dist_to.argmin(axis=1)
        cost0 = dist_to.min(axis=1).sum()

        improved = False
        for c in range(len(medoids)):
            members = np.where(labels == c)[0]
            if len(members) == 0:
                medoids[c] = rng.integers(0, n)
                improved = True
                continue
            subD = D[np.ix_(members, members)]
            costs = subD.sum(axis=1)
            new_med = members[costs.argmin()]
            if new_med != medoids[c]:
                medoids[c] = new_med
                improved = True

        cost1 = D[:, medoids].min(axis=1).sum()
        print(f"iter {t+1}: cost {cost0:.1f} -> {cost1:.1f} | changed: {improved}")
        if not improved:
            break

    labels = D[:, medoids].argmin(axis=1)
    return medoids, labels

K = 8
medoids_idx, labels = kmedoids_pam_light(D, k=K, iters=12, seed=42)
medoid_ids = nodes[medoids_idx].to_list()
print("Medoids:", medoid_ids)


iter 1: cost 135.0 -> 134.9 | changed: True
iter 2: cost 134.9 -> 134.9 | changed: False
Medoids: ['IMG83', 'IMG473', 'IMG81', 'IMG637', 'IMG415', 'IMG79', 'IMG1084', 'IMG475']


In [84]:
def sku(md):
    return f"{md.get('name','') or ''} | {md.get('color','') or ''} | {md.get('sparkling','') or ''} | {md.get('sugar','') or ''}".strip(" |")

rows = []
for c in range(K):
    members = nodes[np.where(labels == c)[0]]
    manus = [m.loc[x,"manufacturer"] for x in members if x in m.index]
    vc = pd.Series(manus).value_counts()
    rows.append({
        "cluster": c,
        "size": len(members),
        "medoid": medoid_ids[c],
        "medoid_brand": m.loc[medoid_ids[c],"manufacturer"] if medoid_ids[c] in m.index else "?",
        "medoid_name": m.loc[medoid_ids[c],"name"] if medoid_ids[c] in m.index else "",
        "brands_n": int(vc.size),
        "top5_brands": ", ".join([f"{k}({v})" for k,v in vc.head(5).items()])
    })

arch_x = pd.DataFrame(rows).sort_values(["brands_n","size"], ascending=False)
display(arch_x)


Unnamed: 0,cluster,size,medoid,medoid_brand,medoid_name,brands_n,top5_brands
1,1,2,IMG473,Николаев и сыновья,Riesling,2,"Николаев и сыновья(1), Галицкий и Галицкий(1)"
4,4,2,IMG415,Cellar Master,Мальвазия село Яркое,2,"Cellar Master(1), Шато Тамань(1)"
6,6,2,IMG1084,Галицкий и Галицкий,Ballet Blanc Красная горка,2,"Галицкий и Галицкий(1), Усадьба Дивноморское(1)"
0,0,1,IMG83,Инкерман,Winemaker's selection Пино Нуар,1,Инкерман(1)
2,2,1,IMG81,Инкерман,Winemaker's selection Шардоне,1,Инкерман(1)
3,3,1,IMG637,Николаев и сыновья,Пти Мансан,1,Николаев и сыновья(1)
5,5,1,IMG79,Инкерман,Winemaker's selection Кокур,1,Инкерман(1)
7,7,1,IMG475,Николаев и сыновья,Sauvignon Blanc,1,Николаев и сыновья(1)


In [85]:
import pandas as pd
import numpy as np

# если у тебя есть edges_df уже — пропусти
# иначе строим directed edges из nearests_parsed
rows = []
for r in df[["image_id","nearests_parsed"]].itertuples(index=False):
    src = r.image_id
    lst = r.nearests_parsed or []
    for d in lst:
        if not isinstance(d, dict):
            continue
        dst = d.get("image_id")
        w = d.get("match_percent")
        if dst is None or w is None:
            continue
        if src == dst:
            continue
        rows.append((src, dst, float(w)))

edges_dir = pd.DataFrame(rows, columns=["src","dst","w"])
print("Directed edges:", len(edges_dir))
edges_dir["w"].describe()


Directed edges: 4233


Unnamed: 0,w
count,4233.0
mean,46.661439
std,30.933542
min,5.01
25%,15.45
50%,44.94
75%,74.03
max,100.0


In [86]:
# делаем ключи и ищем взаимность
a = edges_dir.copy()
b = edges_dir.rename(columns={"src":"dst","dst":"src","w":"w_rev"})

ab = a.merge(b, on=["src","dst"], how="inner")  # ребра, которые есть в обе стороны
# симметризуем: src<dst
ab["u"] = ab[["src","dst"]].min(axis=1)
ab["v"] = ab[["src","dst"]].max(axis=1)

# итоговый вес можно взять как среднее или min (min более "строгий")
mk = (ab.groupby(["u","v"], as_index=False)
        .agg(w=("w","mean"), w_rev=("w_rev","mean")))
mk["w_mknn"] = (mk["w"] + mk["w_rev"]) / 2.0

print("Mutual edges:", len(mk))
mk["w_mknn"].describe()


Mutual edges: 1875


Unnamed: 0,w_mknn
count,1875.0
mean,49.456168
std,30.077007
min,5.17
25%,19.235
50%,50.065
75%,75.875
max,100.0


In [87]:
# медиана и MAD для src по направленным ребрам
stats = (edges_dir.groupby("src")["w"]
         .agg(med="median",
              mad=lambda s: float(np.median(np.abs(s - np.median(s)))) )
         .reset_index())

edges_z = edges_dir.merge(stats, left_on="src", right_on="src", how="left")
edges_z["z"] = (edges_z["w"] - edges_z["med"]) / (edges_z["mad"] + 1e-6)

# теперь опять взаимность, но уже по z
a = edges_z[["src","dst","w","z"]].copy()
b = edges_z[["src","dst","w","z"]].rename(columns={"src":"dst","dst":"src","w":"w_rev","z":"z_rev"})
abz = a.merge(b, on=["src","dst"], how="inner")

abz["u"] = abz[["src","dst"]].min(axis=1)
abz["v"] = abz[["src","dst"]].max(axis=1)

mknn_norm = (abz.groupby(["u","v"], as_index=False)
             .agg(w=("w","mean"),
                  z_u=("z","mean"),
                  z_v=("z_rev","mean")))

# общий нормированный вес: берём min (строже) и плюс сырой w
mknn_norm["z_min"] = mknn_norm[["z_u","z_v"]].min(axis=1)
print("mknn_norm:", len(mknn_norm))
mknn_norm[["w","z_min"]].describe()


mknn_norm: 1875


Unnamed: 0,w,z_min
count,1875.0,1875.0
mean,49.456168,-136542.9
std,30.077007,2083370.0
min,5.17,-41835000.0
25%,19.235,-0.8045718
50%,50.065,0.0
75%,75.875,0.9725915
max,100.0,20050000.0


In [88]:
meta_cols = ["image_id","manufacturer","name","color","sparkling","sugar","image"]
m = df[meta_cols].copy().set_index("image_id")

x = mknn_norm.merge(m[["manufacturer"]], left_on="u", right_index=True, how="left")
x = x.merge(m[["manufacturer"]], left_on="v", right_index=True, how="left", suffixes=("_a","_b"))
x = x[x["manufacturer_a"] != x["manufacturer_b"]].copy()

print("Cross-brand mutual edges:", len(x))
display(x.sort_values(["z_min","w"], ascending=False).head(20))


Cross-brand mutual edges: 1


Unnamed: 0,u,v,w,z_u,z_v,z_min,manufacturer_a,manufacturer_b
961,IMG463,IMG581,8.73,-1.506873,-1.506873,-1.506873,Галицкий и Галицкий,Имение Сикоры


In [89]:
import pandas as pd
import numpy as np

meta = df[["image_id","manufacturer"]].copy().set_index("image_id")

e = und.copy()
e = e.merge(meta, left_on="node1", right_index=True, how="left")
e = e.merge(meta, left_on="node2", right_index=True, how="left", suffixes=("_a","_b"))

# межбренд
e = e[e["manufacturer_a"] != e["manufacturer_b"]].copy()

print("Cross-brand edges:", len(e))
display(e[["node1","node2","w","manufacturer_a","manufacturer_b"]].head(5))


Cross-brand edges: 16


Unnamed: 0,node1,node2,w,manufacturer_a,manufacturer_b
182,IMG1050,IMG350,6.19,Валерий Захарьин,Cellar Master
187,IMG1051,IMG991,5.68,Temelion,Золотая балка
273,IMG1084,IMG511,67.24,Галицкий и Галицкий,Усадьба Дивноморское
754,IMG272,IMG750,7.34,Uppa Winery,Фанагория
964,IMG350,IMG90,7.08,Cellar Master,Инкерман


In [90]:
brand_sizes = meta["manufacturer"].value_counts().rename("brand_size").to_frame()

e = e.merge(brand_sizes, left_on="manufacturer_a", right_index=True, how="left")
e = e.merge(brand_sizes, left_on="manufacturer_b", right_index=True, how="left", suffixes=("_a","_b"))

e.rename(columns={"brand_size":"brand_size_a","brand_size_b":"brand_size_b"}, inplace=True)
e[["brand_size_a","brand_size_b"]].describe()


Unnamed: 0,brand_size_a,brand_size_b
count,16.0,16.0
mean,12.25,21.125
std,3.454466,13.544372
min,4.0,7.0
25%,11.75,13.0
50%,13.0,22.0
75%,15.0,22.25
max,16.0,67.0


In [91]:
K_TOP = 10     # сколько лучших совпадений учитывать
TH_CNT = 15.0  # порог "заметной" похожести для подсчёта количества (подстрой потом)

# упорядочим бренды в паре, чтобы A-B == B-A
e["brand_u"] = e[["manufacturer_a","manufacturer_b"]].min(axis=1)
e["brand_v"] = e[["manufacturer_a","manufacturer_b"]].max(axis=1)

# для topK: сортируем внутри каждой пары брендов по w
e_sorted = e.sort_values("w", ascending=False)

agg = (e_sorted.groupby(["brand_u","brand_v"], as_index=False)
       .agg(
           edges=("w","size"),
           max_w=("w","max"),
           mean_w=("w","mean"),
           # топ-K средняя
           topk_mean=("w", lambda s: float(np.mean(s.head(K_TOP)))),
           # сколько выше порога
           cnt_15=("w", lambda s: int((s >= TH_CNT).sum())),
           # бренды и размеры (берём первые попавшиеся — одинаковые внутри группы)
           brand_size_u=("brand_size_a","first"),
           brand_size_v=("brand_size_b","first"),
       ))

# нормализация: "сколько сильных ребер относительно размера брендов"
# 1) нормализованный count: cnt_15 / sqrt(|A|*|B|)
agg["cnt_norm"] = agg["cnt_15"] / np.sqrt(agg["brand_size_u"] * agg["brand_size_v"])

# 2) нормализованный "mass" по topk_mean: topk_mean * log(1+cnt_15) / sqrt(|A|*|B|)
agg["score_norm"] = (agg["topk_mean"] * np.log1p(agg["cnt_15"])) / np.sqrt(agg["brand_size_u"] * agg["brand_size_v"])

agg = agg.sort_values(["score_norm","max_w","edges"], ascending=False)

display(agg.head(30))
print("Brand pairs:", len(agg))


Unnamed: 0,brand_u,brand_v,edges,max_w,mean_w,topk_mean,cnt_15,brand_size_u,brand_size_v,cnt_norm,score_norm
9,Галицкий и Галицкий,Усадьба Дивноморское,1,67.24,67.24,67.24,1,13,24,0.056614,2.638614
2,Cellar Master,Шато Тамань,1,48.04,48.04,48.04,1,7,23,0.078811,2.624312
8,Галицкий и Галицкий,Николаев и сыновья,1,49.77,49.77,49.77,1,15,13,0.071611,2.470448
11,Инкерман,Николаев и сыновья,4,22.36,18.925,18.925,4,15,22,0.220193,1.676691
3,Domaine Lipko,Галицкий и Галицкий,1,10.83,10.83,10.83,0,12,13,0.0,0.0
7,Галицкий и Галицкий,Имение Сикоры,2,10.21,8.68,8.68,0,13,13,0.0,0.0
10,Имение Сикоры,Усадьба Перовских,1,7.39,7.39,7.39,0,13,28,0.0,0.0
6,Uppa Winery,Фанагория,1,7.34,7.34,7.34,0,11,67,0.0,0.0
4,Domaine Lipko,Николаев и сыновья,1,7.25,7.25,7.25,0,12,15,0.0,0.0
1,Cellar Master,Инкерман,1,7.08,7.08,7.08,0,7,22,0.0,0.0


Brand pairs: 12


In [92]:
TOP_EX = 3

# подготовим для join
e_sorted_small = e_sorted[["brand_u","brand_v","w","node1","node2","manufacturer_a","manufacturer_b"]].copy()

top_examples = (e_sorted_small.groupby(["brand_u","brand_v"], as_index=False)
                .head(TOP_EX)
                .sort_values(["brand_u","brand_v","w"], ascending=[True, True, False]))

# склеим в одну строку
def pack_examples(df_):
    return " || ".join([f"{r.node1}-{r.node2} ({r.w:.1f})" for r in df_.itertuples(index=False)])

ex_pack = (top_examples.groupby(["brand_u","brand_v"])
           .apply(pack_examples)
           .rename("top_examples")
           .reset_index())

agg2 = agg.merge(ex_pack, on=["brand_u","brand_v"], how="left")
display(agg2.head(20))


  .apply(pack_examples)


Unnamed: 0,brand_u,brand_v,edges,max_w,mean_w,topk_mean,cnt_15,brand_size_u,brand_size_v,cnt_norm,score_norm,top_examples
0,Галицкий и Галицкий,Усадьба Дивноморское,1,67.24,67.24,67.24,1,13,24,0.056614,2.638614,IMG1084-IMG511 (67.2)
1,Cellar Master,Шато Тамань,1,48.04,48.04,48.04,1,7,23,0.078811,2.624312,IMG415-IMG793 (48.0)
2,Галицкий и Галицкий,Николаев и сыновья,1,49.77,49.77,49.77,1,15,13,0.071611,2.470448,IMG473-IMG984 (49.8)
3,Инкерман,Николаев и сыновья,4,22.36,18.925,18.925,4,15,22,0.220193,1.676691,IMG475-IMG79 (22.4) || IMG637-IMG83 (19.5) || ...
4,Domaine Lipko,Галицкий и Галицкий,1,10.83,10.83,10.83,0,12,13,0.0,0.0,IMG448-IMG463 (10.8)
5,Галицкий и Галицкий,Имение Сикоры,2,10.21,8.68,8.68,0,13,13,0.0,0.0,IMG463-IMG581 (10.2) || IMG463-IMG598 (7.2)
6,Имение Сикоры,Усадьба Перовских,1,7.39,7.39,7.39,0,13,28,0.0,0.0,IMG598-IMG634 (7.4)
7,Uppa Winery,Фанагория,1,7.34,7.34,7.34,0,11,67,0.0,0.0,IMG272-IMG750 (7.3)
8,Domaine Lipko,Николаев и сыновья,1,7.25,7.25,7.25,0,12,15,0.0,0.0,IMG450-IMG637 (7.2)
9,Cellar Master,Инкерман,1,7.08,7.08,7.08,0,7,22,0.0,0.0,IMG350-IMG90 (7.1)


In [93]:
import networkx as nx
from pyvis.network import Network

# Сколько ребер рисуем (иначе будет паутина)
TOP_BRAND_EDGES = 60

# берём топ по нормализованному скору
gdf = agg2.head(TOP_BRAND_EDGES).copy()

Gb = nx.Graph()
for r in gdf.itertuples(index=False):
    Gb.add_edge(
        r.brand_u, r.brand_v,
        score=float(r.score_norm),
        topk_mean=float(r.topk_mean),
        max_w=float(r.max_w),
        cnt_15=int(r.cnt_15),
        edges=int(r.edges),
        examples=str(r.top_examples) if pd.notna(r.top_examples) else ""
    )

print("Brand graph:", Gb.number_of_nodes(), "nodes,", Gb.number_of_edges(), "edges")


Brand graph: 14 nodes, 12 edges


In [94]:
# нормируем толщину ребер по score
scores = [d["score"] for _,_,d in Gb.edges(data=True)]
smin, smax = (min(scores), max(scores)) if scores else (0,1)

def edge_w(s, min_w=1.0, max_w=12.0):
    t = (s - smin) / max(1e-9, (smax - smin))
    t = max(0.0, min(1.0, t))
    return min_w + t*(max_w-min_w)

net = Network(height="880px", width="100%", bgcolor="#ffffff", font_color="#111", notebook=True)
net.force_atlas_2based(gravity=-35, central_gravity=0.01, spring_length=220, spring_strength=0.04, damping=0.6)

# узлы: размер = размер бренда
brand_size_map = brand_sizes["brand_size"].to_dict()
for b in Gb.nodes():
    sz = brand_size_map.get(b, 1)
    title = f"<b>{b}</b><br>items: {sz}"
    net.add_node(b, label=b, title=title, size=8 + 2*min(sz, 20))

# ребра: толщина по score_norm, tooltip с деталями
for u, v, d in Gb.edges(data=True):
    title = (
        f"<b>{u} ↔ {v}</b><br>"
        f"score_norm: {d['score']:.4f}<br>"
        f"topk_mean: {d['topk_mean']:.2f}<br>"
        f"max_w: {d['max_w']:.2f}<br>"
        f"cnt_15: {d['cnt_15']}<br>"
        f"edges_total: {d['edges']}<br>"
        f"examples: {d['examples']}"
    )
    net.add_edge(u, v, title=title, width=edge_w(d["score"]), value=d["score"])

net.show_buttons(filter_=["physics", "interaction", "layout"])

out_html = "/content/brand_graph.html"
net.save_graph(out_html)
print("Saved:", out_html)


Saved: /content/brand_graph.html


In [95]:
from google.colab import files
files.download(out_html)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [96]:
out_csv = "/content/brand_graph_edges.csv"
agg2.to_csv(out_csv, index=False, encoding="utf-8-sig")
print("Saved:", out_csv, "rows:", len(agg2))
files.download(out_csv)


Saved: /content/brand_graph_edges.csv rows: 12


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [97]:
!pip -q install pillow requests

import os, re, html, requests, shutil
from pathlib import Path
from PIL import Image
import pandas as pd
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed

def safe_filename(s: str, max_len=140):
    s = str(s)
    s = re.sub(r"[\\/:*?\"<>|]+", "_", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s[:max_len].rstrip()

def make_thumb(src_path, dst_path, max_side=900, quality=80):
    dst_path = Path(dst_path)
    dst_path.parent.mkdir(parents=True, exist_ok=True)
    if dst_path.exists():
        return str(dst_path)
    im = Image.open(src_path).convert("RGB")
    im.thumbnail((max_side, max_side))
    im.save(dst_path, format="JPEG", quality=quality, optimize=True)
    return str(dst_path)

def ydisk_public_download(public_key: str, rel_path: str, out_dir="/content/cache", timeout=60):
    rel_path = str(rel_path)
    if not rel_path.startswith("/"):
        rel_path = "/" + rel_path

    local_rel = rel_path.lstrip("/")
    out_path = os.path.join(out_dir, local_rel)
    os.makedirs(os.path.dirname(out_path), exist_ok=True)

    if os.path.exists(out_path) and os.path.getsize(out_path) > 0:
        return out_path

    api = "https://cloud-api.yandex.net/v1/disk/public/resources/download"
    r = requests.get(api, params={"public_key": public_key, "path": rel_path}, timeout=timeout)
    r.raise_for_status()
    href = r.json()["href"]

    with requests.get(href, stream=True, timeout=180) as resp:
        resp.raise_for_status()
        with open(out_path, "wb") as f:
            for chunk in resp.iter_content(chunk_size=1024*1024):
                if chunk:
                    f.write(chunk)
    return out_path


In [98]:
meta_cols = ["image_id","manufacturer","name","color","sparkling","sugar","image"]
m = df[meta_cols].copy().set_index("image_id")

e = und.copy()
e = e.merge(m, left_on="node1", right_index=True, how="left")
e = e.merge(m, left_on="node2", right_index=True, how="left", suffixes=("_a","_b"))
e = e[e["manufacturer_a"] != e["manufacturer_b"]].copy()

print("Cross-brand bottle edges:", len(e))


Cross-brand bottle edges: 16


In [99]:
# Настройки отчёта
TOP_BRAND_EDGES = 40   # сколько связей брендов показать
TOP_PAIRS_PER_EDGE = 8 # сколько пар бутылок на каждую связь

# берём топ ребер брендов
brand_edges = agg2.sort_values("score_norm", ascending=False).head(TOP_BRAND_EDGES)[
    ["brand_u","brand_v","score_norm","topk_mean","max_w","cnt_15","edges"]
].copy()

print("Brand edges in report:", len(brand_edges))
display(brand_edges.head(10))


Brand edges in report: 12


Unnamed: 0,brand_u,brand_v,score_norm,topk_mean,max_w,cnt_15,edges
0,Галицкий и Галицкий,Усадьба Дивноморское,2.638614,67.24,67.24,1,1
1,Cellar Master,Шато Тамань,2.624312,48.04,48.04,1,1
2,Галицкий и Галицкий,Николаев и сыновья,2.470448,49.77,49.77,1,1
3,Инкерман,Николаев и сыновья,1.676691,18.925,22.36,4,4
4,Domaine Lipko,Галицкий и Галицкий,0.0,10.83,10.83,0,1
5,Галицкий и Галицкий,Имение Сикоры,0.0,8.68,10.21,0,2
6,Имение Сикоры,Усадьба Перовских,0.0,7.39,7.39,0,1
7,Uppa Winery,Фанагория,0.0,7.34,7.34,0,1
8,Domaine Lipko,Николаев и сыновья,0.0,7.25,7.25,0,1
9,Cellar Master,Инкерман,0.0,7.08,7.08,0,1


In [100]:
# подготовим быстрый выбор
e2 = e[["node1","node2","w","manufacturer_a","manufacturer_b","image_a","image_b",
        "name_a","name_b","color_a","color_b","sparkling_a","sparkling_b","sugar_a","sugar_b"]].copy()

# чтобы соответствовать brand_u/brand_v (отсортированные)
e2["brand_u"] = e2[["manufacturer_a","manufacturer_b"]].min(axis=1)
e2["brand_v"] = e2[["manufacturer_a","manufacturer_b"]].max(axis=1)

# достаём top пары для каждого brand-edge
pairs_rows = []
for r in brand_edges.itertuples(index=False):
    bu, bv = r.brand_u, r.brand_v
    sub = e2[(e2["brand_u"]==bu) & (e2["brand_v"]==bv)].sort_values("w", ascending=False).head(TOP_PAIRS_PER_EDGE)
    for rr in sub.itertuples(index=False):
        pairs_rows.append({
            "brand_u": bu, "brand_v": bv,
            "w": float(rr.w),
            "image_id_a": rr.node1, "image_id_b": rr.node2,
            "manufacturer_a": rr.manufacturer_a, "manufacturer_b": rr.manufacturer_b,
            "path_image_a": rr.image_a, "path_image_b": rr.image_b,
            "name_a": rr.name_a, "name_b": rr.name_b,
            "color_a": rr.color_a, "color_b": rr.color_b,
            "sparkling_a": rr.sparkling_a, "sparkling_b": rr.sparkling_b,
            "sugar_a": rr.sugar_a, "sugar_b": rr.sugar_b,
        })

pairs_df = pd.DataFrame(pairs_rows)
print("Pairs in report:", len(pairs_df))
display(pairs_df.head(10))


Pairs in report: 16


Unnamed: 0,brand_u,brand_v,w,image_id_a,image_id_b,manufacturer_a,manufacturer_b,path_image_a,path_image_b,name_a,name_b,color_a,color_b,sparkling_a,sparkling_b,sugar_a,sugar_b
0,Галицкий и Галицкий,Усадьба Дивноморское,67.24,IMG1084,IMG511,Галицкий и Галицкий,Усадьба Дивноморское,1070-1096/галицкий балет блан магнум (1).jpg,457-520/усадьба дивноморское солист 2.jpg,Ballet Blanc Красная горка,Солист,Белое,Розовое,Тихое,Тихое,Сухое,Сухое
1,Cellar Master,Шато Тамань,48.04,IMG415,IMG793,Cellar Master,Шато Тамань,385-456/целар мастерс мальвазия 2.jpg,727-798/шато тамань деликат розе 2.jpg,Мальвазия село Яркое,Delicate,Белое,Розовое,Тихое,Тихое,Сухое,Полусухое
2,Галицкий и Галицкий,Николаев и сыновья,49.77,IMG473,IMG984,Николаев и сыновья,Галицкий и Галицкий,457-520/николаев и сыновья рислинг 2.jpg,979-1009/галицкий казак магнум (1).jpg,Riesling,Cosaque,Белое,Красное,Тихое,Тихое,Сухое,Сухое
3,Инкерман,Николаев и сыновья,22.36,IMG475,IMG79,Николаев и сыновья,Инкерман,457-520/николаев и сыновья СБ (2).jpg,77-95/inkerman kokur blan 2.jpg,Sauvignon Blanc,Winemaker's selection Кокур,Белое,Белое,Тихое,Тихое,Сухое,Сухое
4,Инкерман,Николаев и сыновья,19.5,IMG637,IMG83,Николаев и сыновья,Инкерман,631-683/николаев и сыновья пти мансан 2.jpg,77-95/inkerman пино нуар (2).jpg,Пти Мансан,Winemaker's selection Пино Нуар,Белое,Красное,Тихое,Тихое,Сладкое,Полусухое
5,Инкерман,Николаев и сыновья,17.55,IMG475,IMG81,Николаев и сыновья,Инкерман,457-520/николаев и сыновья СБ (2).jpg,77-95/inkerman шардоне (2).jpg,Sauvignon Blanc,Winemaker's selection Шардоне,Белое,Белое,Тихое,Тихое,Сухое,Сухое
6,Инкерман,Николаев и сыновья,16.29,IMG637,IMG81,Николаев и сыновья,Инкерман,631-683/николаев и сыновья пти мансан 2.jpg,77-95/inkerman шардоне (2).jpg,Пти Мансан,Winemaker's selection Шардоне,Белое,Белое,Тихое,Тихое,Сладкое,Сухое
7,Domaine Lipko,Галицкий и Галицкий,10.83,IMG448,IMG463,Domaine Lipko,Галицкий и Галицкий,385-456/липко белый бленд 2.jpg,457-520/галицкий рислинг 2.jpg,White Blend 2021,Рислинг Красная горка,Белое,Белое,Тихое,Тихое,Полусухое,Сухое
8,Галицкий и Галицкий,Имение Сикоры,10.21,IMG463,IMG581,Галицкий и Галицкий,Имение Сикоры,457-520/галицкий рислинг 2.jpg,575-630/сикоры семейный резерв рислинг 2.jpg,Рислинг Красная горка,Рислинг,Белое,Белое,Тихое,Тихое,Сухое,Сухое
9,Галицкий и Галицкий,Имение Сикоры,7.15,IMG463,IMG598,Галицкий и Галицкий,Имение Сикоры,457-520/галицкий рислинг 2.jpg,575-630/сикоры рислинг 2.jpg,Рислинг Красная горка,Рислинг,Белое,Белое,Тихое,Тихое,Сухое,Сухое


In [101]:
OUT_DIR = Path("/content/brand_graph_gallery")
IMG_DIR = OUT_DIR / "img"
IMG_DIR.mkdir(parents=True, exist_ok=True)

# уникальные относительные пути
uniq_paths = sorted(set(
    [p for p in pairs_df["path_image_a"].dropna().tolist() if isinstance(p,str) and p.strip()] +
    [p for p in pairs_df["path_image_b"].dropna().tolist() if isinstance(p,str) and p.strip()]
))
print("Unique images to fetch:", len(uniq_paths))

local_map = {}
failed = []

def task(rel):
    try:
        loc = ydisk_public_download(PUBLIC_KEY, rel)
        return rel, loc, None
    except Exception as e:
        return rel, None, str(e)

with ThreadPoolExecutor(max_workers=12) as ex:
    futs = [ex.submit(task, rel) for rel in uniq_paths]
    done = 0
    for fut in as_completed(futs):
        rel, loc, err = fut.result()
        done += 1
        if err:
            failed.append((rel, err))
        else:
            local_map[rel] = loc
        if done % 25 == 0 or done == len(uniq_paths):
            print(f"Downloaded {done}/{len(uniq_paths)}")

thumb_map = {}
for i, rel in enumerate(uniq_paths, start=1):
    if rel not in local_map:
        continue
    fname = safe_filename(Path(rel).name)
    thumb_name = f"{i:05d}_{fname}.jpg"
    thumb_path = IMG_DIR / thumb_name
    make_thumb(local_map[rel], thumb_path, max_side=900, quality=80)
    thumb_map[rel] = f"img/{thumb_name}"

print("Thumbs ready:", len(thumb_map), "Failed:", len(failed))


Unique images to fetch: 24
Downloaded 24/24
Thumbs ready: 24 Failed: 0


In [102]:
def nice_sku(name, color, sparkling, sugar):
    parts = [name, color, sparkling, sugar]
    parts = [str(x) for x in parts if x is not None and str(x) != "nan" and str(x).strip() != ""]
    return " | ".join(parts)

parts = []
parts.append("<!doctype html><html><head><meta charset='utf-8'>")
parts.append("<meta name='viewport' content='width=device-width, initial-scale=1'>")
parts.append("""
<style>
body{font-family:system-ui,-apple-system,Segoe UI,Roboto,Arial;margin:18px;color:#111;}
h1{font-size:20px;margin:0 0 8px;}
.meta{color:#555;font-size:13px;margin-bottom:14px;}
.toc{display:flex;flex-wrap:wrap;gap:8px;margin:10px 0 18px;}
.chip{display:inline-block;padding:6px 10px;border-radius:999px;background:#f3f3f3;
      font-size:12px;text-decoration:none;color:#111;border:1px solid #e6e6e6;}
.section{border:1px solid #e6e6e6;border-radius:14px;padding:14px;margin:14px 0;}
.hdr{display:flex;justify-content:space-between;gap:10px;flex-wrap:wrap;align-items:baseline;}
.badges{display:flex;gap:8px;flex-wrap:wrap}
.badge{display:inline-block;padding:3px 8px;border-radius:999px;font-size:12px;background:#f7f7f7;border:1px solid #eee;}
.pair{border:1px solid #eee;border-radius:14px;padding:12px;margin:12px 0;background:#fff;}
.grid2{display:grid;grid-template-columns:1fr 1fr;gap:12px;}
.card{border:1px solid #eee;border-radius:12px;padding:10px;background:#fff;}
img{max-width:100%;border:1px solid #ddd;border-radius:10px;background:#fafafa;}
.cap{font-size:12px;line-height:1.25;margin:8px 0 0;}
.small{font-size:11px;color:#666;word-break:break-word;margin-top:6px;}
@media (max-width: 900px){ .grid2{grid-template-columns:1fr;} }
</style>
</head><body>
""")

parts.append("<h1>Brand-level similarity — evidence gallery</h1>")
parts.append(f"<div class='meta'>brand edges: {len(brand_edges)} • pairs shown: {len(pairs_df)} • failed images: {len(failed)}</div>")

# оглавление
parts.append("<div class='toc'>")
for r in brand_edges.itertuples(index=False):
    key = safe_filename(f"{r.brand_u}__{r.brand_v}")
    parts.append(f"<a class='chip' href='#{key}'>{html.escape(r.brand_u)} ↔ {html.escape(r.brand_v)}</a>")
parts.append("</div>")

for r in brand_edges.itertuples(index=False):
    bu, bv = r.brand_u, r.brand_v
    key = safe_filename(f"{bu}__{bv}")
    parts.append(f"<div class='section' id='{key}'>")
    parts.append("<div class='hdr'>")
    parts.append(f"<div><b>{html.escape(bu)}</b> ↔ <b>{html.escape(bv)}</b></div>")
    parts.append("<div class='badges'>")
    parts.append(f"<span class='badge'>score_norm: {r.score_norm:.4f}</span>")
    parts.append(f"<span class='badge'>topk_mean: {r.topk_mean:.2f}</span>")
    parts.append(f"<span class='badge'>max_w: {r.max_w:.2f}</span>")
    parts.append(f"<span class='badge'>cnt_15: {int(r.cnt_15)}</span>")
    parts.append(f"<span class='badge'>edges_total: {int(r.edges)}</span>")
    parts.append("</div></div>")

    sub = pairs_df[(pairs_df["brand_u"]==bu) & (pairs_df["brand_v"]==bv)].sort_values("w", ascending=False)
    for p in sub.itertuples(index=False):
        parts.append("<div class='pair'>")
        parts.append(f"<div class='meta'><b>match:</b> {p.w:.2f}</div>")
        parts.append("<div class='grid2'>")

        # A
        img_a = thumb_map.get(p.path_image_a)
        sku_a = nice_sku(p.name_a, p.color_a, p.sparkling_a, p.sugar_a)
        parts.append("<div class='card'>")
        if img_a:
            parts.append(f"<img src='{html.escape(img_a)}'>")
        parts.append(f"<div class='cap'><b>{html.escape(str(p.manufacturer_a))}</b></div>")
        parts.append(f"<div class='cap'>{html.escape(sku_a)}</div>")
        parts.append(f"<div class='small'>id: {html.escape(str(p.image_id_a))}</div>")
        parts.append(f"<div class='small'>path: {html.escape(str(p.path_image_a))}</div>")
        parts.append("</div>")

        # B
        img_b = thumb_map.get(p.path_image_b)
        sku_b = nice_sku(p.name_b, p.color_b, p.sparkling_b, p.sugar_b)
        parts.append("<div class='card'>")
        if img_b:
            parts.append(f"<img src='{html.escape(img_b)}'>")
        parts.append(f"<div class='cap'><b>{html.escape(str(p.manufacturer_b))}</b></div>")
        parts.append(f"<div class='cap'>{html.escape(sku_b)}</div>")
        parts.append(f"<div class='small'>id: {html.escape(str(p.image_id_b))}</div>")
        parts.append(f"<div class='small'>path: {html.escape(str(p.path_image_b))}</div>")
        parts.append("</div>")

        parts.append("</div></div>")  # grid2, pair

    parts.append("</div>")  # section

parts.append("</body></html>")

OUT_DIR.mkdir(parents=True, exist_ok=True)
html_path = OUT_DIR / "index.html"
html_path.write_text("\n".join(parts), encoding="utf-8")
print("Saved HTML:", str(html_path))


Saved HTML: /content/brand_graph_gallery/index.html


In [103]:
zip_path = shutil.make_archive("/content/brand_graph_gallery", "zip", str(OUT_DIR))
from google.colab import files
files.download(zip_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [104]:
!pip -q install pillow requests opencv-python

import os, re, math, json, shutil, requests
from pathlib import Path
import numpy as np
import pandas as pd
import cv2
from PIL import Image
from concurrent.futures import ThreadPoolExecutor, as_completed

# ====== 1) Yandex Disk download (public) ======
def ydisk_public_download(public_key: str, rel_path: str, out_dir="/content/cache", timeout=60):
    rel_path = str(rel_path)
    if not rel_path.startswith("/"):
        rel_path = "/" + rel_path

    local_rel = rel_path.lstrip("/")
    out_path = os.path.join(out_dir, local_rel)
    os.makedirs(os.path.dirname(out_path), exist_ok=True)

    if os.path.exists(out_path) and os.path.getsize(out_path) > 0:
        return out_path

    api = "https://cloud-api.yandex.net/v1/disk/public/resources/download"
    r = requests.get(api, params={"public_key": public_key, "path": rel_path}, timeout=timeout)
    r.raise_for_status()
    href = r.json()["href"]

    with requests.get(href, stream=True, timeout=180) as resp:
        resp.raise_for_status()
        with open(out_path, "wb") as f:
            for chunk in resp.iter_content(chunk_size=1024*1024):
                if chunk:
                    f.write(chunk)
    return out_path

# ====== 2) Heuristic: which rows are label images? ======
def is_label_row(row) -> bool:
    """
    Мы пытаемся выбрать именно этикетки.
    Логика:
    - если есть file_label_name и image совпадает с ним → этикетка
    - или file_label_name содержит " 2" / "(2)" / "_2" и т.п.
    - или image (относительный путь) заканчивается на "... 2.jpg" / "(2).jpg" / "_2.jpg"
    """
    img = str(row.get("image", "") or "")
    fln = str(row.get("file_label_name", "") or "")

    name = os.path.basename(img).lower()
    fln_l = fln.lower()

    # 1) прямое совпадение с file_label_name
    if fln and os.path.basename(img) == fln:
        return True

    # 2) частые маркеры "этикетка = вариант 2"
    pats = [
        r"\s2\.(jpg|jpeg|png)$",
        r"\(2\)\.(jpg|jpeg|png)$",
        r"_2\.(jpg|jpeg|png)$",
        r" 2\.(jpg|jpeg|png)$",
        r" 2\)$",  # на всякий
    ]
    for p in pats:
        if re.search(p, name):
            return True
        if fln and re.search(p, fln_l):
            return True

    # 3) если в file_label_name явно есть "2", а image похож на него
    if fln and (" 2." in fln_l or "(2)." in fln_l or "_2." in fln_l):
        return True

    return False

# ====== 3) Visual feature extractor ======
def compute_features(image_path: str, resize_max=900):
    """
    Возвращает dict фич.
    Работает без OCR, чисто по пикселям.
    """
    # читаем через OpenCV (быстрее)
    bgr = cv2.imdecode(np.fromfile(image_path, dtype=np.uint8), cv2.IMREAD_COLOR)
    if bgr is None:
        raise ValueError("cv2 failed to read image")

    h0, w0 = bgr.shape[:2]
    scale = 1.0
    mx = max(h0, w0)
    if mx > resize_max:
        scale = resize_max / mx
        bgr = cv2.resize(bgr, (int(w0*scale), int(h0*scale)), interpolation=cv2.INTER_AREA)

    h, w = bgr.shape[:2]

    # HSV + Gray
    hsv = cv2.cvtColor(bgr, cv2.COLOR_BGR2HSV)
    gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY)

    # brightness
    brightness_mean = float(gray.mean())
    contrast_std = float(gray.std())

    # white/black share (пороги можно тюнить)
    white_share = float((gray >= 235).mean())
    black_share = float((gray <= 30).mean())

    # saturation / colorfulness proxy
    sat = hsv[:,:,1].astype(np.float32) / 255.0
    sat_mean = float(sat.mean())
    colorful_share = float((sat >= 0.25).mean())  # >=25% насыщенности

    # hue mode bin (0..179)
    hue = hsv[:,:,0].astype(np.int32)
    # учитываем hue только там где есть цвет, иначе фон зашумит
    mask_color = (sat >= 0.15)
    if mask_color.sum() > 50:
        hue_vals = hue[mask_color]
        hist = np.bincount(hue_vals, minlength=180)
        hue_mode = int(hist.argmax())
    else:
        hue_mode = -1
    # грубая бинировка на 12 корзин
    hue_mode_bin = int(hue_mode // 15) if hue_mode >= 0 else -1

    # edge density (Canny)
    # авто-подбор порогов по медиане яркости
    med = np.median(gray)
    lower = int(max(0, 0.66 * med))
    upper = int(min(255, 1.33 * med))
    edges = cv2.Canny(gray, lower, upper)
    edge_density = float((edges > 0).mean())

    # entropy of grayscale histogram
    hist_g = cv2.calcHist([gray],[0],None,[256],[0,256]).ravel().astype(np.float64)
    p = hist_g / max(1.0, hist_g.sum())
    p = p[p > 0]
    entropy = float(-(p * np.log2(p)).sum())

    return {
        "img_w": w0, "img_h": h0, "scale_used": scale,
        "brightness_mean": brightness_mean,
        "contrast_std": contrast_std,
        "white_share": white_share,
        "black_share": black_share,
        "sat_mean": sat_mean,
        "colorful_share": colorful_share,
        "hue_mode": hue_mode,
        "hue_mode_bin": hue_mode_bin,
        "edge_density": edge_density,
        "entropy": entropy,
    }

# ====== 4) Build dataset: select label rows ======
# ВАЖНО: тут я предполагаю, что df уже загружен
label_mask = df.apply(is_label_row, axis=1)
labels_df = df[label_mask].copy()

print("All rows:", len(df))
print("Label-like rows:", len(labels_df))
print(labels_df[["image_id","manufacturer","image","file_label_name","file_bottle_name"]].head(5))


All rows: 1070
Label-like rows: 1070
  image_id manufacturer                                             image  \
0     IMG1    Массандра    1-41/Массандра_мускат белый южнобережный 2.jpg   
1     IMG2    Массандра  1-41/Массандра портвейн белый южнобережный 2.jpg   
2     IMG3    Массандра             1-41/Массандра портвейн красный 2.jpg   
3     IMG4    Массандра     1-41/Массандра портвейн красный ливадия 2.jpg   
4     IMG5    Массандра  1-41/Массандра седьмое небо князя голицына 2.jpg   

                               file_label_name  \
0    Массандра_мускат белый южнобережный 2.jpg   
1  Массандра портвейн белый южнобережный 2.jpg   
2             Массандра портвейн красный 2.jpg   
3     Массандра портвейн красный ливадия 2.jpg   
4  Массандра седьмое небо князя голицына 2.jpg   

                              file_bottle_name  
0      Массандра_Мускат_белый_южнобережный.jpg  
1  Массандра портвейн белый южнобережный 1.jpg  
2             Массандра портвейн красный 1.jpg  
3  

In [105]:
PUBLIC_KEY = PUBLIC_KEY  # должен быть задан

OUT_CACHE = "/content/cache"
MAX_WORKERS = 16

# что именно скачиваем: относительный путь лежит в колонке image
paths = labels_df["image"].dropna().astype(str).tolist()
paths = [p.strip() for p in paths if p.strip()]
paths_uniq = sorted(set(paths))

print("Unique label images to fetch:", len(paths_uniq))

local_map = {}
failed = []

def fetch_one(rel):
    try:
        loc = ydisk_public_download(PUBLIC_KEY, rel, out_dir=OUT_CACHE)
        return rel, loc, None
    except Exception as e:
        return rel, None, str(e)

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
    futs = [ex.submit(fetch_one, rel) for rel in paths_uniq]
    done = 0
    for fut in as_completed(futs):
        rel, loc, err = fut.result()
        done += 1
        if err:
            failed.append((rel, err))
        else:
            local_map[rel] = loc
        if done % 50 == 0 or done == len(paths_uniq):
            print(f"Downloaded {done}/{len(paths_uniq)}")

print("Downloaded ok:", len(local_map), "failed:", len(failed))

# считаем фичи
feat_rows = []
failed_feat = []

for i, row in labels_df.iterrows():
    rel = str(row["image"])
    loc = local_map.get(rel)
    if not loc:
        failed_feat.append((row["image_id"], rel, "no_local_file"))
        continue
    try:
        feats = compute_features(loc, resize_max=900)
        out = {
            "image_id": row["image_id"],
            "manufacturer": row.get("manufacturer"),
            "name": row.get("name"),
            "color": row.get("color"),
            "sparkling": row.get("sparkling"),
            "sugar": row.get("sugar"),
            "origin": row.get("origin"),
            "vintage_year": row.get("vintage_year"),
            "folder_name": row.get("folder_name"),
            "image_path_rel": rel,
            "file_label_name": row.get("file_label_name"),
        }
        out.update(feats)
        feat_rows.append(out)
    except Exception as e:
        failed_feat.append((row["image_id"], rel, str(e)))

features_df = pd.DataFrame(feat_rows)
print("Features computed:", len(features_df), "failed_feat:", len(failed_feat))

out_csv = "/content/label_visual_features.csv"
features_df.to_csv(out_csv, index=False, encoding="utf-8-sig")
print("Saved:", out_csv)


Unique label images to fetch: 1067
Downloaded 50/1067
Downloaded 100/1067
Downloaded 150/1067
Downloaded 200/1067
Downloaded 250/1067
Downloaded 300/1067
Downloaded 350/1067
Downloaded 400/1067
Downloaded 450/1067
Downloaded 500/1067
Downloaded 550/1067
Downloaded 600/1067
Downloaded 650/1067
Downloaded 700/1067
Downloaded 750/1067
Downloaded 800/1067
Downloaded 850/1067
Downloaded 900/1067
Downloaded 950/1067
Downloaded 1000/1067
Downloaded 1050/1067
Downloaded 1067/1067
Downloaded ok: 1067 failed: 0
Features computed: 1070 failed_feat: 0
Saved: /content/label_visual_features.csv


In [106]:
from scipy.stats import spearmanr, kruskal, chi2_contingency

num_cols = [
    "brightness_mean","white_share","black_share","contrast_std",
    "sat_mean","colorful_share","edge_density","entropy"
]

cat_cols = ["color","sparkling","sugar","origin","manufacturer"]

# 1) Spearman между числовыми
spearman_pairs = []
for i in range(len(num_cols)):
    for j in range(i+1, len(num_cols)):
        a = features_df[num_cols[i]]
        b = features_df[num_cols[j]]
        ok = a.notna() & b.notna()
        if ok.sum() < 30:
            continue
        rho, p = spearmanr(a[ok], b[ok])
        spearman_pairs.append((num_cols[i], num_cols[j], float(rho), float(p), int(ok.sum())))
sp = pd.DataFrame(spearman_pairs, columns=["x","y","rho","p","n"]).sort_values("rho", ascending=False)
display(sp.head(20))

# 2) Категория -> числовые: Kruskal (не требует нормальности)
cat_num = []
for cat in cat_cols:
    if cat not in features_df.columns:
        continue
    groups = features_df[cat].dropna().unique().tolist()
    if len(groups) < 2:
        continue
    for num in num_cols:
        vals = []
        for g in groups:
            v = features_df.loc[features_df[cat]==g, num].dropna()
            if len(v) >= 10:
                vals.append(v.values)
        if len(vals) < 2:
            continue
        stat, p = kruskal(*vals)
        cat_num.append((cat, num, float(stat), float(p)))
catnum = pd.DataFrame(cat_num, columns=["cat","num","stat","p"]).sort_values("p")
display(catnum.head(30))

# 3) Категория -> категория: делаем биннинг "цветная/нецветная", "светлая/тёмная"
features_df["is_bright"] = (features_df["brightness_mean"] >= features_df["brightness_mean"].median()).astype(int)
features_df["is_colorful"] = (features_df["colorful_share"] >= features_df["colorful_share"].median()).astype(int)

cat_cat = []
for cat in cat_cols:
    if cat not in features_df.columns:
        continue
    for bincol in ["is_bright","is_colorful"]:
        tab = pd.crosstab(features_df[cat], features_df[bincol])
        if tab.shape[0] < 2 or tab.shape[1] < 2:
            continue
        chi2, p, dof, exp = chi2_contingency(tab)
        cat_cat.append((cat, bincol, float(chi2), float(p)))
cc = pd.DataFrame(cat_cat, columns=["cat","bin","chi2","p"]).sort_values("p")
display(cc.head(30))


Unnamed: 0,x,y,rho,p,n
22,sat_mean,colorful_share,0.955692,0.0,1070
13,black_share,contrast_std,0.740379,1.900275e-186,1070
0,brightness_mean,white_share,0.542113,8.944479e-83,1070
24,sat_mean,entropy,0.414575,1.076511e-45,1070
27,edge_density,entropy,0.4049,1.7854010000000002e-43,1070
26,colorful_share,entropy,0.397446,8.182007e-42,1070
16,black_share,edge_density,0.316163,2.8853059999999997e-26,1070
17,black_share,entropy,0.219078,4.291369e-13,1070
21,contrast_std,entropy,0.161868,1.016302e-07,1070
20,contrast_std,edge_density,0.142509,2.868122e-06,1070


Unnamed: 0,cat,num,stat,p
4,color,sat_mean,288.627832,3.073052e-61
5,color,colorful_share,263.997579,6.273382e-56
33,manufacturer,white_share,300.049614,1.463509e-41
32,manufacturer,brightness_mean,264.109855,8.420725e-35
38,manufacturer,edge_density,262.409625,1.7447029999999999e-34
39,manufacturer,entropy,244.298259,3.88863e-31
34,manufacturer,black_share,220.549745,8.153658e-27
2,color,black_share,111.804916,2.9991770000000004e-23
3,color,contrast_std,110.789034,4.939738e-23
37,manufacturer,colorful_share,196.442626,1.591952e-22


Unnamed: 0,cat,bin,chi2,p
8,manufacturer,is_bright,395.086301,2.4090429999999998e-34
1,color,is_colorful,160.937998,6.246468000000001e-33
9,manufacturer,is_colorful,243.508197,1.845619e-12
6,origin,is_bright,150.52272,1.619396e-11
2,sparkling,is_bright,42.064662,7.341331e-10
5,sugar,is_colorful,43.773919,2.574345e-08
3,sparkling,is_colorful,31.330929,1.572444e-07
7,origin,is_colorful,121.680189,1.635707e-07
4,sugar,is_bright,30.291205,1.292428e-05
0,color,is_bright,27.810708,3.963478e-05


In [107]:
import pandas as pd
import numpy as np

feat_cols = [
    "brightness_mean","white_share","black_share","contrast_std",
    "sat_mean","colorful_share","edge_density","entropy"
]

X = features_df[feat_cols].copy()

# простая обработка пропусков: медианой
X = X.fillna(X.median(numeric_only=True))

print("X shape:", X.shape)


X shape: (1070, 8)


In [108]:
y_raw = features_df["sparkling"].astype(str).str.lower()

# подстрой под твои реальные значения, если отличаются
y = y_raw.map(lambda s: 1 if "игрист" in s else (0 if s and s!="nan" else np.nan))

mask = y.notna()
Xb = X[mask].copy()
yb = y[mask].astype(int)

print("Binary sparkling rows:", len(yb), "pos rate:", yb.mean())


Binary sparkling rows: 1070 pos rate: 0.17476635514018693


In [109]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(
    Xb, yb, test_size=0.25, random_state=42, stratify=yb
)

clf = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(max_iter=2000, class_weight="balanced"))
])

clf.fit(X_train, y_train)
pred = clf.predict(X_test)
proba = clf.predict_proba(X_test)[:,1]

print(classification_report(y_test, pred, digits=3))
print("ROC AUC:", roc_auc_score(y_test, proba))

# Коэффициенты (важно: после стандартизации)
coef = clf.named_steps["lr"].coef_[0]
coef_df = pd.DataFrame({"feature": feat_cols, "coef": coef}).sort_values("coef", ascending=False)
display(coef_df)


              precision    recall  f1-score   support

           0      0.936     0.729     0.819       221
           1      0.375     0.766     0.503        47

    accuracy                          0.735       268
   macro avg      0.656     0.747     0.661       268
weighted avg      0.838     0.735     0.764       268

ROC AUC: 0.816212573409069


Unnamed: 0,feature,coef
7,entropy,1.523458
2,black_share,1.426449
5,colorful_share,1.007703
0,brightness_mean,0.680482
1,white_share,0.003602
3,contrast_std,-0.405555
4,sat_mean,-0.95714
6,edge_density,-1.222813


In [110]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=400,
    random_state=42,
    class_weight="balanced",
    min_samples_leaf=5
)
rf.fit(X_train, y_train)
pred = rf.predict(X_test)
proba = rf.predict_proba(X_test)[:,1]

print(classification_report(y_test, pred, digits=3))
print("ROC AUC:", roc_auc_score(y_test, proba))

imp = pd.DataFrame({"feature": feat_cols, "importance": rf.feature_importances_}).sort_values("importance", ascending=False)
display(imp)


              precision    recall  f1-score   support

           0      0.911     0.928     0.919       221
           1      0.628     0.574     0.600        47

    accuracy                          0.866       268
   macro avg      0.770     0.751     0.760       268
weighted avg      0.861     0.866     0.863       268

ROC AUC: 0.8910176181765669


Unnamed: 0,feature,importance
5,colorful_share,0.164572
7,entropy,0.155191
2,black_share,0.15302
6,edge_density,0.140955
1,white_share,0.11182
4,sat_mean,0.106138
3,contrast_std,0.087117
0,brightness_mean,0.081188


In [114]:
from sklearn.model_selection import GroupKFold
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd

feat_cols = [
    "brightness_mean","white_share","black_share","contrast_std",
    "sat_mean","colorful_share","edge_density","entropy"
]

# y: 1 = игристое, 0 = неигристое (как ты делала)
y_raw = features_df["sparkling"].astype(str).str.lower()
y = y_raw.map(lambda s: 1 if "игрист" in s else (0 if s and s!="nan" else np.nan))

mask = y.notna()
X = features_df.loc[mask, feat_cols].copy()
X = X.fillna(X.median(numeric_only=True))
y = y[mask].astype(int)

groups = features_df.loc[mask, "manufacturer"].astype(str)

gkf = GroupKFold(n_splits=5)

aucs, f1s, precs, recs = [], [], [], []
for fold, (tr, te) in enumerate(gkf.split(X, y, groups=groups), start=1):
    rf = RandomForestClassifier(
        n_estimators=500,
        random_state=42,
        class_weight="balanced",
        min_samples_leaf=5
    )
    rf.fit(X.iloc[tr], y.iloc[tr])
    proba = rf.predict_proba(X.iloc[te])[:,1]
    pred = (proba >= 0.5).astype(int)

    aucs.append(roc_auc_score(y.iloc[te], proba))
    f1s.append(f1_score(y.iloc[te], pred))
    precs.append(precision_score(y.iloc[te], pred, zero_division=0))
    recs.append(recall_score(y.iloc[te], pred))

    print(f"fold {fold}: AUC={aucs[-1]:.3f} F1={f1s[-1]:.3f} P={precs[-1]:.3f} R={recs[-1]:.3f} | test n={len(te)}")

print("\nMEAN±STD")
print("AUC:", np.mean(aucs), "±", np.std(aucs))
print("F1 :", np.mean(f1s),  "±", np.std(f1s))
print("P  :", np.mean(precs), "±", np.std(precs))
print("R  :", np.mean(recs),  "±", np.std(recs))


fold 1: AUC=0.864 F1=0.565 P=0.667 R=0.491 | test n=214
fold 2: AUC=0.883 F1=0.517 P=0.577 R=0.469 | test n=214
fold 3: AUC=0.695 F1=0.310 P=0.306 R=0.314 | test n=214
fold 4: AUC=0.842 F1=0.522 P=0.450 R=0.621 | test n=214
fold 5: AUC=0.872 F1=0.485 P=0.571 R=0.421 | test n=214

MEAN±STD
AUC: 0.8311882466197895 ± 0.06922585062766434
F1 : 0.4797811081655075 ± 0.08872683305689952
P  : 0.514114774114774 ± 0.12498758825673134
R  : 0.4630688077545849 ± 0.09953338445494087


In [115]:
# Sparkling visual report (HTML) + FILTERS (no extra math in output)
# Assumes: features_df exists (with visual features + meta), PUBLIC_KEY exists
# Produces: /content/sparkling_visual_report/index.html + images folder, and a ZIP to download.

!pip -q install requests pillow

import os, re, html, json, shutil, requests
from pathlib import Path
import numpy as np
import pandas as pd
from PIL import Image
from concurrent.futures import ThreadPoolExecutor, as_completed

# ---------- Yandex public downloader ----------
def ydisk_public_download(public_key: str, rel_path: str, out_dir="/content/cache", timeout=60):
    rel_path = str(rel_path)
    if not rel_path.startswith("/"):
        rel_path = "/" + rel_path

    local_rel = rel_path.lstrip("/")
    out_path = os.path.join(out_dir, local_rel)
    os.makedirs(os.path.dirname(out_path), exist_ok=True)

    if os.path.exists(out_path) and os.path.getsize(out_path) > 0:
        return out_path

    api = "https://cloud-api.yandex.net/v1/disk/public/resources/download"
    r = requests.get(api, params={"public_key": public_key, "path": rel_path}, timeout=timeout)
    r.raise_for_status()
    href = r.json()["href"]

    with requests.get(href, stream=True, timeout=180) as resp:
        resp.raise_for_status()
        with open(out_path, "wb") as f:
            for chunk in resp.iter_content(chunk_size=1024*1024):
                if chunk:
                    f.write(chunk)
    return out_path

def safe_filename(s: str, max_len=140):
    s = str(s)
    s = re.sub(r"[\\/:*?\"<>|]+", "_", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s[:max_len].rstrip()

def make_thumb(src_path, dst_path, max_side=900, quality=80):
    dst_path = Path(dst_path)
    dst_path.parent.mkdir(parents=True, exist_ok=True)
    if dst_path.exists():
        return str(dst_path)
    im = Image.open(src_path).convert("RGB")
    im.thumbnail((max_side, max_side))
    im.save(dst_path, format="JPEG", quality=quality, optimize=True)
    return str(dst_path)

# ---------- 1) Prepare dataset for scoring ----------
feat_cols = [
    "brightness_mean","white_share","black_share","contrast_std",
    "sat_mean","colorful_share","edge_density","entropy"
]

# y: 1 = игристое, 0 = неигристое
y_raw = features_df["sparkling"].astype(str).str.lower()
y = y_raw.map(lambda s: 1 if "игрист" in s else (0 if s and s!="nan" else np.nan))

mask = y.notna()
X = features_df.loc[mask, feat_cols].copy()
X = X.fillna(X.median(numeric_only=True))
y = y[mask].astype(int).reset_index(drop=True)

meta_cols = ["image_id","manufacturer","name","color","sugar","sparkling","image_path_rel"]
meta = features_df.loc[mask, meta_cols].reset_index(drop=True)

# ---------- 2) Fit RF on full data (for scoring / confidence) ----------
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=600,
    random_state=42,
    class_weight="balanced",
    min_samples_leaf=5
)
rf.fit(X, y)
proba = rf.predict_proba(X)[:,1]
pred = (proba >= 0.5).astype(int)

work = meta.copy()
work["y_true"] = y
work["p_sparkling"] = proba
work["y_pred"] = pred

# ---------- 3) Human-readable style tags (quantile-based) ----------
q = {}
for c in feat_cols:
    q[c] = features_df[c].quantile([0.25,0.5,0.75]).to_dict()

feat_by_id = features_df.set_index("image_id")[feat_cols].to_dict(orient="index")

def style_tags(img_id):
    f = feat_by_id.get(img_id, {})
    tags = []
    b = f.get("brightness_mean", None)
    if b is not None:
        if b >= q["brightness_mean"][0.75]: tags.append("светлая")
        elif b <= q["brightness_mean"][0.25]: tags.append("тёмная")
    cs = f.get("colorful_share", None)
    if cs is not None:
        if cs >= q["colorful_share"][0.75]: tags.append("цветные акценты")
        elif cs <= q["colorful_share"][0.25]: tags.append("почти монохром")
    ent = f.get("entropy", None)
    if ent is not None:
        if ent >= q["entropy"][0.75]: tags.append("богатая фактура")
        elif ent <= q["entropy"][0.25]: tags.append("минимализм")
    ed = f.get("edge_density", None)
    if ed is not None:
        if ed >= q["edge_density"][0.75]: tags.append("много линий/текста")
        elif ed <= q["edge_density"][0.25]: tags.append("мало мелких деталей")
    return ", ".join(tags)

work["tags"] = work["image_id"].apply(style_tags)

# ---------- 4) Build sections (we'll render all cards, but pre-tag section in data) ----------
def section_label(row):
    if row.y_true==1 and row.y_pred==1: return "TP: типичные игристые"
    if row.y_true==0 and row.y_pred==0: return "TN: типичные не-игристые"
    if row.y_true==0 and row.y_pred==1: return "FP: ложные тревоги"
    return "FN: пропущенные игристые"

work["section"] = work.apply(section_label, axis=1)

# how many cards to include overall (set None for all)
MAX_CARDS_TOTAL = None  # e.g., 400
if MAX_CARDS_TOTAL is not None:
    # keep a balanced-ish sample per section
    per = MAX_CARDS_TOTAL // 4
    tp = work[work.section.str.startswith("TP")].sort_values("p_sparkling", ascending=False).head(per)
    tn = work[work.section.str.startswith("TN")].sort_values("p_sparkling", ascending=True).head(per)
    fp = work[work.section.str.startswith("FP")].sort_values("p_sparkling", ascending=False).head(per)
    fn = work[work.section.str.startswith("FN")].sort_values("p_sparkling", ascending=True).head(per)
    work_view = pd.concat([tp, tn, fp, fn], ignore_index=True)
else:
    work_view = work.copy()

print("Cards in report:", len(work_view))

# ---------- 5) Download thumbs for cards ----------
OUT_DIR = Path("/content/sparkling_visual_report")
IMG_DIR = OUT_DIR / "img"
IMG_DIR.mkdir(parents=True, exist_ok=True)

need_paths = work_view["image_path_rel"].dropna().astype(str).tolist()
need_paths = [p.strip() for p in need_paths if p.strip()]
need_paths = sorted(set(need_paths))

local_map = {}
failed = []

def fetch_one(rel):
    try:
        loc = ydisk_public_download(PUBLIC_KEY, rel, out_dir="/content/cache")
        return rel, loc, None
    except Exception as e:
        return rel, None, str(e)

with ThreadPoolExecutor(max_workers=16) as ex:
    futs = [ex.submit(fetch_one, rel) for rel in need_paths]
    done = 0
    for fut in as_completed(futs):
        rel, loc, err = fut.result()
        done += 1
        if err:
            failed.append((rel, err))
        else:
            local_map[rel] = loc
        if done % 50 == 0 or done == len(need_paths):
            print(f"Downloaded {done}/{len(need_paths)}")

thumb_map = {}
for i, rel in enumerate(need_paths, start=1):
    loc = local_map.get(rel)
    if not loc:
        continue
    thumb_name = f"{i:05d}_{safe_filename(Path(rel).name)}.jpg"
    thumb_path = IMG_DIR / thumb_name
    make_thumb(loc, thumb_path, max_side=900, quality=80)
    thumb_map[rel] = f"img/{thumb_name}"

print("Thumbs:", len(thumb_map), "failed:", len(failed))

# ---------- 6) Render FILTERED HTML ----------
def nice_sku(name, color, sugar):
    parts = [name, color, sugar]
    parts = [str(x) for x in parts if x is not None and str(x)!="nan" and str(x).strip()!=""]
    return " | ".join(parts)

def esc(s): return html.escape("" if s is None else str(s))

# Build per-card JSON for client-side filtering
cards = []
for r in work_view.itertuples(index=False):
    img = thumb_map.get(r.image_path_rel, "")
    cards.append({
        "image_id": str(r.image_id),
        "manufacturer": "" if pd.isna(r.manufacturer) else str(r.manufacturer),
        "name": "" if pd.isna(r.name) else str(r.name),
        "color": "" if pd.isna(r.color) else str(r.color),
        "sugar": "" if pd.isna(r.sugar) else str(r.sugar),
        "sparkling_text": "" if pd.isna(r.sparkling) else str(r.sparkling),
        "image_path_rel": "" if pd.isna(r.image_path_rel) else str(r.image_path_rel),
        "img": img,
        "y_true": int(r.y_true),
        "y_pred": int(r.y_pred),
        "p": float(r.p_sparkling),
        "section": str(r.section),
        "tags": "" if pd.isna(r.tags) else str(r.tags),
        "sku": nice_sku(r.name, r.color, r.sugar),
    })

manufacturers = sorted(set([c["manufacturer"] for c in cards if c["manufacturer"]]))
colors = sorted(set([c["color"] for c in cards if c["color"]]))
sugars = sorted(set([c["sugar"] for c in cards if c["sugar"]]))

css = """
<style>
body{font-family:system-ui,-apple-system,Segoe UI,Roboto,Arial;margin:18px;color:#111;background:#fff;}
h1{font-size:20px;margin:0 0 8px;}
p{margin:6px 0 14px;color:#444;max-width:1100px;}
.panel{position:sticky;top:0;background:#fff;padding:10px 0 12px;border-bottom:1px solid #e8e8e8;z-index:5;}
.controls{display:flex;flex-wrap:wrap;gap:10px;align-items:flex-end;}
.ctrl{display:flex;flex-direction:column;gap:6px;font-size:12px;color:#333;}
select,input[type="text"]{padding:8px 10px;border:1px solid #ddd;border-radius:10px;min-width:220px;}
input[type="range"]{width:220px;}
.checks{display:flex;gap:10px;flex-wrap:wrap;}
.chk{display:flex;gap:6px;align-items:center;font-size:12px;color:#333;border:1px solid #eee;border-radius:999px;padding:6px 10px;background:#fafafa;}
.kpis{font-size:12px;color:#555;margin-top:8px;}
.grid{display:grid;grid-template-columns:repeat(3, 1fr);gap:12px;margin-top:14px;}
.card{border:1px solid #e6e6e6;border-radius:14px;padding:10px;background:#fff;}
.imgwrap{border-radius:12px;overflow:hidden;border:1px solid #eee;background:#fafafa;}
img{width:100%;height:auto;display:block;}
.noimg{padding:40px;text-align:center;color:#999;}
.cap{font-size:12px;line-height:1.25;margin-top:8px;}
.meta{font-size:11px;color:#555;margin-top:6px;}
.tags{font-size:11px;color:#222;margin-top:6px;}
.path{font-size:10px;color:#777;margin-top:6px;word-break:break-word;}
.pill{display:inline-block;font-size:11px;border:1px solid #eee;border-radius:999px;padding:3px 8px;background:#f7f7f7;margin-right:6px;}
@media (max-width: 1100px){ .grid{grid-template-columns:repeat(2, 1fr);} }
@media (max-width: 700px){ .grid{grid-template-columns:1fr;} select,input[type="text"]{min-width:160px;} }
</style>
"""

js = """
<script>
const CARDS = __CARDS_JSON__;

function $(id){ return document.getElementById(id); }

function toBool(v){ return v === true || v === "true" || v === 1 || v === "1"; }

function render(cards){
  const root = $("grid");
  root.innerHTML = "";
  const frag = document.createDocumentFragment();
  for(const c of cards){
    const div = document.createElement("div");
    div.className = "card";
    div.innerHTML = `
      <div class="imgwrap">${c.img ? `<img src="${c.img}">` : `<div class="noimg">no image</div>`}</div>
      <div class="cap"><b>${escapeHtml(c.manufacturer || "")}</b></div>
      <div class="cap">${escapeHtml(c.sku || "")}</div>
      <div class="meta">
        <span class="pill">${escapeHtml(c.section)}</span>
        <span class="pill">p(игристое): <b>${c.p.toFixed(2)}</b></span>
        <span class="pill">факт: ${c.y_true ? "игристое" : "не игристое"}</span>
        <span class="pill">предсказание: ${c.y_pred ? "игристое" : "не игристое"}</span>
      </div>
      <div class="tags">${escapeHtml(c.tags || "")}</div>
      <div class="path">${escapeHtml(c.image_path_rel || "")}</div>
    `;
    frag.appendChild(div);
  }
  root.appendChild(frag);
  $("kpi").textContent = `Показано: ${cards.length} / ${CARDS.length}`;
}

function escapeHtml(str){
  return String(str)
    .replaceAll("&","&amp;")
    .replaceAll("<","&lt;")
    .replaceAll(">","&gt;")
    .replaceAll('"',"&quot;")
    .replaceAll("'","&#039;");
}

function applyFilters(){
  const q = ($("q").value || "").trim().toLowerCase();
  const manu = $("manu").value;
  const col = $("col").value;
  const sug = $("sug").value;
  const sec = $("sec").value;

  const pmin = parseFloat($("pmin").value);
  const pmax = parseFloat($("pmax").value);

  const onlyTrueSpark = $("onlyTrueSpark").checked;
  const onlyTrueStill = $("onlyTrueStill").checked;
  const onlyTP = $("onlyTP").checked;
  const onlyFP = $("onlyFP").checked;
  const onlyFN = $("onlyFN").checked;
  const onlyTN = $("onlyTN").checked;

  let activeSections = [];
  if(onlyTP) activeSections.push("TP");
  if(onlyTN) activeSections.push("TN");
  if(onlyFP) activeSections.push("FP");
  if(onlyFN) activeSections.push("FN");

  const out = [];
  for(const c of CARDS){
    if(manu && c.manufacturer !== manu) continue;
    if(col && c.color !== col) continue;
    if(sug && c.sugar !== sug) continue;

    if(sec && c.section !== sec) continue;

    if(!(c.p >= pmin && c.p <= pmax)) continue;

    if(onlyTrueSpark && c.y_true !== 1) continue;
    if(onlyTrueStill && c.y_true !== 0) continue;

    if(activeSections.length){
      const tag = c.section.slice(0,2); // "TP", "TN", ...
      if(!activeSections.includes(tag)) continue;
    }

    if(q){
      const hay = `${c.manufacturer} ${c.name} ${c.color} ${c.sugar} ${c.tags} ${c.image_id}`.toLowerCase();
      if(!hay.includes(q)) continue;
    }
    out.push(c);
  }

  // sort: by p desc by default, but invert for TN if checkbox selected
  const sortMode = $("sort").value;
  if(sortMode === "p_desc"){
    out.sort((a,b)=>b.p-a.p);
  } else if(sortMode === "p_asc"){
    out.sort((a,b)=>a.p-b.p);
  } else if(sortMode === "exceptions"){
    // prioritize FP and FN, then by confidence
    const rank = (s)=> s.startsWith("FP")||s.startsWith("FN") ? 0 : 1;
    out.sort((a,b)=>{
      const ra = rank(a.section), rb = rank(b.section);
      if(ra !== rb) return ra-rb;
      return b.p-a.p;
    });
  }

  render(out);
}

function setPLabels(){
  $("pmin_lbl").textContent = $("pmin").value;
  $("pmax_lbl").textContent = $("pmax").value;
}

function resetAll(){
  $("q").value = "";
  $("manu").value = "";
  $("col").value = "";
  $("sug").value = "";
  $("sec").value = "";
  $("pmin").value = "0.00";
  $("pmax").value = "1.00";
  $("onlyTrueSpark").checked = false;
  $("onlyTrueStill").checked = false;
  $("onlyTP").checked = false;
  $("onlyTN").checked = false;
  $("onlyFP").checked = false;
  $("onlyFN").checked = false;
  $("sort").value = "p_desc";
  setPLabels();
  applyFilters();
}

document.addEventListener("DOMContentLoaded", ()=>{
  // init p labels
  setPLabels();
  // wire events
  for(const id of ["q","manu","col","sug","sec","sort"]){
    $(id).addEventListener("input", applyFilters);
    $(id).addEventListener("change", applyFilters);
  }
  for(const id of ["pmin","pmax"]){
    $(id).addEventListener("input", ()=>{ setPLabels(); applyFilters(); });
  }
  for(const id of ["onlyTrueSpark","onlyTrueStill","onlyTP","onlyTN","onlyFP","onlyFN"]){
    $(id).addEventListener("change", applyFilters);
  }
  $("reset").addEventListener("click", resetAll);

  // First render
  applyFilters();
});
</script>
"""

cards_json = json.dumps(cards, ensure_ascii=False)

html_parts = []
html_parts.append("<!doctype html><html><head><meta charset='utf-8'>")
html_parts.append("<meta name='viewport' content='width=device-width, initial-scale=1'>")
html_parts.append(css)
html_parts.append("</head><body>")

html_parts.append("<h1>Как этикетки кодируют «игристое» — интерактивный отчёт</h1>")
html_parts.append("""
<p>
Фильтруй и смотри глазами: типичные игристые, типичные тихие, и самое интересное — исключения (FP/FN).
</p>
""")

# Controls panel
def opt_list(items):
    return "\n".join([f"<option value='{esc(x)}'>{esc(x)}</option>" for x in items])

section_opts = [
    "TP: типичные игристые",
    "TN: типичные не-игристые",
    "FP: ложные тревоги",
    "FN: пропущенные игристые",
]

html_parts.append(f"""
<div class="panel">
  <div class="controls">
    <div class="ctrl">
      <div>Поиск</div>
      <input id="q" type="text" placeholder="бренд / название / теги / image_id">
    </div>

    <div class="ctrl">
      <div>Производитель</div>
      <select id="manu">
        <option value="">(любой)</option>
        {opt_list(manufacturers)}
      </select>
    </div>

    <div class="ctrl">
      <div>Цвет вина</div>
      <select id="col">
        <option value="">(любой)</option>
        {opt_list(colors)}
      </select>
    </div>

    <div class="ctrl">
      <div>Сахар</div>
      <select id="sug">
        <option value="">(любой)</option>
        {opt_list(sugars)}
      </select>
    </div>

    <div class="ctrl">
      <div>Секция</div>
      <select id="sec">
        <option value="">(любая)</option>
        {opt_list(section_opts)}
      </select>
    </div>

    <div class="ctrl">
      <div>p(игристое) от <span id="pmin_lbl">0</span> до <span id="pmax_lbl">1</span></div>
      <div style="display:flex;gap:8px;align-items:center;">
        <input id="pmin" type="range" min="0" max="1" value="0.00" step="0.01">
        <input id="pmax" type="range" min="0" max="1" value="1.00" step="0.01">
      </div>
    </div>

    <div class="ctrl">
      <div>Сортировка</div>
      <select id="sort">
        <option value="p_desc">уверенность ↓</option>
        <option value="p_asc">уверенность ↑</option>
        <option value="exceptions">сначала исключения</option>
      </select>
    </div>

    <div class="ctrl">
      <div>Быстрые фильтры</div>
      <div class="checks">
        <label class="chk"><input type="checkbox" id="onlyTrueSpark"> только факт=игристое</label>
        <label class="chk"><input type="checkbox" id="onlyTrueStill"> только факт=не игристое</label>
        <label class="chk"><input type="checkbox" id="onlyTP"> TP</label>
        <label class="chk"><input type="checkbox" id="onlyTN"> TN</label>
        <label class="chk"><input type="checkbox" id="onlyFP"> FP</label>
        <label class="chk"><input type="checkbox" id="onlyFN"> FN</label>
        <button id="reset" style="padding:8px 12px;border:1px solid #ddd;border-radius:10px;background:#fff;cursor:pointer;">Сброс</button>
      </div>
    </div>
  </div>
  <div class="kpis" id="kpi"></div>
</div>

<div id="grid" class="grid"></div>
""")

# JS with embedded data
html_parts.append(js.replace("__CARDS_JSON__", cards_json))

html_parts.append("</body></html>")

OUT_DIR.mkdir(parents=True, exist_ok=True)
html_path = OUT_DIR / "index.html"
html_path.write_text("\n".join(html_parts), encoding="utf-8")
print("Saved:", html_path)

# Preview
from IPython.display import IFrame, display
display(IFrame(str(html_path), width="100%", height=920))

# Zip for download
zip_path = shutil.make_archive("/content/sparkling_visual_report", "zip", str(OUT_DIR))
from google.colab import files
files.download(zip_path)


Cards in report: 1070
Downloaded 50/1067
Downloaded 100/1067
Downloaded 150/1067
Downloaded 200/1067
Downloaded 250/1067
Downloaded 300/1067
Downloaded 350/1067
Downloaded 400/1067
Downloaded 450/1067
Downloaded 500/1067
Downloaded 550/1067
Downloaded 600/1067
Downloaded 650/1067
Downloaded 700/1067
Downloaded 750/1067
Downloaded 800/1067
Downloaded 850/1067
Downloaded 900/1067
Downloaded 950/1067
Downloaded 1000/1067
Downloaded 1050/1067
Downloaded 1067/1067
Thumbs: 1067 failed: 0
Saved: /content/sparkling_visual_report/index.html


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [111]:
y_color = features_df["color"].astype(str).str.strip()
# оставим только основные классы
keep = y_color.isin(["Белое","Красное","Розовое","Розе"])
Xc = X[keep].copy()
yc = y_color[keep].copy()

print("Color rows:", len(yc))
print(yc.value_counts())


Color rows: 1047
color
Белое      482
Красное    435
Розовое     82
Розе        48
Name: count, dtype: int64


In [112]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    Xc, yc, test_size=0.25, random_state=42, stratify=yc
)

clf = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(max_iter=3000, class_weight="balanced", multi_class="ovr"))
])

clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print(classification_report(y_test, pred, digits=3))


              precision    recall  f1-score   support

       Белое      0.695     0.339     0.456       121
     Красное      0.664     0.725     0.693       109
        Розе      0.100     0.333     0.154        12
     Розовое      0.114     0.250     0.156        20

    accuracy                          0.492       262
   macro avg      0.393     0.412     0.365       262
weighted avg      0.610     0.492     0.518       262





In [113]:
lr = clf.named_steps["lr"]
classes = lr.classes_
coefs = lr.coef_  # shape: (n_classes, n_features)

rows = []
for ci, c in enumerate(classes):
    tmp = pd.DataFrame({"class": c, "feature": feat_cols, "coef": coefs[ci]})
    tmp = tmp.sort_values("coef", ascending=False).head(5)
    rows.append(tmp)
top_coef = pd.concat(rows, ignore_index=True)
display(top_coef)


Unnamed: 0,class,feature,coef
0,Белое,sat_mean,1.392175
1,Белое,brightness_mean,0.573948
2,Белое,contrast_std,0.355716
3,Белое,black_share,0.331712
4,Белое,white_share,0.084912
5,Красное,edge_density,0.712201
6,Красное,contrast_std,0.553226
7,Красное,colorful_share,0.355163
8,Красное,white_share,-0.209344
9,Красное,entropy,-0.213914


In [1]:
import os, glob
print("cache exists:", os.path.exists("/content/cache"))
print("cache files:", len(glob.glob("/content/cache/**/*", recursive=True)))

print("report exists:", os.path.exists("/content/sparkling_visual_report/index.html"))

print("drive mounted:", os.path.exists("/content/drive/MyDrive"))


cache exists: False
cache files: 0
report exists: False
drive mounted: False


In [2]:
import os

path = "/content/label_visual_features.csv"
print("exists:", os.path.exists(path))
if os.path.exists(path):
    print("size (bytes):", os.path.getsize(path))
    # быстрый sanity-check: прочитать пару строк
    import pandas as pd
    df_check = pd.read_csv(path)
    print("rows:", len(df_check), "cols:", df_check.shape[1])
    display(df_check.head(3))


exists: False


In [3]:
!ls -lah /content | sed -n '1,200p'
!find /content -maxdepth 3 -type f -iname "*visual*features*.csv" -o -iname "label_visual_features.csv"


total 16K
drwxr-xr-x 1 root root 4.0K Jan 16 14:24 .
drwxr-xr-x 1 root root 4.0K Feb 14 08:20 ..
drwxr-xr-x 4 root root 4.0K Jan 16 14:24 .config
drwxr-xr-x 1 root root 4.0K Jan 16 14:24 sample_data
