In [1]:
import os
os.chdir("/home/yaroslav/FCUL/MARS_1.0")

# from google.colab import drive
# drive.mount('/content/drive')
# import os
# os.chdir('/content/drive/My Drive/lisboa_ciencia_de_dados/MARS_1.0')
# !pip install -qq igraph ijson

In [2]:
import gc
import pickle
import networkx as nx
import igraph as ig
import pandas as pd
from time import perf_counter
import traceback
from project_cda.tag_formatter import set_log_level, log

In [3]:
set_log_level('DEBUG')
# set_log_level('INFO')
# set_log_level('WARNING')
# set_log_level('ERROR')

In [4]:
from project_cda.anime_graph_builder import AnimeGraphBuilder
from project_cda.community_tracker import CommunityTracker
from project_cda.cluster_evaluation import ClusterEvaluation
from project_cda.partition_enricher import PartitionEnricher
from project_cda.cluster_visualizer import ClusterVisualizer
from project_cda.path_manager import PathManager

In [5]:
DATA_DIR = "data"
USERS_CSV_PATH = f"{DATA_DIR}/datasets/anime_azathoth42/users_sterilized.csv"
USER_DICT_PATH = f"{DATA_DIR}/helpers/user_dict_filtered.json"   # 95 percentile
ANIME_CSV_PATH = f"{DATA_DIR}/datasets/anime_azathoth42/anime_sterilized.csv"

In [6]:
graph_builder = AnimeGraphBuilder(users_csv_path=USERS_CSV_PATH,
                                 user_dict_json_path=USER_DICT_PATH,
                                 anime_csv_path=ANIME_CSV_PATH)

[12:46:59] [DEBUG] [AGB] Anime Graph Builder initialzed for


## EDGING SETTINGS (keep **ONE** option uncommented)

In [7]:
# === EDGES SETTINGS ===
def get_edges_config():
    # --- METHOD: Jaccard + KNN ---
    return {
        "name": "jaccard",
        "kwargs": {"threshold": 0.15}
    }

    # --- METHOD: Raw / Projected ---
    # return {
    #     "name": "raw",
    #     "kwargs": {"threshold": 0},
    # }

## SPARSING SETTINGS (keep **ONE** option uncommented)

In [8]:

def get_sparsing_config():
    # --- METHOD: No sparsing ---
#     return {"name": "full",
#             "kwargs": {}}

    # --- METHOD: KNN ---
    # return {
    #     "name": "knn",
    #     "kwargs": {"k": 10},
    # }

    # --- METHOD: Backbone ---
    return {
        "name": "backbone",
        "kwargs": {"alpha": 0.05}
    }

## CLUSTERING ALGORITHM SETTINGS (keep **ONE** option uncommented)

In [9]:
# === НАСТРОЙКИ АЛГОРИТМА ===
def get_algo_config():
    # --- 1. LEIDEN: MODULARITY (Классика) ---
    return {
        "name": "leiden_mod",
        "kwargs": {
            "objective_function": "modularity",
            "resolution": 1.0,  # "Gamma". 1.0 - стандарт. Больше - мельче кластеры.
            "n_iterations": -1            # -1 = крутить до сходимости (рекомендуется)
        }
    }

    # --- 2. LEIDEN: CPM (Constant Potts Model) ---
    # return {
    #     "name": "leiden_cpm",
    #     "kwargs": {
    #         "objective_function": "CPM",
    #         "resolution": 0.05, # 0.01, 0.05, 0.1...
    #         "n_iterations": -1
    #     }
    # }

    # --- 3. LEADING EIGENVECTOR (Спектральный) ---
    # return {
    #     "name": "eigenvector",
    #     "kwargs": {
    #         "clusters": None 
    #     }
    # }

    # --- 4. WALKTRAP (Random Walks) ---
    # return {
    #     "name": "walktrap",
    #     "kwargs": {
    #         "steps": 4  # Длина блуждания. (3-4) -> мелкие. (8-10) -> крупные.
    #     }
    # }

    # --- 5. INFOMAP (Flow-based) ---
    # return {
    #     "name": "infomap",
    #     "kwargs": {
    #         "trials": 10  # Количество попыток. Больше -> стабильнее результат.
    #     }
    # }

    # --- 6. LABEL PROPAGATION ---
    # return {
    #     "name": "label_propagation",
    #     "kwargs": {}
    # }

In [10]:
# def make_experiment_name(edge_conf, sparse_conf, algo_conf):
#     parts = []
#     def safetostr(obj):
#         return "none" if obj is None else str(obj)

#     # 1. Edges part (e.g., "Jac005")
#     e_name = edge_conf['name']
#     th_str = str(edge_conf['threshold']).replace('.', '')
#     parts.append(f"{e_name}{th_str}")
    
#     # 2. Spars part (e.g., "KNN20")
#     s_type = sparse_conf.get('type')
#     if s_type == 'knn':
#         k = sparse_conf.get('k')
#         parts.append(f"KNN{k}")
#     elif s_type == 'backbone':
#         a = str(sparse_conf.get('alpha')).replace('.', '')
#         parts.append(f"BB{a}")
#     else:
#         parts.append("Full")
        
#     # 3. Algo part (e.g., "LeidenM10")
#     algo = algo_conf['name']
#     if algo == 'leiden':
#         res = str(algo_conf['kwargs'].get('resolution', 1.0)).replace('.', '')
#         parts.append(f"LMod{res}")
#     elif algo == 'infomap':
#         parts.append(f"InfoT{algo_conf['kwargs'].get('trials', 1)}")
#     else:
#         parts.append(algo.capitalize())
        
#     return "_".join(parts)

In [11]:
# if not os.path.exists(DATA_DIR):
#     os.makedirs(DATA_DIR)

# GRAPH_DIR = f"{DATA_DIR}/graphs/"
# if not os.path.exists(GRAPH_DIR):
#     os.makedirs(GRAPH_DIR)

# REPORT_DIR = f"{DATA_DIR}/reports/"
# if not os.path.exists(REPORT_DIR):
#     os.makedirs(REPORT_DIR)

# PARTITION_DIR = f"{DATA_DIR}/partitions/"
# if not os.path.exists(PARTITION_DIR):
#     os.makedirs(PARTITION_DIR)

# PLOTS_DIR = f"{DATA_DIR}/plots/"
# if not os.path.exists(PLOTS_DIR):
#     os.makedirs(PLOTS_DIR)

In [12]:
# # 1. Забираем настройки
# EDGES_CONF = get_edges_config()
# SPARS_CONF = get_sparsing_config()
# ALGO_CONF = get_algo_config()

# # 2. Формируем имя
# EXP_NAME = make_experiment_name(EDGES_CONF, SPARS_CONF, ALGO_CONF)
# CURRENT_EXP_GRAPH_DIR = os.path.join(GRAPH_DIR, EXP_NAME)
# CURRENT_EXP_REPORT_DIR = os.path.join(REPORT_DIR, EXP_NAME)
# CURRENT_EXP_PARTITION_DIR = os.path.join(PARTITION_DIR, EXP_NAME)
# CURRENT_EXP_PLOTS_DIR = os.path.join(PLOTS_DIR, EXP_NAME)

# if not os.path.exists(CURRENT_EXP_GRAPH_DIR):
#     os.makedirs(CURRENT_EXP_GRAPH_DIR)
# if not os.path.exists(CURRENT_EXP_REPORT_DIR):
#     os.makedirs(CURRENT_EXP_REPORT_DIR)
# if not os.path.exists(CURRENT_EXP_PARTITION_DIR):
#     os.makedirs(CURRENT_EXP_PARTITION_DIR)
# if not os.path.exists(CURRENT_EXP_PLOTS_DIR):
#     os.makedirs(CURRENT_EXP_PLOTS_DIR)

# print(f"EDGES CONFIG:           {EDGES_CONF}")
# print(f"SPARSING CONFIG:        {SPARS_CONF}")
# print(f"ALGORITHM CONFIG:       {ALGO_CONF}")
# print(f"EXPERIMENT:             {EXP_NAME}")
# print(f"OUTPUT GRAPH PATH:      {CURRENT_EXP_GRAPH_DIR}")
# print(f"OUTPUT PARTITIONS PATH: {CURRENT_EXP_PARTITION_DIR}")
# print(f"OUTPUT REPORT PATH:     {CURRENT_EXP_REPORT_DIR}")
# print(f"OUTPUT PLOT PATH:       {CURRENT_EXP_PLOTS_DIR}")

In [13]:
base_path = "/home/yaroslav/FCUL/MARS_1.0"
pm = PathManager(base_path)
EDGES_CONF = get_edges_config()
SPARS_CONF = get_sparsing_config()
ALGO_CONF = get_algo_config()
GRAPH_TYPE = 'anime' # или 'users'
paths = pm.get_paths(GRAPH_TYPE, EDGES_CONF, SPARS_CONF, ALGO_CONF)
pm.ensure_dirs(paths)
paths

{'graph_type': 'anime',
 'raw_dir': '/home/yaroslav/FCUL/MARS_1.0/data/graphs/anime/raw/jacth015',
 'input_graph_dir': '/home/yaroslav/FCUL/MARS_1.0/data/graphs/anime/sparse/jacth015_bba005',
 'experiment_dir': '/home/yaroslav/FCUL/MARS_1.0/data/experiments/anime/jacth015_bba005_ldnMODiter-1res1',
 'exp_id': 'jacth015_bba005_ldnMODiter-1res1'}

In [14]:
def get_or_build_raw(year, raw_dir, build_cfg, builder):
    """
    Гарантирует возвращение 'сырого' (Raw) графа.
    1. Ищет в кэше (raw_dir).
    2. Если нет — строит с нуля и сохраняет.
    """
    filename = f"{year}.pkl"
    raw_path = os.path.join(raw_dir, filename)

    # 1. Попытка загрузки из кэша
    if os.path.exists(raw_path):
        log(f"Loading RAW graph from: {raw_path}", tag="CACHE", level='DEBUG')
        try:
            with open(raw_path, "rb") as f:
                return pickle.load(f)
        except Exception as e:
            log(f"Corrupt RAW cache {year}: {e}. Rebuilding...", tag="WARN", level='WARNING')

    # 2. Построение с нуля (если кэша нет или он битый)
    log(f"Building RAW graph from scratch for {year}...", tag="BUILD", level='DEBUG')
    t = perf_counter()
    
    # Строим ребра (Jaccard и т.д.)
    edges, counts = builder.build_edges(year=year, **build_cfg['kwargs'])
    
    # Создаем граф и сразу сохраняем в raw_dir
    G_raw = builder.build_graph(edges, counts, output_path=raw_path)
    
    log(f"Raw built in {perf_counter() - t:.2f}s", tag="TIME", level='DEBUG')
    
    # Чистим память от промежуточных списков
    del edges, counts
    gc.collect()
    
    return G_raw

def get_target_graph(year, paths, build_cfg, sparse_cfg, builder):
    """
    Возвращает итоговый граф для кластеризации.
    1. Проверяет наличие готового Target файла (в input_graph_dir).
    2. Если нет — берет Raw граф (через функцию выше) и применяет спарсинг.
    """
    filename = f"{year}.pkl"
    target_path = os.path.join(paths['input_graph_dir'], filename)

    # A. Если целевой граф уже есть на диске — просто грузим
    if os.path.exists(target_path):
        log(f"Target graph found: {target_path}", tag="CACHE", level='DEBUG')
        try:
            with open(target_path, "rb") as f:
                return pickle.load(f)
        except Exception as e:
            log(f"Corrupt Target cache: {e}. Will regenerate.", tag="WARN", level='WARNING')

    # B. Если целевого графа нет — нам нужен исходник (Raw)
    # Эта функция сама решит: загрузить Raw с диска или построить его
    G = get_or_build_raw(year, paths['raw_dir'], build_cfg, builder)

    # C. Если нужен спарсинг — применяем его к Raw графу
    if sparse_cfg:
        log(f"Sparsifying ({sparse_cfg['name']})...", tag="SPARSE", level='DEBUG')
        
        # Подготовка параметров
        s_kwargs = sparse_cfg['kwargs'].copy()
        s_kwargs['output_path'] = target_path # Важно: builder сохранит результат сюда

        method = sparse_cfg['name']
        if method == 'knn':
            G = builder.sparsify_knn(G, **s_kwargs)
        elif method == 'backbone':
            G = builder.sparsify_backbone(G, **s_kwargs)
        else:
            raise ValueError(f"Unknown sparse method: {method}")
    else:
        # D. Если спарсинг не нужен, то Raw граф и есть наш Target.
        # Но так как Raw мы могли только что построить и сохранить в raw_dir,
        # а target_path может указывать на другую папку (теоретически),
        # скопируем или просто оставим как есть.
        # В нашей логике путей, если sparse=None, то input_graph_dir == raw_dir.
        # Так что ничего делать не надо.
        pass

    return G

In [15]:
partitions_by_year = {}
modularity_by_year = {}

raw_dir = paths['raw_dir']          # Папка для "чистых" графов (например .../raw/jac005)
input_dir = paths['input_graph_dir'] # Папка, откуда грузим граф для кластеризации (может быть равна raw_dir или .../sparse/...)

log(f"Graph Source: {input_dir}", tag="SETUP", level='INFO')
if paths['experiment_dir']:
    log(f"Experiment Output: {paths['experiment_dir']}", tag="SETUP", level='INFO')

for year in range(2006, 2008):
    log(f">>> Processing {year}...", tag="YEAR", level='INFO')

    try:
        G = get_target_graph(year, paths, EDGES_CONF, SPARS_CONF, graph_builder)

        log(f"Clustering with {ALGO_CONF['name']}...", tag="ALGO", level='DEBUG')
    
        if "weight" in G.edge_attributes():
            weights = G.es["weight"]
        else:
            weights = None
            log("No weights found in graph, clustering will be unweighted.", tag="WARN", level='WARNING')

        algo_name = ALGO_CONF['name']
        algo_args = ALGO_CONF.get('kwargs', {})
        log(f"Clustering: {algo_name}", tag="ALGO", level='DEBUG')
        t_c = perf_counter()

        if algo_name in ['leiden_mod', 'leiden_cpm']:
            partition = G.community_leiden(weights=weights, **algo_args)
        elif algo_name == 'eigenvector':
            partition = G.community_leading_eigenvector(weights=weights, **algo_args)
        elif algo_name == 'walktrap':
            wc = G.community_walktrap(weights=weights, **algo_args)
            partition = wc.as_clustering()
        elif algo_name == 'infomap':
            partition = G.community_infomap(edge_weights=weights, **algo_args)
        elif algo_name == 'label_propagation':
            partition = G.community_label_propagation(weights=weights) # kwargs пустые обычно
        else:
            raise ValueError(f"Unknown algo: {algo_name}")
        
        log(f"Clustered in {perf_counter() - t_c:.2f}s", tag="TIME", level='DEBUG')

        real_names = G.vs['name']
        membership = partition.membership
        partitions_by_year[year] = dict(zip(real_names, membership))

        try:
            mod_val = G.modularity(membership, weights=weights)
        except Exception:
            mod_val = 0.0
            
        modularity_by_year[year] = mod_val
        
        log(f"Done. Stats -> Modularity: {mod_val:.4f}, Clusters: {len(set(membership))}", tag="RESULT", level='INFO')

        del partition, weights, G
        gc.collect()

    except Exception as e:
        log(f"Clustering Error in {year}: {e}", tag="ERROR", level='ERROR')
        traceback.print_exc()


[12:46:59] [INFO] [SETUP] Graph Source: /home/yaroslav/FCUL/MARS_1.0/data/graphs/anime/sparse/jacth015_bba005
[12:46:59] [INFO] [SETUP] Experiment Output: /home/yaroslav/FCUL/MARS_1.0/data/experiments/anime/jacth015_bba005_ldnMODiter-1res1
[12:46:59] [INFO] [YEAR] >>> Processing 2006...
[12:46:59] [DEBUG] [CACHE] Target graph found: /home/yaroslav/FCUL/MARS_1.0/data/graphs/anime/sparse/jacth015_bba005/2006.pkl
[12:46:59] [DEBUG] [ALGO] Clustering with leiden_mod...
[12:46:59] [DEBUG] [ALGO] Clustering: leiden_mod
[12:46:59] [DEBUG] [TIME] Clustered in 0.03s


[12:46:59] [INFO] [RESULT] Done. Stats -> Modularity: 0.4576, Clusters: 10
[12:46:59] [INFO] [YEAR] >>> Processing 2007...
[12:46:59] [DEBUG] [CACHE] Target graph found: /home/yaroslav/FCUL/MARS_1.0/data/graphs/anime/sparse/jacth015_bba005/2007.pkl
[12:46:59] [DEBUG] [ALGO] Clustering with leiden_mod...
[12:46:59] [DEBUG] [ALGO] Clustering: leiden_mod
[12:46:59] [DEBUG] [TIME] Clustered in 0.03s
[12:46:59] [INFO] [RESULT] Done. Stats -> Modularity: 0.6372, Clusters: 239


In [16]:
base_partition_filename = f"partition.csv"
base_partition_path = os.path.join(paths['experiment_dir'], base_partition_filename)

tracker = CommunityTracker(threshold=0.1)
aligned_partitions = tracker.track_communities(partitions_by_year)
CommunityTracker.save_aligned_history_to_csv(aligned_partitions, base_partition_path)

[12:46:59] [INFO] [TRACK] Year 2006: done. Unique clusters: 10
[12:46:59] [INFO] [TRACK] Year 2007: done. Unique clusters: 239
[12:46:59] [INFO] [TRACK] Saved partition detail to /home/yaroslav/FCUL/MARS_1.0/data/experiments/anime/jacth015_bba005_ldnMODiter-1res1/partition.csv
[12:46:59] [INFO] [TRACK] Saved partition stats to  /home/yaroslav/FCUL/MARS_1.0/data/experiments/anime/jacth015_bba005_ldnMODiter-1res1/partition_stats.csv


In [17]:
enricher = PartitionEnricher(metadata_path=ANIME_CSV_PATH, key_col="anime_id", set_cols=["genres", "studio"])
anime_meta_dict = enricher.get_metadata_dict()

partition_enriched = enricher.enrich_partition(f"{paths['experiment_dir']}/{base_partition_filename}")

In [18]:
print(partition_enriched.head())

   year  anime_id  cluster_id  Unnamed: 0  \
0  2006         1           0           0   
1  2006         5           0           1   
2  2006         6           0           2   
3  2006         7           0           3   
4  2006         8           1           4   

                                           image_url  score  scored_by  \
0  https://myanimelist.cdn-dena.com/images/anime/...   8.81     365465   
1  https://myanimelist.cdn-dena.com/images/anime/...   8.41     111344   
2  https://myanimelist.cdn-dena.com/images/anime/...   8.31     197843   
3  https://myanimelist.cdn-dena.com/images/anime/...   7.34      31908   
4  https://myanimelist.cdn-dena.com/images/anime/...   7.04       4760   

     rank                                      opening_theme  \
0    27.0            ['"Tank!" by The Seatbelts (eps 1-25)']   
1   157.0  ['"Ask DNA" by The Seatbelts featuring Raju Ra...   
2   234.0                       ['"H.T." by Tsuneo Imahori']   
3  2292.0                   

In [19]:
evaluator = ClusterEvaluation(paths['exp_id'], aligned_partitions, anime_info=anime_meta_dict,modularity_dict=modularity_by_year)
evaluation_df = evaluator.get_trajectory_df()

base_evaluation_filename = f"evaluation.csv"
base_evaluation_path = os.path.join(paths['experiment_dir'], base_evaluation_filename)
evaluation_df.to_csv(base_evaluation_path, index=False, encoding='utf-8')
evaluation_df

[12:47:00] [INFO] [EVAL] Calculating trajectory for jacth015_bba005_ldnMODiter-1res1...


Unnamed: 0_level_0,Method,Gini_Spatial,Entropy_Info,Stability_AMI,Purity_Source,Purity_Genre,Modularity,N_Clusters
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2006,jacth015_bba005_ldnMODiter-1res1,0.361286,2.079632,,0.532148,0.571819,0.457586,10
2007,jacth015_bba005_ldnMODiter-1res1,0.802037,3.397568,0.170454,0.521457,0.612955,0.637179,239


In [20]:
viz = ClusterVisualizer(partition_enriched)

# A. Sankey
viz.plot_sankey(
    filename=os.path.join(paths['experiment_dir'], "sankey.html"),
    key_col="anime_id",
    name_col="title",
    feature_cols=["genres", "studio"],
    metric_col="score",
    sort_col="members",
    age_col="year_start",
    min_link_size=5,
    title="Anime Clusters Over Time"
    )

# B. Streamgraph
viz.plot_streamgraph(
    filename=os.path.join(paths['experiment_dir'], "stream.html"),
    feature_col="genres", 
    title="Rise and Fall of Anime Genres"
)

# C. Bubbles (Optional)
viz.plot_bubbles(
    filename=os.path.join(paths['experiment_dir'], "bubbles.html"),
    x_col="score",      # Чем правее, тем выше оценка
    y_col="members",    # Чем выше, тем популярнее
    size_col="count",   # Размер = кол-во тайтлов
    title="Anime Landscape: Quality vs Popularity"
)

# D. Солнце (Иерархия)
viz.plot_sunburst(
    filename=os.path.join(paths['experiment_dir'], "sunburst.html"),
    feature_col="genres", # Год -> Кластер -> Студия
    title="Anime Source Hierarchy"
)

[12:47:00] [DEBUG] [PLOT] Generating Sankey diagram (Anime Clusters Over Time)...
[12:47:01] [INFO] [PLOT] Plot saved to /home/yaroslav/FCUL/MARS_1.0/data/experiments/anime/jacth015_bba005_ldnMODiter-1res1/sankey.html
[12:47:01] [DEBUG] [PLOT] Generating Streamgraph (Rise and Fall of Anime Genres)...
[12:47:04] [INFO] [PLOT] Plot saved to /home/yaroslav/FCUL/MARS_1.0/data/experiments/anime/jacth015_bba005_ldnMODiter-1res1/stream.html
[12:47:04] [DEBUG] [PLOT] Generating Bubble Chart (Anime Landscape: Quality vs Popularity)...
[12:47:04] [INFO] [PLOT] Plot saved to /home/yaroslav/FCUL/MARS_1.0/data/experiments/anime/jacth015_bba005_ldnMODiter-1res1/bubbles.html
[12:47:04] [DEBUG] [PLOT] Generating Sunburst (Anime Source Hierarchy)...
[12:47:05] [INFO] [PLOT] Plot saved to /home/yaroslav/FCUL/MARS_1.0/data/experiments/anime/jacth015_bba005_ldnMODiter-1res1/sunburst.html
