In [1]:
import os
import gc
import pickle
import networkx as nx
import igraph as ig
import plotly.graph_objects as go
from collections import defaultdict
os.chdir("/home/yaroslav/FCUL/MARS_1.0")

In [2]:
from project_cda.anime_graph_builder import AnimeGraphBuilder

In [3]:
abuilder = AnimeGraphBuilder(users_csv_path="data/datasets/anime_azathoth42/users_sterilized.csv",
                             user_dict_json_path="data/helpers/user_dict_filtered.json",   # 95 percentile
                             anime_csv_path="data/datasets/anime_azathoth42/anime_sterilized.csv")

In [None]:
out_path = "data/graphs/Graphs_cleaned_95_percentile_Jaccard/"
directory = os.path.dirname(out_path)
if directory and not os.path.exists(directory):
    os.makedirs(directory, exist_ok=True)

In [4]:
for year in range(2013, 2014):
    edges, _ = abuilder.build_edges(year=year)
    # G = abuilder.build_graph(edges, weight_threshold=0, plot = True, output_path=f"{out_path}anime_graph_{year}_jaccard.gpickle")
    # del G
    # gc.collect()

Building stats for 2013...
Users joined until 2013: 66363


KeyboardInterrupt: 

In [6]:
# for knn in [10, 20]:

# B = abuilder.sparsify_knn(gpickle_path=f"anime_graph_{year}_new.gpickle", alpha=0.05, weight_threshold=2,
#                                          save_path=f"anime_graph_{year}_backbone.gpickle"
#         )

In [7]:
# with open("data/graphs/Graphs_cleaned_95_percentile_Jaccard/anime_graph_2011_jaccard.gpickle", "rb") as f:
#     G = pickle.load(f)

In [8]:
def nx_to_igraph(G_nx):
    
    mapping = {n: i for i, n in enumerate(G_nx.nodes())} 
    edges = [(mapping[u], mapping[v]) for u, v in G_nx.edges()]
    g = ig.Graph(edges, directed=False)  
    
    
    if nx.get_edge_attributes(G_nx, 'weight'):
        g.es['weight'] = [G_nx[u][v].get('weight', 1) for u, v in G_nx.edges()]
    else:
        g.es['weight'] = [1] * len(G_nx.edges())
    
   
    g.vs['name'] = list(G_nx.nodes())
    
    return g, mapping

In [9]:
# g, mapping = nx_to_igraph(G)

In [10]:
# cl = g.community_leiden(objective_function="modularity", weights="weight", resolution=1)

In [11]:
# from collections import Counter

# community_sizes = Counter(cl.membership)
# sorted_communities = sorted(community_sizes.items(), key=lambda x: x[1], reverse=True)

# for community_id, size in sorted_communities:
#     print(f"Community {community_id}: {size} nodes")

# print(cl.membership)

In [12]:
# cl = g.community_infomap(edge_weights="weight")

In [13]:
# community_sizes = Counter(cl.membership)
# sorted_communities = sorted(community_sizes.items(), key=lambda x: x[1], reverse=True)

# for community_id, size in sorted_communities:
#     print(f"Community {community_id}: {size} nodes")

In [None]:
from project_cda.community_tracker import CommunityTracker

partitions_by_year = {}
for year in range(2006, 2012):
    print(f"handle {year}")
    with open(f"data/graphs/Graphs_cleaned_95_percentile_Jaccard/anime_graph_{year}_jaccard.gpickle", "rb") as f:
        G = pickle.load(f)
    g, mapping = nx_to_igraph(G)
    del G
    gc.collect()
    print("clustering...")
    cl = g.community_leiden(objective_function="modularity", weights="weight", resolution=1)
    partition = CommunityTracker.get_membership(graph=g, partition=cl)
    del g
    gc.collect()
    partitions_by_year[year] = partition


handle 2006
clustering...
handle 2007
clustering...
handle 2008
clustering...
handle 2009
clustering...
handle 2010
clustering...
handle 2011
clustering...


In [None]:
print("tracking...")
tracked_communities = CommunityTracker.track_communities(partition_by_year=partitions_by_year, threshold=0.2)

tracking...
Aligning year 2006...
Aligning year 2007...
Aligning year 2008...
Aligning year 2009...
Aligning year 2010...
Aligning year 2011...


In [16]:

def plot_cluster_evolution(aligned_data: dict, title="Эволюция аниме-сообществ"):
    """
    Строит интерактивную диаграмму Санки на основе выровненных данных.
    
    Args:
        aligned_data: Словарь {year: {node_id: consistent_cluster_id}}, 
                      результат функции track_communities.
    """
    sorted_years = sorted(aligned_data.keys())
    if len(sorted_years) < 2:
        print("Нужно минимум два года для построения потока.")
        return

    # --- 1. Реестр уникальных узлов диаграммы ---
    # Маппинг: (year, cluster_id) -> уникальный индекс для Plotly (0, 1, 2...)
    plotly_node_map = {} 
    labels = [] # Названия узлов для диаграммы (например, "2010: Cl_5")
    node_idx_counter = 0

    for year in sorted_years:
        # Получаем все уникальные ID кластеров в этом году
        cluster_ids = set(aligned_data[year].values())
        for c_id in cluster_ids:
            plotly_node_map[(year, c_id)] = node_idx_counter
            labels.append(f"{year}: Cl_{c_id}")
            node_idx_counter += 1
            
    # --- 2. Вычисление потоков (links) ---
    sources = [] # Откуда (индекс)
    targets = [] # Куда (индекс)
    values = []  # Сколько (толщина)

    # Итерируемся по парам соседних лет: (2010, 2011), (2011, 2012)...
    for i in range(len(sorted_years) - 1):
        year_curr = sorted_years[i]
        year_next = sorted_years[i+1]
        
        # Группируем узлы по переходам: (старый_ID, новый_ID) -> количество
        transitions = defaultdict(int)
        
        # Проходимся по всем аниме, которые есть в ОБОИХ годах
        common_nodes = set(aligned_data[year_curr].keys()) & set(aligned_data[year_next].keys())
        
        for node in common_nodes:
            src_cluster = aligned_data[year_curr][node]
            tgt_cluster = aligned_data[year_next][node]
            transitions[(src_cluster, tgt_cluster)] += 1
            
        # Заполняем списки для Plotly
        for (src_c_id, tgt_c_id), count in transitions.items():
            # Отсекаем совсем мелкие потоки для чистоты графика (опционально)
            if count < 5: continue 
            
            # Преобразуем ID кластеров в индексы Plotly
            sources.append(plotly_node_map[(year_curr, src_c_id)])
            targets.append(plotly_node_map[(year_next, tgt_c_id)])
            values.append(count)

    # --- 3. Построение диаграммы ---
    fig = go.Figure(data=[go.Sankey(
        node = dict(
            pad = 15,
            thickness = 20,
            line = dict(color = "black", width = 0.5),
            label = labels,
            # Можно добавить цвета для узлов, чтобы один consistent_id имел один цвет
            # color = [...] 
        ),
        link = dict(
            source = sources,
            target = targets,
            value = values,
            # color = ... (обычно делают полупрозрачным цветом источника)
        ))])

    fig.update_layout(title_text=title, font_size=12, height=600)
    output_file = "data/helpers/anime_evolution_sankey.html"
    fig.write_html(output_file)

In [17]:
# Пример вызова (после того как отработала track_communities):
plot_cluster_evolution(tracked_communities)