In [1]:
import os
import gc
import pickle
import networkx as nx
import igraph as ig
import plotly.graph_objects as go
from collections import defaultdict
os.chdir("/home/yaroslav/FCUL/MARS_1.0")

In [2]:
from project_cda.anime_graph_builder import AnimeGraphBuilder

In [3]:
abuilder = AnimeGraphBuilder(users_csv_path="data/datasets/anime_azathoth42/users_sterilized.csv",
                             user_dict_json_path="data/helpers/user_dict_filtered.json",   # 95 percentile
                             anime_csv_path="data/datasets/anime_azathoth42/anime_sterilized.csv")

In [4]:
out_path = "data/graphs/Graphs_cleaned_95_percentile_Jaccard/"
directory = os.path.dirname(out_path)
if directory and not os.path.exists(directory):
    os.makedirs(directory, exist_ok=True)

In [None]:
for year in range(2006, 2008):
    edges, _ = abuilder.build_edges(year=year)
    G = abuilder.build_graph(edges, weight_threshold=0, plot = True, output_path=f"{out_path}anime_graph_{year}_jaccard.gpickle")
    # del G
    # gc.collect()

Building stats for 2006...
Users joined until 2006: 133
Calculating weights using method: JACCARD...
Edges built: 64848
Building stats for 2007...
Users joined until 2007: 4148


KeyboardInterrupt: 

In [6]:
# for knn in [10, 20]:

# B = abuilder.sparsify_knn(gpickle_path=f"anime_graph_{year}_new.gpickle", alpha=0.05, weight_threshold=2,
#                                          save_path=f"anime_graph_{year}_backbone.gpickle"
#         )

In [7]:
# with open("data/graphs/Graphs_cleaned_95_percentile_Jaccard/anime_graph_2011_jaccard.gpickle", "rb") as f:
#     G = pickle.load(f)

In [4]:
def nx_to_igraph(G_nx):
    
    mapping = {n: i for i, n in enumerate(G_nx.nodes())} 
    edges = [(mapping[u], mapping[v]) for u, v in G_nx.edges()]
    g = ig.Graph(edges, directed=False)  
    
    
    if nx.get_edge_attributes(G_nx, 'weight'):
        g.es['weight'] = [G_nx[u][v].get('weight', 1) for u, v in G_nx.edges()]
    else:
        g.es['weight'] = [1] * len(G_nx.edges())
    
   
    g.vs['name'] = list(G_nx.nodes())
    
    return g, mapping

In [9]:
# g, mapping = nx_to_igraph(G)

In [10]:
# cl = g.community_leiden(objective_function="modularity", weights="weight", resolution=1)

In [11]:
# from collections import Counter

# community_sizes = Counter(cl.membership)
# sorted_communities = sorted(community_sizes.items(), key=lambda x: x[1], reverse=True)

# for community_id, size in sorted_communities:
#     print(f"Community {community_id}: {size} nodes")

# print(cl.membership)

In [12]:
# cl = g.community_infomap(edge_weights="weight")

In [13]:
# community_sizes = Counter(cl.membership)
# sorted_communities = sorted(community_sizes.items(), key=lambda x: x[1], reverse=True)

# for community_id, size in sorted_communities:
#     print(f"Community {community_id}: {size} nodes")

In [5]:
from project_cda.community_tracker import CommunityTracker

partitions_by_year = {}
for year in range(2006, 2008):
    print(f"handle {year}")
    with open(f"data/graphs/Graphs_cleaned_95_percentile_Jaccard/anime_graph_{year}_jaccard.gpickle", "rb") as f:
        G = pickle.load(f)
    g, mapping = nx_to_igraph(G)
    del G
    gc.collect()
    print("clustering...")
    cl = g.community_leiden(objective_function="modularity", weights="weight", resolution=1)
    partition = CommunityTracker.get_membership(graph=g, partition=cl)
    del g
    gc.collect()
    partitions_by_year[year] = partition


handle 2006
clustering...
handle 2007
clustering...


In [6]:
print("tracking...")
tracked_communities = CommunityTracker.track_communities(partition_by_year=partitions_by_year, threshold=0.2)

tracking...
Aligning year 2006...
Aligning year 2007...


In [7]:
CommunityTracker.save_aligned_history_to_csv(partitions_by_year, f"data/partitions/_jacc_1.csv")

Saved partition detail to data/partitions/_jacc_1.csv
Saved partition stats to  data/partitions/_jacc_1_stats.csv


In [8]:
from project_cda.cluster_evaluation import ClusterEvaluation

ce = ClusterEvaluation("TEST_METHOD", partitions_by_year)
ce.evaluate()

Evaluating method: TEST_METHOD...


{'Method': 'TEST_METHOD',
 'Avg_Gini': 0.4906,
 'Avg_Entropy': 1.7007,
 'Stability_AMI': 0.0902,
 'Count_Volatility': 1.0,
 'Purity_Source': 0.0,
 'Purity_Genre': 0.0}

In [9]:
from project_cda.partition_enricher import PartitionEnricher

enricher = PartitionEnricher("data/datasets/anime_azathoth42/anime_sterilized.csv")
df_jaccard = enricher.enrich_partition("data/partitions/_jacc_1.csv")

In [11]:
print(df_jaccard.head())

   year  anime_id  cluster_id  Unnamed: 0  \
0  2006       457           0         387   
1  2006       558           0         469   
2  2006       565           0         475   
3  2006       846           0         650   
4  2006       853           0         656   

                                           image_url  score  scored_by  \
0  https://myanimelist.cdn-dena.com/images/anime/...   8.74     147314   
1  https://myanimelist.cdn-dena.com/images/anime/...   8.36      26374   
2  https://myanimelist.cdn-dena.com/images/anime/...   7.42      28053   
3  https://myanimelist.cdn-dena.com/images/anime/...   8.11      65095   
4  https://myanimelist.cdn-dena.com/images/anime/...   8.34     335137   

     rank                               opening_theme  \
0    38.0       ['"The Sore Feet Song" by Ally Kerr']   
1   188.0  ['"Saraba Aoki Omakage" by Road of Major']   
2  1951.0                                          []   
3   426.0  ['"Sentimental Generation" by Ami Tokito']   

In [None]:
from project_cda.cluster_visualizer import ClusterVisualizer
viz = ClusterVisualizer(df_jaccard)
viz.plot_evolution_sankey(
    filename="data/flow_plots/anime_evolution_new_viz.html",
    key_col="anime_id",
    name_col="title",
    feature_cols=["genres", "studio"],
    metric_col="score",
    sort_col="members",
    age_col="year_start")

Generating Sankey diagram (Evolution)...
Plot saved to data/flow_plots/anime_evolution_new_viz.html
