# Clusters

In [1]:
from typing import List, Callable
import json

from jq import jq
import tomlkit


In [2]:
with open("data.json", "r") as f:
    data = json.load(f)

In [3]:
doc = tomlkit.document()

In [4]:
# Save to document
def add_cluster_to_doc(cluster, doc):
    """
    Add a cluster to a toml doc
    """
    tab = tomlkit.table()
    a = tomlkit.array(cluster["creators"]).multiline(True)
    tab.add("creators", a)
    doc[cluster["name"]] = tab


## Give cluster related to creators
Algorithms:
1. 'edge'
2. 'color'
3. 'xy'

In [5]:
# edge
def create_cluster_from_large_edges(creator: str, limit=5) -> List[str]:
    """
    5 relevant creators that are connected
    """
    edges = f'. as $parent | $parent.nodes[] | select(.label=="{creator}") | .id as $source | [$parent.edges[] | select(.source == $source)] | sort_by(.size) | reverse | map(.target) | .[0:{limit}][] as $target | $parent.nodes[] | select(.id == $target) | .label'
    creators = jq(edges).input_value(data).all()
    return creators

In [6]:
# color
def create_cluster_from_color(creator: str, limit=5) -> List[str]:
    """
    5 relevant creators in the same group
    """
    color = f'. as $parent | $parent.nodes[] | select(.label=="{creator}") | .attributes."Modularity Class" as $mod | [$parent.nodes[] | select(.attributes."Modularity Class"==$mod) ] | sort_by(.size) | reverse | .[0:{limit}][] | .label'
    creators = jq(color).input_value(data).all()
    return creators

In [7]:
# xy
def create_cluster_from_xy(creator: str, limit=5) -> List[str]:
    """
    5 relevant creators in the same area
    """
    xy = '[. as $parent | $parent.nodes[] | select(.label=="'+creator+'") | . as {$id, $x, $y} | $parent.edges[] | select(.source == $id) | .target as $target | $parent.nodes[] | select(.id == $target) | .dy = ($y-.y)*($y-.y) | .dx = ($x-.x)*($x-.x) | .d = .dx + .dy] | sort_by(.d) | map(.label) | .[0:'+str(limit)+'][]'
    creators = jq(xy).input_value(data).all()
    return creators

In [8]:
string_to_func: dict[str, Callable] = {
  'edge': create_cluster_from_large_edges,
  'color': create_cluster_from_color,
  'xy': create_cluster_from_xy,
}


## Load existing clusters

In [9]:
doc = tomlkit.parse("""[test]
    algo="xy"
    max=15
    creators = ["shroud", "VALORANT"]
    """)

In [10]:
with open("input.toml", "r") as f:
    doc = tomlkit.load(f)

## Clustering

In [13]:
import random

clusters_to_update = [
    "offlinetv",
    "orange",
    "fortnite",
]

# Extend clusters based on chosen algo
for name, info in doc.items():
    if name not in clusters_to_update:
        continue
    new_candidates = []
    for creator in info["creators"]:
        new_creators = string_to_func[info["algo"]](creator, limit=100)
        new_candidates.extend(new_creators)
    new_candidates = set(new_candidates)
    n_creators = len(info["creators"])
    n_max = info["max"]
    if n_creators >= n_max:
        continue
    n_to_add = min(n_max - n_creators, len(new_candidates))
    print(n_to_add)
    new_creators = random.sample(list(new_candidates), k=n_to_add)
    creators = info["creators"]
    creators.extend(new_creators)
    creators = list(set(creators)) # remove duplicates
    creators = sorted(creators, key=lambda L: (L.lower(), L)) #  sort case-insensitive)
    info["creators"] = tomlkit.array(creators).multiline(True)
    print(new_creators)

15
['Nmplol', 'xQcOW', 'HasanAbi', 'moistcr1tikal', 'EsfandTV', 'ludwig', 'GMHikaru', 'sodapoppin', 'Mizkif', 'Trainwreckstv', '39daph', 'QTCinderella', 'pokelawls', 'SmallAnt', 'forsen']
13
['ImKaiCenat', 'SypherPK', 'Clix', 'Bugha', 'AdinRoss', 'benjyfishy', 'BLOU', 'ops1x', 'yourragegaming', 'LosPollosTV', 'Fortnite', 'Tyceno', 'Mongraal']
15
['QuarterJade', 'pokimane', 'fuslie', 'ironmouse', 'lilypichu', 'Tectone', 'CDawgVA', 'boxbox', 'nyanners', 'Sykkuno', 'Natsumiii', 'veibae', 'TinaKitten', 'Scarra', 'Enviosity']
5
['TheStockGuy', 'Xaryu', 'nyanners', 'AdmiralBulldog', 'CDNThe3rd']


## Save doc to file

In [12]:
with open("clusters.toml", "w") as f:
    tomlkit.dump(doc, f)