In [28]:
import pandas as pd
import matplotlib.pyplot as plt
import ast
import networkx as nx
import community as community_louvain
from collections import Counter, defaultdict
import netwulf as nw
import numpy as np
from Netwulf_plot_functions import netwulf_plot_communities
from community import community_louvain 
import pickle

In [19]:
nodes_df = pd.read_csv('nodes.csv')
edges_df = pd.read_csv('edges.csv')
nodes_df['genres'] = nodes_df['genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])
artists_us_df = pd.read_csv("most_listened_artists_in_US_dataset.csv", index_col=0)
artists_us_name_list = (artists_us_df.index).tolist()
nodes_df = nodes_df[nodes_df["name"].isin(artists_us_name_list)]
duplicate_names = nodes_df[nodes_df.duplicated(subset="name", keep=False)].sort_values(by="name")
final_cleaned_nodes_df = nodes_df.sort_values("followers", ascending=False)
final_cleaned_nodes_df = nodes_df.drop_duplicates(subset="name", keep="first")

In [21]:
def split_artists_by_genre(nodes_df, genre_names, exclusive=False):    
    nodes_df = nodes_df.copy()

    filtered_dfs = []

    for genre in genre_names:
        if exclusive:
            other_genres = [g for g in genre_names if g != genre]
            
            df = nodes_df[
                nodes_df['genres'].apply(
                    lambda genres: genre in ' '.join(genres) and all(other not in ' '.join(genres) for other in other_genres)
                )
            ].copy()
        else:
             df = nodes_df[
                nodes_df['genres'].apply(
                    lambda genres: genre in ' '.join(genres)
                )
            ].copy()
        
        filtered_dfs.append(df)

    return filtered_dfs

def get_Graph_with_names(nodes_df, edges_df, verbose=True):
    nodes_df = nodes_df.dropna(subset=['spotify_id', 'name'])
    edges_df = edges_df.dropna(subset=['id_0', 'id_1'])

    # Build ID to name map
    id_to_name = dict(zip(nodes_df['spotify_id'], nodes_df['name']))
    allowed_ids = set(id_to_name.keys())

    # Create graph with artist names as nodes
    G = nx.Graph()

    for _, row in nodes_df.iterrows():
        artist_name = row['name']
        followers = row['followers'] if not pd.isna(row['followers']) else 0
        popularity = row['popularity'] if not pd.isna(row['popularity']) else 0
        genres = row['genres']
        chart_hits = row['chart_hits'] if not pd.isna(row['chart_hits']) else []

        G.add_node(artist_name, followers=followers, popularity=popularity,
                   genres=genres, chart_hits=chart_hits)

    for _, row in edges_df.iterrows():
        id_0, id_1 = row['id_0'], row['id_1']
        if id_0 in allowed_ids and id_1 in allowed_ids:
            name_0 = id_to_name.get(id_0)
            name_1 = id_to_name.get(id_1)
            if name_0 and name_1:
                G.add_edge(name_0, name_1)

    if verbose:
        print(f"Number of nodes: {G.number_of_nodes()}")
        print(f"Number of edges: {G.number_of_edges()}")
        print(f"Network density: {nx.density(G)}")

    return G

def get_n_largest_component(G, n):
    return sorted(nx.connected_components(G), key=len, reverse=True)[n]

def apply_louvain(G, verbose=True):
    partition = community_louvain.best_partition(G)

    nx.set_node_attributes(G, partition, 'community')
    
    if verbose:
        community_sizes = Counter(partition.values())
        sorted_communities = sorted(community_sizes.items(), key=lambda x: x[1], reverse=True)
        for community_id, size in sorted_communities:
            print(f"Community {community_id}: {size} nodes")
    
    communities = defaultdict(list)
    for node, community_id in partition.items():
        communities[community_id].append(node)
    
    sorted_communities = sorted(communities.items(), key=lambda x: len(x[1]), reverse=True)
    return sorted_communities
        
def get_community_subgraph(G, community_id):
    nodes_in_community = [n for n, attr in G.nodes(data=True) if attr.get('community') == community_id]
    
    subgraph = G.subgraph(nodes_in_community).copy()
    
    return subgraph

def get_combined_louvain_splitted_graph(G):
    _G = G.copy()
    partition = community_louvain.best_partition(_G)
    nx.set_node_attributes(_G, partition, 'community')

    combined_G = nx.Graph()
    
    combined_G.add_nodes_from(_G.nodes(data=True))

    for u, v in _G.edges():
        if partition[u] == partition[v]:
            combined_G.add_edge(u, v, **_G[u][v])

    return combined_G

def print_top_collaborators(G, n):
    top_collaborators = sorted(G.degree, key=lambda x: x[1], reverse=True)[:n]

    print(f"Top {n} artists with the most collaborations:")
    for rank, (artist, degree) in enumerate(top_collaborators, start=1):
        print(f"{rank}. {artist} — {degree} collaborations")


In [22]:
df_tes = pd.read_csv("most_listened_artists_in_US_dataset.csv")
df_us = df_tes[df_tes["Country"] == "US"]
df_us = df_us.rename(columns={"Name": "name"})
us_names = df_us["name"].tolist()

In [23]:
final_df = final_cleaned_nodes_df[final_cleaned_nodes_df["name"].isin(us_names)]

In [24]:
final_df

Unnamed: 0,spotify_id,name,followers,popularity,genres,chart_hits
15,3xs0LEzcPXtgNfMNcHzLIP,Rockwell,40344.0,58,[],"['us (1)', 'gb (1)', 'at (1)', 'be (1)', 'ca (..."
54,0LyOADBjj28cbvJWTXUEGA,Loren Allred,111839.0,58,"[deep talent show, hollywood, movie tunes]","['us (1)', 'gb (2)', 'au (2)', 'at (1)', 'cz (..."
85,7wU2WGCJ8HxkekHHE2QLul,Fuego,253571.0,67,"[latin hip hop, rap dominicano, reggaeton, reg...","['ar (3)', 'bo (2)', 'cl (2)', 'co (3)', 'cr (..."
88,2l35CQqtYRh3d8ZIiBep4v,MKTO,995449.0,66,"[dance pop, pop, pop rap, pop rock, post-teen ...","['us (3)', 'gb (2)', 'au (5)', 'at (1)', 'be (..."
152,7h1amg0X7Q5d0D5KfuyKTg,Pyrex,29130.0,57,"[italian hip hop, trap italiana]",['it (3)']
...,...,...,...,...,...,...
154021,5oFkj1qSlyBUmV5d6Edgtq,Cozz,175894.0,54,[underground hip hop],
154404,5YCU9eHY4IYTyNa8XRFuw9,Alex Hosking,3743.0,48,[],
155001,3nQ59hkeciYoR6RreQL4Rv,Darren Criss,117746.0,52,[],
155024,79QO0Xmn1dZhvaLicS2Yrs,The Night Game,37079.0,44,"[indie poptimism, modern alternative rock]",


In [61]:
final_df[final_df["name"].isin(lyrics_supported)]

Unnamed: 0,spotify_id,name,followers,popularity,genres,chart_hits
85,7wU2WGCJ8HxkekHHE2QLul,Fuego,253571.0,67,"[latin hip hop, rap dominicano, reggaeton, reg...","['ar (3)', 'bo (2)', 'cl (2)', 'co (3)', 'cr (..."
88,2l35CQqtYRh3d8ZIiBep4v,MKTO,995449.0,66,"[dance pop, pop, pop rap, pop rock, post-teen ...","['us (3)', 'gb (2)', 'au (5)', 'at (1)', 'be (..."
152,7h1amg0X7Q5d0D5KfuyKTg,Pyrex,29130.0,57,"[italian hip hop, trap italiana]",['it (3)']
171,7EK1bQADBoqbYXnT4Cqv9w,John Denver,1646622.0,67,"[classic country pop, folk, folk rock, mellow ...","['gb (1)', 'be (2)', 'ca (2)', 'ee (1)', 'hu (..."
231,2vm8GdHyrJh2O2MfbQFYG0,Ingrid Michaelson,780402.0,59,"[acoustic pop, ectofolk, lilith, neo mellow, p...","['us (1)', 'at (1)', 'ca (2)', 'cz (1)', 'ee (..."
...,...,...,...,...,...,...
150515,7ahuvq1mbb4idwG1iJbSFG,Cassadee Pope,231195.0,54,"[contemporary country, country, country pop, p...",
151120,6x9QLdzo6eBZxJ1bHsDkjg,Sisqo,806274.0,58,"[contemporary r&b, hip hop, hip pop, r&b, urba...",
152233,7krUxybhp1bUwFBxpOtmZb,La Kuppe,288590.0,53,[cumbia pop],
153396,15Dh5PvHQj909E0RgAe0aN,Nivea,645425.0,47,"[atl hip hop, contemporary r&b, hip pop, pop r...",


In [56]:
with open("pop_df.pkl", "rb") as f: 
    pop = pickle.load(f)
with open("rap_df.pkl", "rb") as f: 
    rap = pickle.load(f)
lyrics_supported = []

In [57]:
lyrics_supported += pop[~pop["lyrics"].isnull()]["name"].tolist()
lyrics_supported += rap[~rap["lyrics"].isnull()]["name"].tolist()

In [25]:
pop_df, rap_df = split_artists_by_genre(final_df, ["pop", "rap"])

print(len(pop_df), len(rap_df))
G_pop = get_Graph_with_names(pop_df, edges_df)
G_rap = get_Graph_with_names(rap_df, edges_df)

535 319
Number of nodes: 535
Number of edges: 1983
Network density: 0.01388217998529875
Number of nodes: 319
Number of edges: 2138
Network density: 0.04215216576960233


In [26]:
communities = apply_louvain(G_rap)
largest_community_id = communities[0][0]

G_largest_rap_community = get_community_subgraph(G_rap, largest_community_id)
G_largest_rap_community_splitted = get_combined_louvain_splitted_graph(G_largest_rap_community)

Community 3: 83 nodes
Community 4: 64 nodes
Community 7: 45 nodes
Community 15: 45 nodes
Community 12: 18 nodes
Community 38: 15 nodes
Community 20: 2 nodes
Community 1: 1 nodes
Community 2: 1 nodes
Community 9: 1 nodes
Community 10: 1 nodes
Community 13: 1 nodes
Community 14: 1 nodes
Community 16: 1 nodes
Community 17: 1 nodes
Community 18: 1 nodes
Community 19: 1 nodes
Community 22: 1 nodes
Community 24: 1 nodes
Community 25: 1 nodes
Community 26: 1 nodes
Community 27: 1 nodes
Community 28: 1 nodes
Community 29: 1 nodes
Community 30: 1 nodes
Community 32: 1 nodes
Community 33: 1 nodes
Community 34: 1 nodes
Community 35: 1 nodes
Community 37: 1 nodes
Community 39: 1 nodes
Community 41: 1 nodes
Community 42: 1 nodes
Community 43: 1 nodes
Community 44: 1 nodes
Community 45: 1 nodes
Community 46: 1 nodes
Community 47: 1 nodes
Community 48: 1 nodes
Community 49: 1 nodes
Community 50: 1 nodes
Community 51: 1 nodes
Community 52: 1 nodes
Community 53: 1 nodes
Community 36: 1 nodes
Community 

In [27]:
communities = community_louvain.best_partition(G_rap)

colors = ['#e57468', '#68e574', '#7468e5', '#e5d068', '#68d0e5']

netwulf_plot_communities(G_rap, communities, port=9981, color_palette=colors, path="Pop_network.pdf")

tmpgraph.json


The default value will be `edges="edges" in NetworkX 3.6.


  nx.node_link_data(G, edges="links") to preserve current behavior, or
  nx.node_link_data(G, edges="edges") for forward compatibility.


TypeError: 'NoneType' object is not subscriptable