In [1]:
import matplotlib as mpl
import pandas as pd
import matplotlib.pylab as plt
import matplotlib.dates as mdates
import numpy as np
from datetime import datetime
import networkx as nx
import netwulf as nw
from itertools import combinations
from collections import defaultdict
import random
from scipy import stats
from networkx.algorithms import community
import community
import json
  

seed = 1337
random.seed(seed)
np.random.seed(seed)

# Network
This section of the notebook will go through the network analysis of the data. We have used `networkx` to build the networks and `netwulf` to visualise them. The the following sections we will be investigating the full network of all musicians as well as a subset of them based on selected genres. The networks will be studied by calculating different statistics, such as number of nodes, number of links, density, clusterings and more. In addition, we will look at community detection to see how well the different genres manages to partition the networks into communities in comparison to the Louvain algorithm for community detection.

# Load data

In [2]:
song_data = pd.read_pickle('songData.df')
print(f'Number of songs: {len(song_data)}')

Number of songs: 25419


Network visualisation config.

In [3]:
# reading the data from the file
with open('network_figures/config.txt') as f:
    data = f.read()
config = json.loads(data)

## Creating the full network
Calculate all genres associated to each artist as well as how many songs they have made for each genre.

In [4]:
all_artists = set()
artist_genres = dict()
artist_genres_count = defaultdict(lambda: defaultdict(lambda: 0))
for artists, genres in zip(song_data.artists, song_data.genres):
    for artist in artists:
        all_artists = all_artists.union(set([artist]))
        for genre in genres:
            artist_genres_count[artist][genre] += 1
        if artist in artist_genres.keys():
            artist_genres[artist] = artist_genres[artist].union(set(genres))
        else:
            artist_genres[artist] = set(genres)
        

all_artists = list(all_artists)
print(f'Number of unique artists: {len(all_artists)}')

Number of unique artists: 7855


Creating a genre list from which each artist can get their main genre label. In addition, a colour list to colour each node based on their main genre.

In [5]:
genre_list = ['pop', 'rock', 'rap', 'r&b', 'country', 'soul', 
              'singer-songwriter', 'pop-rock', 'trap', 'ballad', 
             'soul pop', 'eighties', 'seventies', 'soundtrack',
             'hip-hop', 'funk', 'dance', 'electronic', 'folk', 'cover', 
             'jazz', 'blues']

In [6]:
colour_list = ['#E74C3C', '#9B59B6', '#3498DB', '#1ABC9C', '#27AE60', '#F4D03F', '#E67E22', '#EDB9B9', '#E7E9B9',
              '#B9EDE0', '#B9D7ED', '#DCB9ED', '#8F2323', '#8F6A23', '#4F8F23', '#23628F', '#6B238F', '#AED6F1',
              '#A3E4D7', '#D4AC0D', '#D7BDE2', '#F5B7B1', '#0A2ADA']

In [7]:
colour_dict = {}
for colour, genre in zip(colour_list, genre_list+['other']):
    colour_dict[genre] = colour

Calculate number of songs each artist has in the data set as well as how many times they have collaborated with other artists.

In [8]:
artist_count = defaultdict(lambda: 0)
artist_colab_count = defaultdict(lambda: defaultdict(lambda: 0))

for artists in song_data.artists:
    for artist in artists:
        artist_count[artist] += 1
        for colab in artists:
            if colab != artist:
                artist_colab_count[artist][colab] += 1

### Add nodes
Add each artist as a node with three attributes
> *genre*: most common genre for that artist within the fixed list 'genre_list'

> *size*: number of times the artist has appeared on Billboard's the hot 100 (used to give each node the correct size)

> *all_genres*: all genres associated with that artist

In [9]:
G = nx.Graph()
for artist in all_artists:
    most_occurences = 0
    max_key = 'other'
    random.shuffle(genre_list)
    
    for genre in genre_list:
        if genre in artist_genres_count[artist]:
            if artist_genres_count[artist][genre] > most_occurences:
                most_occurences = artist_genres_count[artist][genre]
                max_key = genre
                
    G.add_node(artist, 
               genre=max_key, 
               size=artist_count[artist], 
               all_genres=artist_genres[artist], 
               group=colour_dict[max_key])
    
G.number_of_nodes()
print(f'Number of nodes: {G.number_of_nodes()}')

Number of nodes: 7855


### Add edges
Add edges between two artists if they have collaborated on a song and weigh the edge by the number of times they have collaborated.

In [11]:
linked_artists = set()
for artists in song_data.artists:
    if len(artists) > 1:
        for comb in combinations(artists, 2):
            if not comb[0] == comb[1]:
                linked_artists = linked_artists.union({tuple([comb[0], comb[1], artist_colab_count[comb[0]][comb[1]]])})

linked_artists = list(linked_artists)
G.add_weighted_edges_from(linked_artists)
print(f'Number of edges: {G.number_of_edges()}')

Number of edges: 6799


### Helper functions

In [12]:
def randomized_graph(graph, N):
    g = graph.copy()
    swaps = 0
    while swaps < N:
        uv = random.choice(list(g.edges()))
        if uv[0] == uv[1]:
            uv = random.choice(list(g.edges()))
        xy = random.choice(list(g.edges()))
        while uv[1] == xy[0]:
            xy = random.choice(list(g.edges()))
        if not g.has_edge(uv[0], xy[1]) and not g.has_edge(uv[1], xy[0]):
            g.remove_edges_from([uv, xy])
            g.add_edges_from([(uv[0], xy[1]), (uv[1], xy[0])])
            swaps += 1
    return g

In [13]:
def get_network_by_genre(G, genre):
    genre_nodes = [node for node, data in G.nodes(data=True) if genre in data['all_genres']]
    return G.subgraph(genre_nodes)

In [14]:
def get_partitioning(filtered_graph):
    partitioning = []
    nc = set(nx.get_node_attributes(filtered_graph, 'group').values())
    for i in nc:
        nodes = (
            node
            for node, data
            in filtered_graph.nodes(data=True)
            if data.get("group") == i
        )
        partitioning.append(filtered_graph.subgraph(nodes))
    return partitioning

In [15]:
def modularity(graph, partitioning):
    M = 0
    L = graph.number_of_edges()
    for subgraph in partitioning:
        Lc = subgraph.number_of_edges()
        kc = sum(graph.degree[node] for node in subgraph.nodes())
        M += Lc/L - (kc / (2 * L))**2
    return M

## Analysis
The full network has now been created and we are ready to do visualisations and analysis. In the following sections we will be working with the full network and sub-networks for the genres: _pop_, _rap_, _rock_, _R&B_, _country_, _funk_, _folk_ and _blues_. For each of the networks we will be investigating the full network as well versions of the network where singleton nodes with less than 5 songs are removed. 

# Mangler et godt argument her for hvorfor vi har valgt at fjerne singletons med mindre end 5 sange. 

### With singletons

In [None]:
network_G, _ = nw.visualize(G, config=config, plot_in_cell_below=False)
fig, ax = nw.draw_netwulf(network_G)
# plt.savefig("network_figures/G.pdf")