## Data extraction

In [1]:
import os

import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from time import sleep

### Connect to `spotipy`

In [2]:
client_id = os.environ["SPOTIPY_CLIENT_ID"]
client_secret = os.environ["SPOTIPY_CLIENT_SECRET"]

client_credentials_manager = SpotifyClientCredentials(
    client_id=client_id, client_secret=client_secret
)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager, retries=2)

### Fetch a list of top 500 Polish artists with IDs

In [3]:
def get_top_artists(q, max_artists=500):
    query = []
    for i in range(0, max_artists, 50):
        query.append(sp.search(q=q, type="artist", limit=50, offset=i))
        sleep(1)

    all_top_artists = [
        {
            "id": artist["id"],
            "name": artist["name"],
            "genres": artist["genres"],
            "popularity": artist["popularity"],
            "followers": artist["followers"]["total"],
        }
        for i in range(len(query))
        for artist in query[i]["artists"]["items"]
    ]

    return pd.DataFrame(all_top_artists)

In [4]:
if os.path.exists("../data/nodes.csv"):
    all_nodes = pd.read_csv("../data/nodes.csv")
else:
    all_nodes = get_top_artists(q='genre:"polish"', max_artists=500)

display(all_nodes)

Unnamed: 0,id,name,genres,popularity,followers
0,7y97mc3bZRFXzT2szRM4L4,Frédéric Chopin,"['classical', 'early romantic era', 'polish cl...",73,3217463
1,6iqDK7aHVlwGGgPmcdSK5L,Tribbs,"['polish pop', 'polish viral pop']",69,147928
2,1T4HxOYolAEb5PadIVKdWZ,Gibbs,['polish hip hop'],70,375421
3,7CJgLPEqiIRuneZSolpawQ,Taco Hemingway,"['polish hip hop', 'polish trap']",69,1453560
4,0LX2VNf5w4iOHW1yyIqb74,Bedoes 2115,"['polish hip hop', 'polish trap']",69,1500183
...,...,...,...,...,...
493,6YpCzWpIXRTdLlpPgDcMJZ,Łona,"['polish alternative rap', 'polish old school ...",40,11356
494,1H8lEfbUpbcXWFqbW9BMro,WŁODAR,['polish viral pop'],40,27681
495,1CzKORB9IN0EjPEyeKBIkf,Tides From Nebula,"['cosmic post-rock', 'instrumental post-rock',...",30,50548
496,6gG1FOW5CoylM7858JYboD,Piotr Rogucki,"['polish alternative', 'sung poetry']",40,74127


### Find collaborations for each artist

In [5]:
def get_artist_tracks(artist_id):
    albums = sp.artist_albums(artist_id, album_type="album")
    album_ids = [album["id"] for album in albums["items"]]
    tracks = []
    for album_id in album_ids:
        album_tracks = sp.album_tracks(album_id)
        sleep(1)
        tracks.extend(album_tracks["items"])
    return tracks


def get_artist_collaborations(artist_id, limit_to_ids=None):
    """Retrieve all collaborations of an artist with other artists.

    Args:
        artist_id (str): the Spotify ID of the artist
        limit_to_ids: iterable of artist IDs to limit the search to
    """
    if limit_to_ids is None:
        limit_to_ids = []

    tracks = get_artist_tracks(artist_id)
    collaborations = [
        {"featured_artist_id": artist["id"], "primary_artist_id": artist_id}
        for track in tracks
        for artist in track["artists"]
        if artist["id"] != artist_id and artist["id"] in limit_to_ids
    ]
    collaborations_df = pd.DataFrame(
        collaborations, columns=["primary_artist_id", "featured_artist_id"]
    )

    # Group by featured artist and count the number of collaborations
    collaborations_df = (
        collaborations_df.groupby(["primary_artist_id", "featured_artist_id"])
        .size()
        .reset_index(name="collaboration_count")
    )

    return collaborations_df

#### Repeat for all artists

In [6]:
from tqdm import tqdm


def get_collaborations_for_all_top_artists(artists):
    collaborations = []

    print("Fetching top artists...")
    artists = artists.iloc[len(collaborations) :]
    artist_ids = artists["id"].tolist()

    print("Fetching collaborations for each top artist...")
    with tqdm(total=len(artists)) as pbar:
        for _, top_artist in artists.iterrows():
            # Get collaborations with other top artists
            collaborations.append(
                get_artist_collaborations(top_artist["id"], artist_ids)
            )
            pbar.update(1)

    return pd.concat(collaborations)

In [7]:
if not os.path.exists("../data/edges.csv"):
    all_edges = get_collaborations_for_all_top_artists(all_nodes)
else:
    all_edges = pd.read_csv("../data/edges.csv")

### Summary

#### Nodes list

In [8]:
display(all_nodes.head(10))

Unnamed: 0,id,name,genres,popularity,followers
0,7y97mc3bZRFXzT2szRM4L4,Frédéric Chopin,"['classical', 'early romantic era', 'polish cl...",73,3217463
1,6iqDK7aHVlwGGgPmcdSK5L,Tribbs,"['polish pop', 'polish viral pop']",69,147928
2,1T4HxOYolAEb5PadIVKdWZ,Gibbs,['polish hip hop'],70,375421
3,7CJgLPEqiIRuneZSolpawQ,Taco Hemingway,"['polish hip hop', 'polish trap']",69,1453560
4,0LX2VNf5w4iOHW1yyIqb74,Bedoes 2115,"['polish hip hop', 'polish trap']",69,1500183
5,6EB8VE9f7Ut6NOgviN6gDW,Dawid Podsiadło,"['polish pop', 'talent show']",65,2044864
6,0Wi2fADbhwXlPUWxBmzo99,Szpaku,['polish hip hop'],68,784319
7,1Kjs5u8GQf6zCFdTj6SI9E,Malik Montana,"['polish drill', 'polish hip hop', 'polish trap']",68,1085397
8,3U5Oag04Yl2WnvPULOlsMD,Kukon,"['polish hip hop', 'polish trap']",68,562717
9,4nPxrGG7k7aEKmNLsfX4cd,White 2115,"['polish hip hop', 'polish trap']",68,1210712


### Edges list

In [9]:
display(all_edges.head(10))

Unnamed: 0,primary_artist_id,featured_artist_id,collaboration_count
0,7y97mc3bZRFXzT2szRM4L4,0HC5DGqdUzXorIXUudkeWG,3
1,7y97mc3bZRFXzT2szRM4L4,7Im00DCJCJrFrC1Ho6vjD6,6
2,6iqDK7aHVlwGGgPmcdSK5L,3ppWDN3lGw7UOGY7z2EQLB,1
3,6iqDK7aHVlwGGgPmcdSK5L,5hqRsNHDZH1jHzI9LgxFRZ,1
4,6iqDK7aHVlwGGgPmcdSK5L,67VM1TW9hWE9hlVYcmioaH,1
5,6iqDK7aHVlwGGgPmcdSK5L,6Lf4vAUaFUR2jAsybC7cGV,1
6,6iqDK7aHVlwGGgPmcdSK5L,76Uu7lnLuTOmH2eZsKZTan,2
7,1T4HxOYolAEb5PadIVKdWZ,0BBB9DjvskQV0oReJMxTP1,1
8,1T4HxOYolAEb5PadIVKdWZ,0Wi2fADbhwXlPUWxBmzo99,3
9,1T4HxOYolAEb5PadIVKdWZ,0bNZDC5iASejId9tdv3kiR,43


### Edges list with mapped node metadata

In [10]:
all_edges_meta = all_edges.merge(
    all_nodes, left_on="primary_artist_id", right_on="id", suffixes=[None, None]
).merge(
    all_nodes,
    left_on="featured_artist_id",
    right_on="id",
    suffixes=["_primary", "_featured"],
)[
    [
        "collaboration_count",
        "name_primary",
        "name_featured",
        "genres_primary",
        "genres_featured",
    ]
]

display(all_edges_meta[all_edges_meta["name_primary"] == "Taco Hemingway"])

Unnamed: 0,collaboration_count,name_primary,name_featured,genres_primary,genres_featured
34,2,Taco Hemingway,Bedoes 2115,"['polish hip hop', 'polish trap']","['polish hip hop', 'polish trap']"
35,1,Taco Hemingway,Szpaku,"['polish hip hop', 'polish trap']",['polish hip hop']
36,1,Taco Hemingway,Daria Zawiałow,"['polish hip hop', 'polish trap']","['polish alternative', 'polish indie', 'polish..."
37,1,Taco Hemingway,KęKę,"['polish hip hop', 'polish trap']",['polish hip hop']
38,2,Taco Hemingway,CatchUp,"['polish hip hop', 'polish trap']","['polish alternative rap', 'polish hip hop']"
39,26,Taco Hemingway,Quebonafide,"['polish hip hop', 'polish trap']",['polish hip hop']
40,2,Taco Hemingway,Oki,"['polish hip hop', 'polish trap']",['polish hip hop']
41,1,Taco Hemingway,Kaz Bałagane,"['polish hip hop', 'polish trap']","['polish hip hop', 'polish trap']"
42,1,Taco Hemingway,Kizo,"['polish hip hop', 'polish trap']","['polish hip hop', 'polish trap']"
43,3,Taco Hemingway,schafter,"['polish hip hop', 'polish trap']","['polish hip hop', 'polish trap']"


### Save nodes and edges to file

In [11]:
all_nodes.to_csv("../data/nodes.csv", index=False)
all_edges.to_csv("../data/edges.csv", index=False)