Import data

In [11]:
import requests
import csv

In [25]:
URL = 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv'
response = requests.get(URL)
response.raise_for_status()

columns = ['track_id', 'track_name', 'track_artist', 'playlist_genre', 'playlist_subgenre', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']

data = response.text.splitlines()
data = csv.DictReader(data)
data = [{k: v for k, v in row.items() if k in columns} for row in data]
data[:5]

[{'track_id': '6f807x0ima9a1j3VPbc7VN',
  'track_name': "I Don't Care (with Justin Bieber) - Loud Luxury Remix",
  'track_artist': 'Ed Sheeran',
  'playlist_genre': 'pop',
  'playlist_subgenre': 'dance pop',
  'danceability': '0.748',
  'energy': '0.916',
  'key': '6',
  'loudness': '-2.634',
  'mode': '1',
  'speechiness': '0.0583',
  'acousticness': '0.102',
  'instrumentalness': '0',
  'liveness': '0.0653',
  'valence': '0.518',
  'tempo': '122.036',
  'duration_ms': '194754'},
 {'track_id': '0r7CVbZTWZgbTCYdfa2P31',
  'track_name': 'Memories - Dillon Francis Remix',
  'track_artist': 'Maroon 5',
  'playlist_genre': 'pop',
  'playlist_subgenre': 'dance pop',
  'danceability': '0.726',
  'energy': '0.815',
  'key': '11',
  'loudness': '-4.969',
  'mode': '1',
  'speechiness': '0.0373',
  'acousticness': '0.0724',
  'instrumentalness': '0.00421',
  'liveness': '0.357',
  'valence': '0.693',
  'tempo': '99.972',
  'duration_ms': '162600'},
 {'track_id': '1z1Hg7Vb0AhHDiEmnDE79l',
  'tra

Data Preprocessing

In [26]:
# create a one hot encoding for the playlist_genre and playlist_subgenre columns
playlist_genre = set(row['playlist_genre'] for row in data)
playlist_subgenre = set(row['playlist_subgenre'] for row in data)

for row in data:
    row['label'] = {}

for genre in playlist_genre:
    for subgenre in playlist_subgenre:
        column_name = f'{genre}_{subgenre}'
        for row in data:
            row['label'][column_name] = int(row['playlist_genre'] == genre and row['playlist_subgenre'] == subgenre)

# remove the original playlist_genre and playlist_subgenre columns
for row in data:
    del row['playlist_genre']
    del row['playlist_subgenre']

In [27]:
# create a one hot encoding for the key and mode columns
key = set(row['key'] for row in data)
mode = set(row['mode'] for row in data)

for k in key:
    for m in mode:
        column_name = f'keymode_{k}_{m}'
        for row in data:
            row[column_name] = int(row['key'] == k and row['mode'] == m)

# remove the original key and mode columns
for row in data:
    del row['key']
    del row['mode']

In [29]:
data[0]

{'track_id': '6f807x0ima9a1j3VPbc7VN',
 'track_name': "I Don't Care (with Justin Bieber) - Loud Luxury Remix",
 'track_artist': 'Ed Sheeran',
 'danceability': '0.748',
 'energy': '0.916',
 'loudness': '-2.634',
 'speechiness': '0.0583',
 'acousticness': '0.102',
 'instrumentalness': '0',
 'liveness': '0.0653',
 'valence': '0.518',
 'tempo': '122.036',
 'duration_ms': '194754',
 'label': {'pop_neo soul': 0,
  'pop_trap': 0,
  'pop_new jack swing': 0,
  'pop_big room': 0,
  'pop_urban contemporary': 0,
  'pop_permanent wave': 0,
  'pop_gangster rap': 0,
  'pop_southern hip hop': 0,
  'pop_classic rock': 0,
  'pop_progressive electro house': 0,
  'pop_post-teen pop': 0,
  'pop_pop edm': 0,
  'pop_indie poptimism': 0,
  'pop_latin hip hop': 0,
  'pop_electro house': 0,
  'pop_tropical': 0,
  'pop_dance pop': 1,
  'pop_hip hop': 0,
  'pop_album rock': 0,
  'pop_latin pop': 0,
  'pop_electropop': 0,
  'pop_hip pop': 0,
  'pop_hard rock': 0,
  'pop_reggaeton': 0,
  'r&b_neo soul': 0,
  'r&b_t