Import data

In [1]:
import requests
import csv

In [5]:
URL = 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-21/spotify_songs.csv'
response = requests.get(URL)
response.raise_for_status()

columns = ['track_id', 'track_name', 'track_artist', 'playlist_genre', 'playlist_subgenre', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']

data = response.text.splitlines()
data = csv.DictReader(data)
data = [{k: v for k, v in row.items() if k in columns} for row in data]
#data[:5]

Data Preprocessing

In [6]:
# create a one hot encoding for the playlist_genre and playlist_subgenre columns
playlist_genre = set(row['playlist_genre'] for row in data)
playlist_subgenre = set(row['playlist_subgenre'] for row in data)

for row in data:
    row['label'] = {}

for genre in playlist_genre:
    for subgenre in playlist_subgenre:
        column_name = f'{genre}_{subgenre}'
        for row in data:
            row['label'][column_name] = int(row['playlist_genre'] == genre and row['playlist_subgenre'] == subgenre)

# remove the original playlist_genre and playlist_subgenre columns
for row in data:
    del row['playlist_genre']
    del row['playlist_subgenre']

# remove any columns that are all zeros
columns_to_remove = set()
for column in data[0]['label']:
    if all(row['label'][column] == 0 for row in data):
        columns_to_remove.add(column)

for row in data:
    for column in columns_to_remove:
        del row['label'][column]

print(data[0])
print(f'Number of subgenres: {len(data[0]["label"])}')

{'track_id': '6f807x0ima9a1j3VPbc7VN', 'track_name': "I Don't Care (with Justin Bieber) - Loud Luxury Remix", 'track_artist': 'Ed Sheeran', 'danceability': '0.748', 'energy': '0.916', 'key': '6', 'loudness': '-2.634', 'mode': '1', 'speechiness': '0.0583', 'acousticness': '0.102', 'instrumentalness': '0', 'liveness': '0.0653', 'valence': '0.518', 'tempo': '122.036', 'duration_ms': '194754', 'label': {'latin_tropical': 0, 'latin_latin pop': 0, 'latin_reggaeton': 0, 'latin_latin hip hop': 0, 'pop_post-teen pop': 0, 'pop_electropop': 0, 'pop_indie poptimism': 0, 'pop_dance pop': 1, 'rock_hard rock': 0, 'rock_album rock': 0, 'rock_permanent wave': 0, 'rock_classic rock': 0, 'edm_electro house': 0, 'edm_big room': 0, 'edm_progressive electro house': 0, 'edm_pop edm': 0, 'rap_southern hip hop': 0, 'rap_hip hop': 0, 'rap_trap': 0, 'rap_gangster rap': 0, 'r&b_neo soul': 0, 'r&b_urban contemporary': 0, 'r&b_hip pop': 0, 'r&b_new jack swing': 0}}
Number of subgenres: 24


In [7]:
# create a one hot encoding for the key and mode columns
key = set(row['key'] for row in data)
mode = set(row['mode'] for row in data)

for k in key:
    column_name = f'key_{k}'
    for row in data:
        row[column_name] = int(row['key'] == k)

for m in mode:
    column_name = f'mode_{m}'
    for row in data:
        row[column_name] = int(row['mode'] == m)

# remove the original key and mode columns
for row in data:
    del row['key']
    del row['mode']

Feature engineering: normalization of continuous properties of songs

In [8]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd
import numpy as np

for track in data:
    track['danceability'] = float(track['danceability'])
    track['energy'] = float(track['energy'])
    track['loudness'] = float(track['loudness'])
    track['speechiness'] = float(track['speechiness'])
    track['acousticness'] = float(track['acousticness'])
    track['instrumentalness'] = float(track['instrumentalness'])
    track['liveness'] = float(track['liveness'])
    track['valence'] = float(track['valence'])
    track['tempo'] = float(track['tempo'])
    track['duration_ms'] = int(track['duration_ms'])

# list of continuous features in our dataset
continuous_features = [
    'danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
    'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms'
]

features_df = pd.DataFrame([{
    feat: track[feat] for feat in continuous_features
} for track in data])

# apply a standard scalar for normalization
scaler = StandardScaler() 
scaled_features = scaler.fit_transform(features_df)

# replacing data with normalization applied
for idx, track in enumerate(data):
    for i, feat in enumerate(continuous_features):
        track[feat] = scaled_features[idx][i]

# for track in data:
#     track['duration_ms'] = np.log1p(track['duration_ms'])  # log(x+1) to avoid log(0)
#     track['loudness'] = np.log1p(abs(track['loudness'])) * np.sign(track['loudness'])

In [9]:
import csv

# Function to flatten the dictionary
def flatten_dict(d, parent_key='', sep='_'):
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):  # Recursively flatten nested dictionaries
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

# Flatten each dictionary in the list
flattened_data = []

for item in data:
    flattened_item = flatten_dict(item)
    flattened_data.append(flattened_item)

print(type(flattened_data[0]))

# # Get the fieldnames from the keys of the first flattened dictionary
# fieldnames = flattened_data[0].keys()

# # Write the flattened data to CSV
# with open('tracks.csv', 'w', newline='', encoding='utf-8') as csvfile:
#     writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
#     writer.writeheader()  # Write the header
#     writer.writerows(flattened_data)  # Write the data

# print("Data written to 'tracks.csv'")

<class 'dict'>
