# Filter non-sleep genres from sleep dataset

In [53]:
import pandas as pd
import numpy as np

In [54]:
df = pd.read_csv('/home/th716/audio-diffusion/spotify_sleep_dataset/Sleep_FullDataset_withDuplicates.csv')

In [55]:
df.columns

Index(['Unnamed: 0', 'TrackName', 'TrackID', 'SampleURL', 'ReleaseYear',
       'Genres', 'danceability', 'energy', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'key', 'mode', 'duration_ms', 'Popularity', 'pNum', 'playlistID',
       'label', 'userCat', 'demoCat', 'length', 'playlistTitle', 'nFoll',
       'nTracks'],
      dtype='object')

In [21]:
df['TrackID'].unique()

array(['48mOMNLnlbok3W6anP7sTS', '77Fs2NajDBQaOOZkYDsFLE',
       '5Nusgvqw46McIdfuqrvM4c', ..., '4z2i0TktkIk8WNNC8YuDAC',
       '2FCl1hxO2u70TLtNykj9yQ', '48MaeYhgUAN64sHd9EIkPP'], dtype=object)

In [None]:
import ast
from collections import Counter

# Function to safely parse each entry
def parse_genres(genres):
    try:
        return ast.literal_eval(genres)
    except (ValueError, SyntaxError):
        return []  # Return an empty list if parsing fails

individual_genres = [genre for genres in df['Genres'].apply(parse_genres) for genre in genres]
Counter(individual_genres)

In [43]:
paired_with_sleep = [genres for genres in df['Genres'].apply(parse_genres) if 'sleep' in genres]

In [48]:
other = []
for genres in paired_with_sleep:
    for genre in genres:
        if genre != 'sleep':
            other.append(genre)
Counter(other)

Counter({'binaural': 8560,
         'brain waves': 8371,
         'pet calming': 1568,
         'environmental': 1098,
         'new age': 853,
         'meditation': 696,
         'healing': 689,
         'ambient': 473,
         'atmosphere': 473,
         'italian metal': 398,
         'music box': 280,
         'calming instrumental': 175,
         'classify': 175,
         'new age piano': 175,
         'reiki': 64,
         'chakra': 42,
         'world meditation': 40,
         'water': 22,
         'background music': 15,
         'focus': 15,
         'tone': 10,
         'classical guitar': 10,
         'sound': 9,
         'sound effects': 7,
         'kirtan': 4,
         'musica de fondo': 2,
         'pianissimo': 2,
         'relaxative': 2,
         'guided meditation': 1,
         'hypnosis': 1})

In [6]:
import os

directory_path = '/home/th716/audio-diffusion/cache/spotify_sleep_dataset/waveform'
file_names = [file_ for file_ in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, file_))]
ids = [int(file_.split('.')[0]) for file_ in file_names]

### Filter out non-Sleep music

In [56]:
import ast

# Define a function that tries to convert each entry with literal_eval
def safe_literal_eval(value):
    try:
        return ast.literal_eval(value)
    except (ValueError, SyntaxError):
        return None  # Return None if there's an error

# Apply the function and filter out rows where 'Genres' could not be parsed
df['Genres'] = df['Genres'].apply(safe_literal_eval)
df = df[df['Genres'].notnull()]  # Remove rows with None in 'Genres'

# Filter for rows that contain 'sleep' in the list of genres
df = df[df['Genres'].apply(lambda genres: 'sleep' in genres)]


In [60]:
df.iloc[0]

Unnamed: 0                                                         30
TrackName                                Sleep Music With White Noise
TrackID                                        3JEY9Vl3Ti02UWy0e60oIE
SampleURL           https://p.scdn.co/mp3-preview/e0cd5d5935bcae58...
ReleaseYear                                                2019-07-16
Genres                                                        [sleep]
danceability                                                   0.0687
energy                                                         0.0688
loudness                                                      -21.231
speechiness                                                    0.0365
acousticness                                                    0.942
instrumentalness                                                0.948
liveness                                                        0.112
valence                                                        0.0325
tempo               

In [64]:
df.to_csv('/home/th716/audio-diffusion/spotify_sleep_dataset/sleep_only_dataset.csv')

# Download samples

In [68]:
df = pd.read_csv('/home/th716/audio-diffusion/spotify_sleep_dataset/sleep_only_dataset.csv')

In [62]:
import os
import requests
import pandas as pd


save_dir = '/home/th716/audio-diffusion/cache/spotify_sleep_dataset/waveform_sleep_only'
os.makedirs(save_dir, exist_ok=True) 

def download_audio(sample_url, track_id):
    try:
        response = requests.get(sample_url, stream=True)
        if response.status_code == 200:
            file_path = os.path.join(save_dir, f"{track_id}.wav")
            with open(file_path, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded {track_id}")
        else:
            print(f"Failed to download {track_id}: HTTP {response.status_code}")
    except Exception as e:
        print(f"Error downloading {track_id}: {e}")

for idx, row in df.iterrows():
    sample_url = row['SampleURL']
    track_id = row['TrackID']
    download_audio(sample_url, track_id)


Downloaded 3JEY9Vl3Ti02UWy0e60oIE
Downloaded 7uO3OqzBMXbZitbvIVw1BH
Downloaded 313Rc8sUpUa5CAORcxNPEp
Downloaded 5NHQ6e1JkQ5zsvzIRkqDhh
Downloaded 4wProhZjQLuEOiIpCmeLn2
Downloaded 6Qt5jKmgz5o7ysXrvH3r0t
Downloaded 6o2xFVVLruevNscPLTX3K0
Downloaded 7MO1iUivRn2ZT7gzrJpMjl
Downloaded 4NzbaK0wDdEmfr4O9AWWWn
Downloaded 1kaacK69UGqwW6bnKVMdwY
Downloaded 1GlBvRz2nqm2swUUXDSvGb
Downloaded 0tW204RLJbX8oocTStcA7U
Downloaded 1w89BKyfpMNQODRBStfE2B
Downloaded 2sPIXZlUcEOmJFtWxVkErd
Downloaded 3A9LRkenqBDRsp8hOdBTUi
Downloaded 24apu5KgCBFy3H8cd6KgoY
Downloaded 37EgnCatAjdwipGG0vOIHL
Downloaded 2jzY8CgqFHRUsYKvx2S41Q
Downloaded 2WNjbhG629uhfczBzu5exg
Downloaded 1EkP8YwyDkYApSnwV5BIbF
Downloaded 2goCfGfx8HqQGo68XKXy1B
Downloaded 3WpiHFudPhTBPYhm5iZ3Hz
Downloaded 0UiAJUM17RtmhOCSfHU0tT
Downloaded 06dRBmmhqQjKRIDiAP47xI
Downloaded 2Tb9q4EigYecyky2YYTZW2
Downloaded 3OD0Mf5GpOtZOY1aLgWhYf
Downloaded 0wS23HwicjEOqLWeQEODvq
Downloaded 3yaYOmumJtoYEbQLO1lDWW
Downloaded 6stcbLlpURUR2ARkwO93XE
Downloaded 19s

KeyboardInterrupt: 