In [2]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import librosa
import numpy as np
import matplotlib.pyplot as plt
import requests
from io import BytesIO
from IPython.display import Audio
import pandas as pd
import csv

In [3]:
client_id = "5e014985ad3b448a9a1941678627bcb4"
client_secret = "45bfe7fe93c14e578a40ac0dad5f63df"
spotify = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret), requests_timeout=10, retries=3)
spotify.trace = True

In [4]:
df = pd.read_csv("278k_labelled_uri.csv")
df.drop(columns=["Unnamed: 0"], inplace=True)
df.rename(columns={"Unnamed: 0.1": "song_id"}, inplace=True)
df.head()

Unnamed: 0,song_id,duration (ms),danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,spec_rate,labels,uri
0,0,195000.0,0.611,0.614,-8.815,0.0672,0.0169,0.000794,0.753,0.52,128.05,3.446154e-07,2,spotify:track:3v6sBj3swihU8pXQQHhDZo
1,1,194641.0,0.638,0.781,-6.848,0.0285,0.0118,0.00953,0.349,0.25,122.985,1.464234e-07,1,spotify:track:7KCWmFdw0TzoJbKtqRRzJO
2,2,217573.0,0.56,0.81,-8.029,0.0872,0.0071,8e-06,0.241,0.247,170.044,4.00785e-07,1,spotify:track:2CY92qejUrhyPUASawNVRr
3,3,443478.0,0.525,0.699,-4.571,0.0353,0.0178,8.8e-05,0.0888,0.199,92.011,7.959809e-08,0,spotify:track:11BPfwVbB7vok7KfjBeW4k
4,4,225862.0,0.367,0.771,-5.863,0.106,0.365,1e-06,0.0965,0.163,115.917,4.693131e-07,1,spotify:track:3yUJKPsjvThlcQWTS9ttYx


In [11]:
import pandas as pd
import numpy as np
import requests
import librosa
import csv
import gc
from io import BytesIO
import time
from pathlib import Path

def process_tracks_by_label(df, spotify, tracks_per_label=100):
    dataframe_dict = {
        "song_id": [], "duration (ms)": [], "danceability": [], 
        "energy": [], "loudness": [], "speechiness": [], 
        "acousticness": [], "instrumentalness": [], "liveness": [], 
        "valence": [], "tempo": [], "spec_rate": [], "uri": [], 
        "labels": []
    }
    
    # Crear directorio para mel spectrogramas si no existe
    mel_dir = Path("mel_spectrograms")
    mel_dir.mkdir(exist_ok=True)
    
    for label, grupo in df.groupby('labels'):
        print(f"\n=== Procesando label {label} ===")
        agregados = 0
        
        # Procesar en lotes de 20 canciones para evitar rate limiting
        for i in range(0, len(grupo), 20):
            if agregados >= tracks_per_label:
                break     

            batch = grupo.iloc[i:i+20]
            print(f"Label {label}: Procesando canciones {i} a {i+20}")
            
            for _, row in batch.iterrows():
                if agregados >= tracks_per_label:
                    break
                    
                song_id = row["song_id"]
                uri = row['uri']
                track_id = uri.split(":")[-1]
                
                try:
                    for attempt in range(3):
                        try:
                            track_info = spotify.track(track_id)
                            break
                        except Exception as e:
                            if "rate limit" in str(e).lower():
                                wait_time = 5 * (attempt + 1)
                                print(f"Rate limit alcanzado. Esperando {wait_time} segundos...")
                                time.sleep(wait_time)
                            else:
                                raise e
                    
                    if "preview_url" in track_info and track_info["preview_url"]:
                        # Procesar mel spectrogram
                        mel_file = mel_dir / f"{song_id}.csv"
                        
                        if not mel_file.exists():  # Evitar reprocesar si ya existe
                            try:
                                preview_url = track_info['preview_url']
                                audio_content = requests.get(preview_url).content
                                y, sr = librosa.load(BytesIO(audio_content), sr=None, duration=5.0)
                                
                                mel_spect = librosa.feature.melspectrogram(y=y, sr=sr)
                                mel_spect_db = librosa.power_to_db(mel_spect, ref=np.max)
                                mel_vector = mel_spect_db.ravel().tolist()
                                
                                with open(mel_file, 'w', newline='') as csvfile:
                                    writer = csv.writer(csvfile)
                                    writer.writerow([song_id] + mel_vector)
                                
                                # Limpiar memoria
                                del y, mel_spect, mel_spect_db, mel_vector
                                gc.collect()
                                
                                # Agregar datos al diccionario
                                for key in dataframe_dict.keys():
                                    dataframe_dict[key].append(row[key])

                            except Exception as e:
                                print(f"Error procesando mel spectrogram para {song_id}: {e}")
                                continue

                        agregados += 1
                        print(f"Label {label}: {agregados}/{tracks_per_label} canciones procesadas")
                
                except Exception as e:
                    print(f"Error procesando track {song_id}: {e}")
                    continue
            
            # Esperar entre lotes para evitar rate limiting
            if agregados < tracks_per_label:
                print("Esperando 1 segundo entre lotes...")
                time.sleep(1)
        
        print(f"Completado label {label}: {agregados} canciones procesadas")
    
    return pd.DataFrame(dataframe_dict)

try:
    result_df = process_tracks_by_label(df, spotify, tracks_per_label=100)
    print("\nProceso completado!")
    print("Forma del DataFrame resultante:", result_df.shape)
    print("\nDistribución de labels:")
    print(result_df['labels'].value_counts())
    
    # Guardar resultado
    result_df.to_csv('processed_tracks.csv', index=False)
    print("\nResultados guardados en 'processed_tracks.csv'")
    
except Exception as e:
    print(f"Error en el proceso principal: {e}")


=== Procesando label 0 ===
Label 0: Procesando canciones 0 a 20
Label 0: 1/100 canciones procesadas
Label 0: 2/100 canciones procesadas
Label 0: 3/100 canciones procesadas
Label 0: 4/100 canciones procesadas
Label 0: 5/100 canciones procesadas
Label 0: 6/100 canciones procesadas
Label 0: 7/100 canciones procesadas
Label 0: 8/100 canciones procesadas
Label 0: 9/100 canciones procesadas
Label 0: 10/100 canciones procesadas
Label 0: 11/100 canciones procesadas
Label 0: 12/100 canciones procesadas
Label 0: 13/100 canciones procesadas
Label 0: 14/100 canciones procesadas
Label 0: 15/100 canciones procesadas
Label 0: 16/100 canciones procesadas
Label 0: 17/100 canciones procesadas
Label 0: 18/100 canciones procesadas
Esperando 1 segundo entre lotes...
Label 0: Procesando canciones 20 a 40
Label 0: 19/100 canciones procesadas
Label 0: 20/100 canciones procesadas
Label 0: 21/100 canciones procesadas
Label 0: 22/100 canciones procesadas
Label 0: 23/100 canciones procesadas
Label 0: 24/100 canc

In [13]:
import pandas as pd
import os

folder_path = "./mel_spectrograms"

headers = ["song_id"] + [f"t{t}_b{b}" for t in range(431) for b in range(128)]

mel_spectrograms_df = pd.DataFrame(columns=headers)

for (i, filename) in enumerate(os.listdir(folder_path)):
    print(f"{i}/400")
    if filename.endswith(".csv"):
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path, header=None)
        df.columns = headers
        mel_spectrograms_df = pd.concat([mel_spectrograms_df, df], ignore_index=True)

mel_spectrograms_df.to_pickle("mel_spectrograms_df.pkl")

mel_spectrograms_df.head()

0/400


  mel_spectrograms_df = pd.concat([mel_spectrograms_df, df], ignore_index=True)


1/400
2/400
3/400
4/400
5/400
6/400
7/400
8/400
9/400
10/400
11/400
12/400
13/400
14/400
15/400
16/400
17/400
18/400
19/400
20/400
21/400
22/400
23/400
24/400
25/400
26/400
27/400
28/400
29/400
30/400
31/400
32/400
33/400
34/400
35/400
36/400
37/400
38/400
39/400
40/400
41/400
42/400
43/400
44/400
45/400
46/400
47/400
48/400
49/400
50/400
51/400
52/400
53/400
54/400
55/400
56/400
57/400
58/400
59/400
60/400
61/400
62/400
63/400
64/400
65/400
66/400
67/400
68/400
69/400
70/400
71/400
72/400
73/400
74/400
75/400
76/400
77/400
78/400
79/400
80/400
81/400
82/400
83/400
84/400
85/400
86/400
87/400
88/400
89/400
90/400
91/400
92/400
93/400
94/400
95/400
96/400
97/400
98/400
99/400
100/400
101/400
102/400
103/400
104/400
105/400
106/400
107/400
108/400
109/400
110/400
111/400
112/400
113/400
114/400
115/400
116/400
117/400
118/400
119/400
120/400
121/400
122/400
123/400
124/400
125/400
126/400
127/400
128/400
129/400
130/400
131/400
132/400
133/400
134/400
135/400
136/400
137/400
138/400
139/

Unnamed: 0,song_id,t0_b0,t0_b1,t0_b2,t0_b3,t0_b4,t0_b5,t0_b6,t0_b7,t0_b8,...,t430_b118,t430_b119,t430_b120,t430_b121,t430_b122,t430_b123,t430_b124,t430_b125,t430_b126,t430_b127
0,561,-28.449587,-20.357433,-16.270924,-16.019396,-17.589495,-16.149189,-16.023872,-16.18338,-15.709162,...,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-70.315796
1,587,-28.257311,-28.569427,-24.319679,-25.002903,-31.400566,-31.267591,-30.247046,-29.865032,-26.255615,...,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-69.62278
2,137,-32.39859,-24.398623,-12.331869,-8.607775,-8.46315,-11.070618,-4.962997,-2.811954,-4.763077,...,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-78.528778
3,70,-19.587574,-11.403679,-10.102903,-14.255501,-16.495501,-13.532471,-14.102993,-16.697994,-20.785107,...,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-70.632492
4,87,-30.482355,-23.153172,-20.124125,-18.806267,-20.171616,-20.383968,-21.594028,-20.616934,-19.69105,...,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-68.306915


In [23]:
df = pd.read_csv("processed_tracks.csv")
df = df.set_index("song_id", drop=True)
df.head()

Unnamed: 0_level_0,duration (ms),danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,spec_rate,uri,labels
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
3,443478.0,0.525,0.699,-4.571,0.0353,0.0178,8.8e-05,0.0888,0.199,92.011,7.959809e-08,spotify:track:11BPfwVbB7vok7KfjBeW4k,0
8,216187.0,0.516,0.692,-4.842,0.0279,0.0875,0.0093,0.09,0.181,83.571,1.290549e-07,spotify:track:6TwrBbgTaB5gpl06YQoRKy,0
9,232333.0,0.548,0.509,-7.937,0.0288,0.261,0.702,0.079,0.484,78.974,1.2396e-07,spotify:track:5SDEirHg6Y8fCYuKMnAaC5,0
28,197246.0,0.787,0.462,-7.985,0.0908,0.724,0.000187,0.102,0.554,86.994,4.603389e-07,spotify:track:0kJUHCpOr6CFU15kEKAgVF,0
31,253869.0,0.567,0.542,-6.64,0.0309,0.309,0.000342,0.0734,0.229,125.16,1.217163e-07,spotify:track:1zZIVe9x0JhdhpuQjLTjIM,0


In [24]:
mel_spectrograms_df = mel_spectrograms_df.set_index("song_id", drop=True)
mel_spectrograms_df.head()

Unnamed: 0_level_0,t0_b0,t0_b1,t0_b2,t0_b3,t0_b4,t0_b5,t0_b6,t0_b7,t0_b8,t0_b9,...,t430_b118,t430_b119,t430_b120,t430_b121,t430_b122,t430_b123,t430_b124,t430_b125,t430_b126,t430_b127
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
561,-28.449587,-20.357433,-16.270924,-16.019396,-17.589495,-16.149189,-16.023872,-16.18338,-15.709162,-16.888853,...,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-70.315796
587,-28.257311,-28.569427,-24.319679,-25.002903,-31.400566,-31.267591,-30.247046,-29.865032,-26.255615,-34.355835,...,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-69.62278
137,-32.39859,-24.398623,-12.331869,-8.607775,-8.46315,-11.070618,-4.962997,-2.811954,-4.763077,-11.053829,...,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-78.528778
70,-19.587574,-11.403679,-10.102903,-14.255501,-16.495501,-13.532471,-14.102993,-16.697994,-20.785107,-23.79105,...,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-70.632492
87,-30.482355,-23.153172,-20.124125,-18.806267,-20.171616,-20.383968,-21.594028,-20.616934,-19.69105,-22.387667,...,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-68.306915


In [25]:
df_final = pd.concat([df, mel_spectrograms_df], axis=1)
df.shape

(400, 13)

In [26]:
df_final.head()


Unnamed: 0_level_0,duration (ms),danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,...,t430_b118,t430_b119,t430_b120,t430_b121,t430_b122,t430_b123,t430_b124,t430_b125,t430_b126,t430_b127
song_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,443478.0,0.525,0.699,-4.571,0.0353,0.0178,8.8e-05,0.0888,0.199,92.011,...,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-70.186569
8,216187.0,0.516,0.692,-4.842,0.0279,0.0875,0.0093,0.09,0.181,83.571,...,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-62.771309
9,232333.0,0.548,0.509,-7.937,0.0288,0.261,0.702,0.079,0.484,78.974,...,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-78.874359
28,197246.0,0.787,0.462,-7.985,0.0908,0.724,0.000187,0.102,0.554,86.994,...,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-65.996414
31,253869.0,0.567,0.542,-6.64,0.0309,0.309,0.000342,0.0734,0.229,125.16,...,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-80.0,-72.233246


In [27]:
df_final.to_pickle("data_with_mel.pkl")