# Librerias y dependecias

In [2]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
from ytmusicapi import YTMusic
import re
from tqdm import tqdm
import time
import random
import requests
import os
from dotenv import load_dotenv

# Autenticaci√≥n

In [3]:


load_dotenv() # carga .env en process env
CLIENT_ID = os.getenv("CLIENT_ID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET")
sp = spotipy.Spotify(
    auth_manager=SpotifyClientCredentials(
        client_id=CLIENT_ID,
        client_secret=CLIENT_SECRET
    )
)
yt = YTMusic()

# Extraccion de Caracteristicas

## Funci√≥n para buscar el ID

In [4]:
def obtener_spotify_id(artista, tema):
    """Busca el ID de Spotify combinando artista y tema."""
    query = f"track:{tema} artist:{artista}"
    try:
        result = sp.search(q=query, type="track", limit=1, market="AR")
        tracks = result.get("tracks", {}).get("items", [])
        if tracks:
            return tracks[0]["id"]
    except Exception:
        pass
    return None


### Aplicar sobre tu dataset

In [5]:


# Copiamos el DataFrame base y reiniciamos el √≠ndice
df_total = pd.read_csv("../data/raw/01_dataset_youtube_spotify.csv", encoding="utf-8", low_memory=False)

df = df_total.copy().reset_index(drop=True)

# Si la columna spotify_id no existe, crearla
if "spotify_id" not in df.columns:
    df["spotify_id"] = None

# Identificar las filas que necesitan b√∫squeda (sin ID)
filas_sin_id = df[df["spotify_id"].isna()].index

print(f"üîç Canciones sin spotify_id: {len(filas_sin_id)} / {len(df)}")

# Buscar los IDs solo para las canciones que no lo tienen
for i in tqdm(filas_sin_id):
    artista = df.loc[i, "artista"]
    tema = df.loc[i, "tema"]

    spotify_id = obtener_spotify_id(artista, tema)
    df.loc[i, "spotify_id"] = spotify_id

    # Espera aleatoria para evitar rate limit (opcional, descomentar si es necesario)
    # time.sleep(0.05 + random.random() * 0.05)

# Guardar resultado completo
ruta_salida = "../data/interim/02_dataset_Y_S_con_spotify_ids.csv"
df.to_csv(ruta_salida, index=False, encoding="utf-8-sig")

print(f" Archivo guardado correctamente en: {ruta_salida}")
print(f" IDs totales encontrados: {df['spotify_id'].notna().sum()} / {len(df)}")
print(f"  - Desde playlists: {len(df) - len(filas_sin_id)}")
print(f"  - Mediante b√∫squeda: {df.loc[filas_sin_id, 'spotify_id'].notna().sum()}")


üîç Canciones sin spotify_id: 407 / 2564


  0%|          | 0/407 [00:00<?, ?it/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 407/407 [02:05<00:00,  3.24it/s]

 Archivo guardado correctamente en: ../data/interim/02_dataset_Y_S_con_spotify_ids.csv
 IDs totales encontrados: 2454 / 2564
  - Desde playlists: 2157
  - Mediante b√∫squeda: 297





In [6]:
df.head()

Unnamed: 0,artista,tema,genero,popularidad,fuente,spotify_id,a√±o
0,Kings of Leon,To Space,rock,59.0,Spotify,356x2OD1llA9NfOMXVmGSk,2025.0
1,Teen Jesus and the Jean Teasers,WONDERFUL,rock,42.0,Spotify,2KlnNibs8vbh1ulZ7124vI,2025.0
2,Mother Mother,HOT TO GO! - Spotify Singles,rock,46.0,Spotify,5HoBujicJSH3hEIeDfmpTB,2025.0
3,Hayley Williams,Showbiz,rock,55.0,Spotify,0HA0F6W8khJgVgZwSEdTgN,2025.0
4,Jimmy Eat World,Failure,rock,52.0,Spotify,3pnfUFrTc8i1hpsORDbx1T,2025.0


## Audiofeatures con Reccobeats

In [7]:
output_path = "../data/interim/03_dataset_con_audiofeatures_reccobeats.csv"

# Detectar columna que contiene los IDs
for c in df.columns:
    if "spotify" in c.lower() and "id" in c.lower():
        col_id = c
        break

if not col_id:
    raise ValueError("No se encontr√≥ una columna que contenga 'spotify' y 'id' en su nombre.")

spotify_ids = df[col_id].dropna().astype(str).unique().tolist()
print(f"‚úÖ Dataset cargado: {len(df)} filas")
print(f"üéµ Se encontraron {len(spotify_ids)} Spotify IDs v√°lidos")

# === FUNCI√ìN PARA OBTENER FEATURES ===
def get_audio_features_batch(ids):
    ids_str = ",".join(ids)
    url = f"https://api.reccobeats.com/v1/audio-features?ids={ids_str}"
    headers = {"Accept": "application/json"}
    res = requests.get(url, headers=headers)
    if res.status_code != 200:
        print(f"‚ö†Ô∏è Error {res.status_code}: {res.text[:150]}")
        return []
    data = res.json().get("content", [])
    return data

# === DESCARGA EN BLOQUES ===
features = []
batch_size = 40  # ‚úÖ la API permite entre 1 y 40

for i in range(0, len(spotify_ids), batch_size):
    batch = spotify_ids[i:i+batch_size]
    data = get_audio_features_batch(batch)
    if data:
        features.extend(data)
    print(f"Procesado bloque {i//batch_size + 1}/{len(spotify_ids)//batch_size + 1} ({len(data)} features)")
    time.sleep(0.3)  # evitar rate limit

# === CONVERTIR A DATAFRAME Y UNIR ===
if not features:
    print("‚ö†Ô∏è No se obtuvieron features de ning√∫n track.")
else:
    df_features = pd.DataFrame(features)
    df_features["spotify_id"] = df_features["href"].str.extract(r"track/([a-zA-Z0-9]+)")
    df_final = df.merge(df_features, left_on=col_id, right_on="spotify_id", how="left")

    df_final.to_csv(output_path, index=False)
    print(f"‚úÖ Archivo guardado en: {output_path}")
    print(f"üéß Features obtenidas: {df_features.shape[0]}")


‚úÖ Dataset cargado: 2564 filas
üéµ Se encontraron 2451 Spotify IDs v√°lidos
Procesado bloque 1/62 (0 features)
Procesado bloque 2/62 (1 features)
Procesado bloque 3/62 (7 features)
Procesado bloque 4/62 (27 features)
Procesado bloque 5/62 (23 features)
Procesado bloque 6/62 (4 features)
Procesado bloque 7/62 (0 features)
Procesado bloque 8/62 (1 features)
Procesado bloque 9/62 (20 features)
Procesado bloque 10/62 (23 features)
Procesado bloque 11/62 (21 features)
Procesado bloque 12/62 (22 features)
Procesado bloque 13/62 (32 features)
Procesado bloque 14/62 (36 features)
Procesado bloque 15/62 (26 features)
Procesado bloque 16/62 (27 features)
Procesado bloque 17/62 (27 features)
Procesado bloque 18/62 (25 features)
Procesado bloque 19/62 (16 features)
Procesado bloque 20/62 (5 features)
Procesado bloque 21/62 (4 features)
Procesado bloque 22/62 (14 features)
Procesado bloque 23/62 (25 features)
Procesado bloque 24/62 (31 features)
Procesado bloque 25/62 (20 features)
Procesado bloq

In [8]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2564 entries, 0 to 2563
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   artista           2564 non-null   object 
 1   tema              2564 non-null   object 
 2   genero            2564 non-null   object 
 3   popularidad       2157 non-null   float64
 4   fuente            2564 non-null   object 
 5   spotify_id        2454 non-null   object 
 6   a√±o               2157 non-null   float64
 7   id                865 non-null    object 
 8   href              865 non-null    object 
 9   acousticness      864 non-null    float64
 10  danceability      864 non-null    float64
 11  energy            864 non-null    float64
 12  instrumentalness  864 non-null    float64
 13  key               864 non-null    float64
 14  liveness          864 non-null    float64
 15  loudness          864 non-null    float64
 16  mode              864 non-null    float64

In [9]:
print(df_final["genero"].value_counts())

genero
rock           958
hiphop         277
jazz           263
hyperpop       199
kpop           194
pop            185
regueton       148
clasica         98
electronica     96
blues           76
country         70
Name: count, dtype: int64


In [10]:
df_final.head()

Unnamed: 0,artista,tema,genero,popularidad,fuente,spotify_id,a√±o,id,href,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence
0,Kings of Leon,To Space,rock,59.0,Spotify,356x2OD1llA9NfOMXVmGSk,2025.0,,,,,,,,,,,,,
1,Teen Jesus and the Jean Teasers,WONDERFUL,rock,42.0,Spotify,2KlnNibs8vbh1ulZ7124vI,2025.0,,,,,,,,,,,,,
2,Mother Mother,HOT TO GO! - Spotify Singles,rock,46.0,Spotify,5HoBujicJSH3hEIeDfmpTB,2025.0,,,,,,,,,,,,,
3,Hayley Williams,Showbiz,rock,55.0,Spotify,0HA0F6W8khJgVgZwSEdTgN,2025.0,,,,,,,,,,,,,
4,Jimmy Eat World,Failure,rock,52.0,Spotify,3pnfUFrTc8i1hpsORDbx1T,2025.0,,,,,,,,,,,,,
