# Librerias y dependecias

In [4]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
from ytmusicapi import YTMusic
import re
from tqdm import tqdm
import time
import random
import requests
import os
from dotenv import load_dotenv

# Autenticación

In [None]:


load_dotenv() # carga .env en process env
CLIENT_ID = os.getenv("CLIENT_ID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET")
sp = spotipy.Spotify(
    auth_manager=SpotifyClientCredentials(
        client_id=CLIENT_ID,
        client_secret=CLIENT_SECRET
    )
)
yt = YTMusic()

# Creacion del Dataset


## Definir playlist

In [3]:
# --- SPOTIFY ---
playlistsspotify = [
    ("rock", "https://open.spotify.com/playlist/2XGHHAtKIkF2vra9n9ohCo"),
    ("blues", "https://open.spotify.com/playlist/4tuvViSICMHa50NAwGaOiz"),
    ("country", "https://open.spotify.com/playlist/66W297BG3yvKGkUuawESJo"),
    ("hyperpop", "https://open.spotify.com/playlist/2spbReDu7dcRW8sMXwMfqz"),
    ("kpop", "https://open.spotify.com/playlist/4r9P2LhsIwLHtWzpfk3H3M"),
    ("kpop", "https://open.spotify.com/playlist/58N7rQpPjyVA6KCBVlOJOd"),
    ("electronica", "https://open.spotify.com/playlist/4o6pV8avI1bS6sB4zHW3no"),
    ("pop", "https://open.spotify.com/playlist/5Sx1Y3RRSorf9FENwcg0Js"),
    ("regueton", "https://open.spotify.com/playlist/03sDEv7FN58Mb9CJOs1Tgn"),
    ("rock", "https://open.spotify.com/playlist/5ffh8oWUXPpuqQFu5AvU6a"),
    ("hiphop", "https://open.spotify.com/playlist/30A7kR1hbfnmmSFDXh3JBT"),
    ("jazz", "https://open.spotify.com/playlist/2hiw0p7Z6KJECOhrB4qkHZ"),
    ("clasica", "https://open.spotify.com/playlist/2Irt1QeunuKClnMypBvtbs")
]


# --- YOUTUBE MUSIC ---
playlistsyoutube = [
    ("clasica", "https://music.youtube.com/playlist?list=PLb14QDz_8YuSzy0wcFDz9TyIduyuAPFOx"),
    ("jazz", "https://music.youtube.com/playlist?list=PLb14QDz_8YuR20uMKjryzBzIKvxZEzftQ"),
    ("hiphop", "https://music.youtube.com/playlist?list=PLb14QDz_8YuSzN5-_XYmIk-Z1nH__Hmf0"),
    ("rock", "https://music.youtube.com/playlist?list=PLb14QDz_8YuQCGXnBubIDe336aCUiVf3c"),
    ("rock", "https://music.youtube.com/playlist?list=PLb14QDz_8YuRtwDNk4DTf9lLl3yV5agtY"),
    ("blues", "https://music.youtube.com/playlist?list=PLb14QDz_8YuS3g51nM-OTq5EfSDQriwrz"),
    ("country", "https://music.youtube.com/playlist?list=PLb14QDz_8YuQmU0F_sPB5m-7c0nO7bRWq"),
]





## Funciones para extraer datos

In [4]:
# --- EXTRACCIÓN ID YOUTUBE MUSIC ---
def obtener_id_youtube(url):
    match = re.search(r"list=([a-zA-Z0-9_-]+)", url)
    return match.group(1) if match else None

def extraer_youtube_playlist(url, genero):
    playlist_id = obtener_id_youtube(url)
    if not playlist_id:
        print(f"No se pudo obtener ID de {url}")
        return []

    try:
        playlist = yt.get_playlist(playlist_id)
    except Exception as e:
        print(f"Error en {url}: {e}")
        return []

    data = []
    for item in playlist.get("tracks", []):
        if "artists" in item:
            artista = item["artists"][0]["name"]
            tema = item["title"]
            data.append((artista, tema, genero, "YouTube Music"))
    return data

def extraer_spotify_playlist(playlist_url, genero):
    """
    Extrae todas las canciones de una playlist de Spotify usando paginación
    Incluye el spotify_id directamente
    """
    try:
        # Extraer el ID de la playlist de la URL
        playlist_id = playlist_url.split('playlist/')[1].split('?')[0]

        data = []
        offset = 0
        limit = 100  # Máximo permitido por la API

        while True:
            # Obtener un lote de canciones
            results = sp.playlist_tracks(playlist_id, limit=limit, offset=offset)

            if not results['items']:
                break  # No hay más canciones

            # Procesar las canciones del lote actual
            for item in results['items']:
                if item['track'] and item['track']['artists']:
                    artista = item['track']['artists'][0]['name']
                    tema = item['track']['name']
                    popularidad = item['track']['popularity']
                    spotify_id = item['track']['id']
                    fecha_lanzamiento = item['track']['album']['release_date']
                    año = int(fecha_lanzamiento.split('-')[0])

                    # Filtrar solo 2024 y 2025
                    if año in [2024, 2025]:
                        data.append([artista, tema, genero, popularidad, "Spotify", spotify_id, año])


            # Si no hay más páginas, salir del bucle
            if results['next'] is None:
                break

            offset += limit  # Avanzar al siguiente lote

        return data

    except Exception as e:
        print(f"Error al extraer playlist {playlist_url}: {e}")
        return []

## Construir el dataset

### Spotify

In [5]:
data_spotify = []
for genero, url in tqdm(playlistsspotify):
    data_spotify.extend(extraer_spotify_playlist(url, genero))

df_spotify = pd.DataFrame(data_spotify, columns=["artista", "tema", "genero", "popularidad","fuente", "spotify_id",'año'])
print(f"Spotify: {len(df_spotify)} canciones")
print(f"Con spotify_id: {df_spotify['spotify_id'].notna().sum()}")
df_spotify.head()

100%|██████████| 13/13 [00:22<00:00,  1.71s/it]

Spotify: 2207 canciones
Con spotify_id: 2207





Unnamed: 0,artista,tema,genero,popularidad,fuente,spotify_id,año
0,Kings of Leon,To Space,rock,59,Spotify,356x2OD1llA9NfOMXVmGSk,2025
1,Teen Jesus and the Jean Teasers,WONDERFUL,rock,42,Spotify,2KlnNibs8vbh1ulZ7124vI,2025
2,Mother Mother,HOT TO GO! - Spotify Singles,rock,46,Spotify,5HoBujicJSH3hEIeDfmpTB,2025
3,Hayley Williams,Showbiz,rock,55,Spotify,0HA0F6W8khJgVgZwSEdTgN,2025
4,Jimmy Eat World,Failure,rock,52,Spotify,3pnfUFrTc8i1hpsORDbx1T,2025


### Youtube

In [6]:
data_youtube = []
for genero, url in tqdm(playlistsyoutube):
    data_youtube.extend(extraer_youtube_playlist(url, genero))

df_youtube = pd.DataFrame(data_youtube, columns=["artista", "tema", "genero", "fuente"])
print(f"YouTube Music: {len(df_youtube)} canciones")
df_youtube.head()


100%|██████████| 7/7 [00:03<00:00,  2.31it/s]

YouTube Music: 438 canciones





Unnamed: 0,artista,tema,genero,fuente
0,James Quinn,Journey of Life,clasica,YouTube Music
1,Yannick Lowack,Silence,clasica,YouTube Music
2,Andrea Vanzo,Rebirth,clasica,YouTube Music
3,Hans Zimmer,"Interstellar Suite: Part 2, No Time for Caution",clasica,YouTube Music
4,Carlotta Dalia,Estudio sin luz,clasica,YouTube Music


### Consolidacion dataset Youtube/Spotify

In [7]:
df_total = pd.concat([df_spotify, df_youtube], ignore_index=True)

#eliminar duplicados por artista + tema
df_total.drop_duplicates(subset=["artista", "tema"], inplace=True)

print(f"Dataset total: {len(df_total)} canciones")

Dataset total: 2564 canciones


## Guardar el dataset final

In [None]:
df_total.to_csv("../data/raw/01_dataset_youtube_spotify.csv", index=False)