In [2]:
import os
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import json

In [3]:
# .env
from dotenv import load_dotenv
load_dotenv(dotenv_path="../.env")

SPOTIFY_CLIENT_ID = os.getenv("SPOTIFY_CLIENT_ID")
SPOTIFY_CLIENT_SECRET = os.getenv("SPOTIFY_CLIENT_SECRET")

In [4]:
SPOTIFY_CLIENT_ID

'7e1e239fb7b54aa4a68d82af9b83cc80'

In [12]:
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=SPOTIFY_CLIENT_ID,
                                                           client_secret=SPOTIFY_CLIENT_SECRET))

results = sp.search(q='the killers for reasons unknown', limit=1)
with open('thekillers.json', 'w') as f:
    json.dump(results, f, indent=4)

## Dataset Spotify Kaggle

In [6]:
import pandas as pd

In [7]:
df = pd.read_csv("../data/spotify-kaggle/raw/dataset.csv")

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        114000 non-null  int64  
 1   track_id          114000 non-null  object 
 2   artists           113999 non-null  object 
 3   album_name        113999 non-null  object 
 4   track_name        113999 non-null  object 
 5   popularity        114000 non-null  int64  
 6   duration_ms       114000 non-null  int64  
 7   explicit          114000 non-null  bool   
 8   danceability      114000 non-null  float64
 9   energy            114000 non-null  float64
 10  key               114000 non-null  int64  
 11  loudness          114000 non-null  float64
 12  mode              114000 non-null  int64  
 13  speechiness       114000 non-null  float64
 14  acousticness      114000 non-null  float64
 15  instrumentalness  114000 non-null  float64
 16  liveness          11

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [None]:
track_found = sp.track("5SuOikwiRyPMVoIQDJUgSV")



In [None]:
track_found['album']['images']

[{'url': 'https://i.scdn.co/image/ab67616d0000b27326573d555999566291e3b7a8',
  'width': 640,
  'height': 640},
 {'url': 'https://i.scdn.co/image/ab67616d00001e0226573d555999566291e3b7a8',
  'width': 300,
  'height': 300},
 {'url': 'https://i.scdn.co/image/ab67616d0000485126573d555999566291e3b7a8',
  'width': 64,
  'height': 64}]

In [None]:
url_found = [track['url'] for track in track_found['album']['images'] if track['height'] == 640][0]

In [None]:
import requests

In [None]:
img_data = requests.get(url_found).content
with open('album_cover.jpg', 'wb') as handler:
    handler.write(img_data)

### Scraping

Iterar sobre todos los track_id unicos del dataset para obtener sus ALBUM COVERS. Cuyo nombre correspondera a su track_id asociado.

In [13]:
len(df["track_id"].unique())*(1216/37)/1024

2880.2010135135133

In [16]:
bad_ids = []

for _track_id in df["track_id"].unique():
    if os.path.exists(f'../data/spotify-kaggle/album_covers/{_track_id}.jpg'):
        print(f"Album cover for {_track_id} already exists. Skipping download.")
        continue
    try:
        track_found = sp.track(_track_id)
    except Exception as e:
        print(f"Track {_track_id} not found. Error: {e}")
        continue
    try:
        url_found = [track['url'] for track in track_found['album']['images'] if track['height'] == 300][0]
    except IndexError:
        print(f"No 300px image found for track {_track_id}. Skipping download.")
        print(track_found['album']['images'])
        bad_ids.append(_track_id)
        continue
    img_data = requests.get(url_found).content
    with open(f'../data/spotify-kaggle/album_covers/{_track_id}.jpg', 'wb') as handler:
        handler.write(img_data)



Album cover for 5SuOikwiRyPMVoIQDJUgSV already exists. Skipping download.
Album cover for 4qPNDBW1i3p13qLCt0Ki3A already exists. Skipping download.
Album cover for 1iJBSr7s7jYXzM8EGcbK5b already exists. Skipping download.
Album cover for 6lfxq3CG4xtTiEg7opyCyx already exists. Skipping download.
Album cover for 5vjLSffimiIP26QG5WcN2K already exists. Skipping download.
Album cover for 01MVOl9KtVTNfFiBU9I7dc already exists. Skipping download.
Album cover for 6Vc5wAMmXdKIAM7WUoEb7N already exists. Skipping download.
Album cover for 1EzrEOXmMH3G43AXT1y7pA already exists. Skipping download.
Album cover for 0IktbUcnAGrvD03AWnz3Q8 already exists. Skipping download.
Album cover for 7k9GuJYLp2AzqokyEdwEw2 already exists. Skipping download.
Album cover for 4mzP5mHkRvGxdhdGdAH7EJ already exists. Skipping download.
Album cover for 5ivF4eQBqJiVL5IAE9jRyl already exists. Skipping download.
Album cover for 4ptDJbJl35d7gQfeNteBwp already exists. Skipping download.
Album cover for 0X9MxHR1rTkEHDjp95F2OO

KeyboardInterrupt: 