In [2]:
import duckdb
con = duckdb.connect("database.db", read_only=True)

In [None]:
# show artists database
con.sql("SELECT * FROM artists").df()

In [None]:
# show albums database
con.sql("SELECT * FROM albums").df()

In [None]:
# show 10 most frequent genres
con.sql("""
        SELECT genre, COUNT(name) AS count, LIST(name) AS artists 
        FROM artists INNER JOIN artist_genre ON artists.spotify_id=artist_genre.artist_id 
        GROUP BY genre ORDER BY count DESC LIMIT 10
    """).df()

In [None]:
# find all albums by the given artist
given_artist = "Sufjan Stevens"
con.execute("""
        SELECT DISTINCT artists.name AS artist_name, albums.name AS album_name  
        FROM albums 
        LEFT JOIN album_artist ON albums.spotify_id=album_artist.album_id
        JOIN artists ON album_artist.artist_id=artists.spotify_id
        WHERE artist_name = ?
        """, (given_artist,)).df()

In [None]:
# find artist based on an album and then other albums by that artist
analyzed_album = "softscars"
artist_id = con.execute("""        
        SELECT DISTINCT artists.spotify_id AS artist_id, artists.name AS artist_name, albums.name AS album_name  
        FROM albums 
        LEFT JOIN album_artist ON albums.spotify_id=album_artist.album_id
        JOIN artists ON album_artist.artist_id=artists.spotify_id
        WHERE album_name=?;
        """,(analyzed_album,)).df()["artist_id"][0]
con.execute("""
        SELECT DISTINCT artists.name AS artist_name, albums.name AS album_name  
        FROM albums 
        LEFT JOIN album_artist ON albums.spotify_id=album_artist.album_id
        JOIN artists ON album_artist.artist_id=artists.spotify_id
        WHERE artists.spotify_id=? AND albums.name!=?;
        """,(artist_id, analyzed_album,)).df()

In [None]:
# based on artist, find other artists based on the genre and their albums
analyzed_artist = "AURORA"
genres_list = con.execute("""
        SELECT genre FROM artists 
        JOIN artist_genre ON artists.spotify_id = artist_genre.artist_id
        WHERE artists.name = ?
        """, (analyzed_artist,)).df()["genre"].to_list()
placeholders = ", ".join("?" for _ in genres_list) # create the appropriate number of ? -> ?, ?, ?, ?, ...
unknowns = genres_list + [analyzed_artist]
con.execute(f"""
        SELECT artists.name, LIST(genre) AS genres, COUNT(genre) AS same_genre_count        
        FROM artists 
        JOIN artist_genre ON artists.spotify_id = artist_genre.artist_id
        WHERE genre IN ({placeholders}) AND artists.name != ?
        GROUP BY artists.name
        ORDER BY same_genre_count DESC
        """, unknowns).df()

In [3]:
# show random album from a given year
year = 2023
con.sql(f"""
        SELECT artists.name as artist_name, albums.name as album_name, total_tracks, release_date, added_at, popularity 
        FROM albums 
        JOIN album_artist ON albums.spotify_id = album_artist.album_id
        JOIN artists ON artists.spotify_id = album_artist.artist_id 
        WHERE added_at BETWEEN '{year}-01-01 00:00:00.000' AND '{year}-12-31 23:59:59.999' 
        ORDER BY RANDOM()  
        LIMIT 1  
        """).df()

Unnamed: 0,artist_name,album_name,total_tracks,release_date,added_at,popularity
0,Various Artists,The Space Project,14,2014-05-06,2023-02-02 18:59:15+01:00,21
