In [2]:
import h5py
import pandas as pd
import os
import tarfile

tar_path = "millionsongsubset.tar.gz"

extract_path = "MillionSongSubset/"

with tarfile.open(tar_path, "r:gz") as tar:
    tar.extractall(extract_path)

path = os.path.join(extract_path, "MillionSongSubset")

folders = os.listdir(path)

dfs = []

for folder in folders:
    folder_path = os.path.join(path, folder)
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".h5"):
                file_path = os.path.join(root, file)
                with h5py.File(file_path, "r") as f:
                   
                    metadata = f["metadata"]["songs"]
                    song_id = metadata["song_id"][0]
                    artist_name = metadata["artist_name"][0]
                    title = metadata["title"][0]

                    
                    dfs.append(pd.DataFrame({
                        "song_id": [song_id],
                        "artist_name": [artist_name],
                        "title": [title],
                    }))

df = pd.concat(dfs, ignore_index=True)
df

Unnamed: 0,song_id,artist_name,title
0,b'SOMZWCG12A8C13C480',b'Casual',"b""I Didn't Mean To"""
1,b'SOCIWDW12A8C13D406',b'The Box Tops',b'Soul Deep'
2,b'SOXVLOJ12AB0189215',b'Sonora Santanera',b'Amor De Cabaret'
3,b'SONHOTT12A8C13493C',b'Adam Ant',b'Something Girls'
4,b'SOFSOCN12A8C143F5D',b'Gob',b'Face the Ashes'
...,...,...,...
9995,b'SOLXXPY12A67ADABA0',b'Moonspell',b'The Hanged Man'
9996,b'SOAYONI12A6D4F85C8',b'Danny Williams',b'The Wonderful World Of The Young'
9997,b'SOJZLAJ12AB017E8A2',b'Winston Reedy',b'Sentimental Man'
9998,b'SORZSCJ12A8C132446',"b'Myrick ""Freeze"" Guillory'",b'Zydeco In D-Minor'


In [3]:
def remove_prefix(b_string):
    return str(b_string)[2:-1]

df = df.applymap(remove_prefix)

df

Unnamed: 0,song_id,artist_name,title
0,SOMZWCG12A8C13C480,Casual,I Didn't Mean To
1,SOCIWDW12A8C13D406,The Box Tops,Soul Deep
2,SOXVLOJ12AB0189215,Sonora Santanera,Amor De Cabaret
3,SONHOTT12A8C13493C,Adam Ant,Something Girls
4,SOFSOCN12A8C143F5D,Gob,Face the Ashes
...,...,...,...
9995,SOLXXPY12A67ADABA0,Moonspell,The Hanged Man
9996,SOAYONI12A6D4F85C8,Danny Williams,The Wonderful World Of The Young
9997,SOJZLAJ12AB017E8A2,Winston Reedy,Sentimental Man
9998,SORZSCJ12A8C132446,"Myrick ""Freeze"" Guillory",Zydeco In D-Minor


In [6]:
top_100 = pd.read_csv('songs.csv')

In [7]:
top_100["artist_and_title"] = top_100["artists"] + "|" + top_100["title"]
df["artist_and_title"] = df["artist_name"] + "|" + df["title"]
mask = ~top_100['artist_and_title'].isin(df['artist_and_title'])
mask.value_counts()

True    100
Name: artist_and_title, dtype: int64

In [9]:
import numpy as np
np.random.seed(123)
not_hot_songs = df.sample(n=3000)

In [13]:
not_hot_songs = not_hot_songs.drop('artist_and_title', axis=1)
not_hot_songs

Unnamed: 0,song_id,artist_name,title
2656,SOPGSVK12A8C13CD10,The Frantic,Rock & Roll Renegade
445,SOUOYDQ12AB017EFFC,Jackson United,Undertow
9505,SOTMVUQ12A6BD53721,Jesus Jones,Nothing To Hold Me
332,SOQOTLQ12AB01868D0,Gloriana,Clementina Santaf\xc3\xa8
4168,SOAGWGQ12A8C13F9A5,Bottom Of The Hudson,One of Us
...,...,...,...
2708,SOXZNEF12A8C137AD7,Jim Reeves,I Guess I'm Crazy
8232,SOFKEIK12A6310D86A,The Rolling Stones,Stop Breaking Down (1994 Digital Remaster)
5835,SONTEEZ12A8C13EC73,Hevia,Albo
6689,SOSWCXN12A6D4FAE3C,Brenda Lee,Your One And Only (LP Version)


In [14]:
not_hot_songs.to_csv('not_hot.csv')