In [49]:
import os, h5py
import pandas as pd

In [50]:
directory_path = './Data/'
data_list = []

for root, dirnames, filenames in os.walk(directory_path):
    for filename in filenames:
        if filename.endswith('.h5'):
            hdf_file_path = os.path.join(root, filename)
            hdf = h5py.File(hdf_file_path, 'r')

            analysis_data = hdf['/analysis/songs/'][:]
            metadata_data = hdf['/metadata/songs/'][:]
            musicbrainz_data = hdf['/musicbrainz/songs/'][:]

            combined_data = pd.concat([pd.DataFrame(analysis_data), pd.DataFrame(metadata_data), pd.DataFrame(musicbrainz_data)], axis=1)
            
            data_list.append(combined_data)
            
            hdf.close()

final_data = pd.concat(data_list, ignore_index=True)

In [51]:
final_data.head()

Unnamed: 0,analysis_sample_rate,audio_md5,danceability,duration,end_of_fade_in,energy,idx_bars_confidence,idx_bars_start,idx_beats_confidence,idx_beats_start,...,idx_artist_terms,idx_similar_artists,release,release_7digitalid,song_hotttnesss,song_id,title,track_7digitalid,idx_artist_mbtags,year
0,22050,b'aee9820911781c734e7694c5432990ca',0.0,252.05506,2.049,0.0,0,0,0,0,...,0,0,b'Monster Ballads X-Mas',633681,0.542899,b'SOQMMHC12AB0180CB8',b'Silent Night',7032331,0,2003
1,22050,b'ed222d07c83bac7689d52753610a513a',0.0,156.55138,0.258,0.0,0,0,0,0,...,0,0,b'Karkuteill\xc3\xa4',145266,0.299877,b'SOVFVAK12A8C1350D9',b'Tanssi vaan',1514808,0,1995
2,22050,b'96c7104889a128fef84fa469d60e380c',0.0,138.97098,0.0,0.0,0,0,0,0,...,0,0,b'Butter',625706,0.617871,b'SOGTUKN12AB017F4F1',b'No One Could Ever',6945353,0,2006
3,22050,b'0f7da84b6b583e3846c7e022fb3a92a2',0.0,145.05751,0.0,0.0,0,0,0,0,...,0,0,b'De Culo',199368,,b'SOBNYVR12A8C13558C',b'Si Vos Quer\xc3\xa9s',2168257,0,2003
4,22050,b'228dd6392ad8001b0281f533f34c72fd',0.0,514.29832,0.0,0.0,0,0,0,0,...,0,0,b'Rene Ablaze Presents Winter Sessions',209038,,b'SOHSBXH12A8C13B0DF',b'Tangle Of Aspens',2264873,0,0


In [52]:
# Convertir les données en chaînes de caractères
final_data['audio_md5'] = final_data['audio_md5'].str.decode('utf-8')
final_data['release'] = final_data['release'].str.decode('utf-8')
final_data['song_id'] = final_data['song_id'].str.decode('utf-8')
final_data['title'] = final_data['title'].str.decode('utf-8')
final_data['artist_name'] = final_data['artist_name'].str.decode('utf-8')
final_data['artist_id'] = final_data['artist_id'].str.decode('utf-8')
final_data['track_id'] = final_data['track_id'].str.decode('utf-8')
final_data['genre'] = final_data['genre'].str.decode('utf-8')

In [53]:
# Supprimer les caractères "b" au début de chaque élément
final_data['audio_md5'] = final_data['audio_md5'].str.replace('^b', '', regex=True)
final_data['release'] = final_data['release'].str.replace('^b', '', regex=True)
final_data['song_id'] = final_data['song_id'].str.replace('^b', '', regex=True)
final_data['title'] = final_data['title'].str.replace('^b', '', regex=True)
final_data['artist_name'] = final_data['artist_name'].str.replace('^b', '', regex=True)
final_data['artist_id'] = final_data['artist_id'].str.replace('^b', '', regex=True)
final_data['genre'] = final_data['genre'].str.replace('^b', '', regex=True)
# final_data['track_id'] = final_data['track_id'].str.replace('^b', '', regex=True)

Filtrage des colonnes, pour ne garder que celles-qui sont intéressantes

In [54]:
# Je filtre les colonnes les plus pertinantes
columns_to_filter= ['Column1', 'analysis_sample_rate', 'duration', 'track_id', 'genre', 'danceability', 'loudness', 'tempo']

In [55]:
final_data = final_data.filter(columns_to_filter)

In [56]:
final_data.head()

Unnamed: 0,analysis_sample_rate,duration,track_id,genre,danceability,loudness,tempo
0,22050,252.05506,TRMMMYQ128F932D901,,0.0,-4.829,87.002
1,22050,156.55138,TRMMMKD128F425225D,,0.0,-10.555,150.778
2,22050,138.97098,TRMMMRX128F93187D9,,0.0,-2.06,177.768
3,22050,145.05751,TRMMMCH128F425532C,,0.0,-4.654,87.433
4,22050,514.29832,TRMMMWA128F426B589,,0.0,-7.806,140.035


In [57]:
final_data.to_csv('./Data/final_data.csv', sep=',', encoding='utf-8', index=False)

In [58]:
genres = pd.read_csv('./Data/genre_cls_beatunes.csv', sep=',', encoding='utf-8')

In [59]:
genres.head()

Unnamed: 0,track_id,seed-genre
0,TRAAAAK128F9318786,Rock
1,TRAAAAV128F421A322,Rock
2,TRAAAAW128F429D538,Hip-Hop
3,TRAAAAY128F42A73F0,World
4,TRAAABD128F429CF47,Rock


In [60]:
final_csv = pd.merge(final_data, genres, on='track_id')

In [61]:
final_csv.to_csv('./Data/final_csv.csv', sep=',', encoding='utf-8', index=False)

In [67]:
data_with_genre = pd.read_csv('./Data/data_with_genre.csv', sep=',', encoding='utf-8')

In [68]:
data_with_genre

Unnamed: 0,analysis_sample_rate,audio_md5,danceability,duration,end_of_fade_in,energy,idx_bars_confidence,idx_bars_start,idx_beats_confidence,idx_beats_start,...,idx_similar_artists,release,release_7digitalid,song_hotttnesss,song_id,title,track_7digitalid,idx_artist_mbtags,year,genre
0,22050,a222795e07cd65b7a530f1346f520649,0.0,218.93179,0.247,0.0,0,0,0,0,...,0,Fear Itself,300848,0.602120,SOMZWCG12A8C13C480,I Didn't Mean To,3401791,0,0,Hip-Hop
1,22050,bb9771eeef3d5b204a3c55e690f52a91,0.0,148.03546,0.148,0.0,0,0,0,0,...,0,Dimensions,300822,,SOCIWDW12A8C13D406,Soul Deep,3400270,0,1969,Rock
2,22050,fa329738005ca53715d9f7381a0d1fe3,0.0,177.47546,0.282,0.0,0,0,0,0,...,0,Las Numero 1 De La Sonora Santanera,514953,,SOXVLOJ12AB0189215,Amor De Cabaret,5703798,0,0,Latin
3,22050,43cd1abd45d5a2dda16a3c65b4963bd4,0.0,233.40363,0.000,0.0,0,0,0,0,...,0,Friend Or Foe,287650,,SONHOTT12A8C13493C,Something Girls,3226795,0,1982,Rock
4,22050,580a8fe08ef0f1c7734b84547d7a8bc7,0.0,209.60608,0.066,0.0,0,0,0,0,...,0,Muertos Vivos,611336,0.604501,SOFSOCN12A8C143F5D,Face the Ashes,6795666,0,2007,Rock
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6541,22050,6a41147ba37254d26d4552f6a73f91d6,0.0,172.25098,0.097,0.0,0,0,0,0,...,0,Praise God I'm Satisfied,597439,,SOVMTAW12A8C13B071,God Don't Never Change,6630898,0,1989,Rock
6542,22050,3f28a2a4bd6b99b8a0c4abe5df0d1f0e,0.0,386.19383,0.177,0.0,0,0,0,0,...,0,Sin / Pecado,691752,0.594080,SOLXXPY12A67ADABA0,The Hanged Man,7677054,0,1998,Rock
6543,22050,4296453097175a6b912f20ed69875e6b,0.0,168.01914,0.403,0.0,0,0,0,0,...,0,Collection,41649,0.334707,SOAYONI12A6D4F85C8,The Wonderful World Of The Young,442366,0,1998,Pop
6544,22050,36759353d4d181efbcddddb0076a5286,0.0,300.82567,0.000,0.0,0,0,0,0,...,0,Nouveau Zydeco,86259,0.000000,SORZSCJ12A8C132446,Zydeco In D-Minor,904098,0,0,World
