In [1]:
## Import necessary libraries
import pandas as pd
import ast
import missingno as msno
import networkx as nx
import matplotlib.pyplot as plt

In [2]:
## Define file locations
TRACK_FILE = '../datasets/fma_metadata/tracks.csv'
GENRE_FILE = '../datasets/fma_metadata/genres.csv'

In [5]:
## Read tracks file, change column names, remove unnecessary rows
track_df = pd.read_csv(TRACK_FILE, header=[0,1])
new_columns = ['_'.join(col).strip() for col in track_df.columns.values]
track_df.columns = new_columns
track_df = track_df.rename(columns={'Unnamed: 0_level_0_Unnamed: 0_level_1': 'track_id'})
track_df = track_df.iloc[1:]

  track_df = pd.read_csv(TRACK_FILE, header=[0,1])


In [7]:
## Read genres file
genre_df = pd.read_csv(GENRE_FILE)

In [8]:
## Create directed graph (Tree) of genre
G = nx.DiGraph()

for index, row in genre_df.iterrows():
    G.add_node(row['genre_id'], label=row['title'])
    
for index, row in genre_df.iterrows():
    if pd.notna(row['parent']):
        G.add_edge(row['parent'], row['genre_id'])

In [65]:
def get_lineage(node, G, genre_df):
    lineage_ids = [node]
    if node == 0:
        lineage_names = ['root']
    else:
        lineage_names = [genre_df[genre_df['genre_id'] == node]['title'].values[0]]
    while list(G.predecessors(node)):
        node = list(G.predecessors(node))[0]
        if node > 0:        
            lineage_ids.append(node)
            lineage_names.append(genre_df[genre_df['genre_id'] == node]['title'].values[0])
    return lineage_ids[::-1], lineage_names[::-1]  # Return in root-to-leaf order


In [66]:
lineages_ids = {}
lineages_names = {}
for genre in G.nodes():
    ids, names = get_lineage(genre, G, genre_df)
    lineages_ids[genre] = ids
    lineages_names[genre] = names

# Transform lineage into DataFrame
lineage_df_ids = pd.DataFrame.from_dict(lineages_ids, orient='index').reset_index()
lineage_df_names = pd.DataFrame.from_dict(lineages_names, orient='index').reset_index()
lineage_df_ids.columns = ['genre_id'] + [f'depth_{i+1}_genre_id' for i in range(lineage_df_ids.shape[1]-1)]
lineage_df_names.columns = ['genre_id'] + [f'depth_{i+1}_genre_name' for i in range(lineage_df_names.shape[1]-1)]

# Handle missing depths
for i in range(2, lineage_df_ids.shape[1]):
    lineage_df_ids.iloc[:, i].fillna(lineage_df_ids.iloc[:, i-1], inplace=True)
    lineage_df_names.iloc[:, i].fillna(lineage_df_names.iloc[:, i-1], inplace=True)


In [69]:
lineage_df = pd.merge(lineage_df_ids, lineage_df_names, on='genre_id')

Unnamed: 0,genre_id,depth_1_genre_id,depth_2_genre_id,depth_3_genre_id,depth_4_genre_id,depth_5_genre_id,depth_1_genre_name,depth_2_genre_name,depth_3_genre_name,depth_4_genre_name,depth_5_genre_name
0,1,38,1.0,1.0,1.0,1.0,Experimental,Avant-Garde,Avant-Garde,Avant-Garde,Avant-Garde
1,2,2,2.0,2.0,2.0,2.0,International,International,International,International,International
2,3,3,3.0,3.0,3.0,3.0,Blues,Blues,Blues,Blues,Blues
3,4,4,4.0,4.0,4.0,4.0,Jazz,Jazz,Jazz,Jazz,Jazz
4,5,5,5.0,5.0,5.0,5.0,Classical,Classical,Classical,Classical,Classical
...,...,...,...,...,...,...,...,...,...,...,...
159,1060,2,46.0,1060.0,1060.0,1060.0,International,Latin America,Tango,Tango,Tango
160,1156,2,130.0,1156.0,1156.0,1156.0,International,Europe,Fado,Fado,Fado
161,1193,38,6.0,16.0,763.0,1193.0,Experimental,Novelty,Sound Effects,Holiday,Christmas
162,1235,1235,1235.0,1235.0,1235.0,1235.0,Instrumental,Instrumental,Instrumental,Instrumental,Instrumental


In [11]:
## Transfrom the track_df so that the every single row belongs to only one genre
track_df['track_genres_lst'] = track_df['track_genres'].apply(ast.literal_eval)
track_df_single_genre = track_df.explode('track_genres_lst')

In [71]:
## Merge genre information into tracks
merged_df = track_df_single_genre.merge(lineage_df, left_on='track_genres_lst', right_on='genre_id', how='left')

In [72]:
# merged_df.to_csv('./datasets/tracks_with_genre.csv')

In [73]:
## filter out for the purpose
merged_df_small = merged_df[merged_df['set_subset'] == 'small']
merged_df_small.to_csv('./datasets/tracks_with_genre_small.csv')
