In [5]:
## This code is to make genre hierarchy based on multi sub-genre tracks.
## May not used for training model, since we will use top genre as our label. But used for preprocessing.

import pandas as pd
import ast
import missingno as msno
import networkx as nx
import matplotlib.pyplot as plt

In [6]:
## Define file locations
TRACK_FILE = 'raw/meta/tracks.csv'
GENRE_FILE = 'raw/meta/genres.csv'

In [7]:
## Read tracks file, change column names, remove unnecessary rows
track_df = pd.read_csv(TRACK_FILE, header=[0,1])
new_columns = ['_'.join(col).strip() for col in track_df.columns.values]
track_df.columns = new_columns
track_df = track_df.rename(columns={'Unnamed: 0_level_0_Unnamed: 0_level_1': 'track_id'})
track_df = track_df.iloc[1:]

  track_df = pd.read_csv(TRACK_FILE, header=[0,1])


In [8]:
## Read genres file
genre_df = pd.read_csv(GENRE_FILE)

In [9]:
## Create directed graph (Tree) of genre
G = nx.DiGraph()

for index, row in genre_df.iterrows():
    G.add_node(row['genre_id'], label=row['title'])
    
for index, row in genre_df.iterrows():
    if pd.notna(row['parent']):
        G.add_edge(row['parent'], row['genre_id'])

In [10]:
def get_lineage(node, G, genre_df):
    lineage_ids = [node]
    if node == 0:
        lineage_names = ['root']
    else:
        lineage_names = [genre_df[genre_df['genre_id'] == node]['title'].values[0]]
    while list(G.predecessors(node)):
        node = list(G.predecessors(node))[0]
        if node > 0:        
            lineage_ids.append(node)
            lineage_names.append(genre_df[genre_df['genre_id'] == node]['title'].values[0])
    return lineage_ids[::-1], lineage_names[::-1]  # Return in root-to-leaf order


In [11]:
lineages_ids = {}
lineages_names = {}
for genre in G.nodes():
    ids, names = get_lineage(genre, G, genre_df)
    lineages_ids[genre] = ids
    lineages_names[genre] = names

# Transform lineage into DataFrame
lineage_df_ids = pd.DataFrame.from_dict(lineages_ids, orient='index').reset_index()
lineage_df_names = pd.DataFrame.from_dict(lineages_names, orient='index').reset_index()
lineage_df_ids.columns = ['genre_id'] + [f'depth_{i+1}_genre_id' for i in range(lineage_df_ids.shape[1]-1)]
lineage_df_names.columns = ['genre_id'] + [f'depth_{i+1}_genre_name' for i in range(lineage_df_names.shape[1]-1)]

# Handle missing depths
for i in range(2, lineage_df_ids.shape[1]):
    lineage_df_ids.iloc[:, i].fillna(lineage_df_ids.iloc[:, i-1], inplace=True)
    lineage_df_names.iloc[:, i].fillna(lineage_df_names.iloc[:, i-1], inplace=True)


In [12]:
lineage_df = pd.merge(lineage_df_ids, lineage_df_names, on='genre_id')

In [13]:
## Transfrom the track_df so that the every single row belongs to only one genre
track_df['track_genres_lst'] = track_df['track_genres'].apply(ast.literal_eval)
track_df_single_genre = track_df.explode('track_genres_lst')

In [14]:
## Merge genre information into tracks
merged_df = track_df_single_genre.merge(lineage_df, left_on='track_genres_lst', right_on='genre_id', how='left')

In [33]:
## filter out for the purpose
merged_df_medium = merged_df[merged_df['set_subset'].isin(['medium', 'small'])]
merged_df_medium.to_csv('processed/tracks_with_genre_medium.csv')


In [34]:
len(merged_df_medium['track_id'].unique())

25000

In [35]:
merged_df_medium.head()

Unnamed: 0,track_id,album_comments,album_date_created,album_date_released,album_engineer,album_favorites,album_id,album_information,album_listens,album_producer,...,depth_1_genre_id,depth_2_genre_id,depth_3_genre_id,depth_4_genre_id,depth_5_genre_id,depth_1_genre_name,depth_2_genre_name,depth_3_genre_name,depth_4_genre_name,depth_5_genre_name
0,2,0.0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4.0,1.0,<p></p>,6073.0,,...,21.0,21.0,21.0,21.0,21.0,Hip-Hop,Hip-Hop,Hip-Hop,Hip-Hop,Hip-Hop
1,3,0.0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4.0,1.0,<p></p>,6073.0,,...,21.0,21.0,21.0,21.0,21.0,Hip-Hop,Hip-Hop,Hip-Hop,Hip-Hop,Hip-Hop
2,5,0.0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4.0,1.0,<p></p>,6073.0,,...,21.0,21.0,21.0,21.0,21.0,Hip-Hop,Hip-Hop,Hip-Hop,Hip-Hop,Hip-Hop
3,10,0.0,2008-11-26 01:45:08,2008-02-06 00:00:00,,4.0,6.0,,47632.0,,...,10.0,10.0,10.0,10.0,10.0,Pop,Pop,Pop,Pop,Pop
14,134,0.0,2008-11-26 01:44:45,2009-01-05 00:00:00,,4.0,1.0,<p></p>,6073.0,,...,21.0,21.0,21.0,21.0,21.0,Hip-Hop,Hip-Hop,Hip-Hop,Hip-Hop,Hip-Hop


In [36]:
merged_df_medium.set_split.value_counts()

training      34316
validation     4434
test           4255
Name: set_split, dtype: int64

In [37]:
# The following code is for future use for predicting multiple sub genres.

# merged_df_med_training = merged_df_medium[merged_df_medium.set_split == 'training']
# merged_df_med_test = merged_df_medium[merged_df_medium.set_split == 'test']
# merged_df_med_valid = merged_df_medium[merged_df_medium.set_split == 'validation']

In [38]:
# merged_df_med_training.to_csv('./processed/25Ktracks_with_genre_training.csv')
# merged_df_med_test.to_csv('./processed/25Ktracks_with_genre_test.csv')
# merged_df_med_valid.to_csv('./processed/25Ktracks_with_genre_validation.csv')
# merged_df_medium.to_csv('./processed/25Ktracks_with_genre_all.csv')