In [12]:
import os.path as osp
import pandas as pd
import numpy as np

In [35]:
def mcGillPreProc(sample_range = range(1301), working_dir = "", output_list = True, max_frame_num = 15122):
    """
    Preprocessing for the McGill_Billboard dataset
    Input:
        sample_range : Range of songs targeted for analysis (denoted by their numbers). The maximal song number is 1300.
        working_dir : Directory path till 'McGill_Billboard/'
        output_list : Whether or not to output in Python 'List' format. If opted out, chromagrams will be zero-padded
                    to form matrices of uniform dimension (max_frame_num, 25). However, a typical 3 minute song has ~4000
                    frames, while the longest song has 15122. Therefore, it is generally not an efficient way to use array.
                    A list with appropriate dimensionality can be easily converted to a numpy array by numpy.asarray().
        max_frame_num : Only relevant when output_list = False. The length to which each chromagram will be zero-padded to.
    Output:
        chroma : List of length m, or array of dimension (M, max_frame_num, 25), where M is number of valid samples.
                (m, nf, 1:25) is the chromagram of song m in time frame nf. (m, :, 0) is timeline.
        chord : List of length m, or array of dimension (M, max_frame_num, 2), where M is number of valid samples. (m, nf, 1)
                is the chord in time frame nf. (m, :, 0) is timeline.
        dict_chord2idx: Dictionary. Mapping from chord name (e.g., "Cmin") to integer.
        dict_idx2chord: Dictionary. The invert of dict_chord2idx.
        song_num : List. The actual file number of each song. 
    """
    chroma_base = "Chroma_vector"
    chroma_filename = "bothchroma.csv"
    chord_base = "MIREX_style"
    chord_filename = "majmin.lab"
    
    dict_chord2idx = dict()
    dict_idx2chord = dict()
    idx_chord = 0
    
    song_num = []
    if output_list:
        chroma = [];
        chord = [];
    else:
        chroma = np.zeros((0, max_frame_num, 25))
        chord = np.zeros((0, max_frame_num, 2))
    
    for sample_num in sample_range:
        chroma_dir = osp.join(working_dir, chroma_base,'{:0>4}'.format(sample_num))
        chord_dir = osp.join(working_dir, chord_base,'{:0>4}'.format(sample_num))
        if not (osp.isdir(chroma_dir) and osp.isdir(chord_dir)):
            continue
        
        song_num.append(sample_num)
        chroma_dat = pd.read_csv(osp.join(chroma_dir, chroma_filename), header=None)
        chroma_dat = chroma_dat.drop(labels=0, axis=1)
        chord_dat = pd.read_csv(osp.join(chord_dir, chord_filename), delimiter='\t', header=None)
        
        for i in range(chord_dat.shape[0]):
            if chord_dat[2][i] not in dict_chord2idx:
                dict_chord2idx.update({chord_dat[2][i]: idx_chord})
                dict_idx2chord.update({idx_chord: chord_dat[2][i]})
                idx_chord +=1
        
        chord_label = np.zeros((chroma_dat.shape[0], 2))
        for i in range(chord_dat.shape[0]):
            sel_time = np.logical_and(chroma_dat[1] >= chord_dat[0][i], chroma_dat[1] < chord_dat[1][i])
            chord_label[sel_time, 1] = dict_chord2idx[chord_dat[2][i]]
            chord_label[:,0] = chroma_dat[1]
        
        chroma_ready = chroma_dat.as_matrix()
        
        if output_list:
            chroma.append(chroma_ready)
            chord.append(chord_label)
        else:
            chroma_ready = np.pad(array=chroma_ready, pad_width=((0, max_frame_num-chroma_ready.shape[0]), (0,0)), mode= 'constant', constant_values = 0)
            chroma = np.concatenate((chroma, np.zeros((1, max_frame_num,25))), axis = 0)
            chroma[-1, :, :] = chroma_ready
            
            chord_ready = np.pad(array=chord_label, pad_width=((0, max_frame_num-chord_label.shape[0]), (0,0)), mode= 'constant', constant_values = 0)
            chord = np.concatenate((chord, np.zeros((1, max_frame_num,2))), axis = 0)
            chord[-1, :, :] = chord_ready
    
    return chroma, chord, dict_chord2idx, dict_idx2chord, song_num


In [36]:
working_dir = "/Users/charleschen/Documents/Courses/CS230/dataset/McGill_Billboard"

chroma, chord, dict_chord2idx, dict_idx2chord, song_num = mcGillPreProc( sample_range= range(20), working_dir= working_dir, output_list=False)