In [1]:
%matplotlib inline

import numpy as np
import pandas as pd

In [2]:
df_tr, df_val, df_te, df_gr, df_sm = None, None, None, None, None

def set_assets(
    is_need_song_meta=False,
):
    import json
    global df_tr, df_val, df_te, df_gr, df_sm
    
    df_tr = pd.read_json('./train.json')
    df_val = pd.read_json('./val.json')
    df_te = pd.read_json('./test.json')
    df_gr = pd.read_json(
        './genre_gn_all.json',
        typ='dataframe'
    ).reset_index()
    df_gr.columns = ['code', 'desc']

    print('shape of df_tr', df_tr.shape)
    print('shape of df_val', df_val.shape)
    print('shape of df_te', df_te.shape)
    print('shape of df_gr', df_gr.shape)

    if is_need_song_meta:
        df_sm = pd.read_json('./song_meta.json')
        print('shape of df_song_meta', df_sm.shape)

In [26]:
def save_obj_by_numpy(filepath, obj):
    np.save(open(filepath, 'wb'), obj)

def load_numpy(filepath):
    return np.load(open(filepath, 'rb'))

def explain_song_list(
    df,
    index=None,
    song_ids=None,
):
    global df_sm, df_gr

    def code2desc(genre_codes):
        return df_gr[
            df_gr.code.isin(genre_codes)
        ].desc.values

    if index is not None:
        song_ids = df.iloc[index].songs

    mask = df_sm.id.isin(song_ids)
    cols = [
        'artist_name_basket',
        'song_gn_dtl_gnr_basket',
        'song_gn_gnr_basket',
        'song_name',
    ]
    tmp = df_sm[mask][cols]
    song_desc = []
    gn_counter = []
    for artist, gn_dtl, gn, name in tmp.values:
#         gn_total = gn_dtl + gn
#         print(gn_total)
        gn_dtl = code2desc(gn_dtl)
        gn = code2desc(gn)
        gn_counter += list(gn)
        song_desc.append((artist, gn_dtl, gn, name))

    from collections import Counter
    gn_counter = Counter(gn_counter)

    return pd.DataFrame(song_desc, columns=cols), gn_counter

In [12]:
songs, cnt = explain_song_list(df_tr, 2)

In [3]:
set_assets(True)

shape of df_tr (115071, 6)
shape of df_val (23015, 6)
shape of df_te (10740, 6)
shape of df_gr (254, 2)
shape of df_song_meta (707989, 9)


In [4]:
tr_lists, val_lists = df_tr.songs.values, df_val.songs.values
set_of_song_ids, set_of_song_ids2 = set(), set()

for i, _list in enumerate(tr_lists):
    set_of_song_ids.update(_list)

for i, _list in enumerate(val_lists):
    set_of_song_ids2.update(_list)

In [5]:
set_of_song_ids.issuperset(set_of_song_ids2)

False

In [6]:
set_of_all_songs = set(df_sm.id.unique())

In [7]:
set_of_all_songs.issuperset(set_of_song_ids), set_of_all_songs.issuperset(set_of_song_ids2)

(True, True)

In [9]:
code2idx = {
    code: idx
    for idx, code in enumerate(df_gr.code)
}
idx2code = {
    idx: code
    for code, idx in code2idx.items()
}

In [15]:
num_of_songs = len(df_tr.songs)
num_of_genre_codes = len(df_gr.code.unique())
song_lists_emb = np.zeros(
    (num_of_songs, num_of_genre_codes)
)
five_percent_index = int(num_of_songs * 0.05)
for row_idx, songs in enumerate(df_tr.songs.values):
    if row_idx % five_percent_index == 0:
        print(row_idx, (row_idx / five_percent_index))

    mask = df_sm.id.isin(songs)
    tmp = df_sm[mask]
    genres_list = (tmp.song_gn_dtl_gnr_basket + tmp.song_gn_gnr_basket)
    for genres in genres_list:
        for genre in genres:
            if genre in code2idx:
                col_idx = code2idx[genre]
                song_lists_emb[row_idx, col_idx] += 1

0 0.0
5753 1.0
11506 2.0
17259 3.0
23012 4.0
28765 5.0
34518 6.0
40271 7.0
46024 8.0
51777 9.0
57530 10.0
63283 11.0
69036 12.0
74789 13.0
80542 14.0
86295 15.0
92048 16.0
97801 17.0
103554 18.0
109307 19.0
115060 20.0


In [25]:
song_lists_emb

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [11., 11.,  0., ...,  0.,  0.,  0.],
       [ 2.,  2.,  0., ...,  0.,  0.,  0.],
       ...,
       [ 8.,  8.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [27]:
save_obj_by_numpy('song_lists_emb.npy', song_lists_emb)

In [28]:
from scipy.spatial.distance import pdist, cdist

In [65]:
dist = cdist(
    song_lists_emb[:1], song_lists_emb[:, ],
    metric='cosine'
)

In [66]:
songs0, cnt0 = explain_song_list(df_tr, 0)
for list_index in np.argsort(dist)[:, 11:21][0]:
    songs1, cnt1 = explain_song_list(df_tr, list_index)
    intersection = np.intersect1d(
        songs0.song_name.values, songs1.song_name.values
    )
    
    print(list_index, intersection)

85001 []
97585 []
10809 []
2983 ['Hey Little Girl' 'Octagon']
14233 ['Count On Me']
71388 []
98594 ['Run, Run, Run']
8299 ['Count On Me']
56458 ['Count On Me' 'No Ceiling']
3026 []


In [None]:
dist = pdist(
    song_lists_emb,
    metric='cosine'
)

In [41]:
df_sm.head(5)[
    [
        'artist_name_basket',
        'song_gn_dtl_gnr_basket',
        'song_gn_gnr_basket',
        'song_name',
    ]
]

Unnamed: 0,artist_name_basket,song_gn_dtl_gnr_basket,song_gn_gnr_basket,song_name
0,[Various Artists],[GN0901],[GN0900],Feelings
1,[Murray Perahia],"[GN1601, GN1606]",[GN1600],"Bach : Partita No. 4 In D Major, BWV 828 - II...."
2,[Peter Gabriel],[GN0901],[GN0900],Solsbury Hill (Remastered 2002)
3,[Matoma],"[GN1102, GN1101]",[GN1100],Feeling Right (Everything Is Nice) (Feat. Popc...
4,[Jude Law],"[GN1802, GN1801]",[GN1800],그남자 그여자


In [66]:
song_desc, cnt = explain_song_list(df_tr, 1)

In [60]:
cnt

Counter({'발라드': 11,
         'OST': 4,
         '록/메탈': 11,
         '뉴에이지': 3,
         '재즈': 7,
         '클래식': 4,
         '일렉트로니카': 1,
         '인디음악': 7,
         '포크/블루스': 2,
         'POP': 1})