In [8]:
%matplotlib inline

import numpy as np
import pandas as pd
import pathlib
import sys
import warnings

warnings.filterwarnings(action='ignore')

abs_path = ''
current_path = pathlib.Path().absolute()

for path_name in str(current_path).split('/'):
    abs_path += path_name + '/'

    if path_name == 'melon-playlist-continuation':
         break

# sys.path.insert(0, abs_path)
sys.path.append(abs_path)
sys.path.append(
    f'/Users/misfits/Documents/workplace/melon-playlist-continuation/'
    f'.venv/lib/python3.7/site-packages'
)

from util import (
    get_absolute_path_of_project_directory,
    write_json,
)

train, test, x_test, y_test, genre, meta = None, None, None, None, None, None

def set_assets(
    is_need_song_meta=False,
):
    import json
    global train, test, x_test, y_test, genre, meta
    
    basepath = get_absolute_path_of_project_directory()
    orig_basepath = f'{basepath}/res/'
    data_basepath = f'{basepath}/arena_data/'
    train = pd.read_json(f'{data_basepath}orig/train.json')
    test = pd.read_json(f'{data_basepath}orig/val.json')
    x_test = pd.read_json(f'{data_basepath}questions/val.json')
    y_test = pd.read_json(f'{data_basepath}answers/val.json')

    genre = pd.read_json(
        f'{orig_basepath}genre_gn_all.json',
        typ='dataframe',
    ).reset_index()
    genre.columns = ['code', 'desc']

    print('shape of train', train.shape)
    print('shape of test', test.shape)
    print('shape of x_test', x_test.shape)
    print('shape of y_test', y_test.shape)
    print('shape of genre', genre.shape)

    if is_need_song_meta:
        meta = pd.read_json(f'{orig_basepath}/song_meta.json')
        print('shape of meta', meta.shape)

In [2]:
from helper import *

In [32]:
set_assets(is_need_song_meta=True)

shape of train (92056, 6)
shape of test (23015, 6)
shape of x_test (23015, 6)
shape of y_test (23015, 6)
shape of genre (254, 2)
shape of meta (707989, 9)


In [4]:
song_emb = load_numpy('./song_lists_emb.npy')

In [5]:
song_emb.shape

(115071, 254)

In [21]:
from gensim.models.word2vec import Word2Vec

In [37]:
def code2desc(df_gr, genre_codes):
    return df_gr[
        df_gr.code.isin(genre_codes)
    ].desc.values

def get_song_meta_by_ids(
    ids,
    df_gr,
    df_meta,
):
    if type(ids) == int:
        ids = [ids]

    cols = [
        'album_name',
        'artist_name_basket',
        'song_gn_dtl_gnr_basket',
        'song_gn_gnr_basket',
        'song_name',
    ]
    df_tmp = df_meta[df_meta.id.isin(ids)][cols]
    song_desc = []
    for album_name, artist, gn_dtl, gn, name in df_tmp.values:
        gn_dtl = code2desc(df_gr, gn_dtl)
        gn = code2desc(df_gr, gn)
        song_desc.append((album_name, artist, gn_dtl, gn, name))

    return pd.DataFrame(song_desc, columns=cols)

In [38]:
get_song_meta_by_ids(0, genre, meta)

Unnamed: 0,album_name,artist_name_basket,song_gn_dtl_gnr_basket,song_gn_gnr_basket,song_name
0,불후의 명곡 - 7080 추억의 얄개시대 팝송베스트,[Various Artists],[세부장르전체],[POP],Feelings


In [92]:
real_train = pd.concat([train, test])

In [93]:
real_train['str_songs'] = real_train.songs.apply(
    lambda x: [str(song_id) for song_id in x]
)

In [94]:
w2v = Word2Vec(
    sentences=real_train.str_songs.values,
    min_count=1,
    size=256,
    window=10,
    negative=20,
)

In [95]:
w2v.init_sims(replace=True)

In [96]:
def test_word2vec():
    global w2v, genre, meta

    num_of_songs = len(w2v.wv.index2word)
    index = np.random.choice(num_of_songs)
    target_id = w2v.wv.index2word[index]
    print('num_of_vectorized_songs :', num_of_songs)
    print('target_song_id :', target_id)
    song_ids = [target_id]
    similarities = [0.0]
    sim_ids = w2v.wv.similar_by_word(target_id)
    for sim_id, similarity in sim_ids:
        song_ids.append(sim_id)
        similarities.append(similarity)
    
    df = get_song_meta_by_ids(
        song_ids,
        genre,
        meta,
    )
    df['sim'] = similarities
    return df

In [97]:
test_word2vec()

num_of_vectorized_songs : 615142
target_song_id : 521951


Unnamed: 0,album_name,artist_name_basket,song_gn_dtl_gnr_basket,song_gn_gnr_basket,song_name,sim
0,Money Round Here (Feat. T-Pain),[C-Ride],[세부장르전체],[랩/힙합],Money Round Here (Feat. T-Pain),0.0
1,About You (Streaming Ver.),[Papa Zeus],"[하우스, 세부장르전체, 하우스, 클럽]","[일렉트로니카, EDM]",About You,0.999892
2,All Mirrors,[Angel Olsen],"[세부장르전체, 얼터너티브팝, '10-]",[POP],Chance,0.999886
3,Let Me Show You (Feat. Juelz Santana),[Boxie],[세부장르전체],[랩/힙합],Let Me Show You (Feat. Juelz Santana),0.999875
4,Les Stars Du Classique : Sir Simon Rattle,"[Simon Rattle, Berliner Philharmoniker]","[세부장르전체, 관현악, 협주곡, 교향/관현악]",[클래식],Holst : The Planets Op.32 - IV. Jupiter. The B...,0.999871
5,겨울과 어울리는 따뜻한 클래식 피아노 명곡 모음,[편안한 클래식],"[세부장르전체, 독주곡]",[클래식],Grieg : Lyric Pieces Book II. Op.38 - I. Berce...,0.99987
6,컴 로드 지저스 (Come Lord Jesus),[테렌스],"[세부장르전체, 국내CCM]",[CCM],컴 로드 지저스 (Come Lord Jesus),0.999868
7,사람가,[조선블루스],"[세부장르전체, 크로스오버]",[국악],사람가,0.999867
8,엄마들이 뽑은 우리아기 자장가,[Various Artists],[어린이클래식],[어린이/태교],Schumann : Kinderszenen Op.15 - VII. Traumerei...,0.999867
9,고요한 산사에서 듣는 자연의 소리,[자연명상수련],"[세부장르전체, ASMR/자연, 힐링/명상/요가]",[뮤직테라피],달빛이 비치는 고요한 산사의 밤소리,0.999865


In [98]:
sid2index = {
    int(sid): index
    for index, sid in enumerate(w2v.wv.index2word)
}

In [107]:
x_test.head()

Unnamed: 0,id,like_cnt,plylst_title,songs,tags,updt_date
0,83381,8,가을에 듣고 싶은 7080 추억의 가을이야기,"[479540, 175783, 674453, 689334, 67552, 428915...",[],2019-09-23 14:03:18.000
1,119556,10,"나의 사랑, 나의 동경, 나의 별","[616210, 197074, 699391, 604870, 446812, 53409...","[팬심, 동경]",2018-05-25 14:47:38.000
2,142795,14,◑흑인●힙합&알엔비◐,[],[힙합],2007-10-22 19:57:58.000
3,148304,7,과일처럼 상큼하고 청량한 매력의 노래들,"[584460, 704838, 554637, 145650, 496343, 34560...","[여름, 과즙, 무더위, 더위, 더운여름]",2019-08-06 12:03:36.000
4,69227,14,비오는날 이불속에서 듣기좋은 말랑말랑한 국내 : RnB #2,"[317372, 670645, 385955, 247872, 418568, 72961...","[감성, 밤, 느낌적인, 방울방울]",2018-06-26 03:51:38.000


In [115]:
from arena_util import most_popular

In [120]:
from io import StringIO

In [118]:
song_mp_counter, song_mp = most_popular(
    real_train, 'songs', 100
)

TypeError: string indices must be integers

In [113]:
from tqdm import tqdm
from collections import Counter

result = []
for _id, sids in tqdm(x_test[['id', 'songs']].values):
    tmp = []

    for sid in sids:
        index = sid2index[sid]
        target_id = w2v.wv.index2word[index]
        sim_ids = np.array(
            w2v.wv.similar_by_word(
                target_id, 
                topn=100,
            )
        )
        tmp += list(sim_ids[:, 0])

    if len(sids) == 0:
        res_sids =

    else:
        tmp = [sid for sid in tmp if sid not in sids]
        cnt_of_sids = Counter(tmp)
        res_sids = [
            int(sid)
            for sid, cnt in cnt_of_sids.most_common(100)
        ]

    result.append({
        'id': _id,
        'songs': res_sids,
        'tags': [],
    })

  0%|          | 81/23015 [00:58<4:34:18,  1.39it/s] 


KeyboardInterrupt: 

In [10]:
from word2vec import word2vec