In [None]:
import json
import os
import re
import warnings
import random

import numpy as np
import scipy as sp
import pandas as pd

from numba import jit
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
pd.options.mode.chained_assignment = None

In [None]:
song_meta = pd.read_json('song_meta.json', typ = 'frame', encoding='utf-8')
train = pd.read_json('train.json', typ = 'frame', encoding='utf-8')

In [12]:
song_meta.head()

Unnamed: 0,album_id,album_name,artist_id_basket,artist_name_basket,id,issue_date,song_gn_dtl_gnr_basket,song_gn_gnr_basket,song_name
0,2255639,불후의 명곡 - 7080 추억의 얄개시대 팝송베스트,[2727],[Various Artists],0,20140512,[GN0901],[GN0900],Feelings
1,376431,"Bach : Partitas Nos. 2, 3 & 4",[29966],[Murray Perahia],1,20080421,"[GN1601, GN1606]",[GN1600],"Bach : Partita No. 4 In D Major, BWV 828 - II...."
2,4698747,Hit,[3361],[Peter Gabriel],2,20180518,[GN0901],[GN0900],Solsbury Hill (Remastered 2002)
3,2644882,Feeling Right (Everything Is Nice) (Feat. Popc...,[838543],[Matoma],3,20151016,"[GN1102, GN1101]",[GN1100],Feeling Right (Everything Is Nice) (Feat. Popc...
4,2008470,그남자 그여자,[560160],[Jude Law],4,20110824,"[GN1802, GN1801]",[GN1800],그남자 그여자


In [129]:
train.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,[락],61281,여행같은 음악,"[525514, 129701, 383374, 562083, 297861, 13954...",71,2013-12-19 18:36:19.000
1,"[추억, 회상]",10532,요즘 너 말야,"[432406, 675945, 497066, 120377, 389529, 24427...",1,2014-12-02 16:19:42.000
2,"[까페, 잔잔한]",76951,"편하게, 잔잔하게 들을 수 있는 곡.-","[83116, 276692, 166267, 186301, 354465, 256598...",17,2017-08-28 07:09:34.000
3,"[연말, 눈오는날, 캐럴, 분위기, 따듯한, 크리스마스캐럴, 겨울노래, 크리스마스,...",147456,크리스마스 분위기에 흠뻑 취하고 싶을때,"[394031, 195524, 540149, 287984, 440773, 10033...",33,2019-12-05 15:15:18.000
4,[댄스],27616,추억의 노래 ㅋ,"[159327, 553610, 5130, 645103, 294435, 100657,...",9,2011-10-25 13:54:56.000


## train data 내 playlist에서 song  일부(20%)를 제거하는 train set 생성

In [125]:
# @jit(nopython=True)
def train_labelled(songs_list, tags_list, X_rate):
    # result[:,0]=songs_X
    # result[:,1]=songs_y
    # result[:,2]=tags_X
    # result[:,3]=tags_y
    i = 0
    songs_X, songs_y, tags_X, tags_y = [], [], [], []
    for songs, tags in zip(songs_list, tags_list):        
        if i > 9999 and i % 10000 == 0:
            print(i, "th completed", sep="")
        n_songs = len(songs)
        n_tags = len(tags)
        
        if n_songs + n_tags == 0:
            continue
        
        n_songs_Xi = round(X_rate*n_songs)
        n_tags_Xi = round(X_rate*n_tags)
        songs_index = random.shuffle(list(range(n_songs)))
        tags_index = random.shuffle(list(range(n_tags)))
        songs_Xi = songs[:n_songs_Xi]
        songs_yi = songs[n_songs_Xi:]
        tags_Xi = tags[:n_tags_Xi]
        tags_yi = tags[n_tags_Xi:]
        
        if not len(songs_X):
            songs_X, songs_y, tags_X, tags_y = [songs_Xi], [songs_yi], [tags_Xi], [tags_yi]
        else:
            songs_X += [songs_Xi]
            songs_y += [songs_yi]
            tags_X += [tags_Xi]
            tags_y += [tags_yi]
        
        i += 1
    return songs_X, songs_y, tags_X, tags_y

In [126]:
random.seed(1000)
new_train = train_labelled(list(train.songs), list(train.tags), 0.8)

10000th completed
20000th completed
30000th completed
40000th completed
50000th completed
60000th completed
70000th completed
80000th completed
90000th completed
100000th completed
110000th completed


In [124]:
print(new_train[3])

[[], [], [], ['겨울왕국', '크리스마스송'], [], ['트렌드', '일렉'], ['이별'], ['회상'], ['인디'], ['락']]


In [131]:
train["songs_X"] = pd.Series(new_train[0])
train["songs_y"] = pd.Series(new_train[1])
train["tags_X"] = pd.Series(new_train[2])
train["tags_y"] = pd.Series(new_train[3])

In [132]:
train.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date,songs_X,songs_y,tags_X,tags_y
0,[락],61281,여행같은 음악,"[525514, 129701, 383374, 562083, 297861, 13954...",71,2013-12-19 18:36:19.000,"[525514, 129701, 383374, 562083, 297861, 13954...","[72552, 223955, 324992, 50104]",[락],[]
1,"[추억, 회상]",10532,요즘 너 말야,"[432406, 675945, 497066, 120377, 389529, 24427...",1,2014-12-02 16:19:42.000,"[432406, 675945, 497066, 120377, 389529, 24427...","[181101, 472144, 414721, 75801, 315216, 192882...","[추억, 회상]",[]
2,"[까페, 잔잔한]",76951,"편하게, 잔잔하게 들을 수 있는 곡.-","[83116, 276692, 166267, 186301, 354465, 256598...",17,2017-08-28 07:09:34.000,"[83116, 276692, 166267, 186301, 354465, 256598...","[35784, 153029, 336743, 203558, 348801, 454550]","[까페, 잔잔한]",[]
3,"[연말, 눈오는날, 캐럴, 분위기, 따듯한, 크리스마스캐럴, 겨울노래, 크리스마스,...",147456,크리스마스 분위기에 흠뻑 취하고 싶을때,"[394031, 195524, 540149, 287984, 440773, 10033...",33,2019-12-05 15:15:18.000,"[394031, 195524, 540149, 287984, 440773, 10033...","[457519, 453762, 349398, 631142, 406082, 54838...","[연말, 눈오는날, 캐럴, 분위기, 따듯한, 크리스마스캐럴, 겨울노래, 크리스마스]","[겨울왕국, 크리스마스송]"
4,[댄스],27616,추억의 노래 ㅋ,"[159327, 553610, 5130, 645103, 294435, 100657,...",9,2011-10-25 13:54:56.000,"[159327, 553610, 5130, 645103, 294435, 100657,...","[583375, 181670, 314344, 617473, 33244, 635753...",[댄스],[]


In [136]:
train.to_json("train_labelled.json", orient='records', force_ascii=False)

## 노래들 간 co-occurrence를 이용하여 누락된 노래 100개 예측

In [148]:
# co-occurrence matrix for songs
num_songs = song_meta.shape[0]
def cooccur(song_lists):
    import scipy as sp
    i = 0
    result = sp.sparse.dok_matrix((num_songs,num_songs), dtype='int32')
    for songs in song_lists:
        if i > 9999 and i % 10000 == 0:
            print(i, "th completed", sep="")
        i = 0
        for song in songs:
            for co_occured_song in songs[(i+1):]:
                result[song, co_occured_song] += 1
                result[co_occured_song, song] += 1
            i += 1
    return result

In [55]:
# train.songs 이용 2시간 걸림
song_co_occurrence = cooccur(train.songs)

In [7]:
# song_co_occurrence_coo = song_co_occurrence.tocoo()
# sp.sparse.save_npz("train_song_co_occurrence.npz", song_co_occurrence_coo)

In [None]:
# train.songs_X 이용 
song_co_occurrence_reduced = cooccur(train.songs_X)

In [None]:
song_co_occurrence_reduced_coo = song_co_occurrence_reduced.tocoo()
sp.sparse.save_npz("train_song_co_occurrence_reduced.npz", song_co_occurrence_reduced_coo)

In [3]:
song_co_occurrence_csr = sp.sparse.load_npz("train_song_co_occurrence.npz").tocsr()

In [4]:
song_co_occurrence_csr

<707989x707989 sparse matrix of type '<class 'numpy.intc'>'
	with 278497102 stored elements in Compressed Sparse Row format>

- 특정 song이 포함될 conditional probability를 계산하기 위한 song 별 총 등장횟수 계산

In [5]:
train_song_count = {song:0 for song in range(len(song_meta))}

In [6]:
for songs in train.songs:
    for song in songs:
        train_song_count[song] += 1

In [7]:
train_song_count = np.array([train_song_count[i] for i in range(len(song_meta))])

In [178]:
print(train_song_count)

[ 2  0  0 ... 11  1  1]


## 노래들과 태그의 co-occurrence를 이용하여 태그 10개 예측

In [8]:
# 태그 unique 값 추출
tags_dict = {tag:i for i, tag in enumerate(train.tags.explode().unique())}
index_tags = {index:tag for tag, index in tags_dict.items()}

In [364]:
# co-occurrence matrix for tags with respect to songs
def cooccur_tag(song_lists, tag_lists, n_songs, n_tags):
    import scipy as sp
    result = sp.sparse.dok_matrix((n_songs, n_tags), dtype='int16')
    i = 0
    for songs, tags in zip(song_lists, tag_lists):
        if i > 9999 and not i % 10000:
            print(i, "th list completed", sep="")
        songs_in_list = len(songs)
        for song in songs:
            for tag in tags:
                result[song, tags_dict[tag]] += 1
        i += 1
    return result

In [11]:
train.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,[락],61281,여행같은 음악,"[525514, 129701, 383374, 562083, 297861, 13954...",71,2013-12-19 18:36:19.000
1,"[추억, 회상]",10532,요즘 너 말야,"[432406, 675945, 497066, 120377, 389529, 24427...",1,2014-12-02 16:19:42.000
2,"[까페, 잔잔한]",76951,"편하게, 잔잔하게 들을 수 있는 곡.-","[83116, 276692, 166267, 186301, 354465, 256598...",17,2017-08-28 07:09:34.000
3,"[연말, 눈오는날, 캐럴, 분위기, 따듯한, 크리스마스캐럴, 겨울노래, 크리스마스,...",147456,크리스마스 분위기에 흠뻑 취하고 싶을때,"[394031, 195524, 540149, 287984, 440773, 10033...",33,2019-12-05 15:15:18.000
4,[댄스],27616,추억의 노래 ㅋ,"[159327, 553610, 5130, 645103, 294435, 100657,...",9,2011-10-25 13:54:56.000


In [365]:
tag_co_occurrence = cooccur_tag(train.songs, train.tags, n_songs=num_songs, n_tags=len(tags_dict))
tag_co_occurrence_coo = tag_co_occurrence.tocoo()
# sp.sparse.save_npz('train_tag_co_occurrence_coo.npz', tag_co_occurrence_coo)

10000th list completed
20000th list completed
30000th list completed
40000th list completed
50000th list completed
60000th list completed
70000th list completed
80000th list completed
90000th list completed
100000th list completed
110000th list completed


In [9]:
tag_co_occurrence_coo = sp.sparse.load_npz("train_tag_co_occurrence_coo.npz")
tag_co_occurrence_csr = tag_co_occurrence_coo.tocsr()

- 특정 tag가 포함될 conditional probability를 계산하기 위한 tag 별 총 등장횟수 계산

In [10]:
train_tag_index_count = tag_co_occurrence_csr.sum(axis=0)

In [11]:
train_tag_index_count = {index:0 for index in range(len(tags_dict))}
train_tag_index_count = np.array([train_tag_index_count[i] for i in range(len(tags_dict))])

In [12]:
for tags in train.tags:
    for tag in tags:
        train_tag_index_count[tags_dict[tag]] += 1

In [13]:
train_tag_index_count

array([4007, 6520, 4579, ...,    1,    1,    1])

In [14]:
tag_co_occurrence_csr.sum(axis=0)

matrix([[124181, 370200, 256209, ...,     23,     29,      6]],
       dtype=int32)

## association analysis

In [17]:
# song 별로 support, confidence, lift 계산
nonzero_index = song_co_occurrence_csr.nonzero()

In [21]:
nonzero_index

(array([     0,      0,      0, ..., 707988, 707988, 707988], dtype=int32),
 array([  1373,   7993,  13615, ..., 670071, 673281, 677431], dtype=int32))

In [22]:
print(nonzero_index[0].shape, nonzero_index[1].shape)

(278497102,) (278497102,)


# validation 데이터 예측

In [12]:
val = pd.read_json('val.json', typ = 'frame', encoding='utf-8')

In [14]:
print(val.shape)
val.head()

(23015, 6)


Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,[],118598,,"[373313, 151080, 275346, 696876, 165237, 52593...",1675,2019-05-27 14:14:33.000
1,[],131447,앨리스테이블,[],1,2014-07-16 15:24:24.000
2,[],51464,,"[529437, 516103, 360067, 705713, 226062, 37089...",62,2008-06-21 23:26:22.000
3,[],45144,,"[589668, 21711, 570151, 320043, 13930, 599327,...",20,2017-10-30 18:15:43.000
4,[],79929,,"[672718, 121924, 102694, 683657, 201558, 38511...",20,2017-02-07 11:40:42.000


In [387]:
sum([1 if n_songs==0 else 0 for n_songs in list(map(lambda x: len(x), val.songs))])

4379

#### validation 데이터에서  songs가 비어 있는 playlist는 4379개 (전체 playlist는 23015개)

### validation song 예측

In [13]:
print("다른 song과 함께 포함된 횟수 :", np.array(song_co_occurrence_csr.sum(axis=1))[:10,0])
print("song 별 총 등장 횟수", list(map(lambda x : train_song_count[x], range(10)) ))

다른 song과 함께 포함된 횟수 : [111   0   0 824 241 311  27  25   0 228]
song 별 총 등장 횟수 [2, 0, 0, 8, 2, 5, 1, 1, 0, 2]


In [14]:
# 비어 있는 playlist에 대해서는 등장횟수 상위 100song으로 예측
train_songs_co_occurrence_rank = np.array(song_co_occurrence_csr.sum(axis=1))[:,0].argsort()[::-1]
train_songs_co_occurrence_rank
np.array(song_co_occurrence_csr.sum(axis=1))[train_songs_co_occurrence_rank]

array([[168958],
       [153941],
       [148093],
       ...,
       [     0],
       [     0],
       [     0]])

In [16]:
np.array(song_co_occurrence_csr[:10000, :].max(axis=0).todense())[0,:]

array([1, 0, 0, ..., 1, 1, 0], dtype=int32)

In [41]:
warnings.filterwarnings(action='ignore')
song_val_pred = []
for i, songs in enumerate(val.songs):
    if i > 0 and i % 1000 == 0:
        print(i, "th completed", sep="")
    
    if not songs:
        song_val_pred += [train_songs_co_occurrence_rank[:100].tolist()]
        continue
    
    song_i_pred = []
    song_i_candidate = np.array(song_co_occurrence_csr[songs,:].sum(axis=0))[0,:]
    song_i_candidate = song_i_candidate / train_song_count
    song_i_candidate[np.isnan(song_i_candidate)] = 0
    song_i_candidate = song_i_candidate.argsort()[::-1]
    
    count = 0
    for song in song_i_candidate:
        if not song in songs:
            song_i_pred += [song]
            count += 1
            if count == 100: break
    if count < 100:
        for song in train_songs_co_occurrence_rank:
            if not song in songs:
                song_i_pred += [song]
                count += 1
                if count == 100: break
    song_val_pred += [song_i_pred]
warnings.filterwarnings(action='default')

1000th completed
2000th completed
3000th completed
4000th completed
5000th completed
6000th completed
7000th completed
8000th completed
9000th completed
10000th completed
11000th completed
12000th completed
13000th completed
14000th completed
15000th completed
16000th completed
17000th completed
18000th completed
19000th completed
20000th completed
21000th completed
22000th completed
23000th completed


In [35]:
print(train_songs_co_occurrence_rank[:100].tolist())

[116573, 144663, 366786, 357367, 675115, 654757, 133143, 549178, 610933, 13281, 339802, 11657, 701557, 650494, 625875, 449244, 627363, 169984, 461341, 26083, 643628, 339124, 174749, 348200, 300087, 505036, 88503, 132994, 37748, 663256, 645489, 117595, 407828, 295250, 302646, 581799, 253755, 463173, 493762, 672550, 173943, 473514, 140867, 377243, 520093, 446812, 27469, 209135, 543820, 663905, 648628, 152422, 13198, 497066, 680366, 531820, 427724, 224921, 146989, 350309, 6546, 580074, 42155, 464051, 237407, 232874, 215411, 349492, 75842, 246531, 523521, 333595, 485155, 118827, 424813, 246984, 67655, 396828, 187047, 351342, 152475, 629738, 443914, 590012, 579592, 459256, 413422, 418935, 628232, 236393, 586653, 601037, 422915, 235773, 668128, 422077, 210647, 547967, 95323, 374617]


In [40]:
print(song_val_pred[-3])

[284643, 473930, 274756, 264383, 542902, 584523, 606705, 93430, 562095, 349649, 397971, 555034, 276528, 541703, 646789, 630396, 358437, 327325, 222053, 518412, 494972, 575616, 354337, 565854, 449662, 38204, 702005, 65429, 590022, 159126, 316272, 604146, 94003, 53924, 252691, 220203, 91770, 283859, 70986, 452728, 216691, 200903, 310390, 313579, 365400, 339971, 338828, 557293, 51181, 420030, 314021, 11073, 133525, 677838, 84700, 224152, 262999, 522802, 621221, 335192, 545403, 65734, 481821, 130261, 697375, 405510, 161493, 188795, 565840, 210294, 123757, 631758, 693457, 229837, 116631, 632876, 479294, 427760, 540231, 288157, 269847, 29129, 388329, 617048, 650134, 396719, 586058, 13320, 607930, 88816, 655427, 590252, 376181, 320519, 309802, 669664, 114633, 647535, 535959, 224602]


### validation tag 예측

In [87]:
tag_co_occurrence_csr

<707989x29160 sparse matrix of type '<class 'numpy.int16'>'
	with 10984669 stored elements in Compressed Sparse Row format>

In [43]:
train_tags_co_occurrence_rank = list(map(lambda x: index_tags[x], np.array(tag_co_occurrence_csr.sum(axis=0))[0,:].argsort()[::-1]))
print(train_tags_co_occurrence_rank[:10])
np.array(tag_co_occurrence_csr.sum(axis=0))[0,:][np.array(tag_co_occurrence_csr.sum(axis=0))[0,:].argsort()[::-1]]

['기분전환', '감성', '드라이브', '발라드', '휴식', '잔잔한', '힐링', '사랑', '새벽', '추억']


array([824433, 582545, 553464, ...,      1,      1,      1], dtype=int32)

In [100]:
print(index_tags)

{0: '락', 1: '추억', 2: '회상', 3: '까페', 4: '잔잔한', 5: '연말', 6: '눈오는날', 7: '캐럴', 8: '분위기', 9: '따듯한', 10: '크리스마스캐럴', 11: '겨울노래', 12: '크리스마스', 13: '겨울왕국', 14: '크리스마스송', 15: '댄스', 16: '운동', 17: '드라이브', 18: 'Pop', 19: '트로피컬하우스', 20: '힐링', 21: '기분전환', 22: '2017', 23: '팝', 24: '트렌드', 25: '일렉', 26: '짝사랑', 27: '취향저격', 28: '슬픔', 29: '고백', 30: '사랑', 31: '이별', 32: '일렉트로니카', 33: '포크', 34: '메탈', 35: '인디', 36: '록', 37: 'Metal', 38: '이일우', 39: 'M에센셜', 40: 'Rock', 41: 'kpop', 42: '걸그룹댄스', 43: '스트레스해소', 44: '새해', 45: '여행', 46: '프로필음악', 47: '카카오톡', 48: '소원', 49: '프로필', 50: '소망', 51: '다짐', 52: '카톡', 53: '듣고', 54: '우울', 55: '이거', 56: '힘내', 57: '힙합', 58: '느낌있는', 59: '밤', 60: '새벽', 61: 'RnB', 62: '감각적인', 63: '국내', 64: '그루브한', 65: '가을', 66: '재즈', 67: '감성', 68: '질리지않는', 69: '나만알고싶은', 70: '봄', 71: '설렘', 72: '비오는날', 73: '누군가생각날때', 74: '스밍', 75: '목록', 76: '폐막식', 77: '올림픽', 78: '엑소', 79: '조용히', 80: '혼자', 81: '또는', 82: '새벽감성', 83: '고민', 84: '맥주한잔', 85: '카페', 86: 'OST', 87: '어쿠스틱', 88: '편안한', 89: '에너지', 90: '듀엣', 91: '달달

In [44]:
warnings.filterwarnings(action='ignore')
tag_val_pred = []
tag_top10 = [train_tags_co_occurrence_rank[:10]]
for i, songs in enumerate(val.songs):
    if i > 0 and i % 1000 == 0:
        print(i, "th completed", sep="")
    
    if not songs:
        tag_val_pred += tag_top10
        continue
    
    tag_i_pred = []
    tag_i_candidate = np.array(tag_co_occurrence_csr[songs,:].sum(axis=0))[0,:]
    tag_i_candidate = tag_i_candidate / train_tag_index_count
    tag_i_candidate[np.isnan(tag_i_candidate)] = 0
    tag_i_candidate = tag_i_candidate.argsort()[::-1]
    
    tags = val.tags[i]
    count = 0
    for tag_index in tag_i_candidate:
        tag = index_tags[tag_index]
        if not tag in tags:
            tag_i_pred += [tag]
            count += 1
            if count == 10: break
    

    tag_val_pred += [tag_i_pred]
warnings.filterwarnings(action='default')

1000th completed
2000th completed
3000th completed
4000th completed
5000th completed
6000th completed
7000th completed
8000th completed
9000th completed
10000th completed
11000th completed
12000th completed
13000th completed
14000th completed
15000th completed
16000th completed
17000th completed
18000th completed
19000th completed
20000th completed
21000th completed
22000th completed
23000th completed


In [124]:
tag_top10[0]

['기분전환', '감성', '드라이브', '발라드', '휴식', '잔잔한', '힐링', '사랑', '새벽', '추억']

In [45]:
tag_val_pred[0]

['가족영화',
 '영원한감동',
 '디즈니애니메이션ost',
 '메리포핀스',
 '스칼렛요한슨',
 '연주노래',
 '정글북',
 '도리를찾아서',
 '인크레더블',
 '라푼젤']

In [65]:
val.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,[],118598,,"[373313, 151080, 275346, 696876, 165237, 52593...",1675,2019-05-27 14:14:33.000
1,[],131447,앨리스테이블,[],1,2014-07-16 15:24:24.000
2,[],51464,,"[529437, 516103, 360067, 705713, 226062, 37089...",62,2008-06-21 23:26:22.000
3,[],45144,,"[589668, 21711, 570151, 320043, 13930, 599327,...",20,2017-10-30 18:15:43.000
4,[],79929,,"[672718, 121924, 102694, 683657, 201558, 38511...",20,2017-02-07 11:40:42.000


In [62]:
song_meta.iloc[val.songs[0], :]

Unnamed: 0,song_gn_dtl_gnr_basket,issue_date,album_name,album_id,artist_id_basket,song_name,song_gn_gnr_basket,artist_name_basket,id
373313,"[GN2207, GN1501, GN1506, GN1509]",20150101,Walt Disney Records The Legacy Collection: Pin...,2308104,[353277],I&#39;ve Got No Strings,"[GN1500, GN2200]",[Dickie Jones],373313
151080,"[GN2207, GN1509, GN1501, GN1506]",20150312,겨울왕국 열기 OST (Making Today A Perfect Day),2308739,"[232538, 746208, 2758138]",Making Today A Perfect Day,"[GN1500, GN2200]","[Idina Menzel, Kristen Bell, Cast of Frozen Fe...",151080
275346,"[GN1301, GN1302]",20150814,Friend Like Me,2334904,[181769],Friend Like Me (From &#34;Aladdin&#34;),[GN1300],[Ne-Yo],275346
696876,"[GN1301, GN1302]",20151006,Ev&#39;rybody Wants To Be A Cat (From &#34;The...,2643514,[871816],Ev’rybody Wants To Be A Cat (From &#34;The Ari...,[GN1300],[Charles Perry],696876
165237,"[GN2207, GN1501, GN0901, GN1509, GN1506]",20151030,We Love Disney (Deluxe),2647713,[742133],Colors Of The Wind (From &#34;Pocahontas&#34;),"[GN1500, GN0900, GN2200]",[Tori Kelly],165237
525935,"[GN2207, GN0901, GN1509, GN1501, GN1506]",20160212,영화 주토피아 OST (Zootopia OST),2666420,[10379],Try Everything (From &#34;Zootopia&#34;/Soundt...,"[GN1500, GN0900, GN2200]",[Shakira],525935
457812,"[GN2207, GN0901, GN1509, GN1501, GN1506]",20171110,Coco (Original Motion Picture Soundtrack),10105939,[553325],Remember Me (D&#250;o) (From &#34;Coco&#34;/So...,"[GN1500, GN0900, GN2200]",[Miguel],457812
371709,"[GN2207, GN1509, GN1501, GN1506]",20171103,Olaf&#39;s Frozen Adventure (Original Soundtrack),10108314,"[746208, 232538, 746216, 407770]",When We&#39;re Together (From &#34;Olaf&#39;s ...,"[GN1500, GN2200]","[Kristen Bell, Idina Menzel, Josh Gad, Jonatha...",371709
170292,"[GN1503, GN1501, GN0908, GN0901]",20190522,Aladdin (Original Motion Picture Soundtrack),10288448,[27242],Arabian Nights (2019) (From &#34;Aladdin&#34;/...,"[GN1500, GN0900]",[Will Smith],170292
438915,"[GN1503, GN1501, GN1509]",20000513,쿠스코? 쿠스코! OST,41158,[28192],My Funny Friend And Me,[GN1500],[Sting],438915


In [54]:
print(list(song_meta.song_name[song_val_pred[0]]))

['The Incredibles (인크레더블) - Road Trip / Missile Lock / The Glory Days', 'Wall-E (월-E)', 'Into The Open Air (From &#34;Brave&#34;/Soundtrack)', 'No Way Out', '알라딘 (Aladdin) - Arabian Nights', 'The Nick of Time (From &#34;Zootopia&#34;/Score)', 'Woodland Symphony / Once Upon a Dream', 'Mickey Mouse Club Book Song', '헤라클레스 (Hercules) - Go The Distance', 'First Flight (From “Big Hero 6”/Score)', 'The Siamese Cat Song/What&#39;s Going on Down There', 'Perfect World', 'Real Gone (From &#34;Cars&#34;/Soundtrack Version)', 'The Time Of Your Life (From &#34;A Bug&#39;s Life&#34;/Score)', 'Collision of Worlds (From &#34;Cars 2&#34;/Soundtrack Version)', 'The Blue Fairy', 'If I Didn&#39;t Have You (From &#34;Monsters, Inc.&#34;/Soundtrack Version)', 'Toy Story (토이스토리) - You`ve Got A Friend In Me', 'Finding Nemo (니모를 찾아서) - Finding Nemo / Nemo Egg', 'My Own Home', 'We Can Still Stop Her (From &#34;Inside Out&#34;/Score)', 'Whistle While You Work', 'Pink Elephants on Parade (From &#34;Dumbo&#34; / 

In [66]:
val_result = [{'id':val.id[i], 'songs':song_val_pred[i], 'tags':tag_val_pred[i]} for i in range(val.shape[0])]

In [143]:
len(val_result)

23015

In [67]:
val_result = re.sub("\'", '\"', str(val_result))

In [68]:
with open('results.json', 'w', encoding='utf-8') as f:
    f.write(str(val_result))