In [18]:
import numpy as np
import pandas as pd
import scipy
import json
from collections import Counter
import copy
import random
import sklearn
from tqdm import tqdm

In [2]:
%%time
# 입력데이터 로드
train = pd.read_json('train.json', typ = 'frame')
val = pd.read_json('val.json', typ = 'frame')
genre = pd.read_json('genre_gn_all.json', typ = 'series')
meta = pd.read_json('song_meta.json', typ = 'frame')

CPU times: total: 10.4 s
Wall time: 10.7 s


In [3]:
# binary playlist by track matrix 만들기
sub_train = train.loc[:len(train)//10]
sub_songs = meta.loc[:len(meta)//10]
num_playlist = len(sub_train) # 11508
num_songs = len(sub_songs) # 70799

playlist_by_track = np.zeros((num_playlist,num_songs), dtype=int)

for i, row in sub_train.iterrows():
    playlist_id = row["id"]
    songs = row["songs"]
    for songid in songs:
        if songid < num_songs:
            playlist_by_track[i][songid] = 1

playlist_by_track.sort(axis=1)
playlist_by_track = -np.sort(-playlist_by_track)

In [4]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=128)
tsvd = svd.fit_transform(playlist_by_track)

reconstructed_matrix = np.dot(tsvd, svd.components_)
reconstructed_matrix = np.sort(reconstructed_matrix, axis=1)[:, ::-1]

In [5]:
np.set_printoptions(linewidth=np.inf)

k = 1
print("approx:",reconstructed_matrix[k][:10])
print("binary:",playlist_by_track[k][:10])

approx: [1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00 2.30646709e-16 1.97817466e-16 1.95990517e-16 1.83652956e-16 1.13097349e-16]
binary: [1 1 1 1 1 0 0 0 0 0]


In [24]:
count = 0
while True:
    if playlist_by_track[1][count] == 0:
        break
    count += 1

next_values = list(reconstructed_matrix[1][count:])
max_index = next_values.index(max(next_values))

print(max_index, reconstructed_matrix[1][count+max_index])

0 2.3064670859310056e-16


In [4]:
sub_train = train.loc[:len(train)//50]
sub_meta = meta.loc[:len(meta)//50]
sub_val = val.loc[:len(val)]

playlist_by_track_ones = np.ones((len(sub_train),len(sub_meta)), dtype=int)
playlist_by_track_zeros = np.zeros((len(sub_val),len(sub_meta)), dtype=int)
df_masking = np.concatenate([playlist_by_track_ones, playlist_by_track_zeros], axis=0)

playlist_by_track = np.zeros((len(sub_train)+len(sub_val),len(sub_meta)), dtype=int)
for i, row in sub_train.iterrows():
    songs = row["songs"]
    for songid in songs:
        if songid < len(sub_meta):
            playlist_by_track[i][songid] = 1

In [5]:
df = np.multiply(playlist_by_track, df_masking)

In [6]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=128)
tsvd = svd.fit_transform(df)
reconstructed_matrix = np.dot(tsvd, svd.components_)

In [9]:
songs_index = np.argsort(-reconstructed_matrix, axis=1)[:, :100]

In [21]:
songs_index

array([[    0,  9434,  9435, ...,  9475,  9476,  9477],
       [ 6546,  2149,  1727, ...,  3567,  1742, 11266],
       [    0,  9434,  9435, ...,  9475,  9476,  9477],
       ...,
       [    0,  9434,  9435, ...,  9475,  9476,  9477],
       [    0,  9434,  9435, ...,  9475,  9476,  9477],
       [    0,  9434,  9435, ...,  9475,  9476,  9477]], dtype=int64)

In [20]:
for i in range(len(sub_train),len(sub_train)+len(sub_val),1):
    next_songid = songs_index[i][0]
    print(next_songid)

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
