In [1]:
%matplotlib inline

import numpy as np
import pandas as pd

import warnings

warnings.filterwarnings(action='ignore')

### project root 에서 util 가져오는 법

In [2]:
import pathlib
import sys

abs_path = ''
current_path = pathlib.Path().absolute()

for path_name in str(current_path).split('/'):
    abs_path += path_name + '/'

    if path_name == 'melon-playlist-continuation':
         break

# sys.path.insert(0, abs_path)
sys.path.append(abs_path)

### Test baseline code of most_popular.py

In [3]:
from collections import Counter

from arena_util import load_json
from arena_util import write_json
from arena_util import remove_seen
from arena_util import most_popular


class GenreMostPopular:
    def _song_mp_per_genre(self, song_meta, global_mp):
        res = {}

        for sid, song in song_meta.items():
            for genre in song['song_gn_gnr_basket']:
                res.setdefault(genre, []).append(sid)

        for genre, sids in res.items():
            res[genre] = Counter({k: global_mp.get(int(k), 0) for k in sids})
            res[genre] = [k for k, v in res[genre].most_common(200)]

        return res

    def _generate_answers(self, song_meta_json, train, questions):
        song_meta = {int(song["id"]): song for song in song_meta_json}
        song_mp_counter, song_mp = most_popular(train, "songs", 200)
        tag_mp_counter, tag_mp = most_popular(train, "tags", 100)
        song_mp_per_genre = self._song_mp_per_genre(song_meta, song_mp_counter)

        answers = []
        for q in questions:
            genre_counter = Counter()

            for sid in q["songs"]:
                for genre in song_meta[sid]["song_gn_gnr_basket"]:
                    genre_counter.update({genre: 1})

            top_genre = genre_counter.most_common(1)

            if len(top_genre) != 0:
                cur_songs = song_mp_per_genre[top_genre[0][0]]
            else:
                cur_songs = song_mp

            answers.append({
                "id": q["id"],
                "songs": remove_seen(q["songs"], cur_songs)[:100],
                "tags": remove_seen(q["tags"], tag_mp)[:10]
            })

        return answers

    def run(self, song_meta_fname, train_fname, question_fname):
        print("Loading song meta...")
        song_meta_json = load_json(song_meta_fname)

        print("Loading train file...")
        train_data = load_json(train_fname)

        print("Loading question file...")
        questions = load_json(question_fname)

        print("Writing answers...")
        answers = self._generate_answers(song_meta_json, train_data, questions)

#         write_json(answers, "results/results.json")

        return answers        

In [4]:
model = GenreMostPopular()

In [5]:
# genre_most_popular.py run \
#  	--song_meta_fname=res/song_meta.json \
#  	--train_fname=arena_data/orig/train.json \
#  	--question_fname=arena_data/questions/val.json 

In [6]:
answers = model.run(
    song_meta_fname='../res/song_meta.json',
    train_fname='../arena_data/orig/train.json',
    question_fname='../arena_data/questions/val.json',
)

Loading song meta...
Loading train file...
Loading question file...
Writing answers...


In [8]:
answers[0].keys()

dict_keys(['id', 'songs', 'tags'])

### test result data 저장 or 제출용 데이터 저장

In [11]:
from util import (
    get_absolute_path_of_project_directory,
    write_json,
)

In [10]:
write_json(data=answers, fname='test', )

In [22]:
# result 형식은 아래와 같아야 함
# sample_result = [
#     {
#         'id': 1,
#         'songs': [123, 456],
#         'tags': ['test1', 'test2'],
#     }
# ]

In [23]:
# local evaluate test인 경우에는 is_submit에 False로 진행
# 이 경우는 project_dir/local_val/에 저장됨
write_json(data=answers, fname='test_result', is_submit=False)

# 제출용은 is_submit에 True로 진행
# 이 경우는 project_dir/submit_val/에 저장됨
write_json(data=answers, fname='test_result', is_submit=True)

### split한 데이터 가져오기

In [12]:
train, test, x_test, y_test, genre, meta = None, None, None, None, None, None

def set_assets(
    is_need_song_meta=False,
):
    import json
    global train, test, x_test, y_test, genre, meta
    
    basepath = get_absolute_path_of_project_directory()
    orig_basepath = f'{basepath}/res/'
    data_basepath = f'{basepath}/arena_data/'
    train = pd.read_json(f'{data_basepath}orig/train.json')
    test = pd.read_json(f'{data_basepath}orig/val.json')
    x_test = pd.read_json(f'{data_basepath}questions/val.json')
    y_test = pd.read_json(f'{data_basepath}answers/val.json')

    genre = pd.read_json(
        f'{orig_basepath}genre_gn_all.json',
        typ='dataframe',
    ).reset_index()
    genre.columns = ['code', 'desc']

    print('shape of train', train.shape)
    print('shape of test', test.shape)
    print('shape of x_test', x_test.shape)
    print('shape of y_test', y_test.shape)
    print('shape of genre', genre.shape)

    if is_need_song_meta:
        meta = pd.read_json(f'{orig_basepath}/song_meta.json')
        print('shape of meta', meta.shape)

In [13]:
# song_meta 가 크기가 커서 원하지 않는 경우엔 import를 안할 수 있다.
set_assets(is_need_song_meta=False)

/Users/misfits/Documents/workplace/melon-playlist-continuation/
shape of train (92056, 6)
shape of test (23015, 6)
shape of x_test (23015, 6)
shape of y_test (23015, 6)
shape of genre (254, 2)


In [25]:
test.head()

Unnamed: 0,id,like_cnt,plylst_title,songs,tags,updt_date
0,18488,3,요즘 많이듣는 인디 노래,"[674442, 131295, 83652, 352919, 233166, 99741,...","[카페, 인디음악, 드라이브, 인디뮤직, 사랑, 이별, 인디]",2017-07-17 11:00:52.000
1,76254,4,살랑살랑 불어오는 바람 같은 뉴에이지,"[222141, 422934, 4917, 700161, 424495, 683582,...","[살랑살랑, 뉴에이지]",2017-07-20 13:42:37.000
2,86227,41,비오는날 감미롭고 우울한 재즈,"[333034, 638621, 483000, 570730, 442053, 17405...","[비오는날, 밤, 새벽]",2015-07-10 03:18:46.000
3,87450,17,걸크러쉬돋는 여자보컬 락 노래 모음,"[229337, 30825, 475737, 672432, 59091, 98657, ...","[락, 락밴드, 메탈, 락음악]",2017-07-10 21:30:25.000
4,24649,5,퇴근 후 차분한 인디,"[13930, 18100, 105626, 310720, 93295, 557891, ...","[집중, 휴식, 밤, 카페, 새벽, 차분한, 조용한, 인디]",2020-04-09 00:04:15.000


In [34]:
test_id = 18488

In [35]:
test[test.id == test_id].songs.values

array([list([674442, 131295, 83652, 352919, 233166, 99741, 676361, 298030, 368044, 7070, 565913, 398467, 117367, 194427, 670669, 592826])],
      dtype=object)

In [36]:
x_test[x_test.id == test_id].songs.values

array([list([674442, 131295, 233166, 298030, 368044, 7070, 670669, 592826])],
      dtype=object)

In [37]:
y_test[y_test.id == test_id].songs.values

array([list([83652, 352919, 99741, 676361, 565913, 398467, 117367, 194427])],
      dtype=object)

### 카카오에서 제공한 데이터 set하는 method
train을 이용하여 학습할 때는 아래처럼 가져올 필요는 없이 위에서처럼 `split_data`를 이용해서 split한 데이터를 사용하도록 하자.

In [7]:
df_tr, df_val, df_te, df_gr, df_sm = None, None, None, None, None

def set_original_assets(
    is_need_song_meta=False,
):
    import json
    global df_tr, df_val, df_te, df_gr, df_sm
    
    basepath = '../res/'
    df_tr = pd.read_json(f'{basepath}train.json')
    df_val = pd.read_json(f'{basepath}val.json')
    df_te = pd.read_json(f'{basepath}test.json')
    df_gr = pd.read_json(
        f'{basepath}genre_gn_all.json',
        typ='dataframe',
    ).reset_index()
    df_gr.columns = ['code', 'desc']

    print('shape of df_tr', df_tr.shape)
    print('shape of df_val', df_val.shape)
    print('shape of df_te', df_te.shape)
    print('shape of df_gr', df_gr.shape)

    if is_need_song_meta:
        df_sm = pd.read_json(f'{basepath}/song_meta.json')
        print('shape of df_song_meta', df_sm.shape)