In [None]:
import numpy as np
import pandas as pd
import os
from collections import Counter
from warnings import warn

warn("Unsupported module 'tqdm' is used.")
from tqdm import tqdm


class KNN:

    __version__ = "KNN-1.0"
    
    def __init__(self, k, rho=0.4, alpha=0.5, beta=0.5, \
                 sim_songs="cos", sim_tags="cos", sim_normalize=False, \
                 train=None, val=None, verbose=True, version_check=True):
        '''
        k : int
        rho : float; 0.4(default) only for idf
        alpha, beta : float; 0.5(default)
        sim_songs, sim_tags : "cos"(default), "idf", "jaccard"
        sim_normalize : boolean; when sim == "cos" or "idf"
        verbose : boolean
        '''
        self.train_id = train["id"]
        self.train_songs = train["songs"]
        self.train_tags = train["tags"]
        del train

        self.val_id = val["id"]
        self.val_songs = val["songs"]
        self.val_tags = val["tags"]
        del val

        self.freq_songs = None
        self.freq_tags = None
        
        self.k = k
        self.rho = rho
        self.alpha = alpha
        self.beta = beta

        self.sim_songs = sim_songs
        self.sim_tags = sim_tags
        self.sim_normalize = sim_normalize

        self.verbose = verbose
        self.__version__ = KNN.__version__

        if version_check:
            print(f"KNN version: {KNN.__version__}")

        TOTAL_SONGS = 707989      # total number of songs

        if self.sim_songs == "idf":

            self.freq_songs = np.zeros(TOTAL_SONGS, dtype=np.int64)
            _playlist = self.train_songs
            for _songs in _playlist:
                self.freq_songs[_songs] += 1


    def predict(self, start=0, end=None, auto_save=False, auto_save_step=500, auto_save_fname='auto_save'):
        '''
        start, end : range(start, end). if end = None, range(start, end of val)
        auto_save : boolean; False(default)
        auto_save_step : int; 500(default)
        auto_save_fname : string (without extension); 'auto_save'(default)
        @returns : pandas.DataFrame; columns=['id', 'songs', 'tags']
        '''

        # TODO: Remove unsupported module 'tqdm'.
        if end:
            _range = tqdm(range(start, end)) if self.verbose else range(start, end)
        elif end == None:
            _range = tqdm(range(start, self.val_id.index.stop)) if self.verbose else range(start, self.val_id.index.stop)

        pred = []
        all_songs = [set(songs) for songs in self.train_songs] # list of set
        all_tags =  [set(tags) for tags in self.train_tags]    # list of set

        for uth in _range:

            playlist_songs = set(self.val_songs[uth])
            playlist_tags = set(self.val_tags[uth])
            k = self.k

            if len(playlist_songs) == 0 or self.alpha == 0:
                simSongs = np.zeros(len(all_songs))
            else:
                simSongs = np.array([self._sim(playlist_songs, vplaylist, self.sim_songs, opt="songs") for vplaylist in all_songs])

            if len(playlist_tags) == 0 or self.beta == 0:
                simTags = np.zeros(len(all_tags))
            else:
                simTags = np.array([self._sim(playlist_tags, vplaylist, self.sim_tags, opt="tags") for vplaylist in all_tags])
            
            # TODO: normalize simSongs and simTags
            sim_score = (self.alpha * simSongs) + (self.beta * simTags)

            songs = set()
            tags = []

            # TODO: add condition (len(tags) < 10)
            while (len(songs) < 100 or len(tags)):
                top = sim_score.argsort()[-k:] # top k indicies of playlists in train

                _songs = []
                _tags = []

                # for vth playlist in train
                for vth in top:
                    _songs += self.train_songs[vth]
                    _tags += self.train_tags[vth]
                songs = set(_songs) - playlist_songs

                counts = Counter(_tags).most_common(30)
                tags = [tag for tag, _ in counts if tag not in playlist_tags]
                
                k += 100
            
            norm = sim_score[top].sum()
            if norm == 0:
                norm = 1.0e+10 # FIXME
            
            relevance = np.array([(song, np.sum([simSongs[vth] if song in all_songs[vth] else 0 for vth in top]) / norm) for song in songs])
            relevance = relevance[relevance[:, 1].argsort()][-100:][::-1]
            pred_songs = relevance[:, 0].astype(np.int64).tolist()
            pred_tags = tags[:10]

            pred.append({
                "id" : int(self.val_id[uth]),
                "songs" : pred_songs,
                "tags" : pred_tags
            })

            if (auto_save == True) and ((uth + 1) % auto_save_step == 0):
                self._auto_save(pred, auto_save_fname)
        
        return pd.DataFrame(pred)
    

    def _sim(self, u, v, sim, opt):
        '''
        u : set (playlist in train data)
        v : set (playlist in test data)
        sim : string; "cos", "idf", "jaccard" (kind of similarity)
        opt : string; "songs", "tags"
        '''

        if sim == "cos":
            if self.sim_normalize:
                try:
                    len(u & v) / ((len(u) ** 0.5) * (len(v) ** 0.5))
                except:
                    return 0
            else:
                return len(u & v)
        
        elif sim == "idf":
            if opt == "songs":
                freq = self.freq_songs
            elif opt == "tags":
                freq = self.freq_tags
            freq = freq[list(u & v)]
            freq = 1 / (((freq - 1) ** self.rho) + 1) # numpy!
            if self.sim_normalize:
                try:
                    return freq.sum() / ((len(u) ** 0.5) * (len(v) ** 0.5))
                except:
                    return 0
            else:
                return freq.sum()
        
        elif sim == "jaccard":
            return len(u & v) / len(u | v)
    
    def _auto_save(self, pred, auto_save_fname):
        '''
        pred : list of dictionaries
        auto_save_fname : string
        '''
        
        if not os.path.isdir("./_temp"):
            os.mkdir('./_temp')
        pd.DataFrame(pred).to_json(f'_temp/{auto_save_fname}.json', orient='records')


if __name__=="__main__":

    ### 1. load data
    train = pd.read_json("train.json")
    val = pd.read_json("val.json")
    # test = pd.read_json("res/test.json")

    ### 2. modeling
    ### 2.1 hyperparameters: k, rho, alpha, beta
    ### 2.2 parameters: sim_songs, sim_tags, sim_normalize
    k = 100
    rho = 0.4
    alpha = 0.6
    beta = 0.4
    sim_songs = "idf"
    sim_tags = "cos"
    sim_normalize = False

    ### 3. range setting - KNN.predict()
    ### 3.1 range(start, end); if end == None, then range(start, end of val)
    ### 3.2 auto_save: boolean; False(default)
    ### 3.3 return type of KNN.predict() : pandas.DataFrame
    pred = KNN(k=k, rho=rho, alpha=alpha, beta=beta, 
               sim_songs=sim_songs, sim_tags=sim_tags, sim_normalize=sim_normalize, 
               train=train, val=val, verbose=True, version_check=True).predict(start=0, end=len(val), auto_save=False)
    # print(pred)

    ### 4. save data
    version = KNN.__version__
    version = version[version.find('-') + 1: version.find('.')]
    path = "."
    fname = f"knn{version}_k{k}rho{int(rho * 10)}a{int(alpha * 10)}b{int(beta * 10)}_{sim_songs}{sim_tags}{sim_normalize}"
    pred.to_json(f'{path}/{fname}.json', orient='records')

  import sys


KNN version: KNN-1.0


  0%|                                                                                        | 0/23015 [00:00<?, ?it/s]

In [15]:
import pandas as pd
val_song0= pd.read_json("val_song0_predictag.json", encoding = 'utf-8')

In [11]:
val_song0.reset_index(inplace = True)

In [13]:
val_song0.to_json("val_title.json")

In [17]:
val_song0

Unnamed: 0,id,like_cnt,plylst_title,songs,tags,updt_date,lower_tags
1,131447,1,앨리스테이블,[],[],2014-07-16 15:24:24.000,[]
10003,112033,8,제목입으로 먹으면 살찌니까 귀로 먹어요 푸드송,[],[기분전환],2016-11-02 20:27:53.000,"[푸드송, 귀, 기분전환]"
10006,70866,2,랩그것은 인생의 진리지,[],[힙합],2015-10-13 17:02:54.000,"[힙합, 랩, 인생의]"
10012,74889,8,자신이 고음 좀 올라간다 그렇다면 도전해보세요 여자편,[],[],2016-02-05 20:43:41.000,"[자신, 고음, 도전, 좀]"
10019,38058,24,바흐의 피아노 작품집 Vol1,[],[],2015-04-08 23:24:23.000,"[바흐, 피아노, 1]"
1003,31976,25,추억 속 강변가요제 수상곡들 19882001,[],[가요],2017-01-13 16:05:25.000,"[수상곡, 추억, 가요, 강변가요제]"
10033,51678,6138,밤 산책하며 듣는 감성힙합,[],"[밤, 새벽]",2020-04-18 12:27:54.000,"[밤, 산책, 듣는, 새벽, 감성힙합]"
10035,70939,0,탑골공원,[],[틀딱],2019-09-02 12:03:58.000,"[탑골공원, 틀딱]"
10036,110880,9,여름휴가에 데리고 가면 좋을 노래,[],[],2016-07-27 21:56:26.000,"[여름, 노래, 휴가, 가면]"
10038,15308,20,레어블랙뮤직Vol24,[],"[HipHop, soul]",2020-03-05 23:28:46.000,"[soul, hiphop, 뮤직]"
