In [1]:
import json
import os
import re
import warnings
import random

import numpy as np
import scipy as sp
import pandas as pd

from numba import jit
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
pd.options.mode.chained_assignment = None

In [2]:
song_meta = pd.read_json('song_meta.json', typ = 'frame', encoding='utf-8')
# train = pd.read_json('train.json', typ = 'frame', encoding='utf-8')

# train data로 songs_y, tags_y vs support, confidence, lift 모델 학습

- train.songs_X와 train.tags_X의 길이가 0인 경우는 없음

In [None]:
train = pd.read_json('train_labelled.json', typ = 'frame', encoding='CP949')
val = pd.read_json('val.json', typ = 'frame', encoding='utf-8')
song_co_occurrence_csr = sp.sparse.load_npz("all_song_co_occurrence_coo.npz").tocsr()
tag_co_occurrence_csr = sp.sparse.load_npz("all_tag_co_occurrence_coo.npz").tocsr()

In [None]:
train.head()

In [None]:
train_song_count = {song:0 for song in range(len(song_meta))}

In [None]:
for songs in train.songs:
    for song in songs:
        train_song_count[song] += 1

In [None]:
train_song_count = np.array([train_song_count[i] for i in range(len(song_meta))])

In [None]:
train_confidence_csr = sp.sparse.load_npz("all_confidence_coo.npz").tocsr()
train_lift_csr = sp.sparse.load_npz("all_lift_coo.npz").tocsr()
tag_train_confidence_csr = sp.sparse.load_npz("all_tag_confidence_coo.npz").tocsr()
tag_train_lift_csr = sp.sparse.load_npz("all_tag_lift_coo.npz").tocsr()

In [None]:
n_train = train.shape[0]

In [None]:
train_support_csr = song_co_occurrence_csr.astype('float16') / n_train

In [None]:
tag_train_support_csr = tag_co_occurrence_csr.astype('float16') / n_train

In [None]:
nonzero_index = train_lift_csr.nonzero()
nonzero_lift = train_lift_csr[nonzero_index[0], nonzero_index[1]]
nonzero_lift[np.isinf(nonzero_lift)] = nonzero_lift[np.logical_not(np.isinf(nonzero_lift))].max()
train_lift_csr[nonzero_index[0], nonzero_index[1]] = nonzero_lift
del nonzero_lift, nonzero_index

In [None]:
nonzero_index = tag_train_lift_csr.nonzero()
nonzero_lift = tag_train_lift_csr[nonzero_index[0], nonzero_index[1]]
nonzero_lift[np.isinf(nonzero_lift)] = nonzero_lift[np.logical_not(np.isinf(nonzero_lift))].max()
tag_train_lift_csr[nonzero_index[0], nonzero_index[1]] = nonzero_lift
del nonzero_lift, nonzero_index

In [None]:
tags_dict = {tag:i for i, tag in enumerate(train.tags.explode().unique())}
index_tags= {index:tag for tag, index in tags_dict.items()}

In [115]:
n_train/4

28767.75

In [None]:
warnings.filterwarnings(action='ignore')
songs_support, songs_confidence, songs_lift, tags_support, tags_confidence, tags_lift = [], [], [], [], [], []
songs_in, tags_in = [], []
all_songs = train.songs_X.explode().unique()
all_tags = train.tags_X.explode().unique()
random.seed(2040)
for i in range(75000, 95000):
    if i > 999 and i % 1000 == 0:
        print(i, "th completed", sep="")
    songs_X = train.songs_X[i]
    songs_y = np.array(train.songs_y[i])
    tags_X = train.tags_X[i]
    tags_y = np.array(train.tags_y[i])
    n_songs = len(songs_X)
    n_tags = len(tags_X)
    n_songs_y = len(songs_y)
    n_tags_y = len(tags_y)
    
    if n_songs_y > 0:
        songs = np.append(songs_X, songs_y)
        songs_index = list(songs_y)
        n_songs_y = len(songs_index)
        songs_index += random.choices(list(all_songs[np.logical_not(np.isin(all_songs, songs))]), k=n_songs_y)

        songs_in += ["in"]*n_songs_y + ["not_in"]*n_songs_y

        songs_i_support = np.array(train_support_csr[songs_X,:].sum(axis=0))[0,songs_index]
        songs_i_confidence = np.array(train_confidence_csr[songs_X,:].sum(axis=0))[0,songs_index]
        songs_i_lift = np.array(train_lift_csr[songs_X,:].sum(axis=0))[0,songs_index]
        
        songs_support += [songs_i_support]
        songs_confidence += [songs_i_confidence]
        songs_lift += [songs_i_lift]
    
    if n_tags_y > 0:
        tags = np.append(tags_X, tags_y)
        tags_index = list(tags_y)
        tags_index += random.choices(list(all_tags[np.logical_not(np.isin(all_tags, tags))]), k=n_tags_y)
        tags_index = list(map(lambda x: tags_dict[x], tags_index))
        tags_X_index = list(map(lambda x: tags_dict[x], tags_X))

        tags_in += ["in"]*n_tags_y + ["not_in"]*n_tags_y

        tags_i_support = np.array(tag_train_support_csr[tags_X_index,:].sum(axis=0))[0,tags_index]
        tags_i_confidence = np.array(tag_train_support_csr[tags_X_index,:].sum(axis=0))[0,tags_index]
        tags_i_lift = np.array(tag_train_support_csr[tags_X_index,:].sum(axis=0))[0,tags_index]
    
        tags_support += [tags_i_support]
        tags_confidence += [tags_i_confidence]
        tags_lift += [tags_i_lift]
warnings.filterwarnings(action='default')

In [103]:
songs_support[:10], songs_confidence[:10], songs_lift[:10], tags_support[:10], tags_confidence[:10], tags_lift[:10]

([array([0.00013035, 0.00013035, 0.00013035, 0.00013035, 0.        ,
         0.        , 0.        , 0.        ]),
  array([3.21540614e-04, 2.95469753e-04, 1.41651676e-03, 1.30354303e-03,
         1.59032250e-03, 1.54687106e-03, 1.72936709e-03, 1.14711787e-03,
         8.69028687e-06, 0.00000000e+00, 8.69028687e-06, 0.00000000e+00,
         0.00000000e+00, 1.73805737e-05, 0.00000000e+00, 0.00000000e+00]),
  array([2.78089180e-04, 2.43328032e-04, 3.04160040e-04, 4.08443483e-04,
         3.12850327e-04, 2.95469753e-04, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 8.69028687e-06, 0.00000000e+00]),
  array([6.95222949e-04, 5.38797786e-04, 4.95346351e-04, 5.47488073e-04,
         5.64868646e-04, 3.56301762e-04, 1.53818078e-03, 2.10304942e-03,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 2.60708606e-05]),
  array([1.14103467e-02, 3.91931938e-03, 7.57793015e-03, 5.80511163e-03,
  

In [104]:
print(sum(np.array(songs_in) == 'in'), len(songs_in))
print(sum(np.array(tags_in) == 'in'), len(tags_in))

757 1514
73 146


In [None]:
AA_features = pd.DataFrame({s_support:songs_support, s_confidence:songs_confidence, s_lift:songs_lift,
                            t_support:tags_support, t_confidence:tags_confidence, t_lift:tags_lift,
                            s_in:songs_in, t_in:tags_in})
pd.to_json("AA_features1.json", orient="records", force_ascii=False)

# validation 데이터 예측

In [None]:
val = pd.read_json('val.json', typ = 'frame', encoding='utf-8')

In [None]:
print(val.shape)
val.head()

In [None]:
sum([1 if n_songs==0 else 0 for n_songs in list(map(lambda x: len(x), val.songs))])

#### validation 데이터에서  songs가 비어 있는 playlist는 4379개 (전체 playlist는 23015개)

### validation song 예측

In [None]:
print("다른 song과 함께 포함된 횟수 :", np.array(song_co_occurrence_csr.sum(axis=1))[:10,0])
print("song 별 총 등장 횟수", list(map(lambda x : train_song_count[x], range(10)) ))

In [None]:
np.array(song_co_occurrence_csr[:10000, :].max(axis=0).todense())[0,:]

In [None]:
warnings.filterwarnings(action='ignore')
song_val_pred = []
for i, songs in enumerate(val.songs):
    if i > 0 and i % 1000 == 0:
        print(i, "th completed", sep="")
    
    if not songs:
        song_val_pred += [train_songs_co_occurrence_rank[:100].tolist()]
        continue
    
    song_i_pred = []
    song_i_candidate = np.array(song_co_occurrence_csr[songs,:].sum(axis=0))[0,:]
    song_i_candidate = song_i_candidate / train_song_count
    song_i_candidate[np.isnan(song_i_candidate)] = 0
    song_i_candidate = song_i_candidate.argsort()[::-1]
    
    count = 0
    for song in song_i_candidate:
        if not song in songs:
            song_i_pred += [song]
            count += 1
            if count == 100: break
    if count < 100:
        for song in train_songs_co_occurrence_rank:
            if not song in songs:
                song_i_pred += [song]
                count += 1
                if count == 100: break
    song_val_pred += [song_i_pred]
warnings.filterwarnings(action='default')

In [None]:
print(train_songs_co_occurrence_rank[:100].tolist())

In [None]:
print(song_val_pred[-3])

### validation tag 예측

In [None]:
tag_co_occurrence_csr

In [None]:
print(index_tags)

In [None]:
warnings.filterwarnings(action='ignore')
tag_val_pred = []
tag_top10 = [train_tags_co_occurrence_rank[:10]]
for i, songs in enumerate(val.songs):
    if i > 0 and i % 1000 == 0:
        print(i, "th completed", sep="")
    
    if not songs:
        tag_val_pred += tag_top10
        continue
    
    tag_i_pred = []
    tag_i_candidate = np.array(tag_co_occurrence_csr[songs,:].sum(axis=0))[0,:]
    tag_i_candidate = tag_i_candidate / train_tag_index_count
    tag_i_candidate[np.isnan(tag_i_candidate)] = 0
    tag_i_candidate = tag_i_candidate.argsort()[::-1]
    
    tags = val.tags[i]
    count = 0
    for tag_index in tag_i_candidate:
        tag = index_tags[tag_index]
        if not tag in tags:
            tag_i_pred += [tag]
            count += 1
            if count == 10: break
    

    tag_val_pred += [tag_i_pred]
warnings.filterwarnings(action='default')

In [None]:
tag_top10[0]

In [None]:
tag_val_pred[0]

In [None]:
val.head()

In [None]:
song_meta.iloc[val.songs[0], :]

In [None]:
print(list(song_meta.song_name[song_val_pred[0]]))

In [None]:
val_result = [{'id':val.id[i], 'songs':song_val_pred[i], 'tags':tag_val_pred[i]} for i in range(val.shape[0])]

In [None]:
len(val_result)

In [None]:
val_result = re.sub("\'", '\"', str(val_result))

In [None]:
with open('results.json', 'w', encoding='utf-8') as f:
    f.write(str(val_result))