In [55]:
import json
import os
import re
import warnings
import random

import numpy as np
import scipy as sp
import pandas as pd

from numba import jit
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

In [2]:
pd.options.mode.chained_assignment = None

In [56]:
genre_gn_all = pd.read_json('genre_gn_all.json', typ='series', encoding='utf-8')
# 장르코드 : gnr_code, 장르명 : gnr_name
genre_gn_all = pd.DataFrame(genre_gn_all, columns = ['gnr_name']).reset_index().rename(columns = {'index' : 'gnr_code'})
song_meta = pd.read_json('song_meta.json', typ = 'frame', encoding='utf-8')
train = pd.read_json('train.json', typ = 'frame', encoding='utf-8')

In [None]:
song_meta.head()

In [None]:
train.head()

## train data 내 playlist에서 song  일부(20%)를 제거하는 train set 생성

In [None]:
# @jit(nopython=True)
def train_labelled(songs_list, tags_list, X_rate):
    # result[:,0]=songs_X
    # result[:,1]=songs_y
    # result[:,2]=tags_X
    # result[:,3]=tags_y
    i = 0
    songs_X, songs_y, tags_X, tags_y = [], [], [], []
    for songs, tags in zip(songs_list, tags_list):        
        if i > 9999 and i % 10000 == 0:
            print(i, "th completed", sep="")
        n_songs = len(songs)
        n_tags = len(tags)
        
        if n_songs + n_tags == 0:
            continue
        
        n_songs_Xi = round(X_rate*n_songs)
        n_tags_Xi = round(X_rate*n_tags)
        songs_index = random.shuffle(list(range(n_songs)))
        tags_index = random.shuffle(list(range(n_tags)))
        songs_Xi = songs[:n_songs_Xi]
        songs_yi = songs[n_songs_Xi:]
        tags_Xi = tags[:n_tags_Xi]
        tags_yi = tags[n_tags_Xi:]
        
        if not len(songs_X):
            songs_X, songs_y, tags_X, tags_y = [songs_Xi], [songs_yi], [tags_Xi], [tags_yi]
        else:
            songs_X += [songs_Xi]
            songs_y += [songs_yi]
            tags_X += [tags_Xi]
            tags_y += [tags_yi]
        
        i += 1
    return songs_X, songs_y, tags_X, tags_y

In [None]:
random.seed(1000)
new_train = train_labelled(list(train.songs), list(train.tags), 0.8)

In [None]:
print(new_train[3])

In [None]:
train["songs_X"] = pd.Series(new_train[0])
train["songs_y"] = pd.Series(new_train[1])
train["tags_X"] = pd.Series(new_train[2])
train["tags_y"] = pd.Series(new_train[3])

In [None]:
train.head()

In [None]:
train.to_json("train_labelled.json", orient='records', force_ascii=False)

## 노래들 간 co-occurrence를 이용하여 누락된 노래 100개 예측

In [61]:
train.tail()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
115066,"[록메탈, 밴드사운드, 록, 락메탈, 메탈, 락, extreme]",120325,METAL E'SM #2,"[429629, 441511, 612106, 516359, 691768, 38714...",3,2020-04-17 04:31:11.000
115067,[일렉],106976,빠른 리스너를 위한 따끈따끈한 최신 인기 EDM 모음!,"[321330, 216057, 534472, 240306, 331098, 23288...",13,2015-12-24 17:23:19.000
115068,"[담시, 가족, 눈물, 그리움, 주인공, 나의_이야기, 사랑, 친구]",11343,#1. 눈물이 앞을 가리는 나의_이야기,"[50512, 249024, 250608, 371171, 229942, 694943...",4,2019-08-16 20:59:22.000
115069,"[잔잔한, 버스, 퇴근버스, Pop, 풍경, 퇴근길]",131982,퇴근 버스에서 편히 들으면서 하루를 마무리하기에 좋은 POP,"[533534, 608114, 343608, 417140, 609009, 30217...",4,2019-10-25 23:40:42.000
115070,"[노래추천, 팝송추천, 팝송, 팝송모음]",100389,FAVORITE POPSONG!!!,"[26008, 456354, 324105, 89871, 135272, 143548,...",17,2020-04-18 20:35:06.000


In [65]:
pd.concat([train, val.iloc[:4,:]]).tail()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
115070,"[노래추천, 팝송추천, 팝송, 팝송모음]",100389,FAVORITE POPSONG!!!,"[26008, 456354, 324105, 89871, 135272, 143548,...",17,2020-04-18 20:35:06.000
0,[],118598,,"[373313, 151080, 275346, 696876, 165237, 52593...",1675,2019-05-27 14:14:33.000
1,[],131447,앨리스테이블,[],1,2014-07-16 15:24:24.000
2,[],51464,,"[529437, 516103, 360067, 705713, 226062, 37089...",62,2008-06-21 23:26:22.000
3,[],45144,,"[589668, 21711, 570151, 320043, 13930, 599327,...",20,2017-10-30 18:15:43.000


In [66]:
val = pd.read_json('val.json', typ = 'frame', encoding='utf-8')
test = pd.read_json('test.json', typ = 'frame', encoding='utf-8')

In [67]:
train = pd.concat([train, val, test])

In [57]:
# co-occurrence matrix for songs
num_songs = song_meta.shape[0]
def cooccur(song_lists):
    import scipy as sp
    i = 0
    result = sp.sparse.dok_matrix((num_songs,num_songs), dtype='int32')
    for songs in song_lists:
        if i > 9999 and i % 10000 == 0:
            print(i, "th completed", sep="")
        i = 0
        for song in songs:
            for co_occured_song in songs[(i+1):]:
                result[song, co_occured_song] += 1
                result[co_occured_song, song] += 1
            i += 1
    return result

In [76]:
# train.songs 이용 2~3시간 걸림
song_co_occurrence = cooccur(train.songs)
sp.sparse.save_npz("all_song_co_occurrence.npz", song_co_occurrence.tocoo())

In [None]:
song_co_occurrence_csr = song_co_occurrence.tocsr()
del song_co_occurrence

In [None]:
# song_co_occurrence_coo = song_co_occurrence.tocoo()
# sp.sparse.save_npz("train_song_co_occurrence.npz", song_co_occurrence_coo)

In [None]:
# train.songs_X 이용 
song_co_occurrence_reduced = cooccur(train.songs_X)

In [None]:
song_co_occurrence_reduced_coo = song_co_occurrence_reduced.tocoo()
sp.sparse.save_npz("train_song_co_occurrence_reduced.npz", song_co_occurrence_reduced_coo)

In [6]:
song_co_occurrence_csr = sp.sparse.load_npz("train_song_co_occurrence.npz").tocsr()

- 특정 song이 포함될 conditional probability를 계산하기 위한 song 별 총 등장횟수 계산

In [77]:
train_song_count = {song:0 for song in range(len(song_meta))}

In [78]:
for songs in train.songs:
    for song in songs:
        train_song_count[song] += 1

In [79]:
train_song_count = np.array([train_song_count[i] for i in range(len(song_meta))])

In [71]:
print(train_song_count)

[ 2  0  0 ... 14  1  1]


## 노래들과 태그의 co-occurrence를 이용하여 태그 10개 예측

In [80]:
# 태그 unique 값 추출
tags_dict = {tag:i for i, tag in enumerate(train.tags.explode().unique())}
index_tags = {index:tag for tag, index in tags_dict.items()}

In [81]:
# co-occurrence matrix for tags with respect to songs
def cooccur_tag(song_lists, tag_lists, n_songs, n_tags):
    import scipy as sp
    result = sp.sparse.dok_matrix((n_songs, n_tags), dtype='int16')
    i = 0
    for songs, tags in zip(song_lists, tag_lists):
        if i > 9999 and not i % 10000:
            print(i, "th list completed", sep="")
        songs_in_list = len(songs)
        for song in songs:
            for tag in tags:
                result[song, tags_dict[tag]] += 1
        i += 1
    return result

In [74]:
train.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,[락],61281,여행같은 음악,"[525514, 129701, 383374, 562083, 297861, 13954...",71,2013-12-19 18:36:19.000
1,"[추억, 회상]",10532,요즘 너 말야,"[432406, 675945, 497066, 120377, 389529, 24427...",1,2014-12-02 16:19:42.000
2,"[까페, 잔잔한]",76951,"편하게, 잔잔하게 들을 수 있는 곡.-","[83116, 276692, 166267, 186301, 354465, 256598...",17,2017-08-28 07:09:34.000
3,"[연말, 눈오는날, 캐럴, 분위기, 따듯한, 크리스마스캐럴, 겨울노래, 크리스마스,...",147456,크리스마스 분위기에 흠뻑 취하고 싶을때,"[394031, 195524, 540149, 287984, 440773, 10033...",33,2019-12-05 15:15:18.000
4,[댄스],27616,추억의 노래 ㅋ,"[159327, 553610, 5130, 645103, 294435, 100657,...",9,2011-10-25 13:54:56.000


In [82]:
tag_co_occurrence = cooccur_tag(train.songs, train.tags, n_songs=num_songs, n_tags=len(tags_dict))
tag_co_occurrence = tag_co_occurrence.tocoo()
sp.sparse.save_npz('all_tag_co_occurrence_coo.npz', tag_co_occurrence)

10000th list completed
20000th list completed
30000th list completed
40000th list completed
50000th list completed
60000th list completed
70000th list completed
80000th list completed
90000th list completed
100000th list completed
110000th list completed
120000th list completed
130000th list completed
140000th list completed


In [None]:
tag_co_occurrence = cooccur_tag(train.songs, train.tags, n_songs=num_songs, n_tags=len(tags_dict))
tag_co_occurrence = tag_co_occurrence.tocoo()
# sp.sparse.save_npz('train_tag_co_occurrence_coo.npz', tag_co_occurrence)

In [83]:
tag_co_occurrence_csr = tag_co_occurrence.tocsr()
del tag_co_occurrence

- 특정 tag가 포함될 conditional probability를 계산하기 위한 tag 별 총 등장횟수 계산

In [84]:
train_tag_index_count = {index:0 for index in range(len(tags_dict))}
train_tag_index_count = np.array([train_tag_index_count[i] for i in range(len(tags_dict))])

In [85]:
for tags in train.tags:
    for tag in tags:
        train_tag_index_count[tags_dict[tag]] += 1

In [14]:
train_tag_index_count

array([4007, 6520, 4579, ...,    1,    1,    1])

## association analysis - song 별로 support, confidence, lift 계산

- song association

In [86]:
nonzero_index = song_co_occurrence_csr.nonzero()

In [None]:
print(nonzero_index[0].shape, nonzero_index[1].shape)

In [87]:
n_train = train.shape[0]

In [88]:
train_support = song_co_occurrence_csr / n_train
train_support = train_support.astype('float16')
print(train_support.max(), type(train_support.max()))

0.01011 <class 'numpy.float16'>


In [95]:
train_confidence = sp.sparse.dok_matrix((num_songs, num_songs), dtype="float16")
train_lift = sp.sparse.dok_matrix((num_songs, num_songs), dtype="float16")

In [None]:
train_confidence[nonzero_index[0], nonzero_index[1]] = \
song_co_occurrence_csr[nonzero_index[0], nonzero_index[1]] / train_song_count[nonzero_index[0]]
del song_co_occurrence_csr
train_confidence = train_confidence.tocoo()
sp.sparse.save_npz("all_confidence_coo.npz", train_confidence)
train_confidence_csr = train_confidence.tocsr()
del train_confidence

In [19]:
# train_confidence_csr = sp.sparse.load_npz("train_confidence_coo.npz").tocsr()

In [20]:
sp.stats.describe(np.array(train_confidence_csr[nonzero_index[0], nonzero_index[1]])[0])

DescribeResult(nobs=278497102, minmax=(0.00045967102, 1.0), mean=0.17817852, variance=0.08319576, skewness=2.0607762336730957, kurtosis=3.0394186023966414)

In [None]:
train_lift[nonzero_index[0], nonzero_index[1]] = \
n_train*(train_confidence_csr[nonzero_index[0], nonzero_index[1]]) / train_song_count[nonzero_index[1]]
train_lift = train_lift.tocoo()
sp.sparse.save_npz("all_lift_coo.npz", train_lift)
train_lift_csr = train_lift_coo.tocsr()
del train_lift

In [33]:
# train_lift_csr = sp.sparse.load_npz("train_lift_coo.npz").tocsr()
sp.stats.describe(np.array(train_lift_csr[nonzero_index[0], nonzero_index[1]])[0])

  x = asanyarray(arr - arrmean)
  a_zero_mean = a - np.expand_dims(np.mean(a, axis), axis)


DescribeResult(nobs=278497102, minmax=(0.04751587, inf), mean=inf, variance=nan, skewness=nan, kurtosis=nan)

- tag association

In [None]:
tag_nonzero_index = tag_co_occurrence_csr.nonzero()

In [None]:
tag_train_support = tag_co_occurrence_csr / n_train
tag_train_support = tag_train_support.astype('float16')
print(tag_train_support.max(), type(tag_train_support.max()))

In [None]:
tag_train_confidence = sp.sparse.dok_matrix((num_songs, tag_co_occurrence_csr.shape[1]), dtype="float16")
tag_train_lift = sp.sparse.dok_matrix((num_songs, tag_co_occurrence_csr.shape[1]), dtype="float16")

In [None]:
tag_train_confidence[tag_nonzero_index[0], tag_nonzero_index[1]] = \
tag_co_occurrence_csr[tag_nonzero_index[0], tag_nonzero_index[1]] / train_song_count[tag_nonzero_index[0]]
del tag_co_occurrence_csr
tag_train_confidence = tag_train_confidence.tocoo()
tag_train_confidence_csr = tag_train_confidence.tocsr()
del tag_train_confidence
inf_index = np.isinf(tag_train_confidence_csr[tag_nonzero_index[0], tag_nonzero_index[1]])
tag_train_confidence_csr[tag_nonzero_index[0], tag_nonzero_index[1]][inf_index] = 
sp.sparse.save_npz("all_tag_confidence_coo.npz", tag_train_confidence)

In [42]:
# tag_train_confidence_csr = sp.sparse.load_npz("tag_train_confidence_coo.npz").tocsr()
sp.stats.describe(np.array(tag_train_confidence_csr[tag_nonzero_index[0], tag_nonzero_index[1]])[0])

DescribeResult(nobs=10984669, minmax=(0.00045967102, 1.0), mean=0.23281047, variance=0.102775514, skewness=1.5750420093536377, kurtosis=1.1285851758876895)

In [None]:
tag_train_lift[tag_nonzero_index[0], tag_nonzero_index[1]] = \
n_train*(tag_train_confidence_csr[tag_nonzero_index[0], tag_nonzero_index[1]]) / train_tag_index_count[tag_nonzero_index[1]]
tag_train_lift = tag_train_lift.tocoo()
sp.sparse.save_npz("all_tag_lift_coo.npz", tag_train_lift)
tag_train_lift_csr = tag_train_lift.tocsr()
del tag_train_lift

In [54]:
# tag_train_lift_csr = sp.sparse.load_npz("tag_train_lift_coo.npz").tocsr()
sp.stats.describe(np.array(tag_train_lift_csr[tag_nonzero_index[0], tag_nonzero_index[1]])[0])

  x = asanyarray(arr - arrmean)
  a_zero_mean = a - np.expand_dims(np.mean(a, axis), axis)


DescribeResult(nobs=10984669, minmax=(0.01159668, inf), mean=inf, variance=nan, skewness=nan, kurtosis=nan)

# validation 데이터 예측
- train, val, test 데이터로 계산한 song/tag의 support, confidence, lift와 reduced train 데이터를 이용하여 logistic 모델의 설명변수 구축
- 구축된 support, confidence, lift 데이터로 fit logistic regression 

In [None]:
print(val.shape)
val.head()

In [None]:
sum([1 if n_songs==0 else 0 for n_songs in list(map(lambda x: len(x), val.songs))])

#### validation 데이터에서  songs가 비어 있는 playlist는 4379개 (전체 playlist는 23015개)

### validation song 예측

In [None]:
print("다른 song과 함께 포함된 횟수 :", np.array(song_co_occurrence_csr.sum(axis=1))[:10,0])
print("song 별 총 등장 횟수", list(map(lambda x : train_song_count[x], range(10)) ))

In [None]:
# 비어 있는 playlist에 대해서는 등장횟수 상위 100song으로 예측
train_songs_co_occurrence_rank = np.array(song_co_occurrence_csr.sum(axis=1))[:,0].argsort()[::-1]
train_songs_co_occurrence_rank
np.array(song_co_occurrence_csr.sum(axis=1))[train_songs_co_occurrence_rank]

In [None]:
np.array(song_co_occurrence_csr[:10000, :].max(axis=0).todense())[0,:]

In [None]:
warnings.filterwarnings(action='ignore')
song_val_pred = []
for i, songs in enumerate(val.songs):
    if i > 0 and i % 1000 == 0:
        print(i, "th completed", sep="")
    
    if not songs:
        song_val_pred += [train_songs_co_occurrence_rank[:100].tolist()]
        continue
    
    song_i_pred = []
    song_i_candidate = np.array(song_co_occurrence_csr[songs,:].sum(axis=0))[0,:]
    song_i_candidate = song_i_candidate / train_song_count
    song_i_candidate[np.isnan(song_i_candidate)] = 0
    song_i_candidate = song_i_candidate.argsort()[::-1]
    
    count = 0
    for song in song_i_candidate:
        if not song in songs:
            song_i_pred += [song]
            count += 1
            if count == 100: break
    if count < 100:
        for song in train_songs_co_occurrence_rank:
            if not song in songs:
                song_i_pred += [song]
                count += 1
                if count == 100: break
    song_val_pred += [song_i_pred]
warnings.filterwarnings(action='default')

In [None]:
print(train_songs_co_occurrence_rank[:100].tolist())

In [None]:
print(song_val_pred[-3])

### validation tag 예측

In [None]:
tag_co_occurrence_csr

In [None]:
train_tags_co_occurrence_rank = list(map(lambda x: index_tags[x], np.array(tag_co_occurrence_csr.sum(axis=0))[0,:].argsort()[::-1]))
print(train_tags_co_occurrence_rank[:10])
np.array(tag_co_occurrence_csr.sum(axis=0))[0,:][np.array(tag_co_occurrence_csr.sum(axis=0))[0,:].argsort()[::-1]]

In [None]:
print(index_tags)

In [None]:
warnings.filterwarnings(action='ignore')
tag_val_pred = []
tag_top10 = [train_tags_co_occurrence_rank[:10]]
for i, songs in enumerate(val.songs):
    if i > 0 and i % 1000 == 0:
        print(i, "th completed", sep="")
    
    if not songs:
        tag_val_pred += tag_top10
        continue
    
    tag_i_pred = []
    tag_i_candidate = np.array(tag_co_occurrence_csr[songs,:].sum(axis=0))[0,:]
    tag_i_candidate = tag_i_candidate / train_tag_index_count
    tag_i_candidate[np.isnan(tag_i_candidate)] = 0
    tag_i_candidate = tag_i_candidate.argsort()[::-1]
    
    tags = val.tags[i]
    count = 0
    for tag_index in tag_i_candidate:
        tag = index_tags[tag_index]
        if not tag in tags:
            tag_i_pred += [tag]
            count += 1
            if count == 10: break
    

    tag_val_pred += [tag_i_pred]
warnings.filterwarnings(action='default')

In [None]:
tag_top10[0]

In [None]:
tag_val_pred[0]

In [None]:
val.head()

In [None]:
song_meta.iloc[val.songs[0], :]

In [None]:
print(list(song_meta.song_name[song_val_pred[0]]))

In [None]:
val_result = [{'id':val.id[i], 'songs':song_val_pred[i], 'tags':tag_val_pred[i]} for i in range(val.shape[0])]

In [None]:
len(val_result)

In [None]:
val_result = re.sub("\'", '\"', str(val_result))

In [None]:
with open('results.json', 'w', encoding='utf-8') as f:
    f.write(str(val_result))