In [None]:
# Extract tracksUrl, AlbumUrl, ArtistUrl, playlist name, NO. of tracks, pid fields

import json
import string
import copy
import os

playlists_MPD = dict()
playlists = list()
playlist = dict()
tracks = list()
artists = list()
albums = list()
count_names = 0
count_tracks = 0
for fname in os.listdir('F:\\SpotifyPlaylistData\\MPD\\data'):    
    f = open(os.path.join('F:\\SpotifyPlaylistData\\MPD\\data', fname))
    js = f.read()
    f.close()
    print(fname)
    slice = json.loads(js)
    playlists_MPD.clear()
    playlists.clear()
    for playlist_id in range(len(slice['playlists'])):
        playlist.clear()
        del tracks[:]
        del artists[:]
        del albums[:]
        
        if 'name' in slice['playlists'][playlist_id]:
            name = slice['playlists'][playlist_id]['name']
            playlist_name = ' '.join(name.lower().split())
            count_names += 1         
        playlist['name'] = playlist_name    
        
        pid = slice['playlists'][playlist_id]['pid'] 
        playlist['pid'] = pid     

        num_tracks = slice['playlists'][playlist_id]['num_tracks']      
        playlist['num_tracks'] = num_tracks
        if len(slice['playlists'][playlist_id]['tracks']) != num_tracks:
            print('num_tracks mismatch!!')
        
        for track_id in range(num_tracks):
            tracks.append(slice['playlists'][playlist_id]['tracks'][track_id]['track_uri'][14:])
            artists.append(slice['playlists'][playlist_id]['tracks'][track_id]['artist_uri'][15:])
            albums.append(slice['playlists'][playlist_id]['tracks'][track_id]['album_uri'][14:])
            count_tracks += 1
        playlist['tracks'] = tracks
        playlist['artists'] = artists
        playlist['albums'] = albums         
        
        playlists.append(copy.deepcopy(playlist))
    playlists_MPD['playlists'] = playlists
    
    with open('F:\\SpotifyPlaylistData\\MPD_Extract\\'+fname, 'w', encoding='utf8') as outfile:        
        outfile.write(json.dumps(playlists_MPD, indent=4, sort_keys=True, ensure_ascii=False,separators=(',', ': ')))
print(count_names)
print(count_tracks)

In [None]:
#  traing with word2vec model to generate embedding vector for track

from gensim.models.word2vec import Word2Vec
import logging
import os
import json

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 
class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            f = open(os.path.join(self.dirname,fname),encoding = 'utf8')
            js = f.read()
            f.close()
            slice = json.loads(js)
            for playlist_id in range(1000):
                yield slice['playlists'][playlist_id]['tracks']

sentences = MySentences('F:\\SpotifyPlaylistData\\MPD_Extract') # a memory-friendly iterator
model = Word2Vec(size=40,alpha=0.025, window=5, min_count=1,sample=0.001, workers=8,
                 min_alpha=0.025, sg=1,negative=5,iter=5)
model.build_vocab(sentences)
for epoch in range(10):
    model.train(sentences, model.corpus_count, epochs=5)
    model.alpha -= 0.002
    model.min_alpha = model.alpha
model.save('F:\\SpotifyPlaylistData\\word2vec.model')

In [None]:
#  traing with doc2vec model to generate embedding vector for playlist name
import gensim, logging
from gensim.models.doc2vec import TaggedDocument
from gensim.models.doc2vec import Doc2Vec
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

class TaggedDoc(object):
    def __init__(self, dirname):
        self.dirname = dirname
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            f = open(os.path.join(self.dirname, fname),encoding='utf8')
            js = f.read()
            f.close()
            slice = json.loads(js)
            for playlist_id in range(1000):
                yield TaggedDocument(words=slice['playlists'][playlist_id]['tracks'],
                                      tags=[slice['playlists'][playlist_id]['name'])

model = Doc2Vec(dm=1, size=40, window=5, alpha=0.025, min_alpha=0.025, min_count=1, sample=0.001,
                workers=4, iter=5, negative=5, dbow_words=1)
TaggedDocs = TaggedDoc('F:\\SpotifyPlaylistData\\MPD_Extract')
model.build_vocab(TaggedDocs)
for epoch in range(10):
    model.train(TaggedDocs,total_examples=1000000, epochs=5)
    model.alpha -= 0.002  # decrease the learning rate
    model.min_alpha = model.alpha  # fix the learning rate, no decay
model.save('F:\\SpotifyPlaylistData\\doc2vec.model')

In [None]:
# Creating mapping between trackUrl and correponding indices

import collections
import os
import json
import pickle

def trackUrlGenerator(trackUrl2artistUrl,trackUrl2albumUrl):
    count_playlists = 0
    count_tracks = 0
    for fname in os.listdir('F:\\SpotifyPlaylistData\\MPD\\data'):  
        f = open(os.path.join('F:\\SpotifyPlaylistData\\MPD\\data', fname))
        js = f.read()
        f.close()
        slice = json.loads(js)
        print(fname)
        for playlist_id in range(len(slice['playlists'])):
            num_tracks = slice['playlists'][playlist_id]['num_tracks']
            for track_id in range(num_tracks):
                count_tracks += 1
                trackUrl = slice['playlists'][playlist_id]['tracks'][track_id]['track_uri'][14:]
                artistUrl = slice['playlists'][playlist_id]['tracks'][track_id]['artist_uri'][15:]
                albumUrl = slice['playlists'][playlist_id]['tracks'][track_id]['album_uri'][14:]
                trackUrl2artistUrl[trackUrl] = artistUrl
                trackUrl2albumUrl[trackUrl] = albumUrl
                yield trackUrl           
            count_playlists += 1
    print('count_playlists:',count_playlists)
    print('count_tracks:',count_tracks)

tracksUrl2idx = dict()
trackUrl2artistUrl = dict()
trackUrl2albumUrl = dict()
tracksUrl_count = []
trackUrlGen = trackUrlGenerator(trackUrl2artistUrl,trackUrl2albumUrl)
counter = collections.Counter(trackUrlGen)
tracksUrl_count.extend(counter.most_common(len(counter)))
print('unique tracks -> len(tracksUrl_count):',len(tracksUrl_count))
sum_tracks = 0
for trackUrl, count in tracksUrl_count:
    tracksUrl2idx[trackUrl] = len(tracksUrl2idx)
    sum_tracks += count
idx2tracksUrl = dict(zip(tracksUrl2idx.values(), tracksUrl2idx.keys()))
print('total tracks(including repeating) -> sum_tracks:',sum_tracks)
print('unique tracks -> len(tracksUrl2idx):',len(tracksUrl2idx))
print('unique tracks -> len(idx2tracksUrl):',len(idx2tracksUrl))
with open("tracksUrl2idx.txt","wb") as f:
    pickle.dump(tracksUrl2idx,f)
with open("idx2tracksUrl.txt","wb") as f:
    pickle.dump(idx2tracksUrl,f)

In [None]:
# Creating mapping between playlist name and integer indices

import collections
import os
import json
import pickle


def nameGenerator():
    count_playlists = 0
    count_names = 0
    for fname in os.listdir('F:\\SpotifyPlaylistData\\MPD_Extract'):  
        f = open(os.path.join('F:\\SpotifyPlaylistData\\MPD_Extract', fname),encoding='utf8')
        js = f.read()
        f.close()
        slice = json.loads(js)
        print(fname)
        for playlist_id in range(len(slice['playlists'])):
            playlistName = slice['playlists'][playlist_id]['name']
            for name in playlistName:
                count_names += 1
                yield name         
            count_playlists += 1
    print('count_playlists:',count_playlists)
    print('count_names:',count_names)

playlistName2idx = dict()
playlistName_count = []
playlistNameGen = nameGenerator()
counter = collections.Counter(playlistNameGen)
playlistName_count.extend(counter.most_common(len(counter)))
print('len(counter):',len(counter))
print('unique playlistNames -> len(playlistName_count):',len(playlistName_count))
sum_playlistNames = 0
for playlistName, count in playlistName_count:
    playlistName2idx[playlistName] = len(playlistName2idx)
    sum_playlistNames += count
idx2playlistName = dict(zip(playlistName2idx.values(), playlistName2idx.keys()))
print('total playlistName(including repeating) -> sum_playlistNames:',sum_playlistNames)
print('unique playlistNames -> len(playlistName2idx):',len(playlistName2idx))
print('unique playlistNames -> len(idx2playlistName):',len(idx2playlistName))
with open("playlistName2idx.txt","wb") as f:
    pickle.dump(playlistName2idx,f)
with open("idx2playlistName.txt","wb") as f:
    pickle.dump(idx2playlistName,f)

In [None]:
#  Indexing tracksUrl, artistUrl, albumUrl, playlist name, NO. of tracks. 
import json
import copy
import os
import pickle

with open("playlistName2idx.txt","rb") as f:
    playlistName2idx = pickle.load(f)
with open("tracksUrl2idx.txt","rb") as f:
    tracksUrl2idx = pickle.load(f)
with open("trackUrl2artistUrl.txt","rb") as f:
    trackUrl2artistUrl = pickle.load(f)
with open("trackUrl2albumUrl.txt","rb") as f:
    trackUrl2albumUrl = pickle.load(f)
with open("artistsUrl2idx.txt","rb") as f:
    artistsUrl2idx = pickle.load(f)
with open("albumsUrl2idx.txt","rb") as f:
    albumsUrl2idx = pickle.load(f)
with open("num_tracks2idx.txt","rb") as f:
    num_tracks2idx = pickle.load(f)

playlists_MPD = dict()
playlists = list()
playlist = dict()
tracksIdx = list()
artistsIdx = list()
albumsIdx = list()
count_tracks = 0
for fname in os.listdir('F:\\SpotifyPlaylistData\\MPD_Extract'):
    f = open(os.path.join('F:\\SpotifyPlaylistData\\MPD_Extract', fname),encoding='utf8')
    js = f.read()
    f.close()
    print(fname)
    slice = json.loads(js)
    playlists_MPD.clear()
    playlists.clear()
    for playlist_id in range(1000):
        playlist.clear()
        del tracksIdx[:]
        del artistsIdx[:]
        del albumsIdx[:]
        
        nameIdx = playlistName2idx[slice['playlists'][playlist_id]['name']]
        playlist['nameIdx'] = nameIdx
        
        pid = slice['playlists'][playlist_id]['pid']
        playlist['pid'] = pid
        
        tracks = slice['playlists'][playlist_id]['tracks']
        artists = slice['playlists'][playlist_id]['artists']
        albums = slice['playlists'][playlist_id]['albums']
        tracks_len = len(tracks)
        num_tracksIdx = num_tracks2idx[tracks_len]
        for track_id in range(tracks_len):
            tracksIdx.append(tracksUrl2idx[tracks[track_id]])
            
            artistIdx = artistsUrl2idx[trackUrl2artistUrl[tracks[track_id]]]
            if artistIdx != artistsUrl2idx[artists[track_id]]:
                print('artist doesn\'t match!!')
            else:
                artistsIdx.append(artistIdx)
            
            albumIdx = albumsUrl2idx[trackUrl2albumUrl[tracks[track_id]]]
            if albumIdx != albumsUrl2idx[albums[track_id]]:
                print("album doesn't match!!")
            else:
                albumsIdx.append(albumIdx)
            count_tracks += 1
        if slice['playlists'][playlist_id]['num_tracks'] != tracks_len:
            print('num_tracks mismatch!!')
        playlist['num_tracks'] = tracks_len
        playlist['num_tracksIdx'] = num_tracksIdx
        playlist['tracksIdx'] = tracksIdx
        playlist['artistsIdx'] = artistsIdx
        playlist['albumsIdx'] = albumsIdx
        
        playlists.append(copy.deepcopy(playlist))
    playlists_MPD['playlists'] = playlists
    with open('F:\\SpotifyPlaylistData\\MPD_Extract_Indices\\'+fname, 'w', encoding='utf8') as outfile:        
        outfile.write(json.dumps(playlists_MPD, indent=4, sort_keys=True, ensure_ascii=False,separators=(',', ': ')))
print(count_tracks)

In [None]:
# Creating mapping between track and track's word2vec embedding features

import pickle
from gensim.models.word2vec import Word2Vec

model = Word2Vec.load('F:\\SpotifyPlaylistData\\word2vec.model')
with open("tracksUrl2idx.txt","rb") as f:
    tracksUrl2idx = pickle.load(f)

tracksCount = 0    
trackIdx2Features = dict() # key:string type, value:numpy type
for tracksUrl in tracksUrl2idx:
    trackIdx2Features[str(tracksUrl2idx[tracksUrl])] = model.wv[tracksUrl]
    tracksCount += 1
print('unique tracks:',tracksCount)
print('len(trackIdx2Features):',len(trackIdx2Features))
with open("trackIdx2Features.txt","wb") as f:
    pickle.dump(trackIdx2Features,f)

In [None]:
# Creating mapping between playlist name and its doc2vec embedding features

import pickle
from gensim.models.doc2vec import Doc2Vec

model = Doc2Vec.load('F:\\SpotifyPlaylistData\\doc2vec.model')
with open("playlistName2idx.txt","rb") as f:
    playlistName2idx = pickle.load(f)

namesCount = 0    
nameIdx2Features = dict() # key:string type, value:numpy type
for playlistName in playlistName2idx:
    nameIdx2Features[str(playlistName2idx[playlistName])] = model.docvecs[playlistName]
    namesCount += 1
print('unique names:',namesCount)
print('len(nameIdx2Features):',len(nameIdx2Features))
with open("nameIdx2Features.txt","wb") as f:
    pickle.dump(nameIdx2Features,f)

In [None]:
# genrate datset 1 as hdf5 file with fields ['num_tracksIdx'],['nameVec'],['tracksIdx'], coresponding to task category 1

import numpy as np
import h5py
import pickle
import os
import json

with h5py.File('F:/SpotifyPlaylistData/Dataset_1.hdf5','w') as f:
    f.create_dataset('num_tracksIdx', (1000000,1),dtype = 'int32')
    f.create_dataset('nameVec', (1000000,40),dtype = 'float32')
    f.create_dataset('tracksIdx', (1000000,376),dtype = 'int32',fillvalue=-1)
with h5py.File('F:/SpotifyPlaylistData/Dataset_1.hdf5','r') as f:
    print(f['tracksIdx'][0,:].shape)
    
with open("nameIdx2Features.txt","rb") as f:
    nameIdx2Features = pickle.load(f)   

with h5py.File('F:/SpotifyPlaylistData/Dataset_1.hdf5','a') as f_data:
    count = 0
    count_playlists = 0
    for fname in os.listdir('F:/SpotifyPlaylistData/MPD_Extract'):  
        f = open(os.path.join('F:/SpotifyPlaylistData/MPD_Extract', fname),encoding='utf8')
        js = f.read()
        f.close()
        slice = json.loads(js)
        print(fname)
        for playlist_id in range(len(slice['playlists'])):
            count_playlists += 1
            f_data['num_tracksIdx'][count,:] = slice['playlists'][playlist_id]['num_tracksIdx']
            f_data['nameVec'][count,:] = nameIdx2Features[str(slice['playlists'][playlist_id]['nameIdx'])]
            tracksIdx = slice['playlists'][playlist_id]['tracksIdx']       
            
            tracksIdx_l= np.asarray(tracksIdx)
            tracksIdx_l_len = len(tracksIdx_l)
            flag_376 = 0
            for i in range(376//tracksIdx_l_len):
                f_data['tracksIdx'][count,i*tracksIdx_l_len:(i+1)*tracksIdx_l_len] = tracksIdx_l
                flag_375 += tracksIdx_l_len
            for i in range(tracksIdx_l_len):
                if flag_376 < 376:
                    f_data['tracksIdx'][count,flag_376] = tracksIdx_l[i]
                    flag_376 += 1
                else:
                    break
            count += 1
            
print('count:',count)
print('count_playlists:',count_playlists)

In [None]:
# genrate datset 2 as hdf5 file with fields ['num_tracksIdx'],['nameVec'],['tracksVec'],['tracksIdx'],coresponding to task category 2

import numpy as np
import h5py
import pickle
import os
import json

with h5py.File('F:/SpotifyPlaylistData/Dataset_2.hdf5','w') as f:
    f.create_dataset('num_tracksIdx', (1000000,1),dtype = 'int32')
    f.create_dataset('nameVec', (1000000,40),dtype = 'float32')
    f.create_dataset('tracksVec', (1000000,40),dtype = 'float32') # one track
    f.create_dataset('tracksIdx', (1000000,375),dtype = 'int32',fillvalue=-1)
with h5py.File('F:/SpotifyPlaylistData/Dataset_2.hdf5','r') as f:
    print(f['tracksIdx'][0,:].shape)
    
with open("nameIdx2Features.txt","rb") as f:
    nameIdx2Features = pickle.load(f)
with open("trackIdx2Features.txt","rb") as f:
    trackIdx2Features = pickle.load(f)

with h5py.File('F:/SpotifyPlaylistData/Dataset_2.hdf5','a') as f_data:
    count = 0
    count_playlists = 0
    for fname in os.listdir('F:/SpotifyPlaylistData/MPD_Extract_Indices'):  
        f = open(os.path.join('F:/SpotifyPlaylistData/MPD_Extract_Indices', fname),encoding='utf8')
        js = f.read()
        f.close()
        slice = json.loads(js)
        print(fname)
        for playlist_id in range(len(slice['playlists'])):
            count_playlists += 1
            f_data['num_tracksIdx'][count,:] = slice['playlists'][playlist_id]['num_tracksIdx']
            f_data['nameVec'][count,:] = nameIdx2Features[str(slice['playlists'][playlist_id]['nameIdx'])]
            tracksIdx = slice['playlists'][playlist_id]['tracksIdx']
            f_data['tracksVec'][count,:] = trackIdx2Features[str(tracksIdx[0])]
            tracksIdx_l= np.asarray(tracksIdx[1:])
            tracksIdx_l_len = len(tracksIdx_l)
            flag_375 = 0
            for i in range(375//tracksIdx_l_len):
                f_data['tracksIdx'][count,i*tracksIdx_l_len:(i+1)*tracksIdx_l_len] = tracksIdx_l
                flag_375 += tracksIdx_l_len
            for i in range(tracksIdx_l_len):
                if flag_375 < 375:
                    f_data['tracksIdx'][count,flag_375] = tracksIdx_l[i]
                    flag_375 += 1
                else:
                    break
            count += 1
print('count:',count)
print('count_playlists:',count_playlists)

In [None]:
# genrate datset 3 as hdf5 file with fields['num_tracksIdx'],['nameVec'],['tracksVec'],['tracksIdx'],coresponding to task category 3

import numpy as np
import h5py
import pickle
import os
import json

with h5py.File('F:/SpotifyPlaylistData/Dataset_3.hdf5','w') as f:
    f.create_dataset('num_tracksIdx', (994587,1),dtype = 'int32')
    f.create_dataset('nameVec', (994587,40),dtype = 'float32')
    f.create_dataset('tracksVec', (994587,200),dtype = 'float32') # five tracks
    f.create_dataset('tracksIdx', (994587,371),dtype = 'int32',fillvalue=-1)
with h5py.File('F:/SpotifyPlaylistData/Dataset_3.hdf5','r') as f:
    print(f['tracksIdx'][0,:].shape)
    
with open("nameIdx2Features.txt","rb") as f:
    nameIdx2Features = pickle.load(f)
with open("trackIdx2Features.txt","rb") as f:
    trackIdx2Features = pickle.load(f)

with h5py.File('F:/SpotifyPlaylistData/Dataset_3.hdf5','a') as f_data:
    count = 0
    count_playlists = 0
    tracksVec_arr = np.zeros([5,40],dtype=np.float32)
    for fname in os.listdir('F:/SpotifyPlaylistData/MPD_Extract_Indices'):  
        f = open(os.path.join('F:/SpotifyPlaylistData/MPD_Extract_Indices', fname),encoding='utf8')
        js = f.read()
        f.close()
        slice = json.loads(js)
        print(fname)
        for playlist_id in range(len(slice['playlists'])):
            count_playlists += 1
            if slice['playlists'][playlist_id]['num_tracks'] > 5:
                f_data['num_tracksIdx'][count,:] = slice['playlists'][playlist_id]['num_tracksIdx']
                f_data['nameVec'][count,:] = nameIdx2Features[str(slice['playlists'][playlist_id]['nameIdx'])]
                tracksIdx = slice['playlists'][playlist_id]['tracksIdx']
                for i in range(5):
                    tracksVec_arr[i,:] = trackIdx2Features[str(tracksIdx[i])]
                f_data['tracksVec'][count,:] = tracksVec_arr.flatten()         
                
                tracksIdx_l= np.asarray(tracksIdx[5:])
                tracksIdx_l_len = len(tracksIdx_l)
                flag_371 = 0
                for i in range(371//tracksIdx_l_len):
                    f_data['tracksIdx'][count,i*tracksIdx_l_len:(i+1)*tracksIdx_l_len] = tracksIdx_l
                    flag_371 += tracksIdx_l_len
                for i in range(tracksIdx_l_len):
                    if flag_371 < 371:
                        f_data['tracksIdx'][count,flag_371] = tracksIdx_l[i]
                        flag_371 += 1
                    else:
                        break
                count += 1 
            
print('count:',count)
print('count_playlists:',count_playlists)

In [None]:
# genrate datset 4 as hdf5 file with fields['num_tracksIdx'],['tracksVec'],['tracksIdx'],coresponding to task category 4

import numpy as np
import h5py
import pickle
import os
import json

with h5py.File('F:/SpotifyPlaylistData/Dataset_4.hdf5','w') as f:
    f.create_dataset('num_tracksIdx', (994587,1),dtype = 'int32')
    f.create_dataset('tracksVec', (994587,200),dtype = 'float32') # five tracks
    f.create_dataset('tracksIdx', (994587,371),dtype = 'int32',fillvalue=-1)
with h5py.File('F:/SpotifyPlaylistData/Dataset_4.hdf5','r') as f:
    print(f['tracksIdx'][0,:].shape)

with open("trackIdx2Features.txt","rb") as f:
    trackIdx2Features = pickle.load(f)

with h5py.File('F:/SpotifyPlaylistData/Dataset_4.hdf5','a') as f_data:
    count = 0
    count_playlists = 0
    tracksVec_arr = np.zeros([5,40],dtype=np.float32)
    for fname in os.listdir('F:/SpotifyPlaylistData/MPD_Extract_Indices'):  
        f = open(os.path.join('F:/SpotifyPlaylistData/MPD_Extract_Indices', fname),encoding='utf8')
        js = f.read()
        f.close()
        slice = json.loads(js)
        print(fname)
        for playlist_id in range(len(slice['playlists'])):
            count_playlists += 1
            if slice['playlists'][playlist_id]['num_tracks'] > 5:
                f_data['num_tracksIdx'][count,:] = slice['playlists'][playlist_id]['num_tracksIdx']
                tracksIdx = slice['playlists'][playlist_id]['tracksIdx']
                for i in range(5):
                    tracksVec_arr[i,:] = trackIdx2Features[str(tracksIdx[i])]
                f_data['tracksVec'][count,:] = tracksVec_arr.flatten()        
                
                tracksIdx_l= np.asarray(tracksIdx[5:])
                tracksIdx_l_len = len(tracksIdx_l)
                flag_371 = 0
                for i in range(371//tracksIdx_l_len):
                    f_data['tracksIdx'][count,i*tracksIdx_l_len:(i+1)*tracksIdx_l_len] = tracksIdx_l
                    flag_371 += tracksIdx_l_len
                for i in range(tracksIdx_l_len):
                    if flag_371 < 371:
                        f_data['tracksIdx'][count,flag_371] = tracksIdx_l[i]
                        flag_371 += 1
                    else:
                        break
                count += 1 
            
print('count:',count)
print('count_playlists:',count_playlists)

In [None]:
# genrate datset 5 as hdf5 file with fields['num_tracksIdx'],['nameVec'],['tracksVec'],['tracksIdx'],coresponding to task category 5

import numpy as np
import h5py
import pickle
import os
import json

with h5py.File('F:/SpotifyPlaylistData/Dataset_5.hdf5','w') as f:
    f.create_dataset('num_tracksIdx', (953325,1),dtype = 'int32')
    f.create_dataset('nameVec', (953325,40),dtype = 'float32')
    f.create_dataset('tracksVec', (953325,400),dtype = 'float32') # ten tracks
    f.create_dataset('tracksIdx', (953325,366),dtype = 'int32',fillvalue=-1)
with h5py.File('F:/SpotifyPlaylistData/Dataset_5.hdf5','r') as f:
    print(f['tracksIdx'][0,:].shape)
    
with open("nameIdx2Features.txt","rb") as f:
    nameIdx2Features = pickle.load(f)
with open("trackIdx2Features.txt","rb") as f:
    trackIdx2Features = pickle.load(f)

with h5py.File('F:/SpotifyPlaylistData/Dataset_5.hdf5','a') as f_data:
    count = 0
    count_playlists = 0
    tracksVec_arr = np.zeros([10,40],dtype=np.float32)
    for fname in os.listdir('F:/SpotifyPlaylistData/MPD_Extract_Indices'):  
        f = open(os.path.join('F:/SpotifyPlaylistData/MPD_Extract_Indices', fname),encoding='utf8')
        js = f.read()
        f.close()
        slice = json.loads(js)
        print(fname)
        for playlist_id in range(len(slice['playlists'])):
            count_playlists += 1
            if slice['playlists'][playlist_id]['num_tracks'] > 10:
                f_data['num_tracksIdx'][count,:] = slice['playlists'][playlist_id]['num_tracksIdx']
                f_data['nameVec'][count,:] = nameIdx2Features[str(slice['playlists'][playlist_id]['nameIdx'])]
                tracksIdx = slice['playlists'][playlist_id]['tracksIdx']
                for i in range(10):
                    tracksVec_arr[i,:] = trackIdx2Features[str(tracksIdx[i])]
                f_data['tracksVec'][count,:] = tracksVec_arr.flatten()
                
                tracksIdx_l= np.asarray(tracksIdx[10:])
                tracksIdx_l_len = len(tracksIdx_l)
                flag_366 = 0
                for i in range(366//tracksIdx_l_len):
                    f_data['tracksIdx'][count,i*tracksIdx_l_len:(i+1)*tracksIdx_l_len] = tracksIdx_l
                    flag_366 += tracksIdx_l_len
                for i in range(tracksIdx_l_len):
                    if flag_366 < 366:
                        f_data['tracksIdx'][count,flag_366] = tracksIdx_l[i]
                        flag_366 += 1
                    else:
                        break
                count += 1 
            
print('count:',count)
print('count_playlists:',count_playlists)

In [None]:
# genrate datset 6 as hdf5 file with fields['num_tracksIdx'],['tracksVec'],['tracksIdx'],coresponding to task category 6

import numpy as np
import h5py
import pickle
import os
import json

with h5py.File('F:/SpotifyPlaylistData/Dataset_6.hdf5','w') as f:
    f.create_dataset('num_tracksIdx', (953325,1),dtype = 'int32')
    f.create_dataset('tracksVec', (953325,400),dtype = 'float32') # ten tracks
    f.create_dataset('tracksIdx', (953325,366),dtype = 'int32',fillvalue=-1)
with h5py.File('F:/SpotifyPlaylistData/Dataset_6.hdf5','r') as f:
    print(f['tracksIdx'][0,:].shape)

with open("trackIdx2Features.txt","rb") as f:
    trackIdx2Features = pickle.load(f)

with h5py.File('F:/SpotifyPlaylistData/Dataset_6.hdf5','a') as f_data:
    count = 0
    count_playlists = 0
    tracksVec_arr = np.zeros([10,40],dtype=np.float32)
    for fname in os.listdir('F:/SpotifyPlaylistData/MPD_Extract_Indices'):  
        f = open(os.path.join('F:/SpotifyPlaylistData/MPD_Extract_Indices', fname),encoding='utf8')
        js = f.read()
        f.close()
        slice = json.loads(js)
        print(fname)
        for playlist_id in range(len(slice['playlists'])):
            count_playlists += 1
            if slice['playlists'][playlist_id]['num_tracks'] > 10:
                f_data['num_tracksIdx'][count,:] = slice['playlists'][playlist_id]['num_tracksIdx']
                tracksIdx = slice['playlists'][playlist_id]['tracksIdx']
                for i in range(10):
                    tracksVec_arr[i,:] = trackIdx2Features[str(tracksIdx[i])]
                f_data['tracksVec'][count,:] = tracksVec_arr.flatten()        
                
                tracksIdx_l= np.asarray(tracksIdx[10:])
                tracksIdx_l_len = len(tracksIdx_l)
                flag_366 = 0
                for i in range(366//tracksIdx_l_len):
                    f_data['tracksIdx'][count,i*tracksIdx_l_len:(i+1)*tracksIdx_l_len] = tracksIdx_l
                    flag_366 += tracksIdx_l_len
                for i in range(tracksIdx_l_len):
                    if flag_366 < 366:
                        f_data['tracksIdx'][count,flag_366] = tracksIdx_l[i]
                        flag_366 += 1
                    else:
                        break
                count += 1 
            
print('count:',count)
print('count_playlists:',count_playlists)

In [None]:
# genrate datset 7 as hdf5 file with fields['num_tracksIdx'],['nameVec'],['tracksVec'],['tracksIdx'],coresponding to task category 7

import numpy as np
import h5py
import pickle
import os
import json

with h5py.File('F:/SpotifyPlaylistData/Dataset_7.hdf5','w') as f:
    f.create_dataset('num_tracksIdx', (754348,1),dtype = 'int32')
    f.create_dataset('nameVec', (754348,40),dtype = 'float32')
    f.create_dataset('tracksVec', (754348,1000),dtype = 'float32') # twenty five tracks
    f.create_dataset('tracksIdx', (754348,351),dtype = 'int32',fillvalue=-1)
with h5py.File('F:/SpotifyPlaylistData/Dataset_7.hdf5','r') as f:
    print(f['tracksIdx'][0,:].shape)
    
with open("nameIdx2Features.txt","rb") as f:
    nameIdx2Features = pickle.load(f)
with open("trackIdx2Features.txt","rb") as f:
    trackIdx2Features = pickle.load(f)

with h5py.File('F:/SpotifyPlaylistData/Dataset_7.hdf5','a') as f_data:
    count = 0
    count_playlists = 0
    tracksVec_arr = np.zeros([25,40],dtype=np.float32)
    for fname in os.listdir('F:/SpotifyPlaylistData/MPD_Extract_Indices'):  
        f = open(os.path.join('F:/SpotifyPlaylistData/MPD_Extract_Indices', fname),encoding='utf8')
        js = f.read()
        f.close()
        slice = json.loads(js)
        print(fname)
        for playlist_id in range(len(slice['playlists'])):
            count_playlists += 1
            if slice['playlists'][playlist_id]['num_tracks'] > 25:
                f_data['num_tracksIdx'][count,:] = slice['playlists'][playlist_id]['num_tracksIdx']
                f_data['nameVec'][count,:] = nameIdx2Features[str(slice['playlists'][playlist_id]['nameIdx'])]
                tracksIdx = slice['playlists'][playlist_id]['tracksIdx']
                for i in range(25):
                    tracksVec_arr[i,:] = trackIdx2Features[str(tracksIdx[i])]
                f_data['tracksVec'][count,:] = tracksVec_arr.flatten()
                
                tracksIdx_l= np.asarray(tracksIdx[25:])
                tracksIdx_l_len = len(tracksIdx_l)
                flag_351 = 0
                for i in range(351//tracksIdx_l_len):
                    f_data['tracksIdx'][count,i*tracksIdx_l_len:(i+1)*tracksIdx_l_len] = tracksIdx_l
                    flag_351 += tracksIdx_l_len
                for i in range(tracksIdx_l_len):
                    if flag_351 < 351:
                        f_data['tracksIdx'][count,flag_351] = tracksIdx_l[i]
                        flag_351 += 1
                    else:
                        break
                count += 1 
            
print('count:',count)
print('count_playlists:',count_playlists)

In [None]:
# genrate datset 8 as hdf5 file with fields['num_tracksIdx'],['nameVec'],['tracksVec'],['tracksIdx'],coresponding to task category 8

import numpy as np
import h5py
import pickle
import os
import json

with h5py.File('F:/SpotifyPlaylistData/Dataset_8.hdf5','w') as f:
    f.create_dataset('num_tracksIdx', (754348,1),dtype = 'int32')
    f.create_dataset('nameVec', (754348,40),dtype = 'float32')
    f.create_dataset('tracksVec', (754348,1000),dtype = 'float32') # random twenty five tracks
    f.create_dataset('tracksIdx', (754348,351),dtype = 'int32',fillvalue=-1)
with h5py.File('F:/SpotifyPlaylistData/Dataset_8.hdf5','r') as f:
    print(f['tracksIdx'][0,:].shape)
    
with open("nameIdx2Features.txt","rb") as f:
    nameIdx2Features = pickle.load(f)
with open("trackIdx2Features.txt","rb") as f:
    trackIdx2Features = pickle.load(f)

with h5py.File('F:/SpotifyPlaylistData/Dataset_8.hdf5','a') as f_data:
    count = 0
    count_0 = 0
    count_playlists = 0
    tracksVec_arr = np.zeros([25,40],dtype=np.float32)
    for fname in os.listdir('F:/SpotifyPlaylistData/MPD_Extract_Indices'):  
        f = open(os.path.join('F:/SpotifyPlaylistData/MPD_Extract_Indices', fname),encoding='utf8')
        js = f.read()
        f.close()
        slice = json.loads(js)
        print(fname)
        for playlist_id in range(len(slice['playlists'])):
            count_playlists += 1
            if slice['playlists'][playlist_id]['num_tracks'] > 25:
                f_data['num_tracksIdx'][count,:] = slice['playlists'][playlist_id]['num_tracksIdx']
                f_data['nameVec'][count,:] = nameIdx2Features[str(slice['playlists'][playlist_id]['nameIdx'])]
                tracksIdx = slice['playlists'][playlist_id]['tracksIdx']
                
                tracksIdx_permutation = np.random.permutation(tracksIdx)
                rand_tracksIdx = tracksIdx_permutation[:25]
                for i in range(25):
                    tracksVec_arr[i,:] = trackIdx2Features[str(rand_tracksIdx[i])]
                f_data['tracksVec'][count,:] = tracksVec_arr.flatten()
                
                tracksIdx_l = tracksIdx_permutation[25:]
                tracksIdx_l_len = len(tracksIdx_l)
                if tracksIdx_l_len == 0:
                    print('tracksIdx:',tracksIdx)
                    print('rand_tracksIdx:',rand_tracksIdx)
                    print('tracksIdx_l:',tracksIdx_l)
                    count_0 += 1
                    break
                flag_351 = 0
                for i in range(351//tracksIdx_l_len):
                    f_data['tracksIdx'][count,i*tracksIdx_l_len:(i+1)*tracksIdx_l_len] = tracksIdx_l
                    flag_351 += tracksIdx_l_len
                for i in range(tracksIdx_l_len):
                    if flag_351 < 351:
                        f_data['tracksIdx'][count,flag_351] = tracksIdx_l[i]
                        flag_351 += 1
                    else:
                        break
                count += 1
                
print('count_0:',count_0)
print('count:',count)
print('count_playlists:',count_playlists)

In [None]:
# genrate datset 9 as hdf5 file with fields['num_tracksIdx'],['nameVec'],['tracksVec'],['tracksIdx'],coresponding to task category 9

import numpy as np
import h5py
import pickle
import os
import json

with h5py.File('F:/SpotifyPlaylistData/Dataset_9.hdf5','w') as f:
    f.create_dataset('num_tracksIdx', (216482,1),dtype = 'int32')
    f.create_dataset('nameVec', (216482,40),dtype = 'float32')
    f.create_dataset('tracksVec', (216482,4000),dtype = 'float32') # one hundred tracks
    f.create_dataset('tracksIdx', (216482,276),dtype = 'int32',fillvalue=-1)
with h5py.File('F:/SpotifyPlaylistData/Dataset_9.hdf5','r') as f:
    print(f['tracksIdx'][0,:].shape)
    
with open("nameIdx2Features.txt","rb") as f:
    nameIdx2Features = pickle.load(f)
with open("trackIdx2Features.txt","rb") as f:
    trackIdx2Features = pickle.load(f)

with h5py.File('F:/SpotifyPlaylistData/Dataset_9.hdf5','a') as f_data:
    count = 0
    count_playlists = 0
    tracksVec_arr = np.zeros([100,40],dtype=np.float32)
    for fname in os.listdir('F:/SpotifyPlaylistData/MPD_Extract_Indices'):  
        f = open(os.path.join('F:/SpotifyPlaylistData/MPD_Extract_Indices', fname),encoding='utf8')
        js = f.read()
        f.close()
        slice = json.loads(js)
        print(fname)
        for playlist_id in range(len(slice['playlists'])):
            count_playlists += 1
            if slice['playlists'][playlist_id]['num_tracks'] > 100:
                f_data['num_tracksIdx'][count,:] = slice['playlists'][playlist_id]['num_tracksIdx']
                f_data['nameVec'][count,:] = nameIdx2Features[str(slice['playlists'][playlist_id]['nameIdx'])]
                tracksIdx = slice['playlists'][playlist_id]['tracksIdx']
                for i in range(100):
                    tracksVec_arr[i,:] = trackIdx2Features[str(tracksIdx[i])]
                f_data['tracksVec'][count,:] = tracksVec_arr.flatten()
                
                tracksIdx_l= np.asarray(tracksIdx[100:])
                tracksIdx_l_len = len(tracksIdx_l)
                flag_276 = 0
                for i in range(276//tracksIdx_l_len):
                    f_data['tracksIdx'][count,i*tracksIdx_l_len:(i+1)*tracksIdx_l_len] = tracksIdx_l
                    flag_276 += tracksIdx_l_len
                for i in range(tracksIdx_l_len):
                    if flag_276 < 276:
                        f_data['tracksIdx'][count,flag_276] = tracksIdx_l[i]
                        flag_276 += 1
                    else:
                        break
                count += 1 
            
print('count:',count)
print('count_playlists:',count_playlists)

In [None]:
# genrate datset 10 as hdf5 file with fields['num_tracksIdx'],['nameVec'],['tracksVec'],['tracksIdx'],coresponding to task category 10

import numpy as np
import h5py
import pickle
import os
import json

with h5py.File('F:/SpotifyPlaylistData/Dataset_10.hdf5','w') as f:
    f.create_dataset('num_tracksIdx', (216482,1),dtype = 'int32')
    f.create_dataset('nameVec', (216482,40),dtype = 'float32')
    f.create_dataset('tracksVec', (216482,4000),dtype = 'float32') # random one hundred tracks
    f.create_dataset('tracksIdx', (216482,276),dtype = 'int32',fillvalue=-1)
with h5py.File('F:/SpotifyPlaylistData/Dataset_10.hdf5','r') as f:
    print(f['tracksIdx'][0,:].shape)
    
with open("nameIdx2Features.txt","rb") as f:
    nameIdx2Features = pickle.load(f)
with open("trackIdx2Features.txt","rb") as f:
    trackIdx2Features = pickle.load(f)

with h5py.File('F:/SpotifyPlaylistData/Dataset_10.hdf5','a') as f_data:
    count = 0
    count_playlists = 0
    tracksVec_arr = np.zeros([100,40],dtype=np.float32)
    for fname in os.listdir('F:/SpotifyPlaylistData/MPD_Extract_Indices'):  
        f = open(os.path.join('F:/SpotifyPlaylistData/MPD_Extract_Indices', fname),encoding='utf8')
        js = f.read()
        f.close()
        slice = json.loads(js)
        print(fname)
        for playlist_id in range(len(slice['playlists'])):
            count_playlists += 1
            if slice['playlists'][playlist_id]['num_tracks'] > 100:
                f_data['num_tracksIdx'][count,:] = slice['playlists'][playlist_id]['num_tracksIdx']
                f_data['nameVec'][count,:] = nameIdx2Features[str(slice['playlists'][playlist_id]['nameIdx'])]
                tracksIdx = slice['playlists'][playlist_id]['tracksIdx']
                
                tracksIdx_permutation = np.random.permutation(tracksIdx)
                rand_tracksIdx = tracksIdx_permutation[:100]
                for i in range(100):
                    tracksVec_arr[i,:] = trackIdx2Features[str(rand_tracksIdx[i])]
                f_data['tracksVec'][count,:] = tracksVec_arr.flatten()
                
                tracksIdx_l = tracksIdx_permutation[100:]
                tracksIdx_l_len = len(tracksIdx_l)
                flag_276 = 0
                for i in range(276//tracksIdx_l_len):
                    f_data['tracksIdx'][count,i*tracksIdx_l_len:(i+1)*tracksIdx_l_len] = tracksIdx_l
                    flag_276 += tracksIdx_l_len
                for i in range(tracksIdx_l_len):
                    if flag_276 < 276:
                        f_data['tracksIdx'][count,flag_276] = tracksIdx_l[i]
                        flag_276 += 1
                    else:
                        break
                count += 1 
            
print('count:',count)
print('count_playlists:',count_playlists)

In [None]:
# train neural model for task category 1

from sklearn.preprocessing import MultiLabelBinarizer
import json
import numpy as np
import h5py
from keras.layers import Input, Embedding, Dense,concatenate,Reshape
from keras.models import Model
from keras import optimizers
import keras.backend as K

vocab = [(n,) for n in range(2262292)]
mlb = MultiLabelBinarizer()
mlb.fit(vocab)

num_train = 1000000
epochs = 3
batch_size = 50
steps_per_epoch = num_train//batch_size
def inputGen(mlb):
    count_playlists = 0
    with h5py.File('C:/Users/zwang10/Research/Dataset_1.hdf5','r') as f:
        for epoch in range(epochs):
            for i in range(steps_per_epoch):
                num_tracksIdx = f['num_tracksIdx'][i*batch_size:(i+1)*batch_size,:]
                name_vec = f['nameVec'][i*batch_size:(i+1)*batch_size,:]
                tracksIdx_encoding = mlb.transform(f['tracksIdx'][i*batch_size:(i+1)*batch_size,:])
                count_playlists += 1
                yield [num_tracksIdx, name_vec], tracksIdx_encoding

    
inputgenerator = inputGen(mlb)
num_tracksIdx_input = Input(shape=(1,),dtype='int32',name='num_tracksIdx_input')
num_tracksIdx = Embedding(input_dim=247,output_dim=10,input_length=1)(num_tracksIdx_input)
num_tracksIdx_em = Reshape([10,])(num_tracksIdx)

name_vec_input = Input(shape=(40,), name='name_vec_input')
name_output = Dense(50, activation='tanh', name='name_output')(name_vec_input)

x = concatenate([num_tracksIdx_em, name_output])
x_dense = Dense(64, activation='relu')(x)

output = Dense(2262292, activation='sigmoid',name='output')(x_dense)
model = Model(inputs=[num_tracksIdx_input, name_vec_input], outputs=[output])
model.summary()

model.compile(optimizer='adam', loss='binary_crossentropy',metrics=['acc'])
history = model.fit_generator(inputgenerator,steps_per_epoch=steps_per_epoch, epochs=epochs)

model.save('task1.h5')

In [None]:
# train neural model for task category 2

from sklearn.preprocessing import MultiLabelBinarizer
import json
import numpy as np
import h5py
from keras.layers import Input, Embedding, Dense,concatenate,Reshape
from keras.models import Model
from keras import optimizers
import keras.backend as K

vocab = [(n,) for n in range(2262292)]
mlb = MultiLabelBinarizer()
mlb.fit(vocab)

num_train = 1000000
epochs = 3
batch_size = 50
steps_per_epoch = num_train//batch_size
def inputGen(mlb):
    count_playlists = 0
    with h5py.File('C:/Users/zwang10/Research/Dataset_2.hdf5','r') as f:
        for epoch in range(epochs):
            for i in range(steps_per_epoch):
                num_tracksIdx = f['num_tracksIdx'][i*batch_size:(i+1)*batch_size,:]
                name_vec = f['nameVec'][i*batch_size:(i+1)*batch_size,:]
                tracks_vec = f['tracksVec'][i*batch_size:(i+1)*batch_size,:]
                tracksIdx_encoding = mlb.transform(f['tracksIdx'][i*batch_size:(i+1)*batch_size,:])
                count_playlists += 1
                yield [num_tracksIdx, name_vec, tracks_vec], tracksIdx_encoding

    
inputgenerator = inputGen(mlb)
num_tracksIdx_input = Input(shape=(1,),dtype='int32',name='num_tracksIdx_input')
num_tracksIdx = Embedding(input_dim=247,output_dim=10,input_length=1)(num_tracksIdx_input)
num_tracksIdx_em = Reshape([10,])(num_tracksIdx)

name_vec_input = Input(shape=(40,), name='name_vec_input')
name_output = Dense(50, activation='tanh', name='name_output')(name_vec_input)

tracks_vec_input = Input(shape=(40,), name='tracks_vec_input')
tracks_output = Dense(50, activation='tanh', name='tracks_output')(tracks_vec_input)

x = concatenate([num_tracksIdx_em, name_output,tracks_output])
x_dense = Dense(64, activation='relu')(x)

output = Dense(2262292, activation='sigmoid',name='output')(x_dense)
model = Model(inputs=[num_tracksIdx_input, name_vec_input,tracks_vec_input], outputs=[output])
model.summary()

model.compile(optimizer='adam', loss='binary_crossentropy')
history = model.fit_generator(inputgenerator,steps_per_epoch=steps_per_epoch, epochs=epochs)

model.save('task2.h5')

In [None]:
# train neural model for task category 3

from sklearn.preprocessing import MultiLabelBinarizer
import json
import numpy as np
import h5py
from keras.layers import Input,Embedding,Dense,concatenate,Reshape,GRU
from keras.models import Model
from keras import optimizers
import keras.backend as K

vocab = [(n,) for n in range(2262292)]
mlb = MultiLabelBinarizer()
mlb.fit(vocab)

num_train = 994587
epochs = 3
batch_size = 50
steps_per_epoch = num_train//batch_size
def inputGen(mlb):
    count_playlists = 0
    with h5py.File('C:/Users/zwang10/Research/Dataset_3.hdf5','r') as f:
        for epoch in range(epochs):
            for i in range(steps_per_epoch):
                num_tracksIdx = f['num_tracksIdx'][i*batch_size:(i+1)*batch_size,:]
                name_vec = f['nameVec'][i*batch_size:(i+1)*batch_size,:]
                tracks_vec = f['tracksVec'][i*batch_size:(i+1)*batch_size,:]
                tracksIdx_encoding = mlb.transform(f['tracksIdx'][i*batch_size:(i+1)*batch_size,:])
                yield [num_tracksIdx, name_vec, tracks_vec], tracksIdx_encoding
            num_tracksIdx = f['num_tracksIdx'][994550:994587,:]
            name_vec = f['nameVec'][994550:994587,:]
            tracks_vec = f['tracksVec'][994550:994587,:]
            tracksIdx_encoding = mlb.transform(f['tracksIdx'][994550:994587,:])
            yield [num_tracksIdx, name_vec, tracks_vec], tracksIdx_encoding

    
inputgenerator = inputGen(mlb)
num_tracksIdx_input = Input(shape=(1,),dtype='int32',name='num_tracksIdx_input')
num_tracksIdx = Embedding(input_dim=247,output_dim=10,input_length=1)(num_tracksIdx_input)
num_tracksIdx_em = Reshape([10,])(num_tracksIdx)

name_vec_input = Input(shape=(40,), name='name_vec_input')
name_output = Dense(50, activation='tanh', name='name_output')(name_vec_input)

tracks_vec_input = Input(shape=(200,), name='tracks_vec_input')
tracks_vec = Reshape([5,40])(tracks_vec_input)
tracks_output = GRU(60,input_shape=(5,40))(tracks_vec)
    
x = concatenate([num_tracksIdx_em, name_output,tracks_output])
x_dense = Dense(64, activation='relu')(x)
output = Dense(2262292, activation='sigmoid',name='output')(x_dense)
model = Model(inputs=[num_tracksIdx_input, name_vec_input,tracks_vec_input], outputs=[output])
model.summary()

model.compile(optimizer='adam', loss='binary_crossentropy')
history = model.fit_generator(inputgenerator,steps_per_epoch=steps_per_epoch+1, epochs=epochs)

model.save('task3.h5')

In [None]:
# train neural model for task category 4

from sklearn.preprocessing import MultiLabelBinarizer
import json
import numpy as np
import h5py
from keras.layers import Input,Embedding,Dense,concatenate,Reshape,GRU
from keras.models import Model
from keras import optimizers
import keras.backend as K

vocab = [(n,) for n in range(2262292)]
mlb = MultiLabelBinarizer()
mlb.fit(vocab)

num_train = 994587
epochs = 3
batch_size = 50
steps_per_epoch = num_train//batch_size
def inputGen(mlb):
    count_playlists = 0
    with h5py.File('C:/Users/zwang10/Research/Dataset_4.hdf5','r') as f:
        for epoch in range(epochs):
            for i in range(steps_per_epoch):
                num_tracksIdx = f['num_tracksIdx'][i*batch_size:(i+1)*batch_size,:]
                tracks_vec = f['tracksVec'][i*batch_size:(i+1)*batch_size,:]
                tracksIdx_encoding = mlb.transform(f['tracksIdx'][i*batch_size:(i+1)*batch_size,:])
                yield [num_tracksIdx, tracks_vec], tracksIdx_encoding
            num_tracksIdx = f['num_tracksIdx'][994550:994587,:]
            tracks_vec = f['tracksVec'][994550:994587,:]
            tracksIdx_encoding = mlb.transform(f['tracksIdx'][994550:994587,:])
            yield [num_tracksIdx, tracks_vec], tracksIdx_encoding
    
inputgenerator = inputGen(mlb)
num_tracksIdx_input = Input(shape=(1,),dtype='int32',name='num_tracksIdx_input')
num_tracksIdx = Embedding(input_dim=247,output_dim=10,input_length=1)(num_tracksIdx_input)
num_tracksIdx_em = Reshape([10,])(num_tracksIdx)

tracks_vec_input = Input(shape=(200,), name='tracks_vec_input')
tracks_vec = Reshape([5,40])(tracks_vec_input)
tracks_output = GRU(60,input_shape=(5,40))(tracks_vec)
    
x = concatenate([num_tracksIdx_em,tracks_output])
x_dense = Dense(64, activation='relu')(x)
output = Dense(2262292, activation='sigmoid',name='output')(x_dense)
model = Model(inputs=[num_tracksIdx_input,tracks_vec_input], outputs=[output])
model.summary()

model.compile(optimizer='adam', loss='binary_crossentropy')
model.fit_generator(inputgenerator,steps_per_epoch=steps_per_epoch+1, epochs=epochs)

model.save('task4.h5')

In [None]:
# train neural model for task category 5

from sklearn.preprocessing import MultiLabelBinarizer
import json
import numpy as np
import h5py
from keras.layers import Input,Embedding,Dense,concatenate,Reshape,GRU
from keras.models import Model
from keras import optimizers
import keras.backend as K

vocab = [(n,) for n in range(2262292)]
mlb = MultiLabelBinarizer()
mlb.fit(vocab)

num_train = 953325
epochs = 3
batch_size = 50
steps_per_epoch = num_train//batch_size
def inputGen(mlb):
    count_playlists = 0
    with h5py.File('C:/Users/zwang10/Research/Dataset_5.hdf5','r') as f:
        for epoch in range(epochs):
            for i in range(steps_per_epoch):
                num_tracksIdx = f['num_tracksIdx'][i*batch_size:(i+1)*batch_size,:]
                name_vec = f['nameVec'][i*batch_size:(i+1)*batch_size,:]
                tracks_vec = f['tracksVec'][i*batch_size:(i+1)*batch_size,:]
                tracksIdx_encoding = mlb.transform(f['tracksIdx'][i*batch_size:(i+1)*batch_size,:])
                yield [num_tracksIdx, name_vec, tracks_vec], tracksIdx_encoding
            num_tracksIdx = f['num_tracksIdx'][953300:953325,:]
            name_vec = f['nameVec'][953300:953325,:]
            tracks_vec = f['tracksVec'][953300:953325,:]
            tracksIdx_encoding = mlb.transform(f['tracksIdx'][953300:953325,:])
            yield [num_tracksIdx, name_vec, tracks_vec], tracksIdx_encoding

    
inputgenerator = inputGen(mlb)
num_tracksIdx_input = Input(shape=(1,),dtype='int32',name='num_tracksIdx_input')
num_tracksIdx = Embedding(input_dim=247,output_dim=10,input_length=1)(num_tracksIdx_input)
num_tracksIdx_em = Reshape([10,])(num_tracksIdx)

name_vec_input = Input(shape=(40,), name='name_vec_input')
name_output = Dense(50, activation='tanh', name='name_output')(name_vec_input)

tracks_vec_input = Input(shape=(400,), name='tracks_vec_input')
tracks_vec = Reshape([10,40])(tracks_vec_input)
tracks_output = GRU(80,input_shape=(10,40))(tracks_vec)
    
x = concatenate([num_tracksIdx_em, name_output,tracks_output])
x_dense = Dense(64, activation='relu')(x)
output = Dense(2262292, activation='sigmoid',name='output')(x_dense)
model = Model(inputs=[num_tracksIdx_input, name_vec_input,tracks_vec_input], outputs=[output])
model.summary()

model.compile(optimizer='adam', loss='binary_crossentropy')
history = model.fit_generator(inputgenerator,steps_per_epoch=steps_per_epoch+1, epochs=epochs)

model.save('task5.h5')

In [None]:
# train neural model for task category 6

from sklearn.preprocessing import MultiLabelBinarizer
import json
import numpy as np
import h5py
from keras.layers import Input,Embedding,Dense,concatenate,Reshape,GRU
from keras.models import Model
from keras import optimizers
import keras.backend as K

vocab = [(n,) for n in range(2262292)]
mlb = MultiLabelBinarizer()
mlb.fit(vocab)

num_train = 953325
epochs = 3
batch_size = 50
steps_per_epoch = num_train//batch_size
def inputGen(mlb):
    count_playlists = 0
    with h5py.File('C:/Users/zwang10/Research/Dataset_6.hdf5','r') as f:
        for epoch in range(epochs):
            for i in range(steps_per_epoch):
                num_tracksIdx = f['num_tracksIdx'][i*batch_size:(i+1)*batch_size,:]
                tracks_vec = f['tracksVec'][i*batch_size:(i+1)*batch_size,:]
                tracksIdx_encoding = mlb.transform(f['tracksIdx'][i*batch_size:(i+1)*batch_size,:])
                yield [num_tracksIdx, tracks_vec], tracksIdx_encoding
            num_tracksIdx = f['num_tracksIdx'][953300:953325,:]
            tracks_vec = f['tracksVec'][953300:953325,:]
            tracksIdx_encoding = mlb.transform(f['tracksIdx'][953300:953325,:])
            yield [num_tracksIdx, tracks_vec], tracksIdx_encoding

    
inputgenerator = inputGen(mlb)
num_tracksIdx_input = Input(shape=(1,),dtype='int32',name='num_tracksIdx_input')
num_tracksIdx = Embedding(input_dim=247,output_dim=10,input_length=1)(num_tracksIdx_input)
num_tracksIdx_em = Reshape([10,])(num_tracksIdx)

tracks_vec_input = Input(shape=(400,), name='tracks_vec_input')
tracks_vec = Reshape([10,40])(tracks_vec_input)
tracks_output = GRU(80,input_shape=(10,40))(tracks_vec)
    
x = concatenate([num_tracksIdx_em,tracks_output])
x_dense = Dense(64, activation='relu')(x)
output = Dense(2262292, activation='sigmoid',name='output')(x_dense)
model = Model(inputs=[num_tracksIdx_input,tracks_vec_input], outputs=[output])
model.summary()

model.compile(optimizer='adam', loss='binary_crossentropy')
history = model.fit_generator(inputgenerator,steps_per_epoch=steps_per_epoch+1, epochs=epochs)

model.save('task6.h5')

In [None]:
# train neural model for task category 7

from sklearn.preprocessing import MultiLabelBinarizer
import json
import numpy as np
import h5py
from keras.layers import Input,Embedding,Dense,concatenate,Reshape,GRU
from keras.models import Model
from keras import optimizers
import keras.backend as K

vocab = [(n,) for n in range(2262292)]
mlb = MultiLabelBinarizer()
mlb.fit(vocab)

num_train = 754348
epochs = 3
batch_size = 50
steps_per_epoch = num_train//batch_size
def inputGen(mlb):
    count_playlists = 0
    with h5py.File('C:/Users/zwang10/Research/Dataset_7.hdf5','r') as f:
        for epoch in range(epochs):
            for i in range(steps_per_epoch):
                num_tracksIdx = f['num_tracksIdx'][i*batch_size:(i+1)*batch_size,:]
                name_vec = f['nameVec'][i*batch_size:(i+1)*batch_size,:]
                tracks_vec = f['tracksVec'][i*batch_size:(i+1)*batch_size,:]
                tracksIdx_encoding = mlb.transform(f['tracksIdx'][i*batch_size:(i+1)*batch_size,:])
                yield [num_tracksIdx, name_vec, tracks_vec], tracksIdx_encoding
            num_tracksIdx = f['num_tracksIdx'][754300:754348,:]
            name_vec = f['nameVec'][754300:754348,:]
            tracks_vec = f['tracksVec'][754300:754348,:]
            tracksIdx_encoding = mlb.transform(f['tracksIdx'][754300:754348,:])
            yield [num_tracksIdx, name_vec, tracks_vec], tracksIdx_encoding

    
inputgenerator = inputGen(mlb)
num_tracksIdx_input = Input(shape=(1,),dtype='int32',name='num_tracksIdx_input')
num_tracksIdx = Embedding(input_dim=247,output_dim=10,input_length=1)(num_tracksIdx_input)
num_tracksIdx_em = Reshape([10,])(num_tracksIdx)

name_vec_input = Input(shape=(40,), name='name_vec_input')
name_output = Dense(50, activation='tanh', name='name_output')(name_vec_input)

tracks_vec_input = Input(shape=(1000,), name='tracks_vec_input')
tracks_vec = Reshape([25,40])(tracks_vec_input)
tracks_output = GRU(100,input_shape=(25,40))(tracks_vec)
    
x = concatenate([num_tracksIdx_em, name_output,tracks_output])
x_dense = Dense(64, activation='relu')(x)
output = Dense(2262292, activation='sigmoid',name='output')(x_dense)
model = Model(inputs=[num_tracksIdx_input, name_vec_input,tracks_vec_input], outputs=[output])
model.summary()

model.compile(optimizer='adam', loss='binary_crossentropy')
history = model.fit_generator(inputgenerator,steps_per_epoch=steps_per_epoch+1, epochs=epochs)

model.save('task7.h5')

In [None]:
# train neural model for task category 8

from sklearn.preprocessing import MultiLabelBinarizer
import json
import numpy as np
import h5py
from keras.layers import Input,Embedding,Dense,concatenate,Reshape,GRU,Conv2D,MaxPool2D
from keras.models import Model
from keras import optimizers
import keras.backend as K

vocab = [(n,) for n in range(2262292)]
mlb = MultiLabelBinarizer()
mlb.fit(vocab)

num_train = 754348
epochs = 3
batch_size = 50
steps_per_epoch = num_train//batch_size
def inputGen(mlb):
    count_playlists = 0
    with h5py.File('C:/Users/zwang10/Research/Dataset_8.hdf5','r') as f:
        for epoch in range(epochs):
            for i in range(steps_per_epoch):
                num_tracksIdx = f['num_tracksIdx'][i*batch_size:(i+1)*batch_size,:]
                name_vec = f['nameVec'][i*batch_size:(i+1)*batch_size,:]
                tracks_vec = f['tracksVec'][i*batch_size:(i+1)*batch_size,:]
                tracksIdx_encoding = mlb.transform(f['tracksIdx'][i*batch_size:(i+1)*batch_size,:])
                yield [num_tracksIdx, name_vec, tracks_vec], tracksIdx_encoding
            num_tracksIdx = f['num_tracksIdx'][754300:754348,:]
            name_vec = f['nameVec'][754300:754348,:]
            tracks_vec = f['tracksVec'][754300:754348,:]
            tracksIdx_encoding = mlb.transform(f['tracksIdx'][754300:754348,:])
            yield [num_tracksIdx, name_vec, tracks_vec], tracksIdx_encoding

    
inputgenerator = inputGen(mlb)
num_tracksIdx_input = Input(shape=(1,),dtype='int32',name='num_tracksIdx_input')
num_tracksIdx = Embedding(input_dim=247,output_dim=10,input_length=1)(num_tracksIdx_input)
num_tracksIdx_em = Reshape([10,])(num_tracksIdx)

name_vec_input = Input(shape=(40,), name='name_vec_input')
name_output = Dense(50, activation='tanh', name='name_output')(name_vec_input)

tracks_vec_input = Input(shape=(1000,), name='tracks_vec_input')
tracks_vec = Reshape([25,40,1])(tracks_vec_input)

tracks_vec_Conv1 = Conv2D(64,(1,40),activation = 'relu',padding = 'valid',input_shape = (25,40,1))(tracks_vec)
tracks_vec_Conv2 = Conv2D(128,(1,1),activation = 'relu',padding = 'valid',input_shape = (25,1,64))(tracks_vec_Conv1)
tracks_vec_MaxPool = MaxPool2D((25,1),padding = 'valid')(tracks_vec_Conv2)
tracks_output = Reshape([128,])(tracks_vec_MaxPool)
    
x = concatenate([num_tracksIdx_em, name_output,tracks_output])
x_dense = Dense(64, activation='relu')(x)
output = Dense(2262292, activation='sigmoid',name='output')(x_dense)
model = Model(inputs=[num_tracksIdx_input, name_vec_input,tracks_vec_input], outputs=[output])
model.summary()

model.compile(optimizer='adam', loss='binary_crossentropy')
history = model.fit_generator(inputgenerator,steps_per_epoch=steps_per_epoch+1, epochs=epochs)

model.save('task8.h5')

In [None]:
# train neural model for task category 9

from sklearn.preprocessing import MultiLabelBinarizer
import json
import numpy as np
import h5py
from keras.layers import Input,Embedding,Dense,concatenate,Reshape,GRU
from keras.models import Model
from keras import optimizers
import keras.backend as K

vocab = [(n,) for n in range(2262292)]
mlb = MultiLabelBinarizer()
mlb.fit(vocab)

num_train = 216482
epochs = 3
batch_size = 50
steps_per_epoch = num_train//batch_size
def inputGen(mlb):
    count_playlists = 0
    with h5py.File('C:/Users/zwang10/Research/Dataset_9.hdf5','r') as f:
        for epoch in range(epochs):
            for i in range(steps_per_epoch):
                num_tracksIdx = f['num_tracksIdx'][i*batch_size:(i+1)*batch_size,:]
                name_vec = f['nameVec'][i*batch_size:(i+1)*batch_size,:]
                tracks_vec = f['tracksVec'][i*batch_size:(i+1)*batch_size,:]
                tracksIdx_encoding = mlb.transform(f['tracksIdx'][i*batch_size:(i+1)*batch_size,:])
                yield [num_tracksIdx, name_vec, tracks_vec], tracksIdx_encoding
            num_tracksIdx = f['num_tracksIdx'][216450:216482,:]
            name_vec = f['nameVec'][216450:216482,:]
            tracks_vec = f['tracksVec'][216450:216482,:]
            tracksIdx_encoding = mlb.transform(f['tracksIdx'][216450:216482,:])
            yield [num_tracksIdx, name_vec, tracks_vec], tracksIdx_encoding

    
inputgenerator = inputGen(mlb)
num_tracksIdx_input = Input(shape=(1,),dtype='int32',name='num_tracksIdx_input')
num_tracksIdx = Embedding(input_dim=247,output_dim=10,input_length=1)(num_tracksIdx_input)
num_tracksIdx_em = Reshape([10,])(num_tracksIdx)

name_vec_input = Input(shape=(40,), name='name_vec_input')
name_output = Dense(50, activation='tanh', name='name_output')(name_vec_input)

tracks_vec_input = Input(shape=(4000,), name='tracks_vec_input')
tracks_vec = Reshape([100,40])(tracks_vec_input)
tracks_output = GRU(200,input_shape=(100,40))(tracks_vec)
    
x = concatenate([num_tracksIdx_em, name_output,tracks_output])
x_dense = Dense(64, activation='relu')(x)
output = Dense(2262292, activation='sigmoid',name='output')(x_dense)
model = Model(inputs=[num_tracksIdx_input, name_vec_input,tracks_vec_input], outputs=[output])
model.summary()

model.compile(optimizer='adam', loss='binary_crossentropy')
history = model.fit_generator(inputgenerator,steps_per_epoch=steps_per_epoch+1, epochs=epochs)

model.save('task9.h5')

In [None]:
# train neural model for task category 10

from sklearn.preprocessing import MultiLabelBinarizer
import json
import numpy as np
import h5py
from keras.layers import Input,Embedding,Dense,concatenate,Reshape,GRU,Conv2D,MaxPool2D
from keras.models import Model
from keras import optimizers
import keras.backend as K

vocab = [(n,) for n in range(2262292)]
mlb = MultiLabelBinarizer()
mlb.fit(vocab)

num_train = 216482
epochs = 10
batch_size = 50
steps_per_epoch = num_train//batch_size
def inputGen(mlb):
    count_playlists = 0
    with h5py.File('C:/Users/zwang10/Research/Dataset_10.hdf5','r') as f:
        for epoch in range(epochs):
            for i in range(steps_per_epoch):
                num_tracksIdx = f['num_tracksIdx'][i*batch_size:(i+1)*batch_size,:]
                name_vec = f['nameVec'][i*batch_size:(i+1)*batch_size,:]
                tracks_vec = f['tracksVec'][i*batch_size:(i+1)*batch_size,:]
                tracksIdx_encoding = mlb.transform(f['tracksIdx'][i*batch_size:(i+1)*batch_size,:])
                yield [num_tracksIdx, name_vec, tracks_vec], tracksIdx_encoding
            num_tracksIdx = f['num_tracksIdx'][216450:216482,:]
            name_vec = f['nameVec'][216450:216482,:]
            tracks_vec = f['tracksVec'][216450:216482,:]
            tracksIdx_encoding = mlb.transform(f['tracksIdx'][216450:216482,:])
            yield [num_tracksIdx, name_vec, tracks_vec], tracksIdx_encoding

    
inputgenerator = inputGen(mlb)
num_tracksIdx_input = Input(shape=(1,),dtype='int32',name='num_tracksIdx_input')
num_tracksIdx = Embedding(input_dim=247,output_dim=10,input_length=1)(num_tracksIdx_input)
num_tracksIdx_em = Reshape([10,])(num_tracksIdx)

name_vec_input = Input(shape=(40,), name='name_vec_input')
name_output = Dense(50, activation='tanh', name='name_output')(name_vec_input)

tracks_vec_input = Input(shape=(4000,), name='tracks_vec_input')
tracks_vec = Reshape([100,40,1])(tracks_vec_input)

tracks_vec_Conv1 = Conv2D(128,(1,40),activation = 'relu',padding = 'valid',input_shape = (100,40,1))(tracks_vec)
tracks_vec_Conv2 = Conv2D(200,(1,1),activation = 'relu',padding = 'valid',input_shape = (100,1,64))(tracks_vec_Conv1)
tracks_vec_MaxPool = MaxPool2D((100,1),padding = 'valid')(tracks_vec_Conv2)
tracks_output = Reshape([200,])(tracks_vec_MaxPool)
    
x = concatenate([num_tracksIdx_em, name_output,tracks_output])
x_dense = Dense(64, activation='relu')(x)
output = Dense(2262292, activation='sigmoid',name='output')(x_dense)
model = Model(inputs=[num_tracksIdx_input, name_vec_input,tracks_vec_input], outputs=[output])
model.summary()

model.compile(optimizer='adam', loss='binary_crossentropy')
history = model.fit_generator(inputgenerator,steps_per_epoch=steps_per_epoch+1, epochs=epochs)

model.save('task10.h5')

In [None]:
#  category 1: preProcess playlist name and Extract trackurl, num_tracks, name, num_samples from challenge datset
import json
import string
import copy

playlists_MPD = dict()
playlists = list()
playlist = dict()
tracks = list()
count = 0  
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set.json') as f:
    js = f.read()
    f.close()
    slice = json.loads(js)
    for playlist_id in range(len(slice['playlists'])):
        playlist.clear()
        del tracks[:]
        if slice['playlists'][playlist_id]['num_samples'] == 0:
            pid = slice['playlists'][playlist_id]['pid']
            for track_id in range(len(slice['playlists'][playlist_id]['tracks'])):
                tracks.append(slice['playlists'][playlist_id]['tracks'][track_id]['track_uri'][14:])
            if 'name' in slice['playlists'][playlist_id]:
                title = slice['playlists'][playlist_id]['name']
                playlist_title = ' '.join(title.lower().split())
                playlist['name'] = playlist_title
                count += 1
            playlist['pid'] = pid
            playlist['num_tracks'] =  slice['playlists'][playlist_id]['num_tracks']
            playlist['num_samples'] =  slice['playlists'][playlist_id]['num_samples']
            playlist['tracks'] = tracks
            playlists.append(copy.deepcopy(playlist))
    playlists_MPD['playlists'] = playlists
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_case1.json', 'w', encoding='utf8') as outfile:        
    outfile.write(json.dumps(playlists_MPD, indent=4, sort_keys=True, ensure_ascii=False,separators=(',', ': ')))
print(count)

In [None]:
#  category 2: preProcess playlist name and Extract trackurl, num_tracks, name, num_samples
import json
import string
import copy

playlists_MPD = dict()
playlists = list()
playlist = dict()
tracks = list()
count = 0  
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set.json') as f:
    js = f.read()
    f.close()
    slice = json.loads(js)
    for playlist_id in range(len(slice['playlists'])):
        playlist.clear()
        del tracks[:]
        if slice['playlists'][playlist_id]['num_samples'] == 1:
            pid = slice['playlists'][playlist_id]['pid']
            for track_id in range(len(slice['playlists'][playlist_id]['tracks'])):
                tracks.append(slice['playlists'][playlist_id]['tracks'][track_id]['track_uri'][14:])
            if 'name' in slice['playlists'][playlist_id]:
                title = slice['playlists'][playlist_id]['name']
                playlist_title = ' '.join(title.lower().split())
                playlist['name'] = playlist_title
                count += 1
            playlist['pid'] = pid
            playlist['num_tracks'] =  slice['playlists'][playlist_id]['num_tracks']
            playlist['num_samples'] =  slice['playlists'][playlist_id]['num_samples']
            playlist['tracks'] = tracks
            playlists.append(copy.deepcopy(playlist))
    playlists_MPD['playlists'] = playlists
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_case2.json', 'w', encoding='utf8') as outfile:        
    outfile.write(json.dumps(playlists_MPD, indent=4, sort_keys=True, ensure_ascii=False,separators=(',', ': ')))
print(count)

In [None]:
#  category 3: preProcess playlist name and Extract trackurl, num_tracks, name, num_samples
import json
import string
import copy

playlists_MPD = dict()
playlists = list()
playlist = dict()
tracks = list()
count = 0  
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set.json') as f:
    js = f.read()
    f.close()
    slice = json.loads(js)
    for playlist_id in range(len(slice['playlists'])):
        playlist.clear()
        del tracks[:]
        if slice['playlists'][playlist_id]['num_samples'] == 5 and 'name' in slice['playlists'][playlist_id]:
            pid = slice['playlists'][playlist_id]['pid']
            for track_id in range(len(slice['playlists'][playlist_id]['tracks'])):
                tracks.append(slice['playlists'][playlist_id]['tracks'][track_id]['track_uri'][14:])
            if 'name' in slice['playlists'][playlist_id]:
                title = slice['playlists'][playlist_id]['name']
                playlist_title = ' '.join(title.lower().split())
                playlist['name'] = playlist_title
                count += 1
            playlist['pid'] = pid
            playlist['num_tracks'] =  slice['playlists'][playlist_id]['num_tracks']
            playlist['num_samples'] =  slice['playlists'][playlist_id]['num_samples']
            playlist['tracks'] = tracks
            playlists.append(copy.deepcopy(playlist))
    playlists_MPD['playlists'] = playlists
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_case3.json', 'w', encoding='utf8') as outfile:        
    outfile.write(json.dumps(playlists_MPD, indent=4, sort_keys=True, ensure_ascii=False,separators=(',', ': ')))
print(count)

In [None]:
#  category 4: preProcess playlist name and Extract trackurl, num_tracks, name, num_samples
import json
import string
import copy

playlists_MPD = dict()
playlists = list()
playlist = dict()
tracks = list()
count = 0  
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set.json') as f:
    js = f.read()
    f.close()
    slice = json.loads(js)
    for playlist_id in range(len(slice['playlists'])):
        playlist.clear()
        del tracks[:]
        if slice['playlists'][playlist_id]['num_samples'] == 5 and 'name' not in slice['playlists'][playlist_id]:
            pid = slice['playlists'][playlist_id]['pid']
            for track_id in range(len(slice['playlists'][playlist_id]['tracks'])):
                tracks.append(slice['playlists'][playlist_id]['tracks'][track_id]['track_uri'][14:])
            if 'name' in slice['playlists'][playlist_id]:
                title = slice['playlists'][playlist_id]['name']
                playlist_title = ' '.join(title.lower().split())
                playlist['name'] = playlist_title
            else:
                count += 1
            playlist['pid'] = pid
            playlist['num_tracks'] =  slice['playlists'][playlist_id]['num_tracks']
            playlist['num_samples'] =  slice['playlists'][playlist_id]['num_samples']
            playlist['tracks'] = tracks
            playlists.append(copy.deepcopy(playlist))
    playlists_MPD['playlists'] = playlists
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_case4.json', 'w', encoding='utf8') as outfile:        
    outfile.write(json.dumps(playlists_MPD, indent=4, sort_keys=True, ensure_ascii=False,separators=(',', ': ')))
print(count)

In [None]:
#  category 5: preProcess playlist name and Extract trackurl, num_tracks, name, num_samples
import json
import string
import copy

playlists_MPD = dict()
playlists = list()
playlist = dict()
tracks = list()
count = 0  
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set.json') as f:
    js = f.read()
    f.close()
    slice = json.loads(js)
    for playlist_id in range(len(slice['playlists'])):
        playlist.clear()
        del tracks[:]
        if slice['playlists'][playlist_id]['num_samples'] == 10 and 'name' in slice['playlists'][playlist_id]:
            pid = slice['playlists'][playlist_id]['pid']
            for track_id in range(len(slice['playlists'][playlist_id]['tracks'])):
                tracks.append(slice['playlists'][playlist_id]['tracks'][track_id]['track_uri'][14:])
            if 'name' in slice['playlists'][playlist_id]:
                title = slice['playlists'][playlist_id]['name']
                playlist_title = ' '.join(title.lower().split())
                playlist['name'] = playlist_title
                count += 1
            playlist['pid'] = pid
            playlist['num_tracks'] =  slice['playlists'][playlist_id]['num_tracks']
            playlist['num_samples'] =  slice['playlists'][playlist_id]['num_samples']
            playlist['tracks'] = tracks
            playlists.append(copy.deepcopy(playlist))
    playlists_MPD['playlists'] = playlists
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_case5.json', 'w', encoding='utf8') as outfile:        
    outfile.write(json.dumps(playlists_MPD, indent=4, sort_keys=True, ensure_ascii=False,separators=(',', ': ')))
print(count)

In [None]:
#  category 6: preProcess playlist name and Extract trackurl, num_tracks, name, num_samples
import json
import string
import copy

playlists_MPD = dict()
playlists = list()
playlist = dict()
tracks = list()
count = 0  
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set.json') as f:
    js = f.read()
    f.close()
    slice = json.loads(js)
    for playlist_id in range(len(slice['playlists'])):
        playlist.clear()
        del tracks[:]
        if slice['playlists'][playlist_id]['num_samples'] == 10 and 'name' not in slice['playlists'][playlist_id]:
            pid = slice['playlists'][playlist_id]['pid']
            for track_id in range(len(slice['playlists'][playlist_id]['tracks'])):
                tracks.append(slice['playlists'][playlist_id]['tracks'][track_id]['track_uri'][14:])
            if 'name' in slice['playlists'][playlist_id]:
                title = slice['playlists'][playlist_id]['name']
                playlist_title = ' '.join(title.lower().split())
                playlist['name'] = playlist_title
            else:
                count += 1
            playlist['pid'] = pid
            playlist['num_tracks'] =  slice['playlists'][playlist_id]['num_tracks']
            playlist['num_samples'] =  slice['playlists'][playlist_id]['num_samples']
            playlist['tracks'] = tracks
            playlists.append(copy.deepcopy(playlist))
    playlists_MPD['playlists'] = playlists
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_case6.json', 'w', encoding='utf8') as outfile:        
    outfile.write(json.dumps(playlists_MPD, indent=4, sort_keys=True, ensure_ascii=False,separators=(',', ': ')))
print(count)

In [None]:
#  category 7: preProcess playlist name and Extract trackurl, num_tracks, name, num_samples
import json
import string
import copy

playlists_MPD = dict()
playlists = list()
playlist = dict()
tracks = list()
count = 0  
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set.json') as f:
    js = f.read()
    f.close()
    slice = json.loads(js)
    for playlist_id in range(len(slice['playlists'])):
        playlist.clear()
        del tracks[:]
        if slice['playlists'][playlist_id]['num_samples'] == 25:
            orderedPos = True 
            for track_id in range(len(slice['playlists'][playlist_id]['tracks'])):
                if slice['playlists'][playlist_id]['tracks'][track_id]['pos'] != track_id:
                    orderedPos = False
                    break
            if orderedPos == True:
                pid = slice['playlists'][playlist_id]['pid']
                for track_id in range(len(slice['playlists'][playlist_id]['tracks'])):
                    tracks.append(slice['playlists'][playlist_id]['tracks'][track_id]['track_uri'][14:])
                if 'name' in slice['playlists'][playlist_id]:
                    title = slice['playlists'][playlist_id]['name']
                    playlist_title = ' '.join(title.lower().split())
                    playlist['name'] = playlist_title
                    count += 1
                playlist['pid'] = pid
                playlist['num_tracks'] =  slice['playlists'][playlist_id]['num_tracks']
                playlist['num_samples'] =  slice['playlists'][playlist_id]['num_samples']
                playlist['tracks'] = tracks
                playlists.append(copy.deepcopy(playlist))
    playlists_MPD['playlists'] = playlists
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_case7.json', 'w', encoding='utf8') as outfile:        
    outfile.write(json.dumps(playlists_MPD, indent=4, sort_keys=True, ensure_ascii=False,separators=(',', ': ')))
print(count)

In [None]:
#  category 8: preProcess playlist name and Extract trackurl, num_tracks, name, num_samples
import json
import string
import copy

playlists_MPD = dict()
playlists = list()
playlist = dict()
tracks = list()
count = 0  
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set.json') as f:
    js = f.read()
    f.close()
    slice = json.loads(js)
    for playlist_id in range(len(slice['playlists'])):
        playlist.clear()
        del tracks[:]
        if slice['playlists'][playlist_id]['num_samples'] == 25:
            orderedPos = True 
            for track_id in range(len(slice['playlists'][playlist_id]['tracks'])):
                if slice['playlists'][playlist_id]['tracks'][track_id]['pos'] != track_id:
                    orderedPos = False
                    break
            if orderedPos == False:
                pid = slice['playlists'][playlist_id]['pid']
                for track_id in range(len(slice['playlists'][playlist_id]['tracks'])):
                    tracks.append(slice['playlists'][playlist_id]['tracks'][track_id]['track_uri'][14:])
                if 'name' in slice['playlists'][playlist_id]:
                    title = slice['playlists'][playlist_id]['name']
                    playlist_title = ' '.join(title.lower().split())
                    playlist['name'] = playlist_title 
                    count += 1
                playlist['pid'] = pid
                playlist['num_tracks'] =  slice['playlists'][playlist_id]['num_tracks']
                playlist['num_samples'] =  slice['playlists'][playlist_id]['num_samples']
                playlist['tracks'] = tracks
                playlists.append(copy.deepcopy(playlist))
    playlists_MPD['playlists'] = playlists
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_case8.json', 'w', encoding='utf8') as outfile:        
    outfile.write(json.dumps(playlists_MPD, indent=4, sort_keys=True, ensure_ascii=False,separators=(',', ': ')))
print(count)

In [None]:
#  category 9: preProcess playlist name and Extract trackurl, num_tracks, name, num_samples
import json
import string
import copy

playlists_MPD = dict()
playlists = list()
playlist = dict()
tracks = list()
count = 0  
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set.json') as f:
    js = f.read()
    f.close()
    slice = json.loads(js)
    for playlist_id in range(len(slice['playlists'])):
        playlist.clear()
        del tracks[:]
        if slice['playlists'][playlist_id]['num_samples'] == 100:
            orderedPos = True 
            for track_id in range(len(slice['playlists'][playlist_id]['tracks'])):
                if slice['playlists'][playlist_id]['tracks'][track_id]['pos'] != track_id:
                    orderedPos = False
                    break
            if orderedPos == True:
                pid = slice['playlists'][playlist_id]['pid']
                for track_id in range(len(slice['playlists'][playlist_id]['tracks'])):
                    tracks.append(slice['playlists'][playlist_id]['tracks'][track_id]['track_uri'][14:])
                if 'name' in slice['playlists'][playlist_id]:
                    title = slice['playlists'][playlist_id]['name']
                    playlist_title = ' '.join(title.lower().split())
                    playlist['name'] = playlist_title 
                    count += 1
                playlist['pid'] = pid
                playlist['num_tracks'] =  slice['playlists'][playlist_id]['num_tracks']
                playlist['num_samples'] =  slice['playlists'][playlist_id]['num_samples']
                playlist['tracks'] = tracks
                playlists.append(copy.deepcopy(playlist))
    playlists_MPD['playlists'] = playlists
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_case9.json', 'w', encoding='utf8') as outfile:        
    outfile.write(json.dumps(playlists_MPD, indent=4, sort_keys=True, ensure_ascii=False,separators=(',', ': ')))
print(count)

In [None]:
#  category 10: preProcess playlist name and Extract trackurl, num_tracks, name, num_samples
import json
import string
import copy

playlists_MPD = dict()
playlists = list()
playlist = dict()
tracks = list()
count = 0  
with open('C:\\potifyPlaylistData\\ChallengeData\\challenge_set.json') as f:
    js = f.read()
    f.close()
    slice = json.loads(js)
    for playlist_id in range(len(slice['playlists'])):
        playlist.clear()
        del tracks[:]
        if slice['playlists'][playlist_id]['num_samples'] == 100:
            orderedPos = True 
            for track_id in range(len(slice['playlists'][playlist_id]['tracks'])):
                if slice['playlists'][playlist_id]['tracks'][track_id]['pos'] != track_id:
                    orderedPos = False
                    break
            if orderedPos == False:
                pid = slice['playlists'][playlist_id]['pid']
                for track_id in range(len(slice['playlists'][playlist_id]['tracks'])):
                    tracks.append(slice['playlists'][playlist_id]['tracks'][track_id]['track_uri'][14:])
                if 'name' in slice['playlists'][playlist_id]:
                    title = slice['playlists'][playlist_id]['name']
                    playlist_title = ' '.join(title.lower().split())
                    playlist['name'] = playlist_title 
                    count += 1
                playlist['pid'] = pid
                playlist['num_tracks'] =  slice['playlists'][playlist_id]['num_tracks']
                playlist['num_samples'] =  slice['playlists'][playlist_id]['num_samples']
                playlist['tracks'] = tracks
                playlists.append(copy.deepcopy(playlist))
    playlists_MPD['playlists'] = playlists
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_case10.json', 'w', encoding='utf8') as outfile:        
    outfile.write(json.dumps(playlists_MPD, indent=4, sort_keys=True, ensure_ascii=False,separators=(',', ': ')))
print(count)

In [None]:
# speicial playlist name
spe_name = ['je taime','raphop','itslit','universal stereo','gott','deporte','jakes playlist','mumford sons wilder mind deluxe',
           'we cant stop','throwbackpop','gezellig','throwbackrap','rythm','mixtape2']
simil_name = ["je t'aime",'rap hop','its lit','universal','god','sport',"jake's playlist",'mumford & sons – wilder mind',
           "we can't stop",'throwback pop','cozy','throwback rap','rhythm','mixtape 2']
dict_speName = dict()
for i in range(len(spe_name)):
    dict_speName[spe_name[i]] = simil_name[i]
with open("dict_speName.txt","wb") as f:
    pickle.dump(dict_speName,f)

In [None]:
#  indexing track,playlist name in challenge datset for category 1 with integers

import json
import copy
import os
import pickle

with open("playlistName2idx.txt","rb") as f:
    playlistName2idx = pickle.load(f)
with open("num_tracks2idx.txt","rb") as f:
    num_tracks2idx = pickle.load(f)
with open("dict_speName.txt","rb") as f:
    dict_speName = pickle.load(f)

playlists_MPD = dict()
playlists = list()
playlist = dict()
tracksIdx = list()
count_playlist = 0
count_name = 0
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_case1.json',encoding='utf8') as f:
    js = f.read()
    f.close()
    slice = json.loads(js)
    playlists_MPD.clear()
    playlists.clear()
    for playlist_id in range(len(slice['playlists'])):
        playlist.clear()
        del tracksIdx[:]
          
        name = slice['playlists'][playlist_id]['name']
        if name in playlistName2idx:
            nameIdx = playlistName2idx[slice['playlists'][playlist_id]['name']]
            playlist['nameIdx'] = nameIdx
            count_name += 1
        else:
            name = ' '.join(''.join(e for e in name if e.isalnum() or e == ' ').split())
            if name in playlistName2idx:
                nameIdx = playlistName2idx[name]
                playlist['nameIdx'] = nameIdx
                count_name += 1
            else:
                nameIdx = playlistName2idx[dict_speName[name]]
                playlist['nameIdx'] = nameIdx
                count_name += 1
    
        playlist['num_tracks'] = slice['playlists'][playlist_id]["num_tracks"]                     
        playlist['num_tracksIdx'] = num_tracks2idx[slice['playlists'][playlist_id]["num_tracks"]] 
        
        pid = slice['playlists'][playlist_id]['pid']
        playlist['pid'] = pid
        
        playlists.append(copy.deepcopy(playlist))
        count_playlist += 1
    playlists_MPD['playlists'] = playlists
    with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_Indices_case1.json', 'w', encoding='utf8') as outfile:        
        outfile.write(json.dumps(playlists_MPD, indent=4, sort_keys=True, ensure_ascii=False,separators=(',', ': ')))
print('count_playlist:',count_playlist)
print('count_name:',count_name)

In [None]:
#  indexing track,playlist name in challenge datset for category 2 with integers

import json
import copy
import os
import pickle

with open("playlistName2idx.txt","rb") as f:
    playlistName2idx = pickle.load(f)
with open("tracksUrl2idx.txt","rb") as f:
    tracksUrl2idx = pickle.load(f)
with open("num_tracks2idx.txt","rb") as f:
    num_tracks2idx = pickle.load(f)
with open("dict_speName.txt","rb") as f:
    dict_speName = pickle.load(f)

playlists_MPD = dict()
playlists = list()
playlist = dict()
tracksIdx = list()
count_playlist = 0
count_name = 0
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_case2.json',encoding='utf8') as f:
    js = f.read()
    f.close()
    slice = json.loads(js)
    playlists_MPD.clear()
    playlists.clear()
    for playlist_id in range(len(slice['playlists'])):
        playlist.clear()
        del tracksIdx[:]
          
        name = slice['playlists'][playlist_id]['name']
        if name in playlistName2idx:
            nameIdx = playlistName2idx[slice['playlists'][playlist_id]['name']]
            playlist['nameIdx'] = nameIdx
            count_name += 1
        else:
            name = ' '.join(''.join(e for e in name if e.isalnum() or e == ' ').split())
            if name in playlistName2idx:
                nameIdx = playlistName2idx[name]
                playlist['nameIdx'] = nameIdx
                count_name += 1
            else:
                nameIdx = playlistName2idx[dict_speName[name]]
                playlist['nameIdx'] = nameIdx
                count_name += 1
    
        playlist['num_tracks'] = slice['playlists'][playlist_id]["num_tracks"]                     
        playlist['num_tracksIdx'] = num_tracks2idx[slice['playlists'][playlist_id]["num_tracks"]] 
        
        pid = slice['playlists'][playlist_id]['pid']
        playlist['pid'] = pid
        
        tracks = slice['playlists'][playlist_id]['tracks']
        tracks_len = len(tracks)

        for track_id in range(tracks_len):
            tracksIdx.append(tracksUrl2idx[tracks[track_id]])
        playlist['tracksIdx'] = tracksIdx
        
        if 1 != tracks_len:
            print("num_tracks doesn't match!!")

        playlists.append(copy.deepcopy(playlist))
        count_playlist += 1
    playlists_MPD['playlists'] = playlists
    with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_Indices_case2.json', 'w', encoding='utf8') as outfile:        
        outfile.write(json.dumps(playlists_MPD, indent=4, sort_keys=True, ensure_ascii=False,separators=(',', ': ')))
print('count_playlist:',count_playlist)
print('count_name:',count_name)

In [None]:
#  indexing track,playlist name in challenge datset for category 3 with integers

import json
import copy
import os
import pickle

with open("playlistName2idx.txt","rb") as f:
    playlistName2idx = pickle.load(f)
with open("tracksUrl2idx.txt","rb") as f:
    tracksUrl2idx = pickle.load(f)
with open("num_tracks2idx.txt","rb") as f:
    num_tracks2idx = pickle.load(f)
with open("dict_speName.txt","rb") as f:
    dict_speName = pickle.load(f)

playlists_MPD = dict()
playlists = list()
playlist = dict()
tracksIdx = list()
count_playlist = 0
count_name = 0
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_case3.json',encoding='utf8') as f:
    js = f.read()
    f.close()
    slice = json.loads(js)
    playlists_MPD.clear()
    playlists.clear()
    for playlist_id in range(len(slice['playlists'])):
        playlist.clear()
        del tracksIdx[:]
          
        name = slice['playlists'][playlist_id]['name']
        if name in playlistName2idx:
            nameIdx = playlistName2idx[slice['playlists'][playlist_id]['name']]
            playlist['nameIdx'] = nameIdx
            count_name += 1
        else:
            name = ' '.join(''.join(e for e in name if e.isalnum() or e == ' ').split())
            if name in playlistName2idx:
                nameIdx = playlistName2idx[name]
                playlist['nameIdx'] = nameIdx
                count_name += 1
            else:
                nameIdx = playlistName2idx[dict_speName[name]]
                playlist['nameIdx'] = nameIdx
                count_name += 1
    
        playlist['num_tracks'] = slice['playlists'][playlist_id]["num_tracks"]                     
        playlist['num_tracksIdx'] = num_tracks2idx[slice['playlists'][playlist_id]["num_tracks"]] 
        
        pid = slice['playlists'][playlist_id]['pid']
        playlist['pid'] = pid
        
        tracks = slice['playlists'][playlist_id]['tracks']
        tracks_len = len(tracks)

        for track_id in range(tracks_len):
            tracksIdx.append(tracksUrl2idx[tracks[track_id]])
        playlist['tracksIdx'] = tracksIdx
        
        if 5 != tracks_len:
            print("num_tracks doesn't match!!")

        playlists.append(copy.deepcopy(playlist))
        count_playlist += 1
    playlists_MPD['playlists'] = playlists
    with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_Indices_case3.json', 'w', encoding='utf8') as outfile:        
        outfile.write(json.dumps(playlists_MPD, indent=4, sort_keys=True, ensure_ascii=False,separators=(',', ': ')))
print('count_playlist:',count_playlist)
print('count_name:',count_name)

In [None]:
#  indexing track,playlist name in challenge datset for category 4 with integers

import json
import copy
import os
import pickle

with open("playlistName2idx.txt","rb") as f:
    playlistName2idx = pickle.load(f)
with open("tracksUrl2idx.txt","rb") as f:
    tracksUrl2idx = pickle.load(f)
with open("num_tracks2idx.txt","rb") as f:
    num_tracks2idx = pickle.load(f)
with open("dict_speName.txt","rb") as f:
    dict_speName = pickle.load(f)

playlists_MPD = dict()
playlists = list()
playlist = dict()
tracksIdx = list()
count_playlist = 0
count_name = 0
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_case4.json',encoding='utf8') as f:
    js = f.read()
    f.close()
    slice = json.loads(js)
    playlists_MPD.clear()
    playlists.clear()
    for playlist_id in range(len(slice['playlists'])):
        playlist.clear()
        del tracksIdx[:]
        
        if 'name' in slice['playlists'][playlist_id]:
            name = slice['playlists'][playlist_id]['name']
            if name in playlistName2idx:
                nameIdx = playlistName2idx[slice['playlists'][playlist_id]['name']]
                playlist['nameIdx'] = nameIdx
                count_name += 1
            else:
                name = ' '.join(''.join(e for e in name if e.isalnum() or e == ' ').split())
                if name in playlistName2idx:
                    nameIdx = playlistName2idx[name]
                    playlist['nameIdx'] = nameIdx
                    count_name += 1
                else:
                    nameIdx = playlistName2idx[dict_speName[name]]
                    playlist['nameIdx'] = nameIdx
                    count_name += 1
    
        playlist['num_tracks'] = slice['playlists'][playlist_id]["num_tracks"]                     
        playlist['num_tracksIdx'] = num_tracks2idx[slice['playlists'][playlist_id]["num_tracks"]] 
        
        pid = slice['playlists'][playlist_id]['pid']
        playlist['pid'] = pid
        
        tracks = slice['playlists'][playlist_id]['tracks']
        tracks_len = len(tracks)

        for track_id in range(tracks_len):
            tracksIdx.append(tracksUrl2idx[tracks[track_id]])
        playlist['tracksIdx'] = tracksIdx
        
        if 5 != tracks_len:
            print("num_tracks doesn't match!!")

        playlists.append(copy.deepcopy(playlist))
        count_playlist += 1
    playlists_MPD['playlists'] = playlists
    with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_Indices_case4.json', 'w', encoding='utf8') as outfile:        
        outfile.write(json.dumps(playlists_MPD, indent=4, sort_keys=True, ensure_ascii=False,separators=(',', ': ')))
print('count_playlist:',count_playlist)
print('count_name:',count_name)

In [None]:
#  indexing track,playlist name in challenge datset for category 5 with integers

import json
import copy
import os
import pickle

with open("playlistName2idx.txt","rb") as f:
    playlistName2idx = pickle.load(f)
with open("tracksUrl2idx.txt","rb") as f:
    tracksUrl2idx = pickle.load(f)
with open("num_tracks2idx.txt","rb") as f:
    num_tracks2idx = pickle.load(f)
with open("dict_speName.txt","rb") as f:
    dict_speName = pickle.load(f)

playlists_MPD = dict()
playlists = list()
playlist = dict()
tracksIdx = list()
count_playlist = 0
count_name = 0
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_case5.json',encoding='utf8') as f:
    js = f.read()
    f.close()
    slice = json.loads(js)
    playlists_MPD.clear()
    playlists.clear()
    for playlist_id in range(len(slice['playlists'])):
        playlist.clear()
        del tracksIdx[:]
          
        name = slice['playlists'][playlist_id]['name']
        if name in playlistName2idx:
            nameIdx = playlistName2idx[slice['playlists'][playlist_id]['name']]
            playlist['nameIdx'] = nameIdx
            count_name += 1
        else:
            name = ' '.join(''.join(e for e in name if e.isalnum() or e == ' ').split())
            if name in playlistName2idx:
                nameIdx = playlistName2idx[name]
                playlist['nameIdx'] = nameIdx
                count_name += 1
            else:
                nameIdx = playlistName2idx[dict_speName[name]]
                playlist['nameIdx'] = nameIdx
                count_name += 1
    
        playlist['num_tracks'] = slice['playlists'][playlist_id]["num_tracks"]                     
        playlist['num_tracksIdx'] = num_tracks2idx[slice['playlists'][playlist_id]["num_tracks"]] 
        
        pid = slice['playlists'][playlist_id]['pid']
        playlist['pid'] = pid
        
        tracks = slice['playlists'][playlist_id]['tracks']
        tracks_len = len(tracks)

        for track_id in range(tracks_len):
            tracksIdx.append(tracksUrl2idx[tracks[track_id]])
        playlist['tracksIdx'] = tracksIdx
        
        if 10 != tracks_len:
            print("num_tracks doesn't match!!")

        playlists.append(copy.deepcopy(playlist))
        count_playlist += 1
    playlists_MPD['playlists'] = playlists
    with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_Indices_case5.json', 'w', encoding='utf8') as outfile:        
        outfile.write(json.dumps(playlists_MPD, indent=4, sort_keys=True, ensure_ascii=False,separators=(',', ': ')))
print('count_playlist:',count_playlist)
print('count_name:',count_name)

In [None]:
#  indexing track,playlist name in challenge datset for category 6 with integers

import json
import copy
import os
import pickle

with open("playlistName2idx.txt","rb") as f:
    playlistName2idx = pickle.load(f)
with open("tracksUrl2idx.txt","rb") as f:
    tracksUrl2idx = pickle.load(f)
with open("num_tracks2idx.txt","rb") as f:
    num_tracks2idx = pickle.load(f)
with open("dict_speName.txt","rb") as f:
    dict_speName = pickle.load(f)

playlists_MPD = dict()
playlists = list()
playlist = dict()
tracksIdx = list()
count_playlist = 0
count_name = 0
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_case6.json',encoding='utf8') as f:
    js = f.read()
    f.close()
    slice = json.loads(js)
    playlists_MPD.clear()
    playlists.clear()
    for playlist_id in range(len(slice['playlists'])):
        playlist.clear()
        del tracksIdx[:]
        
        if 'name' in slice['playlists'][playlist_id]:
            name = slice['playlists'][playlist_id]['name']
            if name in playlistName2idx:
                nameIdx = playlistName2idx[slice['playlists'][playlist_id]['name']]
                playlist['nameIdx'] = nameIdx
                count_name += 1
            else:
                name = ' '.join(''.join(e for e in name if e.isalnum() or e == ' ').split())
                if name in playlistName2idx:
                    nameIdx = playlistName2idx[name]
                    playlist['nameIdx'] = nameIdx
                    count_name += 1
                else:
                    nameIdx = playlistName2idx[dict_speName[name]]
                    playlist['nameIdx'] = nameIdx
                    count_name += 1
    
        playlist['num_tracks'] = slice['playlists'][playlist_id]["num_tracks"]                     
        playlist['num_tracksIdx'] = num_tracks2idx[slice['playlists'][playlist_id]["num_tracks"]] 
        
        pid = slice['playlists'][playlist_id]['pid']
        playlist['pid'] = pid
        
        tracks = slice['playlists'][playlist_id]['tracks']
        tracks_len = len(tracks)

        for track_id in range(tracks_len):
            tracksIdx.append(tracksUrl2idx[tracks[track_id]])
        playlist['tracksIdx'] = tracksIdx
        
        if 10 != tracks_len:
            print("num_tracks doesn't match!!")

        playlists.append(copy.deepcopy(playlist))
        count_playlist += 1
    playlists_MPD['playlists'] = playlists
    with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_Indices_case6.json', 'w', encoding='utf8') as outfile:        
        outfile.write(json.dumps(playlists_MPD, indent=4, sort_keys=True, ensure_ascii=False,separators=(',', ': ')))
print('count_playlist:',count_playlist)
print('count_name:',count_name)

In [None]:
#  indexing track,playlist name in challenge datset for category 7 with integers

import json
import copy
import os
import pickle

with open("playlistName2idx.txt","rb") as f:
    playlistName2idx = pickle.load(f)
with open("tracksUrl2idx.txt","rb") as f:
    tracksUrl2idx = pickle.load(f)
with open("num_tracks2idx.txt","rb") as f:
    num_tracks2idx = pickle.load(f)
with open("dict_speName.txt","rb") as f:
    dict_speName = pickle.load(f)

playlists_MPD = dict()
playlists = list()
playlist = dict()
tracksIdx = list()
count_playlist = 0
count_name = 0
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_case7.json',encoding='utf8') as f:
    js = f.read()
    f.close()
    slice = json.loads(js)
    playlists_MPD.clear()
    playlists.clear()
    for playlist_id in range(len(slice['playlists'])):
        playlist.clear()
        del tracksIdx[:]
          
        name = slice['playlists'][playlist_id]['name']
        if name in playlistName2idx:
            nameIdx = playlistName2idx[slice['playlists'][playlist_id]['name']]
            playlist['nameIdx'] = nameIdx
            count_name += 1
        else:
            name = ' '.join(''.join(e for e in name if e.isalnum() or e == ' ').split())
            if name in playlistName2idx:
                nameIdx = playlistName2idx[name]
                playlist['nameIdx'] = nameIdx
                count_name += 1
            else:
                nameIdx = playlistName2idx[dict_speName[name]]
                playlist['nameIdx'] = nameIdx
                count_name += 1
    
        playlist['num_tracks'] = slice['playlists'][playlist_id]["num_tracks"]                     
        playlist['num_tracksIdx'] = num_tracks2idx[slice['playlists'][playlist_id]["num_tracks"]] 
        
        pid = slice['playlists'][playlist_id]['pid']
        playlist['pid'] = pid
        
        tracks = slice['playlists'][playlist_id]['tracks']
        tracks_len = len(tracks)

        for track_id in range(tracks_len):
            tracksIdx.append(tracksUrl2idx[tracks[track_id]])
        playlist['tracksIdx'] = tracksIdx
        
        if 25 != tracks_len:
            print("num_tracks doesn't match!!")

        playlists.append(copy.deepcopy(playlist))
        count_playlist += 1
    playlists_MPD['playlists'] = playlists
    with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_Indices_case7.json', 'w', encoding='utf8') as outfile:        
        outfile.write(json.dumps(playlists_MPD, indent=4, sort_keys=True, ensure_ascii=False,separators=(',', ': ')))
print('count_playlist:',count_playlist)
print('count_name:',count_name)

In [None]:
#  indexing track,playlist name in challenge datset for category 8 with integers

import json
import copy
import os
import pickle

with open("playlistName2idx.txt","rb") as f:
    playlistName2idx = pickle.load(f)
with open("tracksUrl2idx.txt","rb") as f:
    tracksUrl2idx = pickle.load(f)
with open("num_tracks2idx.txt","rb") as f:
    num_tracks2idx = pickle.load(f)
with open("dict_speName.txt","rb") as f:
    dict_speName = pickle.load(f)

playlists_MPD = dict()
playlists = list()
playlist = dict()
tracksIdx = list()
count_playlist = 0
count_name = 0
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_case8.json',encoding='utf8') as f:
    js = f.read()
    f.close()
    slice = json.loads(js)
    playlists_MPD.clear()
    playlists.clear()
    for playlist_id in range(len(slice['playlists'])):
        playlist.clear()
        del tracksIdx[:]
          
        name = slice['playlists'][playlist_id]['name']
        if name in playlistName2idx:
            nameIdx = playlistName2idx[slice['playlists'][playlist_id]['name']]
            playlist['nameIdx'] = nameIdx
            count_name += 1
        else:
            name = ' '.join(''.join(e for e in name if e.isalnum() or e == ' ').split())
            if name in playlistName2idx:
                nameIdx = playlistName2idx[name]
                playlist['nameIdx'] = nameIdx
                count_name += 1
            else:
                nameIdx = playlistName2idx[dict_speName[name]]
                playlist['nameIdx'] = nameIdx
                count_name += 1
    
        playlist['num_tracks'] = slice['playlists'][playlist_id]["num_tracks"]                     
        playlist['num_tracksIdx'] = num_tracks2idx[slice['playlists'][playlist_id]["num_tracks"]] 
        
        pid = slice['playlists'][playlist_id]['pid']
        playlist['pid'] = pid
        
        tracks = slice['playlists'][playlist_id]['tracks']
        tracks_len = len(tracks)

        for track_id in range(tracks_len):
            tracksIdx.append(tracksUrl2idx[tracks[track_id]])
        playlist['tracksIdx'] = tracksIdx
        
        if 25 != tracks_len:
            print("num_tracks doesn't match!!")

        playlists.append(copy.deepcopy(playlist))
        count_playlist += 1
    playlists_MPD['playlists'] = playlists
    with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_Indices_case8.json', 'w', encoding='utf8') as outfile:        
        outfile.write(json.dumps(playlists_MPD, indent=4, sort_keys=True, ensure_ascii=False,separators=(',', ': ')))
print('count_playlist:',count_playlist)
print('count_name:',count_name)

In [None]:
#  indexing track,playlist name in challenge datset for category 9 with integers

import json
import copy
import os
import pickle

with open("playlistName2idx.txt","rb") as f:
    playlistName2idx = pickle.load(f)
with open("tracksUrl2idx.txt","rb") as f:
    tracksUrl2idx = pickle.load(f)
with open("num_tracks2idx.txt","rb") as f:
    num_tracks2idx = pickle.load(f)
with open("dict_speName.txt","rb") as f:
    dict_speName = pickle.load(f)

playlists_MPD = dict()
playlists = list()
playlist = dict()
tracksIdx = list()
count_playlist = 0
count_name = 0
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_case9.json',encoding='utf8') as f:
    js = f.read()
    f.close()
    slice = json.loads(js)
    playlists_MPD.clear()
    playlists.clear()
    for playlist_id in range(len(slice['playlists'])):
        playlist.clear()
        del tracksIdx[:]
          
        name = slice['playlists'][playlist_id]['name']
        if name in playlistName2idx:
            nameIdx = playlistName2idx[slice['playlists'][playlist_id]['name']]
            playlist['nameIdx'] = nameIdx
            count_name += 1
        else:
            name = ' '.join(''.join(e for e in name if e.isalnum() or e == ' ').split())
            if name in playlistName2idx:
                nameIdx = playlistName2idx[name]
                playlist['nameIdx'] = nameIdx
                count_name += 1
            else:
                nameIdx = playlistName2idx[dict_speName[name]]
                playlist['nameIdx'] = nameIdx
                count_name += 1
    
        playlist['num_tracks'] = slice['playlists'][playlist_id]["num_tracks"]                     
        playlist['num_tracksIdx'] = num_tracks2idx[slice['playlists'][playlist_id]["num_tracks"]] 
        
        pid = slice['playlists'][playlist_id]['pid']
        playlist['pid'] = pid
        
        tracks = slice['playlists'][playlist_id]['tracks']
        tracks_len = len(tracks)

        for track_id in range(tracks_len):
            tracksIdx.append(tracksUrl2idx[tracks[track_id]])
        playlist['tracksIdx'] = tracksIdx
        
        if 100 != tracks_len:
            print("num_tracks doesn't match!!")

        playlists.append(copy.deepcopy(playlist))
        count_playlist += 1
    playlists_MPD['playlists'] = playlists
    with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_Indices_case9.json', 'w', encoding='utf8') as outfile:        
        outfile.write(json.dumps(playlists_MPD, indent=4, sort_keys=True, ensure_ascii=False,separators=(',', ': ')))
print('count_playlist:',count_playlist)
print('count_name:',count_name)

In [None]:
#  indexing track,playlist name in challenge datset for category 10 with integers

import json
import copy
import os
import pickle

with open("playlistName2idx.txt","rb") as f:
    playlistName2idx = pickle.load(f)
with open("tracksUrl2idx.txt","rb") as f:
    tracksUrl2idx = pickle.load(f)
with open("num_tracks2idx.txt","rb") as f:
    num_tracks2idx = pickle.load(f)
with open("dict_speName.txt","rb") as f:
    dict_speName = pickle.load(f)

playlists_MPD = dict()
playlists = list()
playlist = dict()
tracksIdx = list()
count_playlist = 0
count_name = 0
with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_case10.json',encoding='utf8') as f:
    js = f.read()
    f.close()
    slice = json.loads(js)
    playlists_MPD.clear()
    playlists.clear()
    for playlist_id in range(len(slice['playlists'])):
        playlist.clear()
        del tracksIdx[:]
          
        name = slice['playlists'][playlist_id]['name']
        if name in playlistName2idx:
            nameIdx = playlistName2idx[slice['playlists'][playlist_id]['name']]
            playlist['nameIdx'] = nameIdx
            count_name += 1
        else:
            name = ' '.join(''.join(e for e in name if e.isalnum() or e == ' ').split())
            if name in playlistName2idx:
                nameIdx = playlistName2idx[name]
                playlist['nameIdx'] = nameIdx
                count_name += 1
            else:
                nameIdx = playlistName2idx[dict_speName[name]]
                playlist['nameIdx'] = nameIdx
                count_name += 1
    
        playlist['num_tracks'] = slice['playlists'][playlist_id]["num_tracks"]                     
        playlist['num_tracksIdx'] = num_tracks2idx[slice['playlists'][playlist_id]["num_tracks"]] 
        
        pid = slice['playlists'][playlist_id]['pid']
        playlist['pid'] = pid
        
        tracks = slice['playlists'][playlist_id]['tracks']
        tracks_len = len(tracks)

        for track_id in range(tracks_len):
            tracksIdx.append(tracksUrl2idx[tracks[track_id]])
        playlist['tracksIdx'] = tracksIdx
        
        if 100 != tracks_len:
            print("num_tracks doesn't match!!")

        playlists.append(copy.deepcopy(playlist))
        count_playlist += 1
    playlists_MPD['playlists'] = playlists
    with open('C:\\SpotifyPlaylistData\\ChallengeData\\challenge_set_Extract_Indices_case10.json', 'w', encoding='utf8') as outfile:        
        outfile.write(json.dumps(playlists_MPD, indent=4, sort_keys=True, ensure_ascii=False,separators=(',', ': ')))
print('count_playlist:',count_playlist)
print('count_name:',count_name)

In [None]:
# genrate challenge datset 1 as hdf5 file with fields ['num_tracksIdx'],['nameVec'],['tracksIdx'], coresponding to task category 1

import numpy as np
import h5py
import pickle
import os
import json

with h5py.File('C:/SpotifyPlaylistData/ChallengeData/challenge_Dataset_1.hdf5','w') as f:
    f.create_dataset('num_tracksIdx', (1000,1),dtype = 'int32')
    f.create_dataset('nameVec', (1000,40),dtype = 'float32')
    f.create_dataset('pid', (1000,1),dtype = 'int32')
    
with open("nameIdx2Features.txt","rb") as f:
    nameIdx2Features = pickle.load(f)
with open("trackIdx2Features.txt","rb") as f:
    trackIdx2Features = pickle.load(f)

with h5py.File('C:/SpotifyPlaylistData/ChallengeData/challenge_Dataset_1.hdf5','a') as f_data:
    count = 0
    with open('C:/SpotifyPlaylistData/ChallengeData/challenge_set_Extract_Indices_case1.json',encoding='utf8') as f:
        js = f.read()
        f.close()
        slice = json.loads(js)
        for playlist_id in range(len(slice['playlists'])):
            if 'nameIdx' in slice['playlists'][playlist_id]:
                f_data['num_tracksIdx'][count,:] = slice['playlists'][playlist_id]['num_tracksIdx']
                f_data['nameVec'][count,:] = nameIdx2Features[str(slice['playlists'][playlist_id]['nameIdx'])]
                f_data['pid'][count,:] = slice['playlists'][playlist_id]['pid']
                count += 1 
            
print('count:',count)

In [None]:
# genrate challenge datset 2 as hdf5 file with fields ['num_tracksIdx'],['nameVec'],['tracksVec'],['tracksIdx'],coresponding to task category 2

import numpy as np
import h5py
import pickle
import os
import json

with h5py.File('C:/Users/ZhenWang/SpotifyPlaylistData/ChallengeData/challenge_Dataset_2.hdf5','w') as f:
    f.create_dataset('num_tracksIdx', (1000,1),dtype = 'int32')
    f.create_dataset('nameVec', (1000,40),dtype = 'float32')
    f.create_dataset('tracksVec', (1000,40),dtype = 'float32')
    f.create_dataset('pid', (1000,1),dtype = 'int32')
    
with open("nameIdx2Features.txt","rb") as f:
    nameIdx2Features = pickle.load(f)
with open("trackIdx2Features.txt","rb") as f:
    trackIdx2Features = pickle.load(f)

with h5py.File('C:/Users/ZhenWang/SpotifyPlaylistData/ChallengeData/challenge_Dataset_2.hdf5','a') as f_data:
    count = 0
    with open('C:/Users/ZhenWang/SpotifyPlaylistData/ChallengeData/challenge_set_Extract_Indices_case2.json',encoding='utf8') as f:
        js = f.read()
        f.close()
        slice = json.loads(js)
        for playlist_id in range(len(slice['playlists'])):
            if len(slice['playlists'][playlist_id]['tracksIdx']) == 1 and 'nameIdx' in slice['playlists'][playlist_id]:
                f_data['num_tracksIdx'][count,:] = slice['playlists'][playlist_id]['num_tracksIdx']
                f_data['nameVec'][count,:] = nameIdx2Features[str(slice['playlists'][playlist_id]['nameIdx'])]
                tracksIdx = slice['playlists'][playlist_id]['tracksIdx']
                f_data['tracksVec'][count,:] = trackIdx2Features[str(tracksIdx[0])]
                f_data['pid'][count,:] = slice['playlists'][playlist_id]['pid']
                count += 1 
            
print('count:',count)

In [None]:
# genrate challenge datset 3 as hdf5 file with fields['num_tracksIdx'],['nameVec'],['tracksVec'],['tracksIdx'],coresponding to task category 3

import numpy as np
import h5py
import pickle
import os
import json

with h5py.File('C:/SpotifyPlaylistData/ChallengeData/challenge_Dataset_3.hdf5','w') as f:
    f.create_dataset('num_tracksIdx', (1000,1),dtype = 'int32')
    f.create_dataset('nameVec', (1000,40),dtype = 'float32')
    f.create_dataset('tracksVec', (1000,200),dtype = 'float32')
    f.create_dataset('pid', (1000,1),dtype = 'int32')
    
with open("nameIdx2Features.txt","rb") as f:
    nameIdx2Features = pickle.load(f)
with open("trackIdx2Features.txt","rb") as f:
    trackIdx2Features = pickle.load(f)

with h5py.File('C:/SpotifyPlaylistData/ChallengeData/challenge_Dataset_3.hdf5','a') as f_data:
    count = 0
    tracksVec_arr = np.zeros([5,40],dtype=np.float32)
    with open('C:/SpotifyPlaylistData/ChallengeData/challenge_set_Extract_Indices_case3.json',encoding='utf8') as f:
        js = f.read()
        f.close()
        slice = json.loads(js)
        for playlist_id in range(len(slice['playlists'])):
            if len(slice['playlists'][playlist_id]['tracksIdx']) == 5 and 'nameIdx' in slice['playlists'][playlist_id]:
                f_data['num_tracksIdx'][count,:] = slice['playlists'][playlist_id]['num_tracksIdx']
                f_data['nameVec'][count,:] = nameIdx2Features[str(slice['playlists'][playlist_id]['nameIdx'])]
                tracksIdx = slice['playlists'][playlist_id]['tracksIdx']
                for i in range(5):
                    tracksVec_arr[i,:] = trackIdx2Features[str(tracksIdx[i])]
                f_data['tracksVec'][count,:] = tracksVec_arr.flatten()
                f_data['pid'][count,:] = slice['playlists'][playlist_id]['pid']
                count += 1 
            
print('count:',count)

In [None]:
# genrate challenge datset 4 as hdf5 file with fields['num_tracksIdx'],['tracksVec'],['tracksIdx'],coresponding to task category 4

import numpy as np
import h5py
import pickle
import os
import json

with h5py.File('C:/potifyPlaylistData/ChallengeData/challenge_Dataset_4.hdf5','w') as f:
    f.create_dataset('num_tracksIdx', (1000,1),dtype = 'int32')
    f.create_dataset('tracksVec', (1000,200),dtype = 'float32')
    f.create_dataset('pid', (1000,1),dtype = 'int32')
    
with open("nameIdx2Features.txt","rb") as f:
    nameIdx2Features = pickle.load(f)
with open("trackIdx2Features.txt","rb") as f:
    trackIdx2Features = pickle.load(f)

with h5py.File('C:/SpotifyPlaylistData/ChallengeData/challenge_Dataset_4.hdf5','a') as f_data:
    count = 0
    tracksVec_arr = np.zeros([5,40],dtype=np.float32)
    with open('C:/SpotifyPlaylistData/ChallengeData/challenge_set_Extract_Indices_case4.json',encoding='utf8') as f:
        js = f.read()
        f.close()
        slice = json.loads(js)
        for playlist_id in range(len(slice['playlists'])):
            if len(slice['playlists'][playlist_id]['tracksIdx']) == 5 and 'nameIdx' not in slice['playlists'][playlist_id]:
                f_data['num_tracksIdx'][count,:] = slice['playlists'][playlist_id]['num_tracksIdx']
                tracksIdx = slice['playlists'][playlist_id]['tracksIdx']
                for i in range(5):
                    tracksVec_arr[i,:] = trackIdx2Features[str(tracksIdx[i])]
                f_data['tracksVec'][count,:] = tracksVec_arr.flatten()
                f_data['pid'][count,:] = slice['playlists'][playlist_id]['pid']
                count += 1 
            
print('count:',count)

In [None]:
# genrate challenge datset 5 as hdf5 file with fields['num_tracksIdx'],['nameVec'],['tracksVec'],['tracksIdx'],coresponding to task category 5

import numpy as np
import h5py
import pickle
import os
import json

with h5py.File('C:/SpotifyPlaylistData/ChallengeData/challenge_Dataset_5.hdf5','w') as f:
    f.create_dataset('num_tracksIdx', (1000,1),dtype = 'int32')
    f.create_dataset('nameVec', (1000,40),dtype = 'float32')
    f.create_dataset('tracksVec', (1000,400),dtype = 'float32')
    f.create_dataset('pid', (1000,1),dtype = 'int32')
    
with open("nameIdx2Features.txt","rb") as f:
    nameIdx2Features = pickle.load(f)
with open("trackIdx2Features.txt","rb") as f:
    trackIdx2Features = pickle.load(f)

with h5py.File('C:/SpotifyPlaylistData/ChallengeData/challenge_Dataset_5.hdf5','a') as f_data:
    count = 0
    tracksVec_arr = np.zeros([10,40],dtype=np.float32)
    with open('C:/SpotifyPlaylistData/ChallengeData/challenge_set_Extract_Indices_case5.json',encoding='utf8') as f:
        js = f.read()
        f.close()
        slice = json.loads(js)
        for playlist_id in range(len(slice['playlists'])):
            if len(slice['playlists'][playlist_id]['tracksIdx']) == 10 and 'nameIdx' in slice['playlists'][playlist_id]:
                f_data['num_tracksIdx'][count,:] = slice['playlists'][playlist_id]['num_tracksIdx']
                f_data['nameVec'][count,:] = nameIdx2Features[str(slice['playlists'][playlist_id]['nameIdx'])]
                tracksIdx = slice['playlists'][playlist_id]['tracksIdx']
                for i in range(10):
                    tracksVec_arr[i,:] = trackIdx2Features[str(tracksIdx[i])]
                f_data['tracksVec'][count,:] = tracksVec_arr.flatten()
                f_data['pid'][count,:] = slice['playlists'][playlist_id]['pid']
                count += 1 
            
print('count:',count)

In [None]:
# genrate challenge datset 6 as hdf5 file with fields['num_tracksIdx'],['tracksVec'],['tracksIdx'],coresponding to task category 6

import numpy as np
import h5py
import pickle
import os
import json

with h5py.File('C:/SpotifyPlaylistData/ChallengeData/challenge_Dataset_6.hdf5','w') as f:
    f.create_dataset('num_tracksIdx', (1000,1),dtype = 'int32')
    f.create_dataset('tracksVec', (1000,400),dtype = 'float32')
    f.create_dataset('pid', (1000,1),dtype = 'int32')
    
with open("nameIdx2Features.txt","rb") as f:
    nameIdx2Features = pickle.load(f)
with open("trackIdx2Features.txt","rb") as f:
    trackIdx2Features = pickle.load(f)

with h5py.File('C:/SpotifyPlaylistData/ChallengeData/challenge_Dataset_6.hdf5','a') as f_data:
    count = 0
    tracksVec_arr = np.zeros([10,40],dtype=np.float32)
    with open('C:/SpotifyPlaylistData/ChallengeData/challenge_set_Extract_Indices_case6.json',encoding='utf8') as f:
        js = f.read()
        f.close()
        slice = json.loads(js)
        for playlist_id in range(len(slice['playlists'])):
            if len(slice['playlists'][playlist_id]['tracksIdx']) == 10 and 'nameIdx' not in slice['playlists'][playlist_id]:
                f_data['num_tracksIdx'][count,:] = slice['playlists'][playlist_id]['num_tracksIdx']
                tracksIdx = slice['playlists'][playlist_id]['tracksIdx']
                for i in range(10):
                    tracksVec_arr[i,:] = trackIdx2Features[str(tracksIdx[i])]
                f_data['tracksVec'][count,:] = tracksVec_arr.flatten()
                f_data['pid'][count,:] = slice['playlists'][playlist_id]['pid']
                count += 1 
            
print('count:',count)

In [None]:
# genrate challenge datset 7 as hdf5 file with fields['num_tracksIdx'],['nameVec'],['tracksVec'],['tracksIdx'],coresponding to task category 7

import numpy as np
import h5py
import pickle
import os
import json

with h5py.File('C:/SpotifyPlaylistData/ChallengeData/challenge_Dataset_7.hdf5','w') as f:
    f.create_dataset('num_tracksIdx', (1000,1),dtype = 'int32')
    f.create_dataset('nameVec', (1000,40),dtype = 'float32')
    f.create_dataset('tracksVec', (1000,1000),dtype = 'float32')
    f.create_dataset('pid', (1000,1),dtype = 'int32')
    
with open("nameIdx2Features.txt","rb") as f:
    nameIdx2Features = pickle.load(f)
with open("trackIdx2Features.txt","rb") as f:
    trackIdx2Features = pickle.load(f)

with h5py.File('C:/SpotifyPlaylistData/ChallengeData/challenge_Dataset_7.hdf5','a') as f_data:
    count = 0
    tracksVec_arr = np.zeros([25,40],dtype=np.float32)
    with open('C:/SpotifyPlaylistData/ChallengeData/challenge_set_Extract_Indices_case7.json',encoding='utf8') as f:
        js = f.read()
        f.close()
        slice = json.loads(js)
        for playlist_id in range(len(slice['playlists'])):
            if len(slice['playlists'][playlist_id]['tracksIdx']) == 25:
                f_data['num_tracksIdx'][count,:] = slice['playlists'][playlist_id]['num_tracksIdx']
                f_data['nameVec'][count,:] = nameIdx2Features[str(slice['playlists'][playlist_id]['nameIdx'])]
                tracksIdx = slice['playlists'][playlist_id]['tracksIdx']
                for i in range(25):
                    tracksVec_arr[i,:] = trackIdx2Features[str(tracksIdx[i])]
                f_data['tracksVec'][count,:] = tracksVec_arr.flatten()
                f_data['pid'][count,:] = slice['playlists'][playlist_id]['pid']
                count += 1 
            
print('count:',count)

In [None]:
# genrate challenge datset 8 as hdf5 file with fields['num_tracksIdx'],['nameVec'],['tracksVec'],['tracksIdx'],coresponding to task category 8

import numpy as np
import h5py
import pickle
import os
import json

with h5py.File('C:/SpotifyPlaylistData/ChallengeData/challenge_Dataset_8.hdf5','w') as f:
    f.create_dataset('num_tracksIdx', (1000,1),dtype = 'int32')
    f.create_dataset('nameVec', (1000,40),dtype = 'float32')
    f.create_dataset('tracksVec', (1000,1000),dtype = 'float32')
    f.create_dataset('pid', (1000,1),dtype = 'int32')
    
with open("nameIdx2Features.txt","rb") as f:
    nameIdx2Features = pickle.load(f)
with open("trackIdx2Features.txt","rb") as f:
    trackIdx2Features = pickle.load(f)

with h5py.File('C:/SpotifyPlaylistData/ChallengeData/challenge_Dataset_8.hdf5','a') as f_data:
    count = 0
    tracksVec_arr = np.zeros([25,40],dtype=np.float32)
    with open('C:/SpotifyPlaylistData/ChallengeData/challenge_set_Extract_Indices_case8.json',encoding='utf8') as f:
        js = f.read()
        f.close()
        slice = json.loads(js)
        for playlist_id in range(len(slice['playlists'])):
            if len(slice['playlists'][playlist_id]['tracksIdx']) == 25:
                f_data['num_tracksIdx'][count,:] = slice['playlists'][playlist_id]['num_tracksIdx']
                f_data['nameVec'][count,:] = nameIdx2Features[str(slice['playlists'][playlist_id]['nameIdx'])]
                tracksIdx = slice['playlists'][playlist_id]['tracksIdx']
                for i in range(25):
                    tracksVec_arr[i,:] = trackIdx2Features[str(tracksIdx[i])]
                f_data['tracksVec'][count,:] = tracksVec_arr.flatten()
                f_data['pid'][count,:] = slice['playlists'][playlist_id]['pid']
                count += 1 
            
print('count:',count)

In [None]:
# genrate challenge datset 9 as hdf5 file with fields['num_tracksIdx'],['nameVec'],['tracksVec'],['tracksIdx'],coresponding to task category 9

import numpy as np
import h5py
import pickle
import os
import json

with h5py.File('C:/SpotifyPlaylistData/ChallengeData/challenge_Dataset_9.hdf5','w') as f:
    f.create_dataset('num_tracksIdx', (1000,1),dtype = 'int32')
    f.create_dataset('nameVec', (1000,40),dtype = 'float32')
    f.create_dataset('tracksVec', (1000,4000),dtype = 'float32')
    f.create_dataset('pid', (1000,1),dtype = 'int32')
    
with open("nameIdx2Features.txt","rb") as f:
    nameIdx2Features = pickle.load(f)
with open("trackIdx2Features.txt","rb") as f:
    trackIdx2Features = pickle.load(f)

with h5py.File('C:/SpotifyPlaylistData/ChallengeData/challenge_Dataset_9.hdf5','a') as f_data:
    count = 0
    tracksVec_arr = np.zeros([100,40],dtype=np.float32)
    with open('C:/SpotifyPlaylistData/ChallengeData/challenge_set_Extract_Indices_case9.json',encoding='utf8') as f:
        js = f.read()
        f.close()
        slice = json.loads(js)
        for playlist_id in range(len(slice['playlists'])):
            if len(slice['playlists'][playlist_id]['tracksIdx']) == 100:
                f_data['num_tracksIdx'][count,:] = slice['playlists'][playlist_id]['num_tracksIdx']
                f_data['nameVec'][count,:] = nameIdx2Features[str(slice['playlists'][playlist_id]['nameIdx'])]
                tracksIdx = slice['playlists'][playlist_id]['tracksIdx']
                for i in range(100):
                    tracksVec_arr[i,:] = trackIdx2Features[str(tracksIdx[i])]
                f_data['tracksVec'][count,:] = tracksVec_arr.flatten()
                f_data['pid'][count,:] = slice['playlists'][playlist_id]['pid']
                count += 1 
            
print('count:',count)

In [None]:
# genrate challenge datset 10 as hdf5 file with fields['num_tracksIdx'],['nameVec'],['tracksVec'],['tracksIdx'],coresponding to task category 10

import numpy as np
import h5py
import pickle
import os
import json

with h5py.File('C:/SpotifyPlaylistData/ChallengeData/challenge_Dataset_10.hdf5','w') as f:
    f.create_dataset('num_tracksIdx', (1000,1),dtype = 'int32')
    f.create_dataset('nameVec', (1000,40),dtype = 'float32')
    f.create_dataset('tracksVec', (1000,4000),dtype = 'float32')
    f.create_dataset('pid', (1000,1),dtype = 'int32')
    
with open("nameIdx2Features.txt","rb") as f:
    nameIdx2Features = pickle.load(f)
with open("trackIdx2Features.txt","rb") as f:
    trackIdx2Features = pickle.load(f)

with h5py.File('C:/SpotifyPlaylistData/ChallengeData/challenge_Dataset_10.hdf5','a') as f_data:
    count = 0
    tracksVec_arr = np.zeros([100,40],dtype=np.float32)
    with open('C:/SpotifyPlaylistData/ChallengeData/challenge_set_Extract_Indices_case10.json',encoding='utf8') as f:
        js = f.read()
        f.close()
        slice = json.loads(js)
        for playlist_id in range(len(slice['playlists'])):
            if len(slice['playlists'][playlist_id]['tracksIdx']) == 100:
                f_data['num_tracksIdx'][count,:] = slice['playlists'][playlist_id]['num_tracksIdx']
                f_data['nameVec'][count,:] = nameIdx2Features[str(slice['playlists'][playlist_id]['nameIdx'])]
                tracksIdx = slice['playlists'][playlist_id]['tracksIdx']
                for i in range(100):
                    tracksVec_arr[i,:] = trackIdx2Features[str(tracksIdx[i])]
                f_data['tracksVec'][count,:] = tracksVec_arr.flatten()
                f_data['pid'][count,:] = slice['playlists'][playlist_id]['pid']
                count += 1 
            
print('count:',count)

In [None]:
# output predicted results for category 1 

from keras.models import load_model #Library for loading saved models.
import csv
import h5py
import numpy as np
import pickle
model = load_model('./plylistContinuation/task1.h5')
model.summary()

with open("C:/Users/zwang10/Research/idx2tracksUrl.txt","rb") as f:
    idx2tracksUrl = pickle.load(f)
    
fields = ["FishInMedi","main","Results Submission","zwang10@uwyo.edu"]
count = 0
with h5py.File('C:/Users/zwang10/Research/challenge_Dataset_1.hdf5','r') as f,open('C:/Users/zwang10/Research/challenge_1.csv','w') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(fields)
    for playlist_id in range(0,1000):
        num_tracksIdx = f['num_tracksIdx'][playlist_id,:].reshape([-1,1])
        name_vec = f['nameVec'][playlist_id,:].reshape([-1,40])
        pid = f['pid'][playlist_id,:][0]
        row = [pid]

        predictions = model.predict([num_tracksIdx,name_vec])
    
        for trackIdx in np.flip(np.argsort(predictions)[0,-500:],axis=-1):
            row.append('spotify:track:'+idx2tracksUrl[trackIdx])
        csvwriter.writerow(row)
        count += 1
print('count:',count)
print('Done!')

In [None]:
# output predicted results for category 2 

from keras.models import load_model #Library for loading saved models.
import json
import csv
import h5py
import numpy as np
import pickle
model = load_model('./plylistContinuation/task2.h5')
model.summary()

with open("C:/Users/zwang10/Research/idx2tracksUrl.txt","rb") as f:
    idx2tracksUrl = pickle.load(f)
    
count = 0
count_misMatch = 0
f = h5py.File('C:/Users/zwang10/Research/challenge_Dataset_2.hdf5','r')
csvfile = open('C:/Users/zwang10/Research/challenge_2.csv','w')
fj = open('C:/Users/zwang10/Research/challenge_set_Extract_Indices_case2.json',encoding='utf8')
js = fj.read()
slice = json.loads(js)
csvwriter = csv.writer(csvfile)
for playlist_id in range(0,1000):
    print('playlist_id -',playlist_id)
    num_tracksIdx = f['num_tracksIdx'][playlist_id,:].reshape([-1,1])
    name_vec = f['nameVec'][playlist_id,:].reshape([-1,40])
    tracks_vec = f['tracksVec'][playlist_id,:].reshape([-1,40])
    pid = f['pid'][playlist_id,:][0]
    row = [pid]
    predictions = model.predict([num_tracksIdx,name_vec,tracks_vec])
    
    js_pid = slice['playlists'][playlist_id]['pid']
    if js_pid != pid:
        print("Does't match!")
        count_misMatch += 1
    tracksIdx = slice['playlists'][playlist_id]['tracksIdx']
    break_flag = 0
    for trackIdx in np.flip(np.argsort(predictions)[0,-501:],axis=-1):
        if trackIdx not in tracksIdx:
            row.append('spotify:track:'+idx2tracksUrl[trackIdx])
            break_flag += 1
            count += 1
        if break_flag == 500:
            break
    csvwriter.writerow(row)
print('count:',count)
print('misMatch:',count_misMatch)
print('Done!')
f.close()
csvfile.close()
fj.close()

In [None]:
# output predicted results for category 3 

from keras.models import load_model #Library for loading saved models.
import json
import csv
import h5py
import numpy as np
import pickle
model = load_model('./plylistContinuation/task3.h5')
model.summary()

with open("C:/Users/zwang10/Research/idx2tracksUrl.txt","rb") as f:
    idx2tracksUrl = pickle.load(f)
    
count = 0
count_misMatch = 0
f = h5py.File('C:/Users/zwang10/Research/challenge_Dataset_3.hdf5','r')
csvfile = open('C:/Users/zwang10/Research/challenge_3.csv','w')
fj = open('C:/Users/zwang10/Research/challenge_set_Extract_Indices_case3.json',encoding='utf8')
js = fj.read()
slice = json.loads(js)
csvwriter = csv.writer(csvfile)
for playlist_id in range(0,1000):
    print('playlist_id -',playlist_id)
    num_tracksIdx = f['num_tracksIdx'][playlist_id,:].reshape([-1,1])
    name_vec = f['nameVec'][playlist_id,:].reshape([-1,40])
    tracks_vec = f['tracksVec'][playlist_id,:].reshape([-1,200])
    pid = f['pid'][playlist_id,:][0]
    row = [pid]
    predictions = model.predict([num_tracksIdx,name_vec,tracks_vec])
    
    js_pid = slice['playlists'][playlist_id]['pid']
    if js_pid != pid:
        print("Does't match!")
        count_misMatch += 1
    tracksIdx = slice['playlists'][playlist_id]['tracksIdx']
    break_flag = 0
    for trackIdx in np.flip(np.argsort(predictions)[0,-505:],axis=-1):
        if trackIdx not in tracksIdx:
            row.append('spotify:track:'+idx2tracksUrl[trackIdx])
            break_flag += 1
            count += 1
        if break_flag == 500:
            break
    csvwriter.writerow(row)
print('count:',count)
print('misMatch:',count_misMatch)
print('Done!')
f.close()
csvfile.close()
fj.close()

In [None]:
# output predicted results for category 4

from keras.models import load_model #Library for loading saved models.
import json
import csv
import h5py
import numpy as np
import pickle
model = load_model('./plylistContinuation/task4.h5')
model.summary()

with open("C:/Users/zwang10/Research/idx2tracksUrl.txt","rb") as f:
    idx2tracksUrl = pickle.load(f)
    
count = 0
count_misMatch = 0
f = h5py.File('C:/Users/zwang10/Research/challenge_Dataset_4.hdf5','r')
csvfile = open('C:/Users/zwang10/Research/challenge_4.csv','w')
fj = open('C:/Users/zwang10/Research/challenge_set_Extract_Indices_case4.json',encoding='utf8')
js = fj.read()
slice = json.loads(js)
csvwriter = csv.writer(csvfile)
for playlist_id in range(0,1000):
    print('playlist_id -',playlist_id)
    num_tracksIdx = f['num_tracksIdx'][playlist_id,:].reshape([-1,1])
    tracks_vec = f['tracksVec'][playlist_id,:].reshape([-1,200])
    pid = f['pid'][playlist_id,:][0]
    row = [pid]
    predictions = model.predict([num_tracksIdx,tracks_vec])
    
    js_pid = slice['playlists'][playlist_id]['pid']
    if js_pid != pid:
        print("Does't match!")
        count_misMatch += 1
    tracksIdx = slice['playlists'][playlist_id]['tracksIdx']
    break_flag = 0
    for trackIdx in np.flip(np.argsort(predictions)[0,-505:],axis=-1):
        if trackIdx not in tracksIdx:
            row.append('spotify:track:'+idx2tracksUrl[trackIdx])
            break_flag += 1
            count += 1
        if break_flag == 500:
            break
    csvwriter.writerow(row)
print('count:',count)
print('misMatch:',count_misMatch)
print('Done!')
f.close()
csvfile.close()
fj.close()

In [None]:
# output predicted results for category 5

from keras.models import load_model #Library for loading saved models.
import json
import csv
import h5py
import numpy as np
import pickle
model = load_model('./plylistContinuation/task5.h5')
model.summary()

with open("C:/Users/zwang10/Research/idx2tracksUrl.txt","rb") as f:
    idx2tracksUrl = pickle.load(f)
    
count = 0
count_misMatch = 0
f = h5py.File('C:/Users/zwang10/Research/challenge_Dataset_5.hdf5','r')
csvfile = open('C:/Users/zwang10/Research/challenge_5.csv','w')
fj = open('C:/Users/zwang10/Research/challenge_set_Extract_Indices_case5.json',encoding='utf8')
js = fj.read()
slice = json.loads(js)
csvwriter = csv.writer(csvfile)
for playlist_id in range(0,1000):
    print('playlist_id -',playlist_id)
    num_tracksIdx = f['num_tracksIdx'][playlist_id,:].reshape([-1,1])
    name_vec = f['nameVec'][playlist_id,:].reshape([-1,40])
    tracks_vec = f['tracksVec'][playlist_id,:].reshape([-1,400])
    pid = f['pid'][playlist_id,:][0]
    row = [pid]
    predictions = model.predict([num_tracksIdx,name_vec,tracks_vec])
    
    js_pid = slice['playlists'][playlist_id]['pid']
    if js_pid != pid:
        print("Does't match!")
        count_misMatch += 1
    tracksIdx = slice['playlists'][playlist_id]['tracksIdx']
    break_flag = 0
    for trackIdx in np.flip(np.argsort(predictions)[0,-510:],axis=-1):
        if trackIdx not in tracksIdx:
            row.append('spotify:track:'+idx2tracksUrl[trackIdx])
            break_flag += 1
            count += 1
        if break_flag == 500:
            break
    csvwriter.writerow(row)
print('count:',count)
print('misMatch:',count_misMatch)
print('Done!')
f.close()
csvfile.close()
fj.close()

In [None]:
# output predicted results for category 6

from keras.models import load_model #Library for loading saved models.
import json
import csv
import h5py
import numpy as np
import pickle
model = load_model('./plylistContinuation/task6.h5')
model.summary()

with open("C:/Users/zwang10/Research/idx2tracksUrl.txt","rb") as f:
    idx2tracksUrl = pickle.load(f)
    
count = 0
count_misMatch = 0
f = h5py.File('C:/Users/zwang10/Research/challenge_Dataset_6.hdf5','r')
csvfile = open('C:/Users/zwang10/Research/challenge_6.csv','w')
fj = open('C:/Users/zwang10/Research/challenge_set_Extract_Indices_case6.json',encoding='utf8')
js = fj.read()
slice = json.loads(js)
csvwriter = csv.writer(csvfile)
for playlist_id in range(0,1000):
    print('playlist_id -',playlist_id)
    num_tracksIdx = f['num_tracksIdx'][playlist_id,:].reshape([-1,1])
    tracks_vec = f['tracksVec'][playlist_id,:].reshape([-1,400])
    pid = f['pid'][playlist_id,:][0]
    row = [pid]
    predictions = model.predict([num_tracksIdx,tracks_vec])
    
    js_pid = slice['playlists'][playlist_id]['pid']
    if js_pid != pid:
        print("Does't match!")
        count_misMatch += 1
    tracksIdx = slice['playlists'][playlist_id]['tracksIdx']
    break_flag = 0
    for trackIdx in np.flip(np.argsort(predictions)[0,-510:],axis=-1):
        if trackIdx not in tracksIdx:
            row.append('spotify:track:'+idx2tracksUrl[trackIdx])
            break_flag += 1
            count += 1
        if break_flag == 500:
            break
    csvwriter.writerow(row)
print('count:',count)
print('misMatch:',count_misMatch)
print('Done!')
f.close()
csvfile.close()
fj.close()

In [None]:
# output predicted results for category 7

from keras.models import load_model #Library for loading saved models.
import json
import csv
import h5py
import numpy as np
import pickle
model = load_model('./plylistContinuation/task7.h5')
model.summary()

with open("C:/Users/zwang10/Research/idx2tracksUrl.txt","rb") as f:
    idx2tracksUrl = pickle.load(f)
    
count = 0
count_misMatch = 0
f = h5py.File('C:/Users/zwang10/Research/challenge_Dataset_7.hdf5','r')
csvfile = open('C:/Users/zwang10/Research/challenge_7.csv','w')
fj = open('C:/Users/zwang10/Research/challenge_set_Extract_Indices_case7.json',encoding='utf8')
js = fj.read()
slice = json.loads(js)
csvwriter = csv.writer(csvfile)
for playlist_id in range(0,1000):
    print('playlist_id -',playlist_id)
    num_tracksIdx = f['num_tracksIdx'][playlist_id,:].reshape([-1,1])
    name_vec = f['nameVec'][playlist_id,:].reshape([-1,40])
    tracks_vec = f['tracksVec'][playlist_id,:].reshape([-1,1000])
    pid = f['pid'][playlist_id,:][0]
    row = [pid]
    predictions = model.predict([num_tracksIdx,name_vec,tracks_vec])
    
    js_pid = slice['playlists'][playlist_id]['pid']
    if js_pid != pid:
        print("Does't match!")
        count_misMatch += 1
    tracksIdx = slice['playlists'][playlist_id]['tracksIdx']
    break_flag = 0
    for trackIdx in np.flip(np.argsort(predictions)[0,-525:],axis=-1):
        if trackIdx not in tracksIdx:
            row.append('spotify:track:'+idx2tracksUrl[trackIdx])
            break_flag += 1
            count += 1
        if break_flag == 500:
            break
    csvwriter.writerow(row)
print('count:',count)
print('misMatch:',count_misMatch)
print('Done!')
f.close()
csvfile.close()
fj.close()

In [None]:
# output predicted results for category 8

from keras.models import load_model #Library for loading saved models.
import json
import csv
import h5py
import numpy as np
import pickle
model = load_model('./plylistContinuation/task8.h5')
model.summary()

with open("C:/Users/zwang10/Research/idx2tracksUrl.txt","rb") as f:
    idx2tracksUrl = pickle.load(f)
    
count = 0
count_misMatch = 0
f = h5py.File('C:/Users/zwang10/Research/challenge_Dataset_8.hdf5','r')
csvfile = open('C:/Users/zwang10/Research/challenge_8.csv','w')
fj = open('C:/Users/zwang10/Research/challenge_set_Extract_Indices_case8.json',encoding='utf8')
js = fj.read()
slice = json.loads(js)
csvwriter = csv.writer(csvfile)
for playlist_id in range(0,1000):
    print('playlist_id -',playlist_id)
    num_tracksIdx = f['num_tracksIdx'][playlist_id,:].reshape([-1,1])
    name_vec = f['nameVec'][playlist_id,:].reshape([-1,40])
    tracks_vec = f['tracksVec'][playlist_id,:].reshape([-1,1000])
    pid = f['pid'][playlist_id,:][0]
    row = [pid]
    predictions = model.predict([num_tracksIdx,name_vec,tracks_vec])
    
    js_pid = slice['playlists'][playlist_id]['pid']
    if js_pid != pid:
        print("Does't match!")
        count_misMatch += 1
    tracksIdx = slice['playlists'][playlist_id]['tracksIdx']
    break_flag = 0
    for trackIdx in np.flip(np.argsort(predictions)[0,-525:],axis=-1):
        if trackIdx not in tracksIdx:
            row.append('spotify:track:'+idx2tracksUrl[trackIdx])
            break_flag += 1
            count += 1
        if break_flag == 500:
            break
    csvwriter.writerow(row)
print('count:',count)
print('misMatch:',count_misMatch)
print('Done!')
f.close()
csvfile.close()
fj.close()

In [None]:
# output predicted results for category 9

from keras.models import load_model #Library for loading saved models.
import json
import csv
import h5py
import numpy as np
import pickle
model = load_model('./plylistContinuation/task9.h5')
model.summary()

with open("C:/Users/zwang10/Research/idx2tracksUrl.txt","rb") as f:
    idx2tracksUrl = pickle.load(f)
    
count = 0
count_misMatch = 0
f = h5py.File('C:/Users/zwang10/Research/challenge_Dataset_9.hdf5','r')
csvfile = open('C:/Users/zwang10/Research/challenge_9.csv','w')
fj = open('C:/Users/zwang10/Research/challenge_set_Extract_Indices_case9.json',encoding='utf8')
js = fj.read()
slice = json.loads(js)
csvwriter = csv.writer(csvfile)
for playlist_id in range(0,1000):
    print('playlist_id -',playlist_id)
    num_tracksIdx = f['num_tracksIdx'][playlist_id,:].reshape([-1,1])
    name_vec = f['nameVec'][playlist_id,:].reshape([-1,40])
    tracks_vec = f['tracksVec'][playlist_id,:].reshape([-1,4000])
    pid = f['pid'][playlist_id,:][0]
    row = [pid]
    predictions = model.predict([num_tracksIdx,name_vec,tracks_vec])
    
    js_pid = slice['playlists'][playlist_id]['pid']
    if js_pid != pid:
        print("Does't match!")
        count_misMatch += 1
    tracksIdx = slice['playlists'][playlist_id]['tracksIdx']
    break_flag = 0
    for trackIdx in np.flip(np.argsort(predictions)[0,-600:],axis=-1):
        if trackIdx not in tracksIdx:
            row.append('spotify:track:'+idx2tracksUrl[trackIdx])
            break_flag += 1
            count += 1
        if break_flag == 500:
            break
    csvwriter.writerow(row)
print('count:',count)
print('misMatch:',count_misMatch)
print('Done!')
f.close()
csvfile.close()
fj.close()

In [None]:
# output predicted results for category 10

from keras.models import load_model #Library for loading saved models.
import json
import csv
import h5py
import numpy as np
import pickle
model = load_model('./plylistContinuation/task10.h5')
model.summary()

with open("C:/Users/zwang10/Research/idx2tracksUrl.txt","rb") as f:
    idx2tracksUrl = pickle.load(f)
    
count = 0
count_misMatch = 0
f = h5py.File('C:/Users/zwang10/Research/challenge_Dataset_10.hdf5','r')
csvfile = open('C:/Users/zwang10/Research/challenge_10.csv','w')
fj = open('C:/Users/zwang10/Research/challenge_set_Extract_Indices_case10.json',encoding='utf8')
js = fj.read()
slice = json.loads(js)
csvwriter = csv.writer(csvfile)
for playlist_id in range(0,1000):
    print('playlist_id -',playlist_id)
    num_tracksIdx = f['num_tracksIdx'][playlist_id,:].reshape([-1,1])
    name_vec = f['nameVec'][playlist_id,:].reshape([-1,40])
    tracks_vec = f['tracksVec'][playlist_id,:].reshape([-1,4000])
    pid = f['pid'][playlist_id,:][0]
    row = [pid]
    predictions = model.predict([num_tracksIdx,name_vec,tracks_vec])
    
    js_pid = slice['playlists'][playlist_id]['pid']
    if js_pid != pid:
        print("Does't match!")
        count_misMatch += 1
    tracksIdx = slice['playlists'][playlist_id]['tracksIdx']
    break_flag = 0
    for trackIdx in np.flip(np.argsort(predictions)[0,-600:],axis=-1):
        if trackIdx not in tracksIdx:
            row.append('spotify:track:'+idx2tracksUrl[trackIdx])
            break_flag += 1
            count += 1
        if break_flag == 500:
            break
    csvwriter.writerow(row)
print('count:',count)
print('misMatch:',count_misMatch)
print('Done!')
f.close()
csvfile.close()
fj.close()

In [None]:
# Merge into a complete submission

import csv

count = 0
with open('C:/Users/zwang10/Research/submission.csv','w',newline='') as sub, open('C:/Users/zwang10/Research/challenge_1.csv','r') as par1:
    csvreader = csv.reader(par1)
    writer = csv.writer(sub)
    for row in csvreader:
        writer.writerow(row)
        count += 1

with open('C:/Users/zwang10/Research/submission.csv','a',newline='') as sub, open('C:/Users/zwang10/Research/challenge_2.csv','r') as par2:
    csvreader = csv.reader(par2)
    writer = csv.writer(sub)
    for row in csvreader:
        writer.writerow(row)
        count += 1

with open('C:/Users/zwang10/Research/submission.csv','a',newline='') as sub, open('C:/Users/zwang10/Research/challenge_3.csv','r') as par3:
    csvreader = csv.reader(par3)
    writer = csv.writer(sub)
    for row in csvreader:
        writer.writerow(row)
        count += 1
        
with open('C:/Users/zwang10/Research/submission.csv','a',newline='') as sub, open('C:/Users/zwang10/Research/challenge_4.csv','r') as par4:
    csvreader = csv.reader(par4)
    writer = csv.writer(sub)
    for row in csvreader:
        writer.writerow(row)
        count += 1
        
with open('C:/Users/zwang10/Research/submission.csv','a',newline='') as sub, open('C:/Users/zwang10/Research/challenge_5.csv','r') as par5:
    csvreader = csv.reader(par5)
    writer = csv.writer(sub)
    for row in csvreader:
        writer.writerow(row)
        count += 1
        
with open('C:/Users/zwang10/Research/submission.csv','a',newline='') as sub, open('C:/Users/zwang10/Research/challenge_6.csv','r') as par6:
    csvreader = csv.reader(par6)
    writer = csv.writer(sub)
    for row in csvreader:
        writer.writerow(row)
        count += 1

with open('C:/Users/zwang10/Research/submission.csv','a',newline='') as sub, open('C:/Users/zwang10/Research/challenge_7.csv','r') as par7:
    csvreader = csv.reader(par7)
    writer = csv.writer(sub)
    for row in csvreader:
        writer.writerow(row)
        count += 1
        
with open('C:/Users/zwang10/Research/submission.csv','a',newline='') as sub, open('C:/Users/zwang10/Research/challenge_8.csv','r') as par8:
    csvreader = csv.reader(par8)
    writer = csv.writer(sub)
    for row in csvreader:
        writer.writerow(row)
        count += 1
        
with open('C:/Users/zwang10/Research/submission.csv','a',newline='') as sub, open('C:/Users/zwang10/Research/challenge_9.csv','r') as par9:
    csvreader = csv.reader(par9)
    writer = csv.writer(sub)
    for row in csvreader:
        writer.writerow(row)
        count += 1
        
with open('C:/Users/zwang10/Research/submission.csv','a',newline='') as sub, open('C:/Users/zwang10/Research/challenge_10.csv','r') as par10:
    csvreader = csv.reader(par10)
    writer = csv.writer(sub)
    for row in csvreader:
        writer.writerow(row)
        count += 1
print('count:',count)