In [None]:
from tqdm import tqdm
import pandas as pd
import pickle
import os
from scipy import sparse
%matplotlib notebook

In [None]:
song_info = pd.read_csv('data/unique_tracks.txt', sep='<SEP>', header=None, engine='python')
song_info.columns = ['artist_id', 'song_id', 'artist_name', 'song_name']

### Iterative method to get dataset (and not hit memory contstraints)

In [None]:
def get_song_artist(song_id):
    return song_id_x_artist_name[song_id]

folder_out = 'data/tmp/'
dataset = pd.DataFrame()
for num, user_track in enumerate(pd.read_csv('data/train_triplets.txt', 
                                             chunksize=500000,
                                             iterator=True,
                                             sep='\t', 
                                             header=None, 
                                             usecols=[0,1], 
                                            )):
    print(num)
    user_track.columns = ['user_id', 'song_id']
    song_id_x_artist_name = dict(zip(song_info['song_id'], song_info['artist_name']))
    user_track['artist_name'] = user_track['song_id'].apply(get_song_artist)

    dataset = user_track.pivot_table(index='user_id', 
                                     columns='artist_name', 
                                     values='song_id', 
                                     aggfunc='count',
                                    )
    file_name = folder_out + str(num) + '.pkl'
    with open(file_name, 'wb') as f:
        pickle.dump(pd.SparseDataFrame(dataset), f)

    del dataset
    del user_track
    del song_id_x_artist_name


In [None]:
dataset = pd.DataFrame()
for file in tqdm(os.listdir(folder_in)):
    if file.endswith(".pkl"):
        file_path = os.path.join(folder_out, file)
        with open(file_path, 'rb') as f:
            dataset = dataset.append(pickle.load(f))

file_path = '../recommedation_service/data/final.pkl'
with open(file_path, 'wb') as f:
    pickle.dump(dataset, f)

In [11]:
dataset.info()

<class 'pandas.core.sparse.frame.SparseDataFrame'>
Index: 1019412 entries, 0000f88f8d76a238c251450913b0d070e4a77d19 to fffca8193876a33f6c4d8a18b29b69ab247aa841
Columns: 42053 entries, !!! to üNN
dtypes: float64(42053)
memory usage: 235.6+ MB


### Check final dataset sparsity 

In [None]:
with open('data/tmp/final.pkl', 'rb') as f:
    dataset = pickle.load(f)

In [4]:
dataset_s = sparse.csr_matrix(dataset.to_coo())
matrix_size = dataset_s.shape[0]*dataset_s.shape[1] # Number of possible interactions in the matrix

num_ratings = len(dataset_s.nonzero()[0]) # Number of items interacted with
sparsity = 100*(1 - (num_ratings/matrix_size))
sparsity

99.93033133239032