# Mosaico Musical

### Musical recommender by Alberto Antón as a final project for the Master in Data Science of KSchool



In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import io
import os
import sys

In [2]:
# display results to 3 decimal points, not in scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)

### Load data

In [3]:
data_root = "data"

In [18]:
columns = ['user_id', 'song_id', 'num_plays']
datafile = os.path.join(data_root, "train_triplets.txt")
print(datafile)
data = pd.read_csv(datafile, 
                   sep='\t', 
                   header = None,
                   names = columns)
data.head()

data\train_triplets.txt


Unnamed: 0,user_id,song_id,num_plays
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1


In [5]:
# 8GB of data, but not all of them are loaded in memory at once
print (sys.getsizeof(data))

8707245584


In [6]:
# Read songs dataset

In [37]:
columns = ["foo", "song_id", "artist", "title"]
datafile = os.path.join(data_root, "unique_tracks.txt")
all_songs = pd.read_csv(datafile, 
                        header = None,
                        sep = '<SEP>',
                        names = columns,
                        usecols = ["song_id", "artist", "title"],
                        encoding =  "utf-8",
                        engine = "python")

all_songs.head()

Unnamed: 0,song_id,artist,title
0,SOQMMHC12AB0180CB8,Faster Pussy cat,Silent Night
1,SOVFVAK12A8C1350D9,Karkkiautomaatti,Tanssi vaan
2,SOGTUKN12AB017F4F1,Hudson Mohawke,No One Could Ever
3,SOBNYVR12A8C13558C,Yerba Brava,Si Vos Querés
4,SOHSBXH12A8C13B0DF,Der Mystic,Tangle Of Aspens


In [26]:
# Number of songs in data
data.song_id.unique().shape[0]

384546

In [27]:
# Number of songs in all_songs
all_songs.song_id.unique().shape[0]

999056

In [39]:
all_songs.describe()

Unnamed: 0,song_id,artist,title
count,1000000,1000000,999985
unique,999056,72665,702000
top,SOUYQYY12AF72A000F,Michael Jackson,Intro
freq,3,194,1511


In [40]:
# Let's keep only the songs that match

songs = all_songs.merge(data,
                       left_on = "song_id",
                       right_on = "song_id",
                       how = "inner")[["song_id", "artist", "title"]]

# airport_freq.merge(airports[airports.ident == 'KLAX'][['id']], 
#                    left_on='airport_ref', 
#                    right_on='id', 
#                    how='inner')[['airport_ident', 'type', 'description', 'frequency_mhz']]

In [29]:
# Number of songs in songs
songs.song_id.unique().shape[0]

384546

In [41]:
songs.describe()

Unnamed: 0,song_id,artist,title
count,49664528,49664528,49664463
unique,384546,42062,306785
top,SOFRQTD12A81C233C0,Coldplay,Sehr kosmisch
freq,110479,455816,110479


In [None]:
# Removing duplicates from songs
songs.drop_duplicates(subset=inplace=True)

In [34]:
songs.head()

Unnamed: 0,song_id,artist,title
0,SOQMMHC12AB0180CB8,Faster Pussy cat,Silent Night
1,SOQMMHC12AB0180CB8,Faster Pussy cat,Silent Night
2,SOQMMHC12AB0180CB8,Faster Pussy cat,Silent Night
3,SOBNYVR12A8C13558C,Yerba Brava,Si Vos Querés
4,SOBNYVR12A8C13558C,Yerba Brava,Si Vos Querés


In [32]:
# Remove all_songs from memory
all_songs = None

In [33]:
# Funcion that returns the artis and title of a song_id
def which_song(sid):
    return songs[songs.song_id == sid]

In [35]:
which_song("SOVFVAK12A8C1350D9")

Unnamed: 0,song_id,artist,title


As this dataset is part of a closed competition, there is no test set, so we have to create it from the train set.

In [14]:
## TODO Separate sets
# Take a look at how C:\Users\laanton\Documents\Master_en_Data_Science\17.Recommendation_Systems\ml-100k\allbut.pl makes them
# To know how to split hte sets, let's find out the average number of listens per user




In [48]:
data['user_id'][:20]


0     b80344d063b5ccb3212f76538f3d9e43d87dca9e
1     b80344d063b5ccb3212f76538f3d9e43d87dca9e
2     b80344d063b5ccb3212f76538f3d9e43d87dca9e
3     b80344d063b5ccb3212f76538f3d9e43d87dca9e
4     b80344d063b5ccb3212f76538f3d9e43d87dca9e
5     b80344d063b5ccb3212f76538f3d9e43d87dca9e
6     b80344d063b5ccb3212f76538f3d9e43d87dca9e
7     b80344d063b5ccb3212f76538f3d9e43d87dca9e
8     b80344d063b5ccb3212f76538f3d9e43d87dca9e
9     b80344d063b5ccb3212f76538f3d9e43d87dca9e
10    b80344d063b5ccb3212f76538f3d9e43d87dca9e
11    b80344d063b5ccb3212f76538f3d9e43d87dca9e
12    b80344d063b5ccb3212f76538f3d9e43d87dca9e
13    b80344d063b5ccb3212f76538f3d9e43d87dca9e
14    b80344d063b5ccb3212f76538f3d9e43d87dca9e
15    b80344d063b5ccb3212f76538f3d9e43d87dca9e
16    b80344d063b5ccb3212f76538f3d9e43d87dca9e
17    b80344d063b5ccb3212f76538f3d9e43d87dca9e
18    b80344d063b5ccb3212f76538f3d9e43d87dca9e
19    b80344d063b5ccb3212f76538f3d9e43d87dca9e
Name: user_id, dtype: object

In [41]:
grouped_data = data.groupby('user_id')['user_id'].count()
grouped_data.head(20)

user_id
00000b722001882066dff9d2da8a775658053ea0     12
00001638d6189236866af9bbf309ae6c2347ffdc     14
0000175652312d12576d9e6b84f600caa24c4715     12
00001cf0dce3fb22b0df0f3a1d9cd21e38385372     18
0000267bde1b3a70ea75cf2b2d216cb828e3202b     25
00003a4459f33b92906be11abe0e93efc423c0ff     16
00004fb90a86beb8bed1e9e328f5d9b6ee7dc03e     47
00005c6177188f12fb5e2e82cdbd93e8a3f35e64     16
000060ca4e6bea0a5c9037fc1bbd7bbabb98c754     34
00007ed2509128dcdd74ea3aac2363e24e9dc06b     60
00007f902e31b0693a023e9c234461d4e6991eec     21
00008c7cc5606ba35c2595b0b3bb1630d4685028     17
00009d93dc719d1dbaf13507725a03b9fdeebebb     48
0000ad91634a2124eb9a84486443ed9a87a6ea8b     13
0000bb531aaa657c932988bc2f7fd7fc1b2050ec    148
0000d3c803e068cf1da17724f1674897b2dd7130     73
0000d9fcab0c32065aa62737de8c1b35f026b51c     22
0000f88f8d76a238c251450913b0d070e4a77d19    138
000138e252eea35fd73aaf66a9b34102b695a9c8     75
000142b8bf82245279fe63c4a53c816ccc2efa8a     13
Name: user_id, dtype: int64

In [42]:
print (type(grouped_data))

<class 'pandas.core.series.Series'>


In [57]:
print ("Mean: {0}\nMax: {1}\nMin: {2}".format(grouped_data.mean(), grouped_data.max(), grouped_data.min()))

Mean: 47.45681524313316
Max: 4400
Min: 10


In [79]:
grouped_data['00001638d6189236866af9bbf309ae6c2347ffdc']

14

In [80]:
data[data['user_id'] == '00001638d6189236866af9bbf309ae6c2347ffdc']

Unnamed: 0,user_id,song_id,num_plays
33672912,00001638d6189236866af9bbf309ae6c2347ffdc,SOAORYL12A67AD8187,7
33672913,00001638d6189236866af9bbf309ae6c2347ffdc,SOBFEDK12A8C13BB25,1
33672914,00001638d6189236866af9bbf309ae6c2347ffdc,SOCIJBR12AB017BD92,1
33672915,00001638d6189236866af9bbf309ae6c2347ffdc,SOEKYTM12A8C13CBF4,1
33672916,00001638d6189236866af9bbf309ae6c2347ffdc,SOFFWTH12A6310D9E8,2
33672917,00001638d6189236866af9bbf309ae6c2347ffdc,SOFXSRW12A6D4F3B77,1
33672918,00001638d6189236866af9bbf309ae6c2347ffdc,SOLODPO12AB017F217,4
33672919,00001638d6189236866af9bbf309ae6c2347ffdc,SOLOYFG12A8C133391,1
33672920,00001638d6189236866af9bbf309ae6c2347ffdc,SONGKIR12A58A779D3,5
33672921,00001638d6189236866af9bbf309ae6c2347ffdc,SOOEPEG12A6D4FC7CA,1


In [None]:
# to create the training and test sets, hold out is enough, as we want to separate a percentage of the songs per user, 
# so all the users are in both sets

In [90]:
len(data)

48373586

In [96]:
for index, row in data[:5].iterrows():
    print ("Index: {0}\nData: {1}".format(index, row))

Index: 0
Data: user_id      b80344d063b5ccb3212f76538f3d9e43d87dca9e
song_id                            SOAKIMP12A8C130995
num_plays                                           1
Name: 0, dtype: object
Index: 1
Data: user_id      b80344d063b5ccb3212f76538f3d9e43d87dca9e
song_id                            SOAPDEY12A81C210A9
num_plays                                           1
Name: 1, dtype: object
Index: 2
Data: user_id      b80344d063b5ccb3212f76538f3d9e43d87dca9e
song_id                            SOBBMDR12A8C13253B
num_plays                                           2
Name: 2, dtype: object
Index: 3
Data: user_id      b80344d063b5ccb3212f76538f3d9e43d87dca9e
song_id                            SOBFNSP12AF72A0E22
num_plays                                           1
Name: 3, dtype: object
Index: 4
Data: user_id      b80344d063b5ccb3212f76538f3d9e43d87dca9e
song_id                            SOBFOVM12A58A7D494
num_plays                                           1
Name: 4, dtype: object


In [99]:
for index, row in data[:5].iterrows():
    print (grouped_data[row['user_id']])

104
104
104
104
104


In [100]:
midict = {}
len(midict[1])

KeyError: 1

In [None]:
test_pct = 0.2
train_dict = {}
test_dict = {}
# Read user data line by line
for index, row in data[:100].iterrows():
# read his number of songs
    user = row['user_id']
    num_songs = grouped_data[user]
# Check if user exists in test dictionary
    if user not in test_dict:
        # If it doesn't, add it to both dictionaries
        test_dict[user] = 
    else:
    my_dict[key] = 1
# Check if it has the test elements complete
    
# Si no lo tiene, mediante un aleatorio (1,0 T,F, Sí,No) decidimos si lo mandamos a train o test


In [15]:
an_users = data.user_id.unique().shape[0]
n_songs = data.song_id.unique().shape[0]
print("There are %s users and %s songs" %(n_users, n_songs))

There are 1019318 users and 384546 songs


In [5]:
data.describe()

Unnamed: 0,num_plays
count,48373586.0
mean,2.867
std,6.438
min,1.0
25%,1.0
50%,1.0
75%,3.0
max,9667.0


In [None]:
## TODO Load song information and remove all lines from data without a match