# Mosaico Musical

### Musical recommender by Alberto Antón as a final project for the Master in Data Science of KSchool



In [44]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import io
import os
import sys

In [3]:
# display results to 3 decimal points, not in scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)

### Load data

In [4]:
data_root = "data"

In [5]:
columns = ['user_id', 'song_id', 'num_plays']
datafile = os.path.join(data_root, "train_triplets.txt")
print(datafile)
data = pd.read_csv(datafile, 
                   sep='\t', 
                   header = None,
                   names = columns)
data.head()

data\train_triplets.txt


Unnamed: 0,user_id,song_id,num_plays
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1


In [6]:
data.describe()

Unnamed: 0,num_plays
count,48373586.0
mean,2.867
std,6.438
min,1.0
25%,1.0
50%,1.0
75%,3.0
max,9667.0


In [7]:
# Read songs dataset

In [8]:
columns = ["foo", "song_id", "artist", "title"]
datafile = os.path.join(data_root, "unique_tracks.txt")
all_songs = pd.read_csv(datafile, 
                        header = None,
                        sep = '<SEP>',
                        names = columns,
                        usecols = ["song_id", "artist", "title"],
                        encoding =  "utf-8",
                        engine = "python")

all_songs.head()

Unnamed: 0,song_id,artist,title
0,SOQMMHC12AB0180CB8,Faster Pussy cat,Silent Night
1,SOVFVAK12A8C1350D9,Karkkiautomaatti,Tanssi vaan
2,SOGTUKN12AB017F4F1,Hudson Mohawke,No One Could Ever
3,SOBNYVR12A8C13558C,Yerba Brava,Si Vos Querés
4,SOHSBXH12A8C13B0DF,Der Mystic,Tangle Of Aspens


In [9]:
all_songs.describe()

Unnamed: 0,song_id,artist,title
count,1000000,1000000,999985
unique,999056,72665,702000
top,SOUYQYY12AF72A000F,Michael Jackson,Intro
freq,3,194,1511


In [10]:
# Number of songs in data
data.song_id.unique().shape[0]

384546

In [12]:
# Number of songs in all_songs
all_songs.song_id.unique().shape[0]

999056

In [13]:
# Let's keep only the songs that are in data
unique_songs_tmp = data.song_id.unique()
unique_songs_df = pd.DataFrame(unique_songs_tmp, columns = ['song_id'])

In [14]:
songs = unique_songs_df.merge(all_songs,
                       left_on = "song_id",
                       right_on = "song_id",
                       how = "left")[["song_id", "artist", "title"]]

In [15]:
songs.describe()

Unnamed: 0,song_id,artist,title
count,385256,385256,385252
unique,384546,42062,306785
top,SOBPAEP12A58A77F49,Beastie Boys,Intro
freq,3,139,526


In [16]:
songs.head()

Unnamed: 0,song_id,artist,title
0,SOAKIMP12A8C130995,Jack Johnson,The Cove
1,SOAPDEY12A81C210A9,Billy Preston,Nothing from Nothing
2,SOBBMDR12A8C13253B,Paco De Lucia,Entre Dos Aguas
3,SOBFNSP12AF72A0E22,Josh Rouse,Under Cold Blue Stars
4,SOBFOVM12A58A7D494,The Dead 60s,Riot Radio (Soundtrack Version)


In [17]:
# Songs dataframe count is 385.256, while there are 384.546 unique songs. 
# The difference are duplicated songs

In [19]:
# Let's take a look
songs[songs.duplicated(subset=['song_id'], keep=False)].head(6)

Unnamed: 0,song_id,artist,title
9,SOBXHDL12A81C204C0,Kanye West,Stronger
10,SOBXHDL12A81C204C0,Kanye West,Stronger
184,SODGVGW12AC9075A8D,Justin Bieber,Somebody To Love
185,SODGVGW12AC9075A8D,Justin Bieber,Somebody To Love
191,SOKOXWU12AF72AD1BC,Eminem,The Real Slim Shady
192,SOKOXWU12AF72AD1BC,Eminem,The Real Slim Shady


In [20]:
# Remove those duplicates
songs.drop_duplicates(subset = "song_id", inplace = True)
songs.describe()

Unnamed: 0,song_id,artist,title
count,384546,384546,384542
unique,384546,42055,306720
top,SOSSQIP12AB0186F0D,Beastie Boys,Intro
freq,1,136,526


In [43]:
# Saving songs dataframe
songs.to_pickle("songs.pkl")

In [21]:
# Now count and unique are the same

In [46]:
# Remove all_songs from memory
all_songs = None

In [47]:
# Funcion that returns the artist and title of a song_id
def which_song(sid):
    return songs[songs["song_id"] == sid][["artist", "title"]]

In [48]:
which_song("SOSDCFG12AB0184647")

Unnamed: 0,artist,title
26444,Lena Philipsson,6


As this dataset is part of a closed competition, there is no test set, so we have to create it from the train set.

If we want to recommend something to a new user, either we know something about him or we can't but recommend the most popular songs.

Let's go for the first option, so we'll present the user random songs until he selects 10 that he likes

In [49]:
# Let show random artists, then the users selects an artis an we present its songs. The users selects
# thos he likes (or none)

In [50]:
# Show random artist
songs.artist.sample(20)

160353       Clube Do Balanco
110877                Emarosa
372427       Conjunto Clasico
219499            Prem Joshua
305977             Marc Ribot
254931     Nick Glennie-Smith
123339            Fiona Apple
270419                    ZPU
78858                   Kinky
155660              The Dears
33642           The Melodians
41224      Magtens Korridorer
69090              Elton John
64753           Lucila Campos
372301                  Mia X
75538                Deadsoil
237291          Dulce Liquido
332277       Long Tall Texans
13755     Jean-Yves Thibaudet
218444                  Becky
Name: artist, dtype: object

In [41]:
# Function that shows the songs of an artist
def songs_by(artist_id):
    return songs[songs["artist"] == artist_id][["song_id", "title"]]

In [42]:
songs_by("The Waterboys")

Unnamed: 0,song_id,title
23627,SOIQRKG12A6D4F63A3,Rags (Second Amendment)
36813,SOYQYTX12AB0186FFA,Bury My Heart
56909,SOYCNZF12A6D4F7F87,Fisherman's Blues
79882,SOGEJMW12A6D4F8626,When Will We Be Married (1987 Recording)
79887,SOLPBTJ12A6D4F5757,The Dance At The Crossroads
79900,SOPTUEN12A6D4F5751,Every Breath Is Yours
79914,SOWICCP12A6D4F5756,Always Dancing_ Never Getting Tired
97338,SOPARQV12A58A79DC9,A Pagan Place
115100,SODAYPE12A6D4F7F8E,This Is The Sea
175762,SOLLIIT12A8C133154,A Song For The Life (2008 Digital Remaster)


In [None]:
songs.sample(50)

In [None]:
# Randomly selected songs for the test
selected_songs = ["SOBWKZB12A8C136891", "SOSWNMA12A6D4F796F", "SOXWOLZ12A8C135F54", "SONWNJJ12A67AD8492", \
                  "SOHQCOD12AB018316C", "SOFDKPU12A58A7A06A", "SOVKUHB12AAF3B3636", "SONJNVN12A6701C689", \
                  "SOMMXAD12A58A76FBF", "SOZYUBF12AB018337B"]

In [56]:
# Let's get the songs from a real user
datafile = os.path.join(data_root, "train_listenings.pkl")
user_list = pd.read_pickle(datafile)
train_user = pd.DataFrame(user_list, columns=['user_id', 'song_id', 'mun_plays'])
train_user.head()


Unnamed: 0,user_id,song_id,mun_plays
0,00000b722001882066dff9d2da8a775658053ea0,SORDKNX12A8C13A45F,1
1,00000b722001882066dff9d2da8a775658053ea0,SOFLJQZ12A6D4FADA6,1
2,00000b722001882066dff9d2da8a775658053ea0,SOUBEXV12AB01804A4,1
3,00000b722001882066dff9d2da8a775658053ea0,SOKBXYC12A6D4F59D6,1
4,00000b722001882066dff9d2da8a775658053ea0,SOJOJUN12A8AE47E1D,1


In [57]:
train_user.describe()

Unnamed: 0,user_id,song_id,mun_plays
count,10,10,10
unique,1,10,2
top,00000b722001882066dff9d2da8a775658053ea0,SOJOJUN12A8AE47E1D,1
freq,10,1,9


In [59]:
# Lets make a list of songs from the train user
selected_songs = train_user['song_id']

In [88]:
# Titles of the songs played by the train user
train_user.merge(songs,
                left_on = 'song_id',
                right_on = 'song_id',
                how = 'inner')[['artist', 'title']]

Unnamed: 0,artist,title
0,Giles,Replay (Album Version)
1,Cartola,Tive Sim
2,Miley Cyrus,Don't Walk Away
3,Jim Gaffigan,Gravy Drinker (LP Version)
4,Edwyn Collins,Superstar Talking Blues
5,Tab Benoit,Jambalaya
6,MC Solaar,Nouveau western
7,Blue Man Group,Rods And Cones
8,John Parish / PJ Harvey,Heela
9,Jim Gaffigan,Hooooot Pocket! (LP Version)


In [94]:
# Now let's find other users that listened to those songs
similar_users = data[data["song_id"].isin(selected_songs)]['user_id'].reset_index(name='user_id')

In [95]:
similar_users.head()

Unnamed: 0,index,user_id
0,2030,2b6c2f33bc0e887ea7c4411f58106805a1923280
1,3047,b6b799f34a204bd928ea014c243ddad6d0be4f8f
2,3272,732f88be38fae217f8ab7e24c20dd072436e3e40
3,4035,9fba771d9731561eba47216f6fbfc0023d88641b
4,4111,62f2f9b881dc320d745a90c0c10528d18e10deb1


In [83]:
similar_users.describe()

Unnamed: 0,index
count,75588.0
mean,24087314.656
std,13933310.624
min,2030.0
25%,11970264.75
50%,24156641.0
75%,36118919.0
max,48373263.0


In [89]:
# Create a data frame with all the songs of our similar users
# What for???
similar_users_plays = data.merge(similar_users, 
                           left_on = 'user_id', 
                           right_on = 'user_id', 
                           how = 'inner')[["user_id", "song_id", "num_plays"]]

In [90]:
similar_users_plays.head()

Unnamed: 0,user_id,song_id,num_plays
0,2b6c2f33bc0e887ea7c4411f58106805a1923280,SOAUWYT12A81C206F1,6
1,2b6c2f33bc0e887ea7c4411f58106805a1923280,SOBMPJM12A8C13BAE2,1
2,2b6c2f33bc0e887ea7c4411f58106805a1923280,SOBONKR12A58A7A7E0,2
3,2b6c2f33bc0e887ea7c4411f58106805a1923280,SOBYSSP12AAF3B32CA,2
4,2b6c2f33bc0e887ea7c4411f58106805a1923280,SOCBDPR12A6701F792,1


In [91]:
similar_users_plays['song_id'].describe()

count                5336240
unique                219078
top       SOFLJQZ12A6D4FADA6
freq                   60803
Name: song_id, dtype: object

In [98]:
# Let's see how many of our songs those users have played

data[data["song_id"].isin(selected_songs)].groupby(['user_id']).size().sort_values(ascending = False)

user_id
00000b722001882066dff9d2da8a775658053ea0    10
20ff5a5506da7eef77835a0fe84a006e3c525a39     6
a32cfe061d740a7236d8727946f81f1a15c7028f     6
94e3357310cd860548432d5fb68a88c3032ca1e8     6
ed6426b2069480821ad8271b8c47ddd6df2ff674     5
edb5a29d03eb009f100aab5999aceb530c07a065     5
8b8f370f1bcbea4fc8b82fda3befb80bb9533253     5
9f85a4c39ee437c73fca1cb43b09e7283a06f9de     5
970b23b0a4a725d51545c9a6e333500a48271b03     5
2343b6f1e4b7408f7463aa5c00008296a6822dbf     5
340bc3cfcc095dc8ef3a90f83839b77c232ffa58     5
70d1b8f61b9b79c4fba9f764807bff3d02e6c647     5
8e56e10637928a0bd88409211e06232ddf0f0e32     5
96c8d38bbb32ef781b96b33e50ad9c03eb62c340     5
aab42f141f70914c82ccca5cecee6942270ee588     5
44ace73d853dbe305c67dca4f1b02913e3075450     5
47988192adad5b09d8a7ee7f91268757d9b91988     5
01b44213810365eff7d555ebdb84b91cdb275271     5
845d41a896304caa3ea1a0c588bc9fc745fa24ea     4
04e60c0940bd7a022038e219c528c1e27cd042ce     4
b97b0284199acd6f6020f2a19aa479bac0b7d100     4
08eb4

In [None]:
# Function that selects all the songs of a user
def songs_of_user(u_id):
    return data[data["user_id"] == u_id]

This phase is not yet completed, as we have to assing a weight to the users depending on similarity (number of my songs played) with me, but right now we are going to make a rough estimation.

Let's find out which are the most popular songs amongst my "similar" users. We could order the songs by sum of play count, but ths method is very sensible to outliers (a user may have listened a hundred times to the same song), so we are going to find the most popular (most repeated) songs among these users EXCLUDING MY SELECTED_SONGS!!!!!

In [102]:
prediction_df = similar_users.merge(data[~data.song_id.isin(selected_songs)],
                   left_on = 'user_id',
                   right_on = 'user_id')

# data.merge(similar_users,
#           left_on = 'user_id',
#           right_on = 'user_id',
#           )

# data["song_id"].isin(selected_songs)


# airport_freq.merge(airports[airports.ident == 'KLAX'][['id']], 
#                    left_on='airport_ref', 
#                    right_on='id', 
#                    how='inner')[['airport_ident', 'type', 'description', 'frequency_mhz']]

In [106]:
prediction_df.user_id.unique().shape[0]

70170

In [109]:
# Let's chack that the selected songs are not in the prediction dataframe

prediction_df[prediction_df.song_id == "SORDKNX12A8C13A45F"]

# Crrect result has to be an empty dataframe

Unnamed: 0,index,user_id,song_id,num_plays


In [114]:
# Let's also check that the train user is not in the prediction_df
prediction_df[prediction_df.user_id == "00000b722001882066dff9d2da8a775658053ea0"]

Unnamed: 0,index,user_id,song_id,num_plays
1338114,12098218,00000b722001882066dff9d2da8a775658053ea0,SOCTXQW12A6D4F70AD,1
1338115,12098218,00000b722001882066dff9d2da8a775658053ea0,SOCZQCY12AC468E40F,1
1338116,12098219,00000b722001882066dff9d2da8a775658053ea0,SOCTXQW12A6D4F70AD,1
1338117,12098219,00000b722001882066dff9d2da8a775658053ea0,SOCZQCY12AC468E40F,1
1338118,12098222,00000b722001882066dff9d2da8a775658053ea0,SOCTXQW12A6D4F70AD,1
1338119,12098222,00000b722001882066dff9d2da8a775658053ea0,SOCZQCY12AC468E40F,1
1338120,12098223,00000b722001882066dff9d2da8a775658053ea0,SOCTXQW12A6D4F70AD,1
1338121,12098223,00000b722001882066dff9d2da8a775658053ea0,SOCZQCY12AC468E40F,1
1338122,12098224,00000b722001882066dff9d2da8a775658053ea0,SOCTXQW12A6D4F70AD,1
1338123,12098224,00000b722001882066dff9d2da8a775658053ea0,SOCZQCY12AC468E40F,1


In [115]:
# Here it is. Let's remove it
prediction_df = prediction_df[prediction_df.user_id != "00000b722001882066dff9d2da8a775658053ea0"]

In [116]:
prediction_df[prediction_df.user_id == "00000b722001882066dff9d2da8a775658053ea0"]

Unnamed: 0,index,user_id,song_id,num_plays


In [None]:
# It is not there anymore

In [127]:
# Let's now find the most played songs in the prediction dataframe
predicted_songs = prediction_df.\
                    groupby(['song_id']).\
                    size().\
                    sort_values(ascending = False).\
                    head(20).\
                    to_frame("popularity").\
                    reset_index()

In [128]:
predicted_songs.head()

Unnamed: 0,song_id,popularity
0,SOAUWYT12A81C206F1,28056
1,SOBONKR12A58A7A7E0,26453
2,SOEGIYH12A6D4FC0E3,22394
3,SOSXLTC12AF72A7F54,20894
4,SOFRQTD12A81C233C0,19347


In [129]:
# Let's see the songs in a human readable way
predicted_songs.merge(songs,
                left_on = 'song_id',
                right_on = 'song_id',
                how = 'inner')[['artist', 'title']]

Unnamed: 0,artist,title
0,Björk,Undo
1,Dwight Yoakam,You're The One
2,Barry Tuckwell/Academy of St Martin-in-the-Fie...,Horn Concerto No. 4 in E flat K495: II. Romanc...
3,Kings Of Leon,Revelry
4,Harmonia,Sehr kosmisch
5,Florence + The Machine,Dog Days Are Over (Radio Edit)
6,Five Iron Frenzy,Canada
7,OneRepublic,Secrets
8,Tub Ring,Invalid
9,Lonnie Gordon,Catch You Baby (Steve Pitron & Max Sanna Radio...


In [132]:
# This would be our recommendation. Now we have to score it

In [133]:
# Let's read the test file with the rest of the songs of the user to see if he has also listened to any of our recommendations

In [134]:
datafile = os.path.join(data_root, "test_listenings.pkl")
test_list = pd.read_pickle(datafile)
test_user = pd.DataFrame(test_list, columns=['user_id', 'song_id', 'mun_plays'])
test_user.head()

Unnamed: 0,user_id,song_id,mun_plays
0,00000b722001882066dff9d2da8a775658053ea0,SOCTXQW12A6D4F70AD,1
1,00000b722001882066dff9d2da8a775658053ea0,SOCZQCY12AC468E40F,1


In [135]:
test_user.merge(songs,
                left_on = 'song_id',
                right_on = 'song_id',
                how = 'inner')[['artist', 'title']]

Unnamed: 0,artist,title
0,Nick Cave & The Bad Seeds,Babe_ You Turn Me On (Paris 2)
1,Ana Carolina,Ela É Bamba
