# Mosaico Musical

### Musical recommender by Alberto Antón as a final project for the Master in Data Science of KSchool


In [21]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import io
import os
import sys
import random

In [221]:
# display results to 3 decimal points, not in scientific notation, and thousands separator
pd.set_option("display.float_format", lambda x: "{:,.2f}".format(x))

In [22]:
# Set random seed
random.seed(666)

### Loading data

In [3]:
data_root = "data"

In [5]:
# Load training dataset
columns = ["user_id", "song_id", "num_plays"]
datafile = os.path.join(data_root, "train_triplets.txt")

data = pd.read_csv(datafile, 
                   sep=""\t", 
                   header = None,
                   names = columns)

In [6]:
# Let's get a glimpse of the data
data.head()

Unnamed: 0,user_id,song_id,num_plays
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1


In [15]:
data.describe()

Unnamed: 0,num_plays
count,48373586.0
mean,2.87
std,6.44
min,1.0
25%,1.0
50%,1.0
75%,3.0
max,9667.0


In [16]:
# Let's analyze num_plays column a little deeper
data.num_plays.describe()

count   48,373,586.00
mean             2.87
std              6.44
min              1.00
25%              1.00
50%              1.00
75%              3.00
max          9,667.00
Name: num_plays, dtype: float64

In [17]:
# most of the songs have been played only one time, and there are very large outliers, 
# so we will not be using num_plays field.

We already have the listenings dataset, now let's load the song information dataset

In [60]:
columns = ["foo", "song_id", "artist", "title"]
datafile = os.path.join(data_root, "unique_tracks.txt")

all_songs = pd.read_csv(datafile, 
                        header = None,
                        sep = "<SEP>",
                        names = columns,
                        usecols = ["song_id", "artist", "title"],
                        encoding =  "utf-8",
                        engine = "python")

In [61]:
all_songs.sample(10)

Unnamed: 0,song_id,artist,title
299918,SOVSONN12A8C133578,The Charlie Daniels Band,Tennessee
600203,SOCGOAR12A6D4F5EA3,The Beautiful South,Good As Gold (Stupid As Mud)
690933,SOKLENT12A8C1401D7,James Galway;Daniel Measham,Waltzing Matilda
413968,SOMAEWN12AB017E37F,zebrahead,All for none and none for all
425908,SOKHRQI12A8C13F53E,Cam'Ron / Juelz Santana / Jimmy Jones,Come Home With Me
934697,SOXNBUL12AC468D570,Slowpho,Dial (Stupid Høghus Remix)
117534,SOZFNVI12AB018887D,Amanda,Ingenstans
463207,SOIRJRE12A8C135117,Southern Tenant Folk Union,Southern Folk Theme In A
787914,SOBATOZ12AB017FC6F,An Albatross,Dimensional Gymnastics
681144,SOQAMQQ12A8AE4866D,Annie Lennox,The Saddest Song (I've Got)


In [62]:
all_songs.describe()

Unnamed: 0,song_id,artist,title
count,1000000,1000000,999985
unique,999056,72665,702000
top,SONBEKD12AB01894DC,Michael Jackson,Intro
freq,3,194,1511


In [63]:
# There seems to be repeated some songs. Lest's find out
all_songs[all_songs.duplicated(subset=['song_id'], keep=False)].sort_values("song_id").head(10)

Unnamed: 0,song_id,artist,title
963681,SOAAEFC12AB01852F1,Tineke Schouten/Linda De Mol/Franklin Brown,De Tongbreker
304966,SOAAEFC12AB01852F1,Tineke Schouten,De Tongbreker (Tineke Schouten & Linda de Mol)
572832,SOACGAQ12A58A79805,Arctic Monkeys,Fire And The Thud
141269,SOACGAQ12A58A79805,Arctic Monkeys,Fire And The Thud
15028,SOADFGH12A6D4F74F7,Red Hot Chili Peppers,Falling Into Grace (Album Version)
522461,SOADFGH12A6D4F74F7,Red Hot Chili Peppers,Falling Into Grace (Album Version)
68224,SOADYVX12A8A9D9462,Rihanna,We Ride
655219,SOADYVX12A8A9D9462,Rihanna,We Ride
182693,SOAEIFW12A8C1391E4,Franz Ferdinand,Michael
525920,SOAEIFW12A8C1391E4,Franz Ferdinand,Michael


In [64]:
# Let's remove those duplicates
all_songs.drop_duplicates(subset = "song_id", inplace = True)
all_songs.describe()

Unnamed: 0,song_id,artist,title
count,999056,999056,999041
unique,999056,72652,701922
top,SOEATPZ12AB0182EA3,Johnny Cash,Intro
freq,1,191,1511


Now count and unique show the same ammount

In [29]:
# There is information of about one million songs. Let's see hoy many of these songs
# are in the training dataset
data.song_id.unique().shape[0]

384546

We don't need information for so many songs, so let's create a dataframe with information only on the songs that are in the training dataframe.

In [65]:
# Let's keep only the songs that are in data
unique_songs_df = pd.DataFrame(data.song_id.unique(), columns = ["song_id"])

In [66]:
songs = unique_songs_df.merge(all_songs, on = "song_id", how = "left")[["song_id", "artist", "title"]]

In [67]:
songs.describe()

Unnamed: 0,song_id,artist,title
count,384546,384546,384542
unique,384546,42055,306720
top,SOLSHRD12AB018BF73,Beastie Boys,Intro
freq,1,136,526


We can now release some memory discarding the all_songs dataframe

In [68]:
all_songs = None

If we want to recommend something to a new user, either we know something about him or we can't but recommend the most popular songs.

Let's go for the first option, so we'll choose some songs and make the recommender to select some new ones for us, but first, we are going to create some helper functions.

In [69]:
# Function that shows the songs of an artist
def songs_by(artist_name):
    return songs[songs["artist"] == artist_name][["song_id", "title"]]

In [70]:
# Let's try to find some rock songs an see the recommendation...
songs.artist.sample(25)

289281                                 Adultnapper
84521                                   Dusminguet
364797                            Thought Industry
243807                                  Juaninacka
38834                                         Koop
242478                               The Oppressed
202995                                        Malu
53805                                 Stevie Nicks
41188                             The Aqua Velvets
38899                                Easton Corbin
312948                 Dolly Parton & Kenny Rogers
213235                                Lizzy Borden
178214                                   All-4-One
288538                                 Alex Baroni
54787                                          Ayo
245196                   The Strange Fruit Project
217254                                   The Itals
244223    Steve Wynn & The Dragon Bridge Orchestra
21364                                         Salt
319667                         

In [72]:
songs_by('Metallica')

Unnamed: 0,song_id,title
1146,SOOEEPE12A8AE459A4,The Unforgiven III
1407,SOZATKE12A6D4F5915,2 X 4
1594,SOGAUIQ12A6D4F8262,Hit The Lights
1641,SOUGBIM12A6D4F8247,The Four Horsemen
1713,SOCHYVZ12A6D4F5908,Enter Sandman
1787,SOZDGEW12A8C13E748,One
1794,SOGMBXD12A6D4F5920,Ronnie
4702,SOJSRYJ12A6D4F824C,Phantom Lord
4712,SOMTBXX12AF729F5A6,Am I Evil?
4726,SORIEXB12A6D4F824D,No Remorse


We list some groups. When we find one we like, list its songs and get the song id. We do this until we have a small dataset of songs we like.

This is our metalhead selection: 

| Artist | song_id   |
|------|------|
| Thrice | SORJRTI12A6D4F7D67 |
| Clawfinger | SOSOPGB12A8C13C185 |
| Rammstein | SOSYHME12A8C135DD8 | 
| Rancid | SOSBJSU12A8C138469 | 
| Against Me! | SONJQZM12A6D4FBE30 |
| Millencolin | SOZVBUH12A8AE4745C |
| Van Halen | SOVMGEX12AC9070FF2 |
| Led Zeppelin | SOEHJKJ12A8C13CA4D |
| Staind | SOUNBBX12A6D4F338E |
| Monster Magnet | SOJKARY12A6701ED3F |
| Whitesnake | SOPCNEA12A67ADF48B |
| Puddle Of Mudd | SOTQVSE12A6D4F8200 |
| Green Day | SOTNYYH12A6701F94B |
| Metallica | SOSJRJP12A6D4F826F |

In [74]:
selected_songs = ["SORJRTI12A6D4F7D67", "SOSOPGB12A8C13C185", "SOSYHME12A8C135DD8", "SOSBJSU12A8C138469", \
                 "SONJQZM12A6D4FBE30", "SOZVBUH12A8AE4745C", "SOVMGEX12AC9070FF2", "SOEHJKJ12A8C13CA4D", \
                 "SOUNBBX12A6D4F338E", "SOJKARY12A6701ED3F", "SOPCNEA12A67ADF48B", "SOTQVSE12A6D4F8200", \
                 "SOTNYYH12A6701F94B", "SOSJRJP12A6D4F826F"]

In [86]:
# Now let's find other users that listened to these songs. we need it as a dataframe, so we'll use reset_index()
similar_users = data[data["song_id"].isin(selected_songs)]['user_id'].reset_index()

In [89]:
similar_users.head()

Unnamed: 0,index,user_id
0,6041,bd64f193f0f53f09d44ff48fd52830ff2fded392
1,7067,a520488fcf049bbb5cd847cfa4f884c740692780
2,7970,0ef42a19efb74d0a05c308d00636c8d8d41bec0c
3,8466,7661038e3e655fd31961ad18aea13dded963eedf
4,9524,12497e138741a0b94bb36a14bef32c9d0ee20fec


In [90]:
similar_users.user_id.describe()

count                                        36391
unique                                       35253
top       52542a715ba72e52eec99b277a42532c88615469
freq                                             4
Name: user_id, dtype: object

We can see that most of the users only have played one of the soungs of our selection, and that the one who has listened to more of our songs has played only 4.

In [80]:
# Let's see how many of our songs those users have played

data[data["song_id"].isin(selected_songs)].groupby(["user_id"]).size().sort_values(ascending = False)

user_id
8fc187765f25645e802bd5137f641c8de7df17b8    4
52542a715ba72e52eec99b277a42532c88615469    4
be59c5b281f8b714c4d4d4bfb877715a93b3c64d    4
58c846a9d19a9345bffe62b212436cb49363278a    3
a883218d1e6171d4913b1dec6c083eb3fea5f914    3
b73bc9b4732c8edf790e257df7395973f8d085ef    3
3e6dd161e97e7bd0e20986e7f5e391e5d24e0a62    3
b67f2d3bea6a313bc55695517cc9b38ff5f920fa    3
07e8066fc9c82f5e700023f3c963117e874e0188    3
3f52fdf255f7043eb170a49606bebe14f6c7a08a    3
ed824982bb5d17465708f5bfbd8589af81ad4de0    3
217b76adb93cdb5d221408ad9f9c5c244a65b038    3
f106f63e74ba0648ed27e2fd59094a53c8c9c534    3
78fb080641b1b1f9b85ceffd9c1686eb8db7c765    3
748096044d04f6736c6921203f711f57fe6e31ee    3
00fc9d7d12f74bcd93fa787cc26a9c61a0904ac7    3
b161e27efcd0135dabd0cc2cfea477498667b191    3
53175b45ba820a33ac8f833a85a986a7c0f7d3d4    3
b60c2902ab24963f33d8a431bee8676a14ceb003    3
8c24607fcd3b2ca28a8eb5924c7c26d8d40e82c4    3
127c8ac775ebdce42de94ff5783ab8d8e333711f    3
58dc40ef3b13f15b889f72d6e3

Now that we know the users that have played our music, we need to know what other songs they have listened.

In [91]:
prediction_df = similar_users.merge(data[~data.song_id.isin(selected_songs)], on = 'user_id')

In [92]:
prediction_df.head()

Unnamed: 0,index,user_id,song_id,num_plays
0,6041,bd64f193f0f53f09d44ff48fd52830ff2fded392,SOACIPG12A8AE47E1C,1
1,6041,bd64f193f0f53f09d44ff48fd52830ff2fded392,SOAHEEC12A6BD4DAA4,1
2,6041,bd64f193f0f53f09d44ff48fd52830ff2fded392,SOAKQBB12A8C1413A0,1
3,6041,bd64f193f0f53f09d44ff48fd52830ff2fded392,SOAVFMF12A6D4F92E6,1
4,6041,bd64f193f0f53f09d44ff48fd52830ff2fded392,SOBOLEI12A58A7E386,1


In [94]:
prediction_df.describe()

Unnamed: 0,index,num_plays
count,3233753.0,3233753.0
mean,24217294.64,2.99
std,14002352.72,6.43
min,6041.0,1.0
25%,11973702.0,1.0
50%,24413248.0,1.0
75%,36386260.0,3.0
max,48373378.0,2213.0


In [95]:
prediction_df.user_id.describe()

count                                      3233753
unique                                       35253
top       4e73d9e058d2b1f2dba9c1fe4a8f416f9f58364f
freq                                          4623
Name: user_id, dtype: object

Now we have all the songs our similar users have played. We only have to select the most popular ones. We can to this in two ways:

1. Order the songs by play count (repetition).
2. Order the songs by the number of users that have played them (popularity).

The repetition is very sensitive to outliers. As we saw at the beginning, the maximum value of play_count is 9,667. Everytime this user is selected, that song would be the first in our recommendation, so we are going to go with the popularity option.

In [96]:
# Let's now find the most played songs in the prediction dataframe. This would be our recommendation
predicted_songs = prediction_df.\
                    groupby(['song_id']).\
                    size().\
                    sort_values(ascending = False).\
                    head(20).\
                    to_frame("popularity").\
                    reset_index()

In [97]:
predicted_songs.head()

Unnamed: 0,song_id,popularity
0,SOEGIYH12A6D4FC0E3,8851
1,SOAUWYT12A81C206F1,8014
2,SOSXLTC12AF72A7F54,7285
3,SOBONKR12A58A7A7E0,6980
4,SOFRQTD12A81C233C0,6086


In [98]:
# Let's see the songs in a human readable way
predicted_songs.merge(songs, on = 'song_id')[['artist', 'title']]

Unnamed: 0,artist,title
0,Barry Tuckwell/Academy of St Martin-in-the-Fie...,Horn Concerto No. 4 in E flat K495: II. Romanc...
1,Björk,Undo
2,Kings Of Leon,Revelry
3,Dwight Yoakam,You're The One
4,Harmonia,Sehr kosmisch
5,Alliance Ethnik,Représente
6,Cartola,Tive Sim
7,OneRepublic,Secrets
8,Lil Wayne / Eminem,Drop The World
9,Florence + The Machine,Dog Days Are Over (Radio Edit)


Mmmmm... There doesn't seem to be much metal in that list, does it? The result is totally different from the set of songs we chose. Let's investigate. First we are going to take a look at the most popular songs of the entire dataset to see it it's similar to this.

In [99]:
# Let's find out the most played songs in the whole dataset
most_popular_songs = data.\
                    groupby(['song_id']).\
                    size().\
                    sort_values(ascending = False).\
                    head(20).\
                    to_frame("popularity").\
                    reset_index()

In [101]:
most_popular_songs.merge(songs, on = 'song_id')[['song_id', 'artist', 'title', 'popularity']]

Unnamed: 0,song_id,artist,title,popularity
0,SOFRQTD12A81C233C0,Harmonia,Sehr kosmisch,110479
1,SOAUWYT12A81C206F1,Björk,Undo,90476
2,SOAXGDH12A8C13F8A1,Florence + The Machine,Dog Days Are Over (Radio Edit),90444
3,SOBONKR12A58A7A7E0,Dwight Yoakam,You're The One,84000
4,SOSXLTC12AF72A7F54,Kings Of Leon,Revelry,80656
5,SONYKOW12AB01849C9,OneRepublic,Secrets,78353
6,SOEGIYH12A6D4FC0E3,Barry Tuckwell/Academy of St Martin-in-the-Fie...,Horn Concerto No. 4 in E flat K495: II. Romanc...,69487
7,SOLFXKT12AB017E3E0,Charttraxx Karaoke,Fireflies,64229
8,SODJWHY12A8C142CCE,Train,Hey_ Soul Sister,63809
9,SOFLJQZ12A6D4FADA6,Cartola,Tive Sim,58610


Ok. Now we see a couple of things about our recommender:

1. It is heavily influenced by popularity. There are 35,253 users that have listened to at least one of our songs, and the vast majority has listened to just one of them, meaning that they are not very similar to us, so their recomendation is almost random. This forces us to give more importance to the songs of the users more similar to us.

2. Harmonia the most popular artist? Bjök the second? Dwight Yoakam the fourth? This is impossible in real life. This dataset must be rotten, but anyway, to our recommendator they are just names.

Now we are going to implement the scoring funtion to put more weight on the songs of the most similar to us users than on the least similar. We will create a score for each song based on how similar to us is the user. Later we will sum all the scores and return an ordered by score list that will be our new recommendation.

First we have to know every user similarity to us:

In [103]:
# Create a Dataframe with the number of similar songs by user
user_similarity = data[data["song_id"].isin(selected_songs)].groupby(['user_id']).size().to_frame("similarity")

In [105]:
# Let's see the ten most similar to us users
user_similarity.sort_values('similarity', ascending = False).head(10)

Unnamed: 0_level_0,similarity
user_id,Unnamed: 1_level_1
8fc187765f25645e802bd5137f641c8de7df17b8,4
52542a715ba72e52eec99b277a42532c88615469,4
be59c5b281f8b714c4d4d4bfb877715a93b3c64d,4
217b76adb93cdb5d221408ad9f9c5c244a65b038,3
33825a5d5b1b2ea935a9fc2f4a3cbf8e97e6280a,3
07e8066fc9c82f5e700023f3c963117e874e0188,3
b31888d485ddff26572ffdab1c947bcc067ff3a1,3
ed824982bb5d17465708f5bfbd8589af81ad4de0,3
53175b45ba820a33ac8f833a85a986a7c0f7d3d4,3
3e6dd161e97e7bd0e20986e7f5e391e5d24e0a62,3


In [111]:
# Now we are going to create a new dataframe with the user_id, song_id and similarity excluding the songs
# we have selected for our recommendation.
prediction_df = similar_users.\
    merge(data[~data.song_id.isin(selected_songs)], on = 'user_id').\
    merge(user_similarity, on = "user_id")[["user_id", "song_id", "similarity"]]

In [112]:
prediction_df.sample(10)

Unnamed: 0,user_id,song_id,similarity
3047192,5ad93cdc05cf1431c04d7e809df3934d980b5a5f,SOQGVCS12AF72A078D,1
2114466,8a5fadcd48a1cbffc0a67795acbbb624b45f6ad8,SOWWWSD12A58A801F9,2
3039753,15206b4183c1b9effadf75247bbf51cfbb387a72,SOXXICK12A6D4F67B9,1
1838791,3487d95158d8ec4bc6b10ffb2db39593eccc556a,SOOONEP12A8AE483EC,1
2351891,550eda18938d8cf21990ca15685c5a050dbc7c75,SOOPQAY12A6D4F5926,1
2818013,8a9dc382f60363f76354c857e0a97a98b9bc74e7,SOQOLYG12AF729EAAC,3
1468193,872821d140b7f318205357a8823ee6177aea4c23,SOACIPG12A8AE47E1C,1
173699,bd1edf96f947af790365f7378d2e894ff4f7b72e,SOTJCZJ12A58A7E54B,1
576247,2904f00329117109159e6831b16b8e9dc866480a,SOUSMXX12AB0185C24,2
2432536,592aeeaa43b2af7a5670085eefc0fb9f4987f23f,SOLVRYB12A6D4FBA1D,1


In [115]:
# now we have to create a DF with the score of every song that we'll user later to order the most relevant songs
# for a specific user
max_similarity = pred_df.similarity.max()

In [116]:
total_plays_similar_users = pred_df.shape[0]

In [250]:
# Anton's scoring funtion that ranks songs by similarity to the user
def song_scoring(similarity):
    return (similarity / max_similarity) ** (max_similarity - (similarity - 1))

In [251]:
# Constant values for normalization
max_sim = song_scoring(max_similarity)
min_sim = song_scoring(1)

In [257]:
# Let's apply Anton's scoring funtion to the prediction dataframe
prediction_df['score'] = prediction_df['similarity'].map(lambda x: song_scoring(x))

In [258]:
prediction_df.head(10)

Unnamed: 0,user_id,song_id,similarity,score
0,bd64f193f0f53f09d44ff48fd52830ff2fded392,SOACIPG12A8AE47E1C,1,0.0
1,bd64f193f0f53f09d44ff48fd52830ff2fded392,SOAHEEC12A6BD4DAA4,1,0.0
2,bd64f193f0f53f09d44ff48fd52830ff2fded392,SOAKQBB12A8C1413A0,1,0.0
3,bd64f193f0f53f09d44ff48fd52830ff2fded392,SOAVFMF12A6D4F92E6,1,0.0
4,bd64f193f0f53f09d44ff48fd52830ff2fded392,SOBOLEI12A58A7E386,1,0.0
5,bd64f193f0f53f09d44ff48fd52830ff2fded392,SOBYNII12A58291CDC,1,0.0
6,bd64f193f0f53f09d44ff48fd52830ff2fded392,SOCALRI12A58A7BBC5,1,0.0
7,bd64f193f0f53f09d44ff48fd52830ff2fded392,SOCHYVZ12A6D4F5908,1,0.0
8,bd64f193f0f53f09d44ff48fd52830ff2fded392,SODQGBE12A6D4F6BAB,1,0.0
9,bd64f193f0f53f09d44ff48fd52830ff2fded392,SOEAQHH12A58A78F59,1,0.0


In [259]:
# Let's summ the score of every song and order it
predicted_songs = prediction_df.\
                    groupby(['song_id'])['score'].\
                    sum().\
                    sort_values(ascending = False).\
                    head(20).\
                    to_frame("popularity")

In [260]:
predicted_songs.head(20)

Unnamed: 0_level_0,popularity
song_id,Unnamed: 1_level_1
SOTNHIP12AB0183131,102.01
SOEGIYH12A6D4FC0E3,88.2
SOZDGEW12A8C13E748,84.81
SOLRGNF12AB0187CF4,83.88
SOFGIVB12A6D4F5923,78.56
SOITRTA12A6D4F8261,76.16
SOPQLBY12A6310E992,74.57
SOSXLTC12AF72A7F54,74.0
SOCHYVZ12A6D4F5908,73.45
SOAUWYT12A81C206F1,72.64


In [261]:
# Let's see the songs in a human readable way
predicted_songs.merge(songs, on = 'song_id')[['song_id', 'artist', 'title']].head(20)

Unnamed: 0,song_id,artist,title
0,SOTNHIP12AB0183131,Kid Cudi / Kanye West / Common,Make Her Say
1,SOEGIYH12A6D4FC0E3,Barry Tuckwell/Academy of St Martin-in-the-Fie...,Horn Concerto No. 4 in E flat K495: II. Romanc...
2,SOZDGEW12A8C13E748,Metallica,One
3,SOLRGNF12AB0187CF4,Simon Harris,Sample Track 2
4,SOFGIVB12A6D4F5923,Metallica / Marianne Faithfull,The Memory Remains
5,SOITRTA12A6D4F8261,Metallica,Ride The Lightning
6,SOPQLBY12A6310E992,Radiohead,Creep (Explicit)
7,SOSXLTC12AF72A7F54,Kings Of Leon,Revelry
8,SOCHYVZ12A6D4F5908,Metallica,Enter Sandman
9,SOAUWYT12A81C206F1,Björk,Undo


In [None]:
# Sacar las canciones más oídas de los que tienen un 4 de similitud a ver si se parecen a esto...