# Mosaico Musical

### Musical recommender by Alberto Antón as a final project for the Master in Data Science of KSchool



In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import io
import os
import sys

In [2]:
# display results to 3 decimal points, not in scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)

### Load data

In [3]:
data_root = "data"

In [18]:
columns = ['user_id', 'song_id', 'num_plays']
datafile = os.path.join(data_root, "train_triplets.txt")
print(datafile)
data = pd.read_csv(datafile, 
                   sep='\t', 
                   header = None,
                   names = columns)
data.head()

data\train_triplets.txt


Unnamed: 0,user_id,song_id,num_plays
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1


In [5]:
# 8GB of data, but not all of them are loaded in memory at once
print (sys.getsizeof(data))

8707245584


In [6]:
# Read songs dataset

In [37]:
columns = ["foo", "song_id", "artist", "title"]
datafile = os.path.join(data_root, "unique_tracks.txt")
all_songs = pd.read_csv(datafile, 
                        header = None,
                        sep = '<SEP>',
                        names = columns,
                        usecols = ["song_id", "artist", "title"],
                        encoding =  "utf-8",
                        engine = "python")

all_songs.head()

Unnamed: 0,song_id,artist,title
0,SOQMMHC12AB0180CB8,Faster Pussy cat,Silent Night
1,SOVFVAK12A8C1350D9,Karkkiautomaatti,Tanssi vaan
2,SOGTUKN12AB017F4F1,Hudson Mohawke,No One Could Ever
3,SOBNYVR12A8C13558C,Yerba Brava,Si Vos Querés
4,SOHSBXH12A8C13B0DF,Der Mystic,Tangle Of Aspens


In [26]:
# Number of songs in data
data.song_id.unique().shape[0]

384546

In [27]:
# Number of songs in all_songs
all_songs.song_id.unique().shape[0]

999056

In [39]:
all_songs.describe()

Unnamed: 0,song_id,artist,title
count,1000000,1000000,999985
unique,999056,72665,702000
top,SOUYQYY12AF72A000F,Michael Jackson,Intro
freq,3,194,1511


In [40]:
# Let's keep only the songs that match

songs = all_songs.merge(data,
                       left_on = "song_id",
                       right_on = "song_id",
                       how = "inner")[["song_id", "artist", "title"]]

# airport_freq.merge(airports[airports.ident == 'KLAX'][['id']], 
#                    left_on='airport_ref', 
#                    right_on='id', 
#                    how='inner')[['airport_ident', 'type', 'description', 'frequency_mhz']]

In [29]:
# Number of songs in songs
songs.song_id.unique().shape[0]

384546

In [41]:
songs.describe()

Unnamed: 0,song_id,artist,title
count,49664528,49664528,49664463
unique,384546,42062,306785
top,SOFRQTD12A81C233C0,Coldplay,Sehr kosmisch
freq,110479,455816,110479


In [42]:
# Removing duplicates from songs
songs.drop_duplicates(subset = "song_id", inplace = True)
songs.describe()

Unnamed: 0,song_id,artist,title
count,384546,384546,384542
unique,384546,42055,306720
top,SOSCFJZ12AB018E7D7,Beastie Boys,Intro
freq,1,136,526


In [43]:
songs.head()

Unnamed: 0,song_id,artist,title
0,SOQMMHC12AB0180CB8,Faster Pussy cat,Silent Night
3,SOBNYVR12A8C13558C,Yerba Brava,Si Vos Querés
6,SOYGNWH12AB018191E,3 Gars Su'l Sofa,L'antarctique
12,SOGPCJI12A8C13CCA0,Waldemar Bastos,N Gana
19,SOSDCFG12AB0184647,Lena Philipsson,006


In [None]:
# There seems to be songs with the same title and different id. Identify them

# TODO: DATASET CLEANING!!!!!!!!!!!!!!



In [32]:
# Remove all_songs from memory
all_songs = None

In [56]:
# Funcion that returns the artist and title of a song_id
def which_song(sid):
    return songs[songs["song_id"] == sid][["artist", "title"]]

In [57]:
which_song("SOSDCFG12AB0184647")

Unnamed: 0,artist,title
19,Lena Philipsson,6


As this dataset is part of a closed competition, there is no test set, so we have to create it from the train set.

If we want to recommend something to a new user, either we know something about him or we can't but recommend the most popular songs.

Let's go for the first option, so we'll present the user random songs until he selects 10 that he likes

In [64]:
songs.sample(50)

Unnamed: 0,song_id,artist,title
47731140,SOBQSSG12A8C13DE4E,Yo La Tengo,I'm Set Free
40645739,SOCNRSS12AB018C15A,Aline Barros,Cantarei teu amor para sempre (I could sing of...
11181177,SOQJEHJ12AB0180D89,Tijuana,Groove Is In The Air
3301365,SOHFMWD12A6D4F6E62,Jonny Greenwood,Clockwork Tin Soldiers
39562321,SOTGYHV12A8C138B0A,CCCP - Fedeli Alla Linea,Trafitto
8339398,SOIMCUD12AB018E66B,Synkro,Departure
20478518,SOAGJVF12A8C132EF6,Bobby Womack;The Brotherhood,A Change is Gonna Come
16077622,SOJFOMZ12A8C135DBF,Mark Stewart,Blood Money 2
18052510,SOKCAOH12A6D4F84A3,Tears For Fears,When In Love With A Blind Man
42189049,SOJBXMJ12A8AE48D9F,Gladiators,Talking Blues (Live)


In [65]:
# Randomly selected songs for the test
selected_songs = ["SOBWKZB12A8C136891", "SOSWNMA12A6D4F796F", "SOXWOLZ12A8C135F54", "SONWNJJ12A67AD8492", \
                  "SOHQCOD12AB018316C", "SOFDKPU12A58A7A06A", "SOVKUHB12AAF3B3636", "SONJNVN12A6701C689", \
                  "SOMMXAD12A58A76FBF", "SOZYUBF12AB018337B"]

In [71]:
for song in selected_songs:
    print (which_song(song))

             artist      title
41430157  Ben Folds  Wandering
        artist            title
5406232  Tonic  Take Me As I Am
                 artist               title
15457938  Habeas Corpus  Ni Dentro Ni Fuera
                   artist   title
24109806  Robbie Williams  United
                                   artist               title
21897330  USS (Ubiquitous Synergy Seeker)  Stranger To Myself
              artist             title
49584607  Clawfinger  Nothing Going On
             artist                              title
6602954  Nickelback  Gotta Be Somebody (Album Version)
              artist title
4453758  The Subways  Mary
                     artist                              title
21860061  Story Of The Year  Meathead (Album Version) (Non-PA)
          artist           title
911702  Savatage  Morphine child


In [72]:
# Now let's find other users that listent to those songs
similar_users = data[data["song_id"].isin(selected_songs)]

In [86]:
similar_users.head()

Unnamed: 0,user_id,song_id,num_plays
23016,edc4e3b50cb4f23282a157edc7ca7cf09afcc093,SONJNVN12A6701C689,2
44348,4e11f45d732f4861772b2906f81a7d384552ad12,SONJNVN12A6701C689,1
52930,02192554db8fe6d17b6309aabb2b7526a2e58534,SOBWKZB12A8C136891,1
136715,d6c990812a8e346e21ff899ee8bdb581f986561e,SOZYUBF12AB018337B,8
179655,ab1eee882b2f01ac185a5322520955b64dd4b4fc,SOMMXAD12A58A76FBF,3


In [73]:
similar_users.describe()

Unnamed: 0,num_plays
count,889.0
mean,2.08
std,3.049
min,1.0
25%,1.0
50%,1.0
75%,2.0
max,45.0


In [74]:
# Let's see how many of our songs those users have

# Create a selected songs dataframe to join with the other dataframes
selected_songs_df = pd.DataFrame(selected_songs, columns=['song_id'])

In [80]:
similar_users.groupby("user_id").size().sort_values(ascending = False)

user_id
3fa86e6ea24ac4b98823597232198bcae4e03339    2
fff795d5c0810f90da9f80219b0c8a347ddae0de    1
4ecef11908a6dd60a28968687035b548df3c4404    1
5120bc8672190f77aa6fb530bba8e672b3fa2de6    1
505b6904345262d1402c27ba717023b2970adcf1    1
503e05cd0fd48309415e0e3a2677a9c99eda4e18    1
5019aeed8b7fe20ee63eaa768a105ace9d7ed964    1
4fd959f92e869c167c91abe1ed2cda2467b80d6e    1
4fa541c1d2591e5bd42f444e07042f5e5127871e    1
4f560e1ea4e06f65654f52da5dc3e9e10fa0548c    1
4f2e2e78bd2526a81883d592bab559571cb28e49    1
4f176bb7d1b10d487241920efd345372537f64f2    1
4f15a724c0b76f2051783bcb32275247011d96cc    1
4eefa3af168fe87c528a1749e7a77dc7ec524cc2    1
4e765da731229d9426c44449d811071b40e43923    1
51c7d267fc6e41f841730ff08e35d16e39133563    1
4e6d2d9c5ab2314b751589490d200cd4f399fa5d    1
4e16e02b178c28a3f525487b4bd8635cf6eea0fb    1
4e11f45d732f4861772b2906f81a7d384552ad12    1
4d905807e921aadd4995b5e3493d6a44863c84e5    1
4d59946b943c2196ddc683e682cdc01b563594cf    1
4d12190047211f450991002b50

In [81]:
# Function that selects all the songs of a user
def songs_of_user(u_id):
    return data[data["user_id"] == u_id]

In [84]:
# I seem to have a very heterogeneous taste, as only one user has listened to two of my ten songs
# I'm one of a kind!!

# Let's see what songs are those 2 of my most similar user:
data[(data["user_id"] == "3fa86e6ea24ac4b98823597232198bcae4e03339") & (data["song_id"].isin(selected_songs))]

Unnamed: 0,user_id,song_id,num_plays
19550126,3fa86e6ea24ac4b98823597232198bcae4e03339,SOBWKZB12A8C136891,2
19550525,3fa86e6ea24ac4b98823597232198bcae4e03339,SOMMXAD12A58A76FBF,3


In [85]:
print(which_song("SOBWKZB12A8C136891"))
print(which_song("SOMMXAD12A58A76FBF"))

             artist      title
41430157  Ben Folds  Wandering
                     artist                              title
21860061  Story Of The Year  Meathead (Album Version) (Non-PA)


This phase is not yet completed, as we have to assing a weight to the users depending on similarity (number of my songs played) with me, but right now we are going to make a rough estimation.

Lets fin out which are the most popular songs amongst my "similar" users. We could order the songs by sum of play count, but ths method is very sensible to outliers (a user may have listened a hundred times to the same song), so we are going to find the most popular (most repeated) songs among these users EXCLUDING MY SELECTED_SONGS!!!!!

In [89]:
sorted_prediction = similar_users.groupby("song_id").size().to_frame("popularity").reset_index().sort_values(["popularity"], ascending = False)


# airports.groupby(['iso_country', 'type']).size().to_frame('size').reset_index().sort_values(['iso_country', 'size'], ascending=[True, False])

In [93]:
sorted_prediction.head(15)

Unnamed: 0,song_id,popularity
3,SOMMXAD12A58A76FBF,370
4,SONJNVN12A6701C689,161
0,SOBWKZB12A8C136891,157
6,SOSWNMA12A6D4F796F,97
2,SOHQCOD12AB018316C,30
7,SOVKUHB12AAF3B3636,29
8,SOXWOLZ12A8C135F54,20
9,SOZYUBF12AB018337B,20
5,SONWNJJ12A67AD8492,4
1,SOFDKPU12A58A7A06A,1


In [92]:
which_song("SONJNVN12A6701C689")

Unnamed: 0,artist,title
4453758,The Subways,Mary


In [None]:
# There seems to be a problem here as I am selecting only my songs when it would have to be just the opposite....