# Mosaico Musical

### Musical recommender by Alberto Antón as a final project for the Master in Data Science of KSchool



In [44]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import io
import os
import sys

In [299]:
# display results to 3 decimal points, not in scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)

### Load data

In [4]:
data_root = "data"

In [5]:
columns = ['user_id', 'song_id', 'num_plays']
datafile = os.path.join(data_root, "train_triplets.txt")
print(datafile)
data = pd.read_csv(datafile, 
                   sep='\t', 
                   header = None,
                   names = columns)
data.head()

data\train_triplets.txt


Unnamed: 0,user_id,song_id,num_plays
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1


In [6]:
data.describe()

Unnamed: 0,num_plays
count,48373586.0
mean,2.867
std,6.438
min,1.0
25%,1.0
50%,1.0
75%,3.0
max,9667.0


In [226]:
# most of the songs have been played only one time, and there are very large outliers so we will not be using
# num_plays field
data.num_plays.describe()

count   48373586.000
mean           2.867
std            6.438
min            1.000
25%            1.000
50%            1.000
75%            3.000
max         9667.000
Name: num_plays, dtype: float64

In [7]:
# Read songs dataset

In [8]:
columns = ["foo", "song_id", "artist", "title"]
datafile = os.path.join(data_root, "unique_tracks.txt")
all_songs = pd.read_csv(datafile, 
                        header = None,
                        sep = '<SEP>',
                        names = columns,
                        usecols = ["song_id", "artist", "title"],
                        encoding =  "utf-8",
                        engine = "python")

all_songs.head()

Unnamed: 0,song_id,artist,title
0,SOQMMHC12AB0180CB8,Faster Pussy cat,Silent Night
1,SOVFVAK12A8C1350D9,Karkkiautomaatti,Tanssi vaan
2,SOGTUKN12AB017F4F1,Hudson Mohawke,No One Could Ever
3,SOBNYVR12A8C13558C,Yerba Brava,Si Vos Querés
4,SOHSBXH12A8C13B0DF,Der Mystic,Tangle Of Aspens


In [9]:
all_songs.describe()

Unnamed: 0,song_id,artist,title
count,1000000,1000000,999985
unique,999056,72665,702000
top,SOUYQYY12AF72A000F,Michael Jackson,Intro
freq,3,194,1511


In [10]:
# Number of songs in data
data.song_id.unique().shape[0]

384546

In [12]:
# Number of songs in all_songs
all_songs.song_id.unique().shape[0]

999056

In [13]:
# Let's keep only the songs that are in data
unique_songs_tmp = data.song_id.unique()
unique_songs_df = pd.DataFrame(unique_songs_tmp, columns = ['song_id'])

In [14]:
songs = unique_songs_df.merge(all_songs,
                       left_on = "song_id",
                       right_on = "song_id",
                       how = "left")[["song_id", "artist", "title"]]

In [15]:
songs.describe()

Unnamed: 0,song_id,artist,title
count,385256,385256,385252
unique,384546,42062,306785
top,SOBPAEP12A58A77F49,Beastie Boys,Intro
freq,3,139,526


In [16]:
songs.head()

Unnamed: 0,song_id,artist,title
0,SOAKIMP12A8C130995,Jack Johnson,The Cove
1,SOAPDEY12A81C210A9,Billy Preston,Nothing from Nothing
2,SOBBMDR12A8C13253B,Paco De Lucia,Entre Dos Aguas
3,SOBFNSP12AF72A0E22,Josh Rouse,Under Cold Blue Stars
4,SOBFOVM12A58A7D494,The Dead 60s,Riot Radio (Soundtrack Version)


In [17]:
# Songs dataframe count is 385.256, while there are 384.546 unique songs. 
# The difference are duplicated songs

In [19]:
# Let's take a look
songs[songs.duplicated(subset=['song_id'], keep=False)].head(6)

Unnamed: 0,song_id,artist,title
9,SOBXHDL12A81C204C0,Kanye West,Stronger
10,SOBXHDL12A81C204C0,Kanye West,Stronger
184,SODGVGW12AC9075A8D,Justin Bieber,Somebody To Love
185,SODGVGW12AC9075A8D,Justin Bieber,Somebody To Love
191,SOKOXWU12AF72AD1BC,Eminem,The Real Slim Shady
192,SOKOXWU12AF72AD1BC,Eminem,The Real Slim Shady


In [20]:
# Remove those duplicates
songs.drop_duplicates(subset = "song_id", inplace = True)
songs.describe()

Unnamed: 0,song_id,artist,title
count,384546,384546,384542
unique,384546,42055,306720
top,SOSSQIP12AB0186F0D,Beastie Boys,Intro
freq,1,136,526


In [43]:
# Saving songs dataframe
songs.to_pickle("songs.pkl")

In [21]:
# Now count and unique are the same

In [46]:
# Remove all_songs from memory
all_songs = None

In [47]:
# Funcion that returns the artist and title of a song_id
def which_song(sid):
    return songs[songs["song_id"] == sid][["artist", "title"]]

In [48]:
which_song("SOSDCFG12AB0184647")

Unnamed: 0,artist,title
26444,Lena Philipsson,6


As this dataset is part of a closed competition, there is no test set, so we have to create it from the train set.

If we want to recommend something to a new user, either we know something about him or we can't but recommend the most popular songs.

Let's go for the first option, so we'll present the user random songs until he selects 10 that he likes

In [49]:
# Let show random artists, then the users selects an artis an we present its songs. The users selects
# thos he likes (or none)

In [50]:
# Show random artist
songs.artist.sample(20)

160353       Clube Do Balanco
110877                Emarosa
372427       Conjunto Clasico
219499            Prem Joshua
305977             Marc Ribot
254931     Nick Glennie-Smith
123339            Fiona Apple
270419                    ZPU
78858                   Kinky
155660              The Dears
33642           The Melodians
41224      Magtens Korridorer
69090              Elton John
64753           Lucila Campos
372301                  Mia X
75538                Deadsoil
237291          Dulce Liquido
332277       Long Tall Texans
13755     Jean-Yves Thibaudet
218444                  Becky
Name: artist, dtype: object

In [136]:
# Function that shows the songs of an artist
def songs_by(artist_name):
    return songs[songs["artist"] == artist_name][["song_id", "title"]]

In [42]:
songs_by("The Waterboys")

Unnamed: 0,song_id,title
23627,SOIQRKG12A6D4F63A3,Rags (Second Amendment)
36813,SOYQYTX12AB0186FFA,Bury My Heart
56909,SOYCNZF12A6D4F7F87,Fisherman's Blues
79882,SOGEJMW12A6D4F8626,When Will We Be Married (1987 Recording)
79887,SOLPBTJ12A6D4F5757,The Dance At The Crossroads
79900,SOPTUEN12A6D4F5751,Every Breath Is Yours
79914,SOWICCP12A6D4F5756,Always Dancing_ Never Getting Tired
97338,SOPARQV12A58A79DC9,A Pagan Place
115100,SODAYPE12A6D4F7F8E,This Is The Sea
175762,SOLLIIT12A8C133154,A Song For The Life (2008 Digital Remaster)


In [None]:
songs.sample(50)

In [None]:
# Randomly selected songs for the test
selected_songs = ["SOBWKZB12A8C136891", "SOSWNMA12A6D4F796F", "SOXWOLZ12A8C135F54", "SONWNJJ12A67AD8492", \
                  "SOHQCOD12AB018316C", "SOFDKPU12A58A7A06A", "SOVKUHB12AAF3B3636", "SONJNVN12A6701C689", \
                  "SOMMXAD12A58A76FBF", "SOZYUBF12AB018337B"]

In [56]:
# Let's get the songs from a real user
datafile = os.path.join(data_root, "train_listenings.pkl")
user_list = pd.read_pickle(datafile)
train_user = pd.DataFrame(user_list, columns=['user_id', 'song_id', 'mun_plays'])
train_user.head()


Unnamed: 0,user_id,song_id,mun_plays
0,00000b722001882066dff9d2da8a775658053ea0,SORDKNX12A8C13A45F,1
1,00000b722001882066dff9d2da8a775658053ea0,SOFLJQZ12A6D4FADA6,1
2,00000b722001882066dff9d2da8a775658053ea0,SOUBEXV12AB01804A4,1
3,00000b722001882066dff9d2da8a775658053ea0,SOKBXYC12A6D4F59D6,1
4,00000b722001882066dff9d2da8a775658053ea0,SOJOJUN12A8AE47E1D,1


In [57]:
train_user.describe()

Unnamed: 0,user_id,song_id,mun_plays
count,10,10,10
unique,1,10,2
top,00000b722001882066dff9d2da8a775658053ea0,SOJOJUN12A8AE47E1D,1
freq,10,1,9


In [59]:
# Lets make a list of songs from the train user
selected_songs = train_user['song_id']

In [88]:
# Titles of the songs played by the train user
train_user.merge(songs,
                left_on = 'song_id',
                right_on = 'song_id',
                how = 'inner')[['artist', 'title']]

Unnamed: 0,artist,title
0,Giles,Replay (Album Version)
1,Cartola,Tive Sim
2,Miley Cyrus,Don't Walk Away
3,Jim Gaffigan,Gravy Drinker (LP Version)
4,Edwyn Collins,Superstar Talking Blues
5,Tab Benoit,Jambalaya
6,MC Solaar,Nouveau western
7,Blue Man Group,Rods And Cones
8,John Parish / PJ Harvey,Heela
9,Jim Gaffigan,Hooooot Pocket! (LP Version)


In [94]:
# Now let's find other users that listened to those songs
similar_users = data[data["song_id"].isin(selected_songs)]['user_id'].reset_index(name='user_id')

In [95]:
similar_users.head()

Unnamed: 0,index,user_id
0,2030,2b6c2f33bc0e887ea7c4411f58106805a1923280
1,3047,b6b799f34a204bd928ea014c243ddad6d0be4f8f
2,3272,732f88be38fae217f8ab7e24c20dd072436e3e40
3,4035,9fba771d9731561eba47216f6fbfc0023d88641b
4,4111,62f2f9b881dc320d745a90c0c10528d18e10deb1


In [83]:
similar_users.describe()

Unnamed: 0,index
count,75588.0
mean,24087314.656
std,13933310.624
min,2030.0
25%,11970264.75
50%,24156641.0
75%,36118919.0
max,48373263.0


In [89]:
# Create a data frame with all the songs of our similar users
# What for???
similar_users_plays = data.merge(similar_users, 
                           left_on = 'user_id', 
                           right_on = 'user_id', 
                           how = 'inner')[["user_id", "song_id", "num_plays"]]

In [90]:
similar_users_plays.head()

Unnamed: 0,user_id,song_id,num_plays
0,2b6c2f33bc0e887ea7c4411f58106805a1923280,SOAUWYT12A81C206F1,6
1,2b6c2f33bc0e887ea7c4411f58106805a1923280,SOBMPJM12A8C13BAE2,1
2,2b6c2f33bc0e887ea7c4411f58106805a1923280,SOBONKR12A58A7A7E0,2
3,2b6c2f33bc0e887ea7c4411f58106805a1923280,SOBYSSP12AAF3B32CA,2
4,2b6c2f33bc0e887ea7c4411f58106805a1923280,SOCBDPR12A6701F792,1


In [91]:
similar_users_plays['song_id'].describe()

count                5336240
unique                219078
top       SOFLJQZ12A6D4FADA6
freq                   60803
Name: song_id, dtype: object

In [98]:
# Let's see how many of our songs those users have played

data[data["song_id"].isin(selected_songs)].groupby(['user_id']).size().sort_values(ascending = False)

user_id
00000b722001882066dff9d2da8a775658053ea0    10
20ff5a5506da7eef77835a0fe84a006e3c525a39     6
a32cfe061d740a7236d8727946f81f1a15c7028f     6
94e3357310cd860548432d5fb68a88c3032ca1e8     6
ed6426b2069480821ad8271b8c47ddd6df2ff674     5
edb5a29d03eb009f100aab5999aceb530c07a065     5
8b8f370f1bcbea4fc8b82fda3befb80bb9533253     5
9f85a4c39ee437c73fca1cb43b09e7283a06f9de     5
970b23b0a4a725d51545c9a6e333500a48271b03     5
2343b6f1e4b7408f7463aa5c00008296a6822dbf     5
340bc3cfcc095dc8ef3a90f83839b77c232ffa58     5
70d1b8f61b9b79c4fba9f764807bff3d02e6c647     5
8e56e10637928a0bd88409211e06232ddf0f0e32     5
96c8d38bbb32ef781b96b33e50ad9c03eb62c340     5
aab42f141f70914c82ccca5cecee6942270ee588     5
44ace73d853dbe305c67dca4f1b02913e3075450     5
47988192adad5b09d8a7ee7f91268757d9b91988     5
01b44213810365eff7d555ebdb84b91cdb275271     5
845d41a896304caa3ea1a0c588bc9fc745fa24ea     4
04e60c0940bd7a022038e219c528c1e27cd042ce     4
b97b0284199acd6f6020f2a19aa479bac0b7d100     4
08eb4

In [208]:
# Function that selects all the songs of a user
def songs_of_user(u_id):
    tmp_df = data[data["user_id"] == u_id]
    return tmp_df.merge(songs,
                left_on = 'song_id',
                right_on = 'song_id',
                how = 'inner')[['song_id', 'artist', 'title']]

This phase is not yet completed, as we have to assing a weight to the users depending on similarity (number of my songs played) with me, but right now we are going to make a rough estimation.

Let's find out which are the most popular songs amongst my "similar" users. We could order the songs by sum of play count, but ths method is very sensible to outliers (a user may have listened a hundred times to the same song), so we are going to find the most popular (most repeated) songs among these users excluding the selected songs.

In [102]:
prediction_df = similar_users.merge(data[~data.song_id.isin(selected_songs)],
                   left_on = 'user_id',
                   right_on = 'user_id')

In [106]:
prediction_df.user_id.unique().shape[0]

70170

In [109]:
# Let's chack that the selected songs are not in the prediction dataframe

prediction_df[prediction_df.song_id == "SORDKNX12A8C13A45F"]

# Crrect result has to be an empty dataframe

Unnamed: 0,index,user_id,song_id,num_plays


In [114]:
# Let's also check that the train user is not in the prediction_df
prediction_df[prediction_df.user_id == "00000b722001882066dff9d2da8a775658053ea0"]

Unnamed: 0,index,user_id,song_id,num_plays
1338114,12098218,00000b722001882066dff9d2da8a775658053ea0,SOCTXQW12A6D4F70AD,1
1338115,12098218,00000b722001882066dff9d2da8a775658053ea0,SOCZQCY12AC468E40F,1
1338116,12098219,00000b722001882066dff9d2da8a775658053ea0,SOCTXQW12A6D4F70AD,1
1338117,12098219,00000b722001882066dff9d2da8a775658053ea0,SOCZQCY12AC468E40F,1
1338118,12098222,00000b722001882066dff9d2da8a775658053ea0,SOCTXQW12A6D4F70AD,1
1338119,12098222,00000b722001882066dff9d2da8a775658053ea0,SOCZQCY12AC468E40F,1
1338120,12098223,00000b722001882066dff9d2da8a775658053ea0,SOCTXQW12A6D4F70AD,1
1338121,12098223,00000b722001882066dff9d2da8a775658053ea0,SOCZQCY12AC468E40F,1
1338122,12098224,00000b722001882066dff9d2da8a775658053ea0,SOCTXQW12A6D4F70AD,1
1338123,12098224,00000b722001882066dff9d2da8a775658053ea0,SOCZQCY12AC468E40F,1


In [115]:
# Here it is. Let's remove it
prediction_df = prediction_df[prediction_df.user_id != "00000b722001882066dff9d2da8a775658053ea0"]

In [116]:
prediction_df[prediction_df.user_id == "00000b722001882066dff9d2da8a775658053ea0"]

Unnamed: 0,index,user_id,song_id,num_plays


In [None]:
# It is not there anymore

In [127]:
# Let's now find the most played songs in the prediction dataframe
predicted_songs = prediction_df.\
                    groupby(['song_id']).\
                    size().\
                    sort_values(ascending = False).\
                    head(20).\
                    to_frame("popularity").\
                    reset_index()

In [128]:
predicted_songs.head()

Unnamed: 0,song_id,popularity
0,SOAUWYT12A81C206F1,28056
1,SOBONKR12A58A7A7E0,26453
2,SOEGIYH12A6D4FC0E3,22394
3,SOSXLTC12AF72A7F54,20894
4,SOFRQTD12A81C233C0,19347


In [129]:
# Let's see the songs in a human readable way
predicted_songs.merge(songs,
                left_on = 'song_id',
                right_on = 'song_id',
                how = 'inner')[['artist', 'title']]

Unnamed: 0,artist,title
0,Björk,Undo
1,Dwight Yoakam,You're The One
2,Barry Tuckwell/Academy of St Martin-in-the-Fie...,Horn Concerto No. 4 in E flat K495: II. Romanc...
3,Kings Of Leon,Revelry
4,Harmonia,Sehr kosmisch
5,Florence + The Machine,Dog Days Are Over (Radio Edit)
6,Five Iron Frenzy,Canada
7,OneRepublic,Secrets
8,Tub Ring,Invalid
9,Lonnie Gordon,Catch You Baby (Steve Pitron & Max Sanna Radio...


In [132]:
# This would be our recommendation. Now we have to score it

In [133]:
# Let's read the test file with the rest of the songs of the user to see if he has also listened to any of our recommendations

In [134]:
datafile = os.path.join(data_root, "test_listenings.pkl")
test_list = pd.read_pickle(datafile)
test_user = pd.DataFrame(test_list, columns=['user_id', 'song_id', 'mun_plays'])
test_user.head()

Unnamed: 0,user_id,song_id,mun_plays
0,00000b722001882066dff9d2da8a775658053ea0,SOCTXQW12A6D4F70AD,1
1,00000b722001882066dff9d2da8a775658053ea0,SOCZQCY12AC468E40F,1


In [135]:
test_user.merge(songs,
                left_on = 'song_id',
                right_on = 'song_id',
                how = 'inner')[['artist', 'title']]

Unnamed: 0,artist,title
0,Nick Cave & The Bad Seeds,Babe_ You Turn Me On (Paris 2)
1,Ana Carolina,Ela É Bamba


In [145]:
# Let's try to find some rock songs an see the recommendation...
songs.artist.sample(50)

185851                             Batmobile
310923                  ARRESTED DEVELOPMENT
139191                                 CPM22
195024                        Cowboy Junkies
31684                           Joan Osborne
180004                                  Ween
234304                      Acoustic Alchemy
53906                                    Air
107033                            The Elders
193116                                Smokie
135791                                 Beak>
1722                       Justin Timberlake
351067                             Tori Amos
39563                               Pendulum
246879                               Anarbor
255620                         April Stevens
15068     Ol' Dirty Bastard featuring Lil Mo
132906                                  Saga
79941                            Los Angeles
207846                             Tata Bojs
108076                            Songs:Ohia
9835                                    datA
67660     

In [None]:
# There seems to be a problem here as I am selecting only my songs when it would have to be just the opposite....

In [None]:
Thrice  SORJRTI12A6D4F7D67
Clawfinger  SOSOPGB12A8C13C185
Rammstein  SOSYHME12A8C135DD8
Rancid  SOSBJSU12A8C138469
Against Me!  SONJQZM12A6D4FBE30
Millencolin  SOZVBUH12A8AE4745C
Van Halen  SOVMGEX12AC9070FF2
Led Zeppelin  SOEHJKJ12A8C13CA4D
Staind  SOUNBBX12A6D4F338E
Monster Magnet  SOJKARY12A6701ED3F
Whitesnake  SOPCNEA12A67ADF48B
Puddle Of Mudd  SOTQVSE12A6D4F8200
Green Day  SOTNYYH12A6701F94B
Metallica  SOSJRJP12A6D4F826F

In [193]:
# Randomly selected songs for the test
selected_songs = ["SORJRTI12A6D4F7D67", "SOSOPGB12A8C13C185", "SOSYHME12A8C135DD8", "SOSBJSU12A8C138469", \
                 "SONJQZM12A6D4FBE30", "SOZVBUH12A8AE4745C", "SOVMGEX12AC9070FF2", "SOEHJKJ12A8C13CA4D", \
                 "SOUNBBX12A6D4F338E", "SOJKARY12A6701ED3F", "SOPCNEA12A67ADF48B", "SOTQVSE12A6D4F8200", \
                 "SOTNYYH12A6701F94B", "SOSJRJP12A6D4F826F"]

In [194]:
# Now let's find other users that listened to those songs
similar_users = data[data["song_id"].isin(selected_songs)]['user_id'].reset_index(name='user_id')

In [195]:
similar_users.describe()

Unnamed: 0,index
count,36391.0
mean,24187498.446
std,13995662.742
min,6041.0
25%,11986777.0
50%,24291425.0
75%,36296129.5
max,48373378.0


In [196]:
# Let's see how many of our songs those users have played

data[data["song_id"].isin(selected_songs)].groupby(['user_id']).size().sort_values(ascending = False)

user_id
8fc187765f25645e802bd5137f641c8de7df17b8    4
52542a715ba72e52eec99b277a42532c88615469    4
be59c5b281f8b714c4d4d4bfb877715a93b3c64d    4
58c846a9d19a9345bffe62b212436cb49363278a    3
a883218d1e6171d4913b1dec6c083eb3fea5f914    3
b73bc9b4732c8edf790e257df7395973f8d085ef    3
3e6dd161e97e7bd0e20986e7f5e391e5d24e0a62    3
b67f2d3bea6a313bc55695517cc9b38ff5f920fa    3
07e8066fc9c82f5e700023f3c963117e874e0188    3
3f52fdf255f7043eb170a49606bebe14f6c7a08a    3
ed824982bb5d17465708f5bfbd8589af81ad4de0    3
217b76adb93cdb5d221408ad9f9c5c244a65b038    3
f106f63e74ba0648ed27e2fd59094a53c8c9c534    3
78fb080641b1b1f9b85ceffd9c1686eb8db7c765    3
748096044d04f6736c6921203f711f57fe6e31ee    3
00fc9d7d12f74bcd93fa787cc26a9c61a0904ac7    3
b161e27efcd0135dabd0cc2cfea477498667b191    3
53175b45ba820a33ac8f833a85a986a7c0f7d3d4    3
b60c2902ab24963f33d8a431bee8676a14ceb003    3
8c24607fcd3b2ca28a8eb5924c7c26d8d40e82c4    3
127c8ac775ebdce42de94ff5783ab8d8e333711f    3
58dc40ef3b13f15b889f72d6e3

In [215]:
# Create a Dataframe with the number of similar songs by user
user_similarity = data[data["song_id"].isin(selected_songs)].groupby(['user_id']).size().to_frame("similarity").reset_index()

In [224]:
user_similarity.sort_values('similarity', ascending = False).head()

Unnamed: 0,user_id,similarity
19668,8fc187765f25645e802bd5137f641c8de7df17b8,4
11318,52542a715ba72e52eec99b277a42532c88615469,4
26131,be59c5b281f8b714c4d4d4bfb877715a93b3c64d,4
4544,217b76adb93cdb5d221408ad9f9c5c244a65b038,3
7031,33825a5d5b1b2ea935a9fc2f4a3cbf8e97e6280a,3


In [231]:
pred_df = similar_users.\
    merge(data[~data.song_id.isin(selected_songs)], on = 'user_id').\
    merge(user_similarity, on = "user_id")[["user_id", "song_id", "similarity"]]

In [234]:
pred_df.head()

Unnamed: 0,user_id,song_id,similarity
554524,8fc187765f25645e802bd5137f641c8de7df17b8,SOWIUZC12A67020681,4
556237,8fc187765f25645e802bd5137f641c8de7df17b8,SOKXXYY12A58A7837E,4
556239,8fc187765f25645e802bd5137f641c8de7df17b8,SOKZIXI12AB018DB4F,4
556240,8fc187765f25645e802bd5137f641c8de7df17b8,SOLDJOP12AC9097630,4
556241,8fc187765f25645e802bd5137f641c8de7df17b8,SOLDLEW12A6D4F7DCB,4


In [235]:
# now we have to create a DF with the score of every song that we'll user later to order the most relevant songs
# for a specific user
max_similarity = pred_df.similarity.max()

4

In [239]:
total_plays_similar_users = pred_df.shape[0]
total_plays_similar_users

3233753

In [293]:
def song_scoring(similarity):
    return total_plays_similar_users * (similarity / max_similarity) ** 3

In [267]:
def normalizer(x, max_x, min_x):
    return (x - min_x) / (max_x - min_x)

In [268]:
# Constant values for normalization
max_sim = song_scoring(max_similarity)
min_sim = song_scoring(1)

In [269]:
print (max_sim, min_sim)

3233753.0 3157.9619140625


In [283]:
# Let's add the score of the songs
def song_scoring_norm(similarity):
    return normalizer(song_scoring(similarity), max_sim, min_sim)

In [294]:
pred_df['score'] = pred_df['similarity'].map(lambda x: song_scoring_norm(x))

In [295]:
pred_df.sample(20)

Unnamed: 0,user_id,song_id,similarity,score
644380,7c906455aa9fb4e77c3330eecc11b302abb85595,SOSQUZL12A67ADAFCB,2,0.124
515067,33121ba97455f4f3d4bf1c03db13688ca60e10a4,SOMMXAD12A58A76FBF,1,0.015
3004342,64b11e81cd9a1e0da039c43cc6ca057f4266e2f5,SODEAAU12AB01823CE,1,0.015
1384509,2435caa5cee3eb83bcd455b6e9a7b8c47c3c5998,SOEGIYH12A6D4FC0E3,1,0.015
490202,407199f76e774482245c9fa61045d506dc8c27ee,SOGVTLG12A6D4F4F1B,1,0.015
1431173,1afd981d5ec4ddb53e32f0dbc0f691e8b21ca23f,SOIYJYB12AB017DDE5,1,0.015
1601642,782a2ac9bf28358e63a343ca5a370a5c4f644b3d,SOWBNGK12A8C143932,1,0.015
2018586,d7d48d8c7ffdf65f9450e146ee8ee30e18c7ed1b,SOMKFGS12A8C13F985,1,0.015
3172455,f02ef1e670319151d2d27f4215fb16a973f7de2f,SONMPJJ12AB0183AF8,1,0.015
2383129,190032a66ad34dc88e46da82efef6dce8d8f9f7b,SOYDPWZ12AF72A27DB,1,0.015


In [296]:
# Let's summ the score of every song and order it
pred_songs = pred_df.\
                    groupby(['song_id'])['score'].\
                    sum().\
                    sort_values(ascending = False).\
                    head(20).\
                    to_frame("popularity").\
                    reset_index()

In [297]:
pred_songs.head(20)

Unnamed: 0,song_id,popularity
0,SOEGIYH12A6D4FC0E3,176.2
1,SOAUWYT12A81C206F1,153.699
2,SOSXLTC12AF72A7F54,146.857
3,SOBONKR12A58A7A7E0,135.91
4,SOFRQTD12A81C233C0,127.462
5,SOTNHIP12AB0183131,125.879
6,SOLRGNF12AB0187CF4,111.392
7,SOUVTSM12AC468F6A7,110.8
8,SOFLJQZ12A6D4FADA6,110.495
9,SONYKOW12AB01849C9,109.977


In [298]:
# Let's see the songs in a human readable way
pred_songs.merge(songs, on = 'song_id')[['song_id', 'artist', 'title']].head(20)

Unnamed: 0,song_id,artist,title
0,SOEGIYH12A6D4FC0E3,Barry Tuckwell/Academy of St Martin-in-the-Fie...,Horn Concerto No. 4 in E flat K495: II. Romanc...
1,SOAUWYT12A81C206F1,Björk,Undo
2,SOSXLTC12AF72A7F54,Kings Of Leon,Revelry
3,SOBONKR12A58A7A7E0,Dwight Yoakam,You're The One
4,SOFRQTD12A81C233C0,Harmonia,Sehr kosmisch
5,SOTNHIP12AB0183131,Kid Cudi / Kanye West / Common,Make Her Say
6,SOLRGNF12AB0187CF4,Simon Harris,Sample Track 2
7,SOUVTSM12AC468F6A7,Lil Wayne / Eminem,Drop The World
8,SOFLJQZ12A6D4FADA6,Cartola,Tive Sim
9,SONYKOW12AB01849C9,OneRepublic,Secrets


In [None]:
# Remove prediction_df 
|
v

In [197]:
prediction_df = similar_users.merge(data[~data.song_id.isin(selected_songs)],
                   left_on = 'user_id',
                   right_on = 'user_id')

In [198]:
prediction_df.user_id.unique().shape[0]

35253

In [225]:
prediction_df.head()

Unnamed: 0,index,user_id,song_id,num_plays
0,6041,bd64f193f0f53f09d44ff48fd52830ff2fded392,SOACIPG12A8AE47E1C,1
1,6041,bd64f193f0f53f09d44ff48fd52830ff2fded392,SOAHEEC12A6BD4DAA4,1
2,6041,bd64f193f0f53f09d44ff48fd52830ff2fded392,SOAKQBB12A8C1413A0,1
3,6041,bd64f193f0f53f09d44ff48fd52830ff2fded392,SOAVFMF12A6D4F92E6,1
4,6041,bd64f193f0f53f09d44ff48fd52830ff2fded392,SOBOLEI12A58A7E386,1


In [200]:
# Let's now find the most played songs in the prediction dataframe
predicted_songs = prediction_df.\
                    groupby(['song_id']).\
                    size().\
                    sort_values(ascending = False).\
                    head(20).\
                    to_frame("popularity").\
                    reset_index()

In [201]:
predicted_songs.head()

Unnamed: 0,song_id,popularity
0,SOEGIYH12A6D4FC0E3,8851
1,SOAUWYT12A81C206F1,8014
2,SOSXLTC12AF72A7F54,7285
3,SOBONKR12A58A7A7E0,6980
4,SOFRQTD12A81C233C0,6086


In [202]:
# Let's see the songs in a human readable way
predicted_songs.merge(songs,
                left_on = 'song_id',
                right_on = 'song_id',
                how = 'inner')[['artist', 'title']]

Unnamed: 0,artist,title
0,Barry Tuckwell/Academy of St Martin-in-the-Fie...,Horn Concerto No. 4 in E flat K495: II. Romanc...
1,Björk,Undo
2,Kings Of Leon,Revelry
3,Dwight Yoakam,You're The One
4,Harmonia,Sehr kosmisch
5,Alliance Ethnik,Représente
6,Cartola,Tive Sim
7,OneRepublic,Secrets
8,Lil Wayne / Eminem,Drop The World
9,Florence + The Machine,Dog Days Are Over (Radio Edit)


In [204]:
# Let's find out the most played songs in the whole dataset
most_popular_songs = data.\
                    groupby(['song_id']).\
                    size().\
                    sort_values(ascending = False).\
                    head(20).\
                    to_frame("popularity").\
                    reset_index()

In [207]:
most_popular_songs.merge(songs,
                left_on = 'song_id',
                right_on = 'song_id',
                how = 'inner')[['artist', 'title', 'popularity']]

Unnamed: 0,artist,title,popularity
0,Harmonia,Sehr kosmisch,110479
1,Björk,Undo,90476
2,Florence + The Machine,Dog Days Are Over (Radio Edit),90444
3,Dwight Yoakam,You're The One,84000
4,Kings Of Leon,Revelry,80656
5,OneRepublic,Secrets,78353
6,Barry Tuckwell/Academy of St Martin-in-the-Fie...,Horn Concerto No. 4 in E flat K495: II. Romanc...,69487
7,Charttraxx Karaoke,Fireflies,64229
8,Train,Hey_ Soul Sister,63809
9,Cartola,Tive Sim,58610


In [None]:
# The results are heavily affected by the popularity of the songs

In [212]:
# Let's see the songs of one of the mos similar users:
songs_of_user("b31888d485ddff26572ffdab1c947bcc067ff3a1")

Unnamed: 0,song_id,artist,title
0,SOAAIEC12AAA8C87CE,A Brand,Time
1,SOACHPF12A6D4FA64D,Pet Shop Boys,Left To My Own Devices (2001 Digital Remaster)
2,SOALAMU12AB018271A,Lagwagon,Razor Burn
3,SOANQFY12AB0183239,Muse,Uprising
4,SOAVGNJ12A58A7DA4D,Phil Collins,Something Happened On The Way To Heaven
5,SOAYAYU12AB018436A,This Or The Apocalypse,Memento Mori
6,SOAYZOQ12AB018C013,Six Ft Ditch,Six Feet Deep
7,SOBANAT12A6D4F7501,Red Hot Chili Peppers,Naked In The Rain (Album Version)
8,SOBAUXH12A67ADD86C,Ok Go,Here It Goes Again
9,SOBHXKP12AB017EC82,Fear Factory,Dog Day Sunrise (Album Version)


In [190]:
group = songs_by("Metallica")

In [191]:

group.head(100)

Unnamed: 0,song_id,title
1164,SOOEEPE12A8AE459A4,The Unforgiven III
1432,SOZATKE12A6D4F5915,2 X 4
1622,SOGAUIQ12A6D4F8262,Hit The Lights
1669,SOUGBIM12A6D4F8247,The Four Horsemen
1743,SOCHYVZ12A6D4F5908,Enter Sandman
1822,SOZDGEW12A8C13E748,One
1829,SOGMBXD12A6D4F5920,Ronnie
4779,SOJSRYJ12A6D4F824C,Phantom Lord
4789,SOMTBXX12AF729F5A6,Am I Evil?
4803,SORIEXB12A6D4F824D,No Remorse


In [177]:
group[group.title == "Jump (Album Version)"]

Unnamed: 0,song_id,title
60240,SOVMGEX12AC9070FF2,Jump (Album Version)
