# Mosaico Musical

### Musical recommender by Alberto Antón as a final project for the Master in Data Science of KSchool


In [21]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import io
import os
import sys
import random

In [14]:
# display results to 3 decimal points, not in scientific notation, and thousands separator
pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))

In [22]:
# Set random seed
random.seed(666)

### Loading data

In [3]:
data_root = "data"

In [5]:
# Load training dataset
columns = ['user_id', 'song_id', 'num_plays']
datafile = os.path.join(data_root, "train_triplets.txt")

data = pd.read_csv(datafile, 
                   sep='\t', 
                   header = None,
                   names = columns)

In [6]:
# Let's get a glimpse of the data
data.head()

Unnamed: 0,user_id,song_id,num_plays
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAPDEY12A81C210A9,1
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFNSP12AF72A0E22,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBFOVM12A58A7D494,1


In [15]:
data.describe()

Unnamed: 0,num_plays
count,48373586.0
mean,2.87
std,6.44
min,1.0
25%,1.0
50%,1.0
75%,3.0
max,9667.0


In [16]:
# Let's analyze num_plays column a little deeper
data.num_plays.describe()

count   48,373,586.00
mean             2.87
std              6.44
min              1.00
25%              1.00
50%              1.00
75%              3.00
max          9,667.00
Name: num_plays, dtype: float64

In [17]:
# most of the songs have been played only one time, and there are very large outliers, 
# so we will not be using num_plays field.

We already have the listenings dataset, now let's load the song information dataset

In [25]:
columns = ["foo", "song_id", "artist", "title"]
datafile = os.path.join(data_root, "unique_tracks.txt")

all_songs = pd.read_csv(datafile, 
                        header = None,
                        sep = '<SEP>',
                        names = columns,
                        usecols = ["song_id", "artist", "title"],
                        encoding =  "utf-8",
                        engine = "python")

In [26]:
all_songs.sample(20)

Unnamed: 0,song_id,artist,title
889472,SOHSAMM12AB017DA2A,Antonis Remos,Tha 'Thela Na 'Soun Edo
723055,SOZJANK12A58A7BE46,Ultravox!,Just For A Moment
632442,SOAQFRZ12A6D4F893B,Larry McCray,Three Straight Days Of Rain
294544,SOIGFNZ12AB01807A9,Katatonia,The Longest Year
544428,SOHWGID12D02199F61,T-Pain featuring R. Kelly_ Pimp C (of UGK)_ To...,"I'm N Luv (Wit A Stripper) 2 - ""Tha Remix"" fea..."
740282,SOYBSLG12A6D4F4A8B,Sonya Kitchell,Let Me Go
732546,SOWFCKH12A8C1364E8,tobyMac,In The Light
554159,SOXESDD12A8C133A94,Juliet Jonesin Sydan,Uskonto_ laskento_ lukeminen_ laulu
845058,SOTNYZN12A81C20FFC,Blue Magic,Answer To My Prayer (LP Version)
2500,SOLBXJC12A8C1406FD,Renato Russo,Cathedral Song


In [27]:
all_songs.describe()

Unnamed: 0,song_id,artist,title
count,1000000,1000000,999985
unique,999056,72665,702000
top,SONBEKD12AB01894DC,Michael Jackson,Intro
freq,3,194,1511


In [28]:
# Let's see the unique number of songs
all_songs.song_id.unique().shape[0]

999056

In [29]:
# There is information of about one million songs. Let's see hoy many of these songs
# are in the training dataset
data.song_id.unique().shape[0]

384546

We don't need information for so many songs, so let's create a dataframe with information only on the songs that are in the training dataframe.

In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48373586 entries, 0 to 48373585
Data columns (total 3 columns):
user_id      object
song_id      object
num_plays    int64
dtypes: int64(1), object(2)
memory usage: 1.1+ GB
