# Install Java, Spark, and Findspark
This installs Apache Spark 2.3.2, Java 8, and [Findspark](https://github.com/minrk/findspark), a library that makes it easy for Python to find Spark.

In [0]:
import os
import time

# data science imports
import math
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# utils import
#from fuzzywuzzy import fuzz

# visualization imports
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

%matplotlib inline

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
!cp /content/gdrive/My\ Drive/MscsDs/msdchallenge/kaggle_visible_evaluation_triplets.txt .

In [0]:
!cp -r /content/gdrive/My\ Drive/MscsDs/MSongsDB-master/PythonSrc .

In [0]:
# ! cd /content/gdrive/My\ Drive/MscsDs/ && tar -zxvf millionsongsubset_full.tar.gz

In [0]:
import glob
files = glob.glob('/content/gdrive/My Drive/MscsDs/MillionSongSubset/data/A/*/*/*.h5')

In [142]:
len(files)

7620

In [0]:
import sys
sys.path.append('./PythonSrc')

import hdf5_utils as HDF5
import hdf5_getters as g

In [0]:
artist_names = []
song_id = []
song_titles =[]
song_durations =[]
song_release_years =[] #year when this song was released, according to musicbrainz.org
song_hotness  =[]
song_tempo = []
song_energies = []
song_danceability = []
song_bars =[]
song_beats = []
song_time_signatures =[]
song_tatum =[]
song_modes=[]
song_keys=[]
artist_familiarty = []
artist_hotness = []

for f in files:
    
    h5 = HDF5.open_h5_file_read(f)
    
    song_idss = g.get_song_id(h5).decode('utf-8')
    song_title = g.get_title(h5).decode('utf-8')
    song_duration = g.get_duration(h5)
    song_release_year = g.get_year(h5)
    song_hot = g.get_song_hotttnesss (h5)
    song_speed = g.get_tempo(h5)
    song_energy = g.get_energy(h5)
    artist_name = g.get_artist_name(h5).decode('utf-8')
    song_dance = g.get_danceability(h5)
    song_bar = g.get_bars_start(h5)
    song_beat = g.get_beats_start(h5)
    song_time_signature = g.get_time_signature(h5)
    song_tat = g.get_tatums_start(h5)
    song_mode = g.get_mode(h5)
    song_key = g.get_key(h5)
    artist_familar = g.get_artist_familiarity(h5)
    artist_hot = g.get_artist_hotttnesss(h5)
    
    song_id.append(song_idss)
    song_titles.append(song_title)
    song_durations.append(song_duration)
    song_release_years.append(song_release_year)
    song_hotness.append(song_hot)
    song_energies.append(song_energy)
    song_tempo.append(song_speed)
    artist_names.append(artist_name)
    song_danceability.append(song_dance)
    song_bars.append(song_bar)
    song_beats.append(song_beat)
    song_time_signatures.append(song_time_signature)
    song_tatum.append(song_tat)
    song_modes.append(song_mode)
    song_keys.append(song_key)
    artist_familiarty.append(artist_familar)
    artist_hotness.append(artist_hot)
    
    h5.close()

In [145]:
song_id[2]

'SODRPJT12AC468DEF1'

In [0]:
data = {'song_ids':song_id
        ,'song_titles':song_titles
        ,'song_durations':song_durations
        ,'song_realease_years':song_release_years
        ,'artist_names':artist_names
        ,'song_hotness':song_hotness
        ,'song_tempo':song_tempo
        ,'song_energies':song_energies
        ,'song_danceability':song_danceability
        ,'song_bars':song_bars
        ,'song_beats':song_beats
        ,'song_time_signatures':song_time_signatures
        ,'song_tatum':song_tatum
        ,'song_modes':song_modes
        ,'song_keys':song_keys
        ,'artist_familiarty':artist_familiarty
        ,'artist_hotness':artist_hotness
       }

In [0]:
# To use the data loaded from hd5 files
# df=pd.DataFrame(data)

# To save the data to csv files
# df.to_csv('/content/gdrive/My Drive/MscsDs/MillionSongSubset/msd.csv', sep=',')

Loading the data from saved dataframe

In [0]:
df = pd.read_csv('/content/gdrive/My Drive/MscsDs/MillionSongSubset/msd.csv')

In [5]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,song_ids,song_titles,song_durations,song_realease_years,artist_names,song_hotness,song_tempo,song_energies,song_danceability,song_bars,song_beats,song_time_signatures,song_tatum,song_modes,song_keys,artist_familiarty,artist_hotness
0,0,0,SONHTWP12A8C142744,Vals '89,162.58567,0,Manolis Famellos & I Podilates,,94.301,0.0,0.0,[ 1.45502 3.39383 5.32826 7.25953 9.1...,[ 0.16441 0.80395 1.45502 2.09102 2.7...,3,[ 0.16441 0.48418 0.80395 1.13522 1.4...,1,0,0.22873,0.0
1,1,1,SOPIHMB12AC468E0DD,Ghost of the Ocean (Live),214.25587,0,Uriah Heep,0.2502,149.921,0.0,0.0,[ 1.20218 2.80457 4.39266 5.98514 7.5...,[ 0.38752 0.79653 1.20218 1.60244 2.0...,4,[ 0.2523 0.38752 0.52273 ... 212.97102 2...,1,7,0.673239,0.479205
2,2,2,SODRPJT12AC468DEF1,The frog song,169.87383,0,Alain-François,,136.044,0.0,0.0,[ 1.37397 4.48312 7.56804 10.65675 13.7...,[ 0.48609 0.93058 1.37397 1.81958 2.2...,7,[ 0.26328 0.48609 0.70778 0.93058 1.1...,1,2,0.380317,0.300918
3,3,3,SOLLOTO12AB01804C6,Deep Blue Sea (Daniel Rossen home recording),351.50322,2008,Grizzly Bear,,85.003,0.0,0.0,[ 0.72449 3.55798 6.37712 9.20176 12.0...,[ 0.72449 1.43405 2.1445 2.85228 3.5...,4,[1.8653000e-01 3.6881000e-01 5.4665000e-01 ......,1,9,0.760636,0.547244
4,4,4,SOCNYGS12AB01832B8,La vérité,252.44689,2007,Annie Blanchard,0.0,113.287,0.0,0.0,[2.2334000e-01 2.3324800e+00 4.4484000e+00 6.5...,[2.2334000e-01 7.5000000e-01 1.2775800e+00 1.8...,4,[2.2334000e-01 3.9890000e-01 5.7445000e-01 ......,1,5,0.479735,0.35246


In [6]:
nd=df[['song_ids','song_titles','artist_names']]
nd.head()

Unnamed: 0,song_ids,song_titles,artist_names
0,SONHTWP12A8C142744,Vals '89,Manolis Famellos & I Podilates
1,SOPIHMB12AC468E0DD,Ghost of the Ocean (Live),Uriah Heep
2,SODRPJT12AC468DEF1,The frog song,Alain-François
3,SOLLOTO12AB01804C6,Deep Blue Sea (Daniel Rossen home recording),Grizzly Bear
4,SOCNYGS12AB01832B8,La vérité,Annie Blanchard


In [7]:
nd['song_titles'].value_counts()

Intro                             12
Hey Joe                            5
Smile                              5
Wave                               4
Outro                              4
                                  ..
In The Midst                       1
Provisoirement                     1
Chanson sous la pluie              1
Tropical Nights                    1
Jetstream - Arthur Baker Remix     1
Name: song_titles, Length: 7438, dtype: int64

In [9]:
users = pd.read_table('./kaggle_visible_evaluation_triplets.txt', sep='\t', names= ['user_id','song_ids','play_count'])
users.head()

Unnamed: 0,user_id,song_ids,play_count
0,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOBONKR12A58A7A7E0,1
1,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOEGIYH12A6D4FC0E3,1
2,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOFLJQZ12A6D4FADA6,1
3,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SOHTKMO12AB01843B0,1
4,fd50c4007b68a3737fe052d5a4f78ce8aa117f3d,SODQZCY12A6D4F9D11,1


In [10]:
uniqueUsers = users['user_id'].unique()
len(uniqueUsers)

110000

In [11]:
users['user_id'].nunique()

110000

In [12]:
users["play_count"].describe()

count    1.450933e+06
mean     3.187149e+00
std      7.051664e+00
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      3.000000e+00
max      9.230000e+02
Name: play_count, dtype: float64

In [13]:
users["song_ids"].nunique()

163206

In [14]:
knnData = pd.merge(users, nd)
knnData

Unnamed: 0,user_id,song_ids,play_count,song_titles,artist_names
0,c34670d9c1718361feb93068a853cead3c95b76a,SOEHWGF12A6D4F8B2B,3,Hips Don't Lie (featuring Wyclef Jean),Shakira Featuring Wyclef Jean
1,c759e740af57c477fe358e62ad7b3b1f2f113a2f,SOEHWGF12A6D4F8B2B,6,Hips Don't Lie (featuring Wyclef Jean),Shakira Featuring Wyclef Jean
2,8f6e833948caf895a3281ded74ceb32783916e42,SOEHWGF12A6D4F8B2B,1,Hips Don't Lie (featuring Wyclef Jean),Shakira Featuring Wyclef Jean
3,8cd07652ccba3754502c8fa8fe809c418fdcdc8f,SOEHWGF12A6D4F8B2B,2,Hips Don't Lie (featuring Wyclef Jean),Shakira Featuring Wyclef Jean
4,563e8088cbb8affb096d2334531624a8cf7ff3d2,SOEHWGF12A6D4F8B2B,5,Hips Don't Lie (featuring Wyclef Jean),Shakira Featuring Wyclef Jean
...,...,...,...,...,...
18127,a4a85f48a7d787b6e17ed38001abcd032cb87c6a,SOKLIMD12A8C145AC3,1,Whiskey on the Fire,Aaron Watson
18128,90bac2ccc1660cf444569f3553e6b141d132b2c3,SONCEJF12AB018581C,1,Jamming,Twinkle Twinkle Little Rock Star
18129,46a363f17bd4e07d2c607779877baedafc07e7d7,SOKWNWI12A8C13383F,1,Bianca's Midnight Lullaby,Al Di Meola
18130,8ecfca10c16f046ae53634347d6b9029c40425c0,SONQKDG12A8C130D96,1,Isabel,Juana Molina


In [15]:
knnData["user_id"].nunique()

16083

In [16]:
knnData["song_ids"].nunique()

1311

In [17]:
knnData["play_count"].describe()

count    18132.000000
mean         2.999007
std          7.307306
min          1.000000
25%          1.000000
50%          1.000000
75%          3.000000
max        597.000000
Name: play_count, dtype: float64

In [0]:
song=knnData.drop_duplicates(['user_id','song_titles','artist_names'])
song=song.reset_index(drop=True)

In [19]:
song.head()

Unnamed: 0,user_id,song_ids,play_count,song_titles,artist_names
0,c34670d9c1718361feb93068a853cead3c95b76a,SOEHWGF12A6D4F8B2B,3,Hips Don't Lie (featuring Wyclef Jean),Shakira Featuring Wyclef Jean
1,c759e740af57c477fe358e62ad7b3b1f2f113a2f,SOEHWGF12A6D4F8B2B,6,Hips Don't Lie (featuring Wyclef Jean),Shakira Featuring Wyclef Jean
2,8f6e833948caf895a3281ded74ceb32783916e42,SOEHWGF12A6D4F8B2B,1,Hips Don't Lie (featuring Wyclef Jean),Shakira Featuring Wyclef Jean
3,8cd07652ccba3754502c8fa8fe809c418fdcdc8f,SOEHWGF12A6D4F8B2B,2,Hips Don't Lie (featuring Wyclef Jean),Shakira Featuring Wyclef Jean
4,563e8088cbb8affb096d2334531624a8cf7ff3d2,SOEHWGF12A6D4F8B2B,5,Hips Don't Lie (featuring Wyclef Jean),Shakira Featuring Wyclef Jean


In [20]:
song_pivot=song.pivot(index='user_id',columns='song_ids',values='play_count')
song_pivot=song_pivot.fillna(0)
song_pivot.shape

(16083, 1311)

In [21]:
song_pivot.head()

song_ids,SOAAAQN12AB01856D3,SOAAEHR12A6D4FB060,SOAASSD12AB0181AA6,SOABLAF12AB018E1D9,SOABTKM12A8AE4721E,SOABVWD12A58A7C3FF,SOACEDS12A6701EAAA,SOACUIU12A6D4F715B,SOADJND12A8C13BAE1,SOAECHX12A6D4FC3D9,SOAEEXM12A8C13D6E3,SOAEGMV12A8C1455F2,SOAEJTI12A8C145051,SOAESFB12A8C13573F,SOAFUPM12A6D4FD26F,SOAGXBV12A6D4F70BD,SOAGYNF12AB0187EEE,SOAHDOA12A6D4FA830,SOAHRLX12A58A7C372,SOAHYMX12AB0182021,SOAIEMM12AB017EC98,SOAIJKC12A8C131D6F,SOAIOOJ12A8C1385D2,SOAJEOO12A8C13A2FF,SOAKDLX12A67ADAFC5,SOAKODQ12A58A7B461,SOAKPEL12A6D4FC126,SOAMXYQ12AB017C25E,SOANUME12A8C1368DF,SOAOOXO12AF72A787F,SOAOXTG12A67021AA7,SOAPEEL12A81C21ADE,SOAPIHC12A6D4F5FD5,SOAPLUZ12A6D4F9CD2,SOAPQCG12AB017AAAB,SOAPZIG12A6D4F7920,SOARSSE12A58A7BA9A,SOASNQV12A8AE48819,SOATHUP12A8C131DC3,SOAUFFQ12A8C134623,...,SOZCURA12AF72A17B1,SOZDDPQ12AF72A8BA8,SOZEQVC12AC468E38B,SOZFGOG12A58A7F755,SOZFKKI12AB018602A,SOZFZMX12AB018BE08,SOZGQLE12AB018672C,SOZGUEI12A8C139953,SOZHSPI12A8C1339E6,SOZIIHW12AB0189AFA,SOZIMLB12AF72AD1CD,SOZKFHV12A6D4F996F,SOZKYCJ12A6D4F480A,SOZLSBK12A6D4F87B8,SOZMELK12A8C143532,SOZMHAO12A8C141C2A,SOZMMFV12AB0183F74,SOZNESJ12A6D4FD187,SOZNQQQ12AF72ABF9F,SOZPRDS12A8C134E2E,SOZQDRV12A8C133FB1,SOZQSGL12AF72A9145,SOZQVTJ12A6701D96B,SOZSHWZ12A8C1452B5,SOZSKYN12A8C13A883,SOZSSCO12AF72A554D,SOZTOSX12AC468AD34,SOZTUDR12A8C13FE3F,SOZUEFV12A8C141169,SOZUOAK12AB018AA82,SOZVTZB12A58A7AEE4,SOZVWSE12A6D4F7ADA,SOZVZWP12A58A7BAD1,SOZWAMN12AB018234D,SOZWECJ12A6D4F5229,SOZXHBQ12AB0186626,SOZYUKG12A6D4FB64F,SOZZPYH12AB0187578,SOZZQBH12A6D4FAFD8,SOZZVMW12AB0183B52
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
00020fcd8b01986a6a85b896ccde6c49f35142ad,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000724134199acaa98869fc4ffb805198d2940eb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0011d5f4fb02ff276763d385c3f2ded2b00ad94a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
001a4329228373ec5d4efe1084317b4009df7b97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00200865ee578746cc29b28d2c14a81c74895a5f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
from sklearn.neighbors import NearestNeighbors
knn=NearestNeighbors(n_neighbors=10,metric='cosine')
model_user=knn.fit(song_pivot)

In [0]:
# for i in range(0, len(distances.flatten())):
#       if i == 0:
#           print("Searching recommendation for user: ", song_pivot.index[query_index])
#       else:
#           rows = knnData.loc[knnData['user_id'] == song_pivot.index[indices.flatten()[i]] ]
#           for item in rows.values:      
#             print("\n  User: ", item[0])
#             print("    Song: ", item[2])  
#             print("    Play Count: ",  item[3])

In [0]:
def findRecomendationsUsers(model, matrix, data, query_index):
  distances, indices = model.kneighbors([matrix.iloc[query_index, :]], n_neighbors = 10)
  for i in range(0, len(distances.flatten())):
        if i == 0:
            print("Searching recommendation for user: ", matrix.index[query_index])
        else:
            rows = data.loc[data['user_id'] == matrix.index[indices.flatten()[i]] ]
            for item in rows.values:      
              print("\n  User: ", item[0])
              print("    Play Count: ", item[2])  
              print("    Song: ",  item[3])

In [27]:
query_index = np.random.choice(song_pivot.shape[0])
query_index

11973

In [0]:
# Some query index with unique recommendations
# query_index = 1134
# query_index 1277

In [29]:
findRecomendationsUsers(model_user, song_pivot, knnData, query_index)

Searching recommendation for user:  bcd2779b095b6da759ce245e262d5be5ece3a020

  User:  8124ed9c91410c7cdd0eaeaa6c03e20858c16cbf
    Play Count:  1
    Song:  Dimension

  User:  44efca4a32ec10570415ff27d750bcb7317a7537
    Play Count:  1
    Song:  Dimension

  User:  cfa7f2cce2d8ec824d67708e0fc7506ab02d8fd2
    Play Count:  2
    Song:  Dimension

  User:  b239d5d51be091b059a19c8715efc005f896c78c
    Play Count:  1
    Song:  Dimension

  User:  0d50e72b0e312a574a99b94da96f7796740d7e6d
    Play Count:  1
    Song:  Dimension

  User:  9e66bf7016204731edd27cdcf37a9168091a0326
    Play Count:  1
    Song:  Dimension

  User:  18553a9964e4b7019f6ec873e7d4915bc4444c2f
    Play Count:  1
    Song:  Dimension

  User:  440924f2c522a9c2539abaf58f24fff1d76723f1
    Play Count:  1
    Song:  Dimension

  User:  93a0369cedca0b2ce753ba61d011d683b5ad85a1
    Play Count:  1
    Song:  Dimension


In [0]:
song_plays = (users.
     groupby(by = ['song_ids'])['play_count'].
     sum().
     reset_index().
     rename(columns = {'play_count': 'total_song_plays'})
     [['song_ids', 'total_song_plays']]
    )

In [31]:
song_plays.describe()

Unnamed: 0,total_song_plays
count,163206.0
mean,28.334375
std,215.826789
min,1.0
25%,1.0
50%,5.0
75%,15.0
max,35432.0


# Item Based Colaborative Filtering

In [0]:
from scipy.sparse import csr_matrix

In [0]:
sparse_matrix = song.pivot(index='song_ids',columns='user_id',values='play_count')
sparse_matrix = sparse_matrix.fillna(0)

In [34]:
sparse_matrix

user_id,00020fcd8b01986a6a85b896ccde6c49f35142ad,000724134199acaa98869fc4ffb805198d2940eb,0011d5f4fb02ff276763d385c3f2ded2b00ad94a,001a4329228373ec5d4efe1084317b4009df7b97,00200865ee578746cc29b28d2c14a81c74895a5f,00209c99d83b405d47fe87f6761dbf7d259ca856,0027bd60fea07d48fa336a979f9fa439bebb44fb,0029b299091e9e9040761838785a0689a5f67215,002ace7725512ef9180af3c8a0fbf0e1071b72d9,003032fc00365d6f15f047c63717a14e0d8aced7,0032dddb264b47d623c93833a18007a788102432,0034614d935784ee588b28dff22a4882a12cb401,00365d3bf40b02ebd932bd43e7b463495a46ad68,00387504e8fc8aa34827a637dea25000e86e85a3,0038d0eaac8e3e132e7199611a8add77815f3d86,0049335750252b238b7ed2d2018033c1ebd1e7ab,00520d79b219630ec27d8d8b2a03997685284440,00539b30daa6999e65cc44867acac6c336004a02,00545af48096210f97930b66f6661df88729900f,005aa4c4bc1dc74114b8bd6ffdca64c1d218474f,005f71ae59d898709e7b88002c3be4d2d599a66f,00629154d40c2100fb492508b4c535c661718da4,006665569461ff1711a0f3f29c47b5e7ab43edb2,006771ed630b23afa026256aa70ea4bbde319097,0068bf86cce92b76b987c62578735c51b2bec607,006a47b6c163573d750abcc50683f52448b59d76,0072963112236d305279a9fed2b195abd365a806,007308634313ec6dcb20b55581d6e4ad1dcca8b7,00766ba637c5dbdb52ff0a5cbf13229e2135d0c2,007835ec61401b07b4a61c3164556a7c5056ba2f,007bf363856974d0bd57d3e4cdc93423769361f3,007e450faa8daa47362a2a3f5d0a6f4ecd535f3f,007f4e112f22e32bd1f3ef634afb30bf7262211a,0086a41ed007814cb836ce8a541a92c68ab3c688,0088bbf8870349910bc13612ac74384397d01f16,0088ee55a3c5834bce7e010af03680820bad5f1d,0091d5d21c0f709d513d79bc17952e5ac0cc56a1,00949b7e0d675efa40992a0e3f66db26b7a27b08,009d9c2babc0c05aa8a4419ad00f5105aa99c5e1,00a36168ed80334a6816e1feefd5b40aba212e31,...,ff5f55c48e94d0da039a0d40e03e2f254f5195bb,ff63ab995d2d5b87ced0f93dc3b4d13db02c2491,ff65ec4a9765bcc63578aaea2eb68e502e462e2c,ff66c1e5c1fb424e06c761c95048c9ec4faa8070,ff6bd57fa3d8860553ec778e24d73e775b280fc9,ff6d36e7292335ea32e89b19d269a2d760a91a49,ff6e96b1129b9f6330d0f1e59be39751e175ee4f,ff77fea5a8d84f5109de17185e2a7fb5d7e2a467,ff7a04e8400800d26e522e436a088b1251a4cdf6,ff80830a101547b5c112e72e939dde33f0a68a1b,ff814d959cc4b515fa7dccffb1355104791c60ed,ff81bdc5042aabf6687b0322acc5803c2cb4d912,ff85e05b8d2d01540cf41260df2532c4f16510d0,ff8891b901eed1a3abc9bd4c53a17d2bcdc0ead7,ff8a343fa2e3b0f61f3728dec69eae6ad137528d,ff8c7b25076f76e2ba5bfe62b094b95b50daa0ed,ff8e0fe264adf90b61d2146b51f336abea8c6030,ff93375e8105444419de44b55658d0dbe1c883d7,ffa9ab7e3ff7b5d4fbac66bc9997949a9e721a91,ffaa6d6870522d52107eb2cbbcfe7d1634400111,ffbc11b17e88f2af9e7235630078d78d003f526a,ffbfe37dc0f8454d4c7ad2061fb69287d1dc47f2,ffc31c581692545c74b2508f46b83fac136f1a88,ffc683d098d818421f22363b150a19fd7d307764,ffd4cb97d77ee06adf29b0632350589404552b8e,ffda55050d450037e24b67b8e0f48f4f16a03872,ffdfb7f168ab6dc11c03a3af8684ff736aa06f04,ffe0d010c60ef28ef5e6923e758f514e755adfcf,ffe33c42d3c85ad750326fa0ae45cc3b84d58f15,ffe5a0afac73eefdc4e6cfb0a40df22480ed4ce2,ffe5b8df7ba3210f4724fe8d9eeca0c435bcfab9,ffe6cff4a35b9fa9bc77ce9f5c62803521293f0f,ffe70ee0afa3930edf9bca702cb3982d3ff747be,ffe98240b89fb05e1997ed4933b544d434f108b1,ffec87ea10f29a7103edd660c45c65527b1c4b9d,ffee916020d891a74264b8326829742333b79103,ffef5c05e4f25af2e8e9880af84d66936056b597,fff24c114beca48744a1ca2b699dae8f42398ad1,fff7e94f0a8d3c6c705b5f7c677d61da82aa545b,fffcfe9b89b791e39ff3e643aa57ae9303079c56
song_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
SOAAAQN12AB01856D3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SOAAEHR12A6D4FB060,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SOAASSD12AB0181AA6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SOABLAF12AB018E1D9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SOABTKM12A8AE4721E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SOZXHBQ12AB0186626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SOZYUKG12A6D4FB64F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SOZZPYH12AB0187578,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SOZZQBH12A6D4FAFD8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
sparse_matrix_sparse = csr_matrix(sparse_matrix.values)

In [0]:
from sklearn.neighbors import NearestNeighbors
knn=NearestNeighbors(n_neighbors=10,metric='cosine')
knn_model=knn.fit(sparse_matrix_sparse)

In [0]:
# def findRecomendations(model, matrix, query_index):
#   distances, indices = model.kneighbors([matrix.iloc[query_index, :]], n_neighbors = 10)
#   for i in range(0, len(distances.flatten())):
#       if i == 0:
#           print(sparse_matrix.index[query_index], nd.loc[nd['song_ids'] == sparse_matrix.index[query_index]].song_titles.values[0])
#       else:
#           print(i, sparse_matrix.index[indices.flatten()[i]], nd.loc[nd['song_ids'] == sparse_matrix.index[indices.flatten()[i]] ].song_titles.values[0], distances.flatten()[i])

In [0]:
def findRecomendations(model, matrix, query_index):
  distances, indices = model.kneighbors([matrix.iloc[query_index, :]], n_neighbors = 10)
  for i in range(0, len(distances.flatten())):
      if i == 0:
          rowDetails = nd.loc[nd['song_ids'] == matrix.index[query_index]]
          print("Song: ", rowDetails.song_titles.values[0], ", Artist: ",  rowDetails.artist_names.values[0] )
      else:
          rowDetails = nd.loc[nd['song_ids'] == matrix.index[indices.flatten()[i]] ]
          print(" Recommendation", i, ": ")
          print("   Song: ", rowDetails.song_titles.values[0])  
          print("   Artist: ",  rowDetails.artist_names.values[0])
          print("   Distance: ", distances.flatten()[i])

In [0]:
query_index = np.random.choice(sparse_matrix.shape[0])
query_index
query_index = 111

In [40]:
findRecomendations(knn_model, sparse_matrix, query_index)

Song:  Meet Me In The Bathroom , Artist:  The Strokes
 Recommendation 1 : 
   Song:  Between Love & Hate
   Artist:  The Strokes
   Distance:  0.7475532789460535
 Recommendation 2 : 
   Song:  Ahead By A Century
   Artist:  The Tragically Hip
   Distance:  0.9814504441695933
 Recommendation 3 : 
   Song:  Human Being
   Artist:  The New York Dolls
   Distance:  0.9836043541054011
 Recommendation 4 : 
   Song:  Dungeon Master
   Artist:  EPMD / Nocturnal
   Distance:  0.9848730148632904
 Recommendation 5 : 
   Song:  Brother
   Artist:  Alice In Chains
   Distance:  0.9891693927785223
 Recommendation 6 : 
   Song:  Welcome
   Artist:  Phil Collins
   Distance:  0.9974236728013324
 Recommendation 7 : 
   Song:  15 Step
   Artist:  Radiohead
   Distance:  0.9981707181046349
 Recommendation 8 : 
   Song:  Pero Me Acuerdo De Tí
   Artist:  Christina Aguilera
   Distance:  0.9989296116164493
 Recommendation 9 : 
   Song:  Don't Worry About It (Edited)
   Artist:  N.E.R.D.
   Distance:  1.0


In [0]:
distances, indices = knn_model.kneighbors([sparse_matrix.iloc[query_index, :]], n_neighbors = 6)

In [42]:
distances, indices 

(array([[0.        , 0.74755328, 0.98145044, 0.98360435, 0.98487301,
         0.98916939]]), array([[ 111, 1235,  689,  349,  425,  760]]))

In [43]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        rowDetails = nd.loc[nd['song_ids'] == sparse_matrix.index[query_index]]
        print("Song: ", rowDetails.song_titles.values[0], ", Artist: ",  rowDetails.artist_names.values[0] )
    else:
        rowDetails = nd.loc[nd['song_ids'] == sparse_matrix.index[indices.flatten()[i]] ]
        print(" Recommendation", i, ": ")
        print("   Song: ", rowDetails.song_titles.values[0])  
        print("   Artist: ",  rowDetails.artist_names.values[0])
        print("   Distance: ", distances.flatten()[i])

Song:  Meet Me In The Bathroom , Artist:  The Strokes
 Recommendation 1 : 
   Song:  Between Love & Hate
   Artist:  The Strokes
   Distance:  0.7475532789460535
 Recommendation 2 : 
   Song:  Ahead By A Century
   Artist:  The Tragically Hip
   Distance:  0.9814504441695933
 Recommendation 3 : 
   Song:  Human Being
   Artist:  The New York Dolls
   Distance:  0.9836043541054011
 Recommendation 4 : 
   Song:  Dungeon Master
   Artist:  EPMD / Nocturnal
   Distance:  0.9848730148632904
 Recommendation 5 : 
   Song:  Brother
   Artist:  Alice In Chains
   Distance:  0.9891693927785223


In [0]:
# nd

In [0]:
# nd['artist_names'].nunique()

In [0]:
# s = nd['song_ids']

In [0]:
# nd['song_ids'] == 'SOOSYMY12AB01888CD'

In [0]:
# print(nd.loc[nd['song_ids'] == 'SOOSYMY12AB01888CD', 'song_titles'].values[0])