### Importing libraries

In [1]:
import numpy as np
import pandas as pd

### Getting data

In [2]:
artists_filepath = r'C:\Users\ANIMESH\Documents\Python Scripts\datasets\artists.csv'
data_filepath = r'C:\Users\ANIMESH\Documents\Python Scripts\datasets\scrobbler-small-sample.csv'
artists = pd.read_csv (artists_filepath, header=None)
data = pd.read_csv (data_filepath)

### Seeing the data

In [3]:
print (len (artists), artists.shape)
print (artists.columns, type (artists))
artists.set_axis (['Artists'], axis = 1, inplace = True)
artists.head ()

111 (111, 1)
Int64Index([0], dtype='int64') <class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Artists
0,Massive Attack
1,Sublime
2,Beastie Boys
3,Neil Young
4,Dead Kennedys


In [4]:
print (data.info ())
print (data.columns)
print (data.index)
print (data.shape)
print (data['artist_offset'].unique ().min (), data['artist_offset'].unique ().max ())
data.head ()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2894 entries, 0 to 2893
Data columns (total 3 columns):
user_offset      2894 non-null int64
artist_offset    2894 non-null int64
playcount        2894 non-null int64
dtypes: int64(3)
memory usage: 67.9 KB
None
Index(['user_offset', 'artist_offset', 'playcount'], dtype='object')
RangeIndex(start=0, stop=2894, step=1)
(2894, 3)
0 110


Unnamed: 0,user_offset,artist_offset,playcount
0,1,79,58
1,1,84,80
2,1,86,317
3,1,89,64
4,1,96,159


### Creating the sparse matrix

Had some confusion about the shape of the matrix. 

Initialy created it with shape (111, 500), but saw that had to make one of (500, 111).

Did have to transpose it later (before giving it to NMF).

*Was trying to create a 111 x 500 sparse matrix which was impossible (due to the nature of the data), but couldv'e just transposed it later...*

In [5]:
# creating a sparse matrix of artists and users
print (len (data['user_offset'].unique ()))
n_users = len (data['user_offset'].unique ())
sparse = np.zeros ((n_users, artists.shape[0])) # 500 x 111
print (sparse.shape)

for i in range (len (data)):
    sparse[data.iloc[i, 0], data.iloc[i, 1]] = data.iloc[i, 2]

500
(500, 111)


**Tried using the provided sparse matrix...**

In [6]:
newdata_filepath = r'C:\Users\ANIMESH\Documents\Python Scripts\datasets\artist_sparse_data.csv'
new_sparse = pd.read_csv (newdata_filepath)
new_sparse.head ()

Unnamed: 0,Name,0,1,2,3,4,5,6,7,8,...,490,491,492,493,494,495,496,497,498,499
0,Massive Attack,0.0,0.0,105.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Sublime,128.0,211.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,270.0,0.0,105.0,97.0,0.0,0.0,0.0
2,Beastie Boys,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Neil Young,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Dead Kennedys,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


*Transposed*

In [7]:
sparse = sparse.T # 111 x 500

### Fitting and Transforming the matrix

In [8]:
from sklearn.decomposition import NMF

model = NMF (n_components = 4)

nmf_features = model.fit_transform (sparse)

In [9]:
print (nmf_features, nmf_features.shape)

[[  4.44507767e-02   4.67270186e-02   4.66015950e-01   3.40299348e-01]
 [  7.02034491e-02   7.30981107e-02   4.31529322e-01   2.20673005e-01]
 [  7.91129381e-01   5.23437084e-03   4.23971844e-01   5.12380838e-01]
 [  8.50975570e-02   0.00000000e+00   5.70670346e+00   0.00000000e+00]
 [  1.62843484e-02   1.19586357e-02   2.12926592e-01   2.05269491e-01]
 [  6.83715871e-02   3.44198633e-02   3.74099965e-01   5.87600857e-01]
 [  1.60940045e-02   5.82853479e-03   1.25908277e-01   4.62074195e-03]
 [  0.00000000e+00   8.33301922e-02   7.07844963e-01   5.13060956e-02]
 [  4.40663117e-02   2.25901960e-02   3.05419777e-01   0.00000000e+00]
 [  4.19253405e-01   2.05457896e-02   2.24705971e+00   3.82228299e+00]
 [  4.09245992e-02   1.07791862e-02   7.32208466e-01   1.32199526e+00]
 [  1.09141994e-02   3.51423157e-05   4.90181228e-03   1.46756405e-01]
 [  4.23684960e-01   6.79091353e-03   1.00996994e+00   8.03145571e+00]
 [  6.67304198e-02   6.45017175e-03   4.97122563e-01   7.69318637e-01]
 [  1.

### Scaling the matrix

In [10]:
from sklearn.preprocessing import StandardScaler

scale = StandardScaler ()

scaled = scale.fit_transform (nmf_features)

In [11]:
print (scaled, scaled.shape)

[[ -1.41236115e-01  -1.16975712e-01  -1.86400330e-01  -2.95033030e-01]
 [ -1.38803142e-01  -1.14381727e-01  -1.90265904e-01  -3.11284932e-01]
 [ -7.06939659e-02  -1.21057126e-01  -1.91113014e-01  -2.71654804e-01]
 [ -1.37396027e-01  -1.21572003e-01   4.01023374e-01  -3.41264585e-01]
 [ -1.43897126e-01  -1.20395695e-01  -2.14768877e-01  -3.13377585e-01]
 [ -1.38976206e-01  -1.18186302e-01  -1.96703106e-01  -2.61435747e-01]
 [ -1.43915109e-01  -1.20998681e-01  -2.24522677e-01  -3.40636831e-01]
 [ -1.45435583e-01  -1.13375251e-01  -1.59293945e-01  -3.34294367e-01]
 [ -1.41272437e-01  -1.19349924e-01  -2.04401403e-01  -3.41264585e-01]
 [ -1.05826793e-01  -1.19551022e-01   1.32351794e-02   1.78013774e-01]
 [ -1.41569249e-01  -1.20511711e-01  -1.56563063e-01  -1.61664191e-01]
 [ -1.44404469e-01  -1.21568546e-01  -2.38086178e-01  -3.21326913e-01]
 [ -1.05408124e-01  -1.20904017e-01  -1.25429042e-01   7.49853252e-01]
 [ -1.39131255e-01  -1.20937534e-01  -1.82913619e-01  -2.36748378e-01]
 [ -1.

### Normalizing...

In [12]:
from sklearn.preprocessing import Normalizer

norm = Normalizer ()

normalized = norm.fit_transform (scaled)

In [13]:
print (normalized, normalized.shape)

[[-0.35825443 -0.29671637 -0.47281635 -0.74837013]
 [-0.34124582 -0.28120607 -0.46776638 -0.76529017]
 [-0.19609016 -0.33578695 -0.53010722 -0.75351317]
 [-0.24639778 -0.21801993  0.71917124 -0.61200341]
 [-0.33960991 -0.28414446 -0.50687349 -0.73959874]
 [-0.37100161 -0.31550227 -0.52510549 -0.69791142]
 [-0.32036863 -0.26935449 -0.49980869 -0.75828977]
 [-0.35156378 -0.27406382 -0.38506382 -0.80809516]
 [-0.32203682 -0.27206347 -0.46594211 -0.77792783]
 [-0.44188255 -0.49918842  0.05526384  0.74330118]
 [-0.48497345 -0.4128367  -0.53633773 -0.55381265]
 [-0.32653686 -0.27489877 -0.53837608 -0.72660549]
 [-0.13565904 -0.15560208 -0.16142574  0.96505251]
 [-0.39592032 -0.34414716 -0.52051006 -0.67370551]
 [-0.29995317 -0.26583564  0.49440767 -0.77131094]
 [ 0.99921613 -0.01178712 -0.02313713 -0.02988088]
 [-0.05851976 -0.06227126  0.14718881  0.98541016]
 [-0.11450309 -0.09571502  0.96103951  0.23266015]
 [-0.32561678 -0.26427014 -0.47845242 -0.77150391]
 [-0.3455986  -0.28728185 -0.44

### Creating the dataframe 

For using the scaler product to find recommendations

In [14]:
artists_list = pd.Series (artists['Artists'], name = 'Artists')

In [15]:
# create df

df = pd.DataFrame (normalized, index = artists_list)
df.head ()

Unnamed: 0_level_0,0,1,2,3
Artists,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Massive Attack,-0.358254,-0.296716,-0.472816,-0.74837
Sublime,-0.341246,-0.281206,-0.467766,-0.76529
Beastie Boys,-0.19609,-0.335787,-0.530107,-0.753513
Neil Young,-0.246398,-0.21802,0.719171,-0.612003
Dead Kennedys,-0.33961,-0.284144,-0.506873,-0.739599


In [16]:
df.loc['Simon & Garfunkel']

0   -0.366912
1   -0.305394
2   -0.287596
3   -0.830300
Name: Simon & Garfunkel, dtype: float64

In [17]:
print (model.components_.shape)

(4, 500)


### Didn't understand the purpose of this...

In [18]:
components_df = pd.DataFrame (model.components_)
components_df.head ()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0.01609,0.010212,0.012548,0.0,0.950854,0.002256,0.001653,0.001962,0.003178,0.082053,...,0.0,0.005106,0.001159,0.003276,0.003365,0.015574,0.001697,0.069969,0.027231,0.09538
1,0.000934,0.004024,0.179696,0.0,0.002673,0.0,0.002588,0.007038,0.002585,0.0,...,0.0,0.001267,0.121001,0.002399,0.002142,0.004195,0.000202,0.000108,1.120135,0.009206
2,0.04362,0.060047,0.9559,0.947866,0.009793,0.014799,0.031576,0.042991,0.083369,0.055871,...,0.710065,0.157009,0.023992,0.037107,0.03571,0.027655,0.047172,0.015334,1.616928,0.07025
3,0.208803,0.024575,0.153734,0.0,0.021122,0.107838,0.005354,0.012745,0.128571,1.513034,...,0.0,0.0,0.011545,0.024254,0.031297,0.034936,0.389935,0.072665,0.035115,0.118332


In [19]:
print (components_df.shape)
print (components_df.iloc[2].nlargest ())

(4, 500)
178    87.256272
19     21.276257
179    14.127639
166     9.372709
88      9.367174
Name: 2, dtype: float64


## Final answer

In [20]:
artist_heard = df.loc['Simon & Garfunkel']
similaritites = df.dot (artist_heard)
print (similaritites.nlargest ())

Artists
Simon & Garfunkel    1.000000
Bright Eyes          0.999173
Phish                0.997546
Mirah                0.996388
Franz Ferdinand      0.996165
dtype: float64
