In [2]:
import pandas as pd
from scipy.spatial.distance import cosine
import numpy as np
from scipy.sparse import csr_matrix
from matplotlib import pyplot as plt

# display results to 3 decimal points, not in scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)

#download the files with we are going to work
info_user_songs = 'https://static.turi.com/datasets/millionsong/10000.txt'
songs_metadata_file = 'https://static.turi.com/datasets/millionsong/song_data.csv'

### Read the info_user_song and define the columns user_id, song_id and listen_count 

In [3]:
song_df_1 = pd.read_table(info_user_songs,header=None)
song_df_1.columns = ["user_id", "song_id", "listen_count"]
song_df_1.head()

Unnamed: 0,user_id,song_id,listen_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1


In [4]:
len(song_df_1)

2000000

### Read the songs_metadata_file  data and then  merges with song_df_1 to have only one dataset

In [5]:
song_df_2 =  pd.read_csv(songs_metadata_file)
song_df_2.head()


Unnamed: 0,song_id,title,release,artist_name,year
0,SOQMMHC12AB0180CB8,Silent Night,Monster Ballads X-Mas,Faster Pussy cat,2003
1,SOVFVAK12A8C1350D9,Tanssi vaan,Karkuteillä,Karkkiautomaatti,1995
2,SOGTUKN12AB017F4F1,No One Could Ever,Butter,Hudson Mohawke,2006
3,SOBNYVR12A8C13558C,Si Vos Querés,De Culo,Yerba Brava,2003
4,SOHSBXH12A8C13B0DF,Tangle Of Aspens,Rene Ablaze Presents Winter Sessions,Der Mystic,0


In [6]:
song_df =  pd.merge(song_df_1, song_df_2.drop_duplicates(['song_id']), on="song_id", how="left")
song_df

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Thicker Than Water,Jack Johnson,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Graduation,Kanye West,2007
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,In Between Dreams,Jack Johnson,2005
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999
5,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODDNQT12A6D4F5F7E,5,Apuesta Por El Rock 'N' Roll,Antología Audiovisual,Héroes del Silencio,2007
6,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODXRTY12AB0180F3B,1,Paper Gangsta,The Fame Monster,Lady GaGa,2008
7,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOFGUAY12AB017B0A8,1,Stacked Actors,There Is Nothing Left To Lose,Foo Fighters,1999
8,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOFRQTD12A81C233C0,1,Sehr kosmisch,Musik von Harmonia,Harmonia,0
9,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOHQWYZ12A6D4FA701,1,Heaven's gonna burn your eyes,Hôtel Costes 7 by Stéphane Pompougnac,Thievery Corporation feat. Emiliana Torrini,2002


In [7]:
len(song_df)

2000000

#### song_df contain info about user_id, song_id, _listen_count, title music, release, artist_name and year. The dataset have a length of 200000

In [8]:
if song_df['artist_name'].isnull().sum() > 0:
    song_df = song_df.dropna(axis = 0, subset = ['artist_name'])
    
len(song_df)

2000000

In [9]:
#selec a subset of total dataset
song_df_subset = song_df.loc[0:10000, :]
song_df_subset["song"] = song_df_subset["title"].map(str)+"-"+song_df_subset["artist_name"].map(str)
song_df_subset.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year,song
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Thicker Than Water,Jack Johnson,0,The Cove-Jack Johnson
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976,Entre Dos Aguas-Paco De Lucia
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Graduation,Kanye West,2007,Stronger-Kanye West
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,In Between Dreams,Jack Johnson,2005,Constellations-Jack Johnson
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999,Learn To Fly-Foo Fighters


### Calculo de popularidade das musicas
exitem duas formas para realizar operações (sum, count, min, max) sobre os dados de uma ou varias columnas. Neste caso se quer saber a occorrencia de uma musica pelo numero de usuarios e fazer uma relação com o numero de vezes que a musica foi reproduzida.

song_play e song_grouped  fazem a mesma função, somente muda um pouco o codigo da mesma*. Nelas é avaliado quantas vezes o titulo de uma musica "title"(numero de occurrencias) se repete no dataset, oseja basicamente determina quantos usuarios escutaram determinada musica em função de "listen_count". Posteriormente é establecido uma porcentagem para determinar a importancia da musica em função do numero de reproduções de determinada musica. finalmente é mostrado de maior a menor as musicas que foram mais escutadas pelos usuarios.


*Song_play para fazer o calculo usa o metodo count(), e song_grouped usa o metodo agg() que é mais generico e pode ser usado para calcular soma, div, max, min ... etc.


In [10]:

song_play =(song_df_subset.
            groupby("song")["listen_count"].
            count().
            reset_index().
            rename(columns = {'listen_count': 'total_song_plays'})
            [['song', 'total_song_plays']]
            )
grouped_sum = song_play['total_song_plays'].sum()
song_play["porcentage"] = song_play["total_song_plays"].div(grouped_sum)*100

song_play.sort_values(by=['total_song_plays','song'], ascending = False)

Unnamed: 0,song,total_song_plays,porcentage
3660,Sehr kosmisch-Harmonia,45,0.450
5105,You're The One-Dwight Yoakam,32,0.320
4678,Undo-Björk,32,0.320
3655,Secrets-OneRepublic,28,0.280
1071,Dog Days Are Over (Radio Edit)-Florence + The ...,28,0.280
4712,Use Somebody-Kings Of Leon,27,0.270
4378,The Scientist-Coldplay,27,0.270
3476,Revelry-Kings Of Leon,26,0.260
1387,Fireflies-Charttraxx Karaoke,24,0.240
1862,Horn Concerto No. 4 in E flat K495: II. Romanc...,23,0.230


In [14]:
song_grouped = song_df_subset.groupby(['song']).agg({'listen_count': 'count'}).reset_index()
grouped_sum1 = song_grouped['listen_count'].sum()
song_grouped['percentage']  = song_grouped['listen_count'].div(grouped_sum1)*100
song_grouped.sort_values(['listen_count', 'song'], ascending = [0,1])

Unnamed: 0,song,listen_count,percentage
3660,Sehr kosmisch-Harmonia,45,0.450
4678,Undo-Björk,32,0.320
5105,You're The One-Dwight Yoakam,32,0.320
1071,Dog Days Are Over (Radio Edit)-Florence + The ...,28,0.280
3655,Secrets-OneRepublic,28,0.280
4378,The Scientist-Coldplay,27,0.270
4712,Use Somebody-Kings Of Leon,27,0.270
3476,Revelry-Kings Of Leon,26,0.260
1387,Fireflies-Charttraxx Karaoke,24,0.240
1862,Horn Concerto No. 4 in E flat K495: II. Romanc...,23,0.230


In [16]:
song_grouped['listen_count'].describe()

count   5151.000
mean       1.942
std        2.165
min        1.000
25%        1.000
50%        1.000
75%        2.000
max       45.000
Name: listen_count, dtype: float64

### To know the total number of users  and  songs into dataset

In [17]:
users = song_df_subset['user_id'].unique()
len(users) ## return the number of unique users

365

In [18]:


songs = song_df_subset['song'].unique()
len(songs) ## return the number of unique songs

5151

In [19]:
n_users = users.shape[0]
n_songs = songs.shape[0]


In [36]:
users[0]

'b80344d063b5ccb3212f76538f3d9e43d87dca9e'

### separar os dados em dados de treinamento e dados de test

In [20]:
from sklearn import cross_validation as cv

train_data, test_data = cv.train_test_split(song_df, test_size=0.20, random_state=0)



In [26]:
train_data.head()

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year
608812,7b8fbe766a49e5d7618452149dfab920621fc4fb,SOJJYDE12AF729FC16,1,Two Is Better Than One,Love Drunk,Boys Like Girls featuring Taylor Swift,2009
623729,d24956cd68ff84b6d0271286ae6866ee1c89ff77,SOPQGWI12A8C135DDB,2,Royal Gregory,LP,Holy Fuck,2007
583106,da7b91b6cab1ca11227ee7720c4d2e03e8c31579,SOCOIIG12A58A7D151,1,Mr Sandman,Original Hits - 50s,The Chordettes,1993
435735,5f633da6ad4845350949c3c76ce6c4ef6f167476,SOQQTBB12AB0182F1D,2,A Days Work (feat. P.O.S),Rádio do Canibal,BK-One,0
1361953,01ad0fabd01af750700a1e80bb0055abcb3edd28,SOVYNVS12AC3DF64AB,2,Rockin' Rollin' Stone,100 Greatest Rockabilly Hits,Andy Starr,2000


In [32]:

train_data_grouped = train_data.groupby(['song_id']).agg({'user_id': 'count'}).reset_index()
train_data_grouped.rename(columns = {'user_id': 'score'},inplace=True)
train_data_sort = train_data_grouped.sort_values(['score', 'song_id'], ascending = [0,1])
train_data_sort['Rank'] = train_data_sort['score'].rank(ascending=0, method='first')
popularity_recommendations = train_data_sort.head(10)
popularity_recommendations

Unnamed: 0,song_id,score,Rank
2220,SOFRQTD12A81C233C0,6630,1.0
317,SOAUWYT12A81C206F1,5639,2.0
352,SOAXGDH12A8C13F8A1,5592,3.0
614,SOBONKR12A58A7A7E0,5143,4.0
7416,SOSXLTC12AF72A7F54,4938,5.0
5531,SONYKOW12AB01849C9,4627,6.0
1664,SOEGIYH12A6D4FC0E3,4368,7.0
4448,SOLFXKT12AB017E3E0,3835,8.0
1334,SODJWHY12A8C142CCE,3819,9.0
2115,SOFLJQZ12A6D4FADA6,3707,10.0


In [45]:
user_id = users[0] # para escolher um usuario para recomendar musicas
users_recommendations =  popularity_recommendations
users_recommendations["user_id"] = user_id
cols = users_recommendations.columns.tolist()
cols = cols[-1:] + cols[:-1] # para organizar as colunas
users_recommendations = users_recommendations[cols]
users_recommendations



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,user_id,song_id,score,Rank
2220,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOFRQTD12A81C233C0,6630,1.0
317,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAUWYT12A81C206F1,5639,2.0
352,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAXGDH12A8C13F8A1,5592,3.0
614,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBONKR12A58A7A7E0,5143,4.0
7416,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOSXLTC12AF72A7F54,4938,5.0
5531,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SONYKOW12AB01849C9,4627,6.0
1664,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOEGIYH12A6D4FC0E3,4368,7.0
4448,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOLFXKT12AB017E3E0,3835,8.0
1334,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODJWHY12A8C142CCE,3819,9.0
2115,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOFLJQZ12A6D4FADA6,3707,10.0


Your first step will be to create the user-item matrix. Since you have both testing and training data you need to create two matrices.

In [28]:
#Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_songs))
for line in train_data.itertuples():
    train_data_matrix[line[1], line[2]] = line[3]

#train_data_matrix = np.zeros((n_users, n_songs))
#for line in test_data.itertuples():
 #   test_data_matrix[line[1]-1, line[2]-1] = line[3]


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [2]:
for row in train_data.itertuples():
     train_data_matrix[row[1]-1, row[2]-1] = row[3]


NameError: name 'train_data' is not defined

#### calcular similaridade de usuario-item

In [59]:
from sklearn.metrics.pairwise import pairwise_distances


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

In [53]:
df = pd.DataFrame({'col1': [5, 2], 'col2': [10, 20], 'col3':[5,4]},
                      index=['a', 'b'])

matrix =  np.zeros((2,3))
for row in df.itertuples():
    print row[1]-1
    print row[2]-1
   # matrix[row[1], row[2]] = row[3]
   # print matrix

4
9
1
19


### Recommendation by popularity of song

In [6]:
artist_play = (data.
     groupby(by = ['Artist'])['Streams'].
     sum().
     reset_index().
     rename(columns = {'Streams': 'total_artist_plays'})
     [['Artist', 'total_artist_plays']]
    )
artist_play.sort_values(by= ['total_artist_plays'], ascending = False)

Unnamed: 0,Artist,total_artist_plays
1627,Ed Sheeran,8913973976
1562,Drake,4523630992
5631,The Chainsmokers,4292590087
4641,Post Malone,3700404149
3050,Kendrick Lamar,3570665303
3556,Luis Fonsi,3555514919
2523,J Balvin,2494735971
924,Calvin Harris,2397708371
2469,Imagine Dragons,2322921399
1266,DJ Khaled,2236224259


In [8]:
artist_play['total_artist_plays'].describe()

count         6628.000
mean      26932825.632
std      208004490.308
min           1002.000
25%          26133.000
50%         175241.000
75%        2043230.000
max     8913973976.000
Name: total_artist_plays, dtype: float64

In [13]:
artist_play['total_artist_plays'].quantile(np.arange(.9, 1, .01)), 

(0.900    15429870.400
 0.910    18607267.630
 0.920    23570143.160
 0.930    29621448.130
 0.940    37989694.900
 0.950    53180615.950
 0.960    80408860.440
 0.970   130946684.520
 0.980   285558664.920
 0.990   702407456.450
 Name: total_artist_plays, dtype: float64,)

To understand: (100%-99% = 1%) 1% of artist have roughly 700.000.000 or more plays,2% have 200.000.000 or more, 3% have 130.000.000 or more, 4% have 80.000.000 or more.

Since we have so many artists, we'll limit it to the top 4%. This is arbitrary threshold for popularity.

In [22]:
popularity_threshold = 130000000
data_popular_artists = artist_play.query('total_artist_plays >= @popularity_threshold')
data_popular_artists.sort_values(by=['total_artist_plays'], ascending = False)


Unnamed: 0,Artist,total_artist_plays
1627,Ed Sheeran,8913973976
1562,Drake,4523630992
5631,The Chainsmokers,4292590087
4641,Post Malone,3700404149
3050,Kendrick Lamar,3570665303
3556,Luis Fonsi,3555514919
2523,J Balvin,2494735971
924,Calvin Harris,2397708371
2469,Imagine Dragons,2322921399
1266,DJ Khaled,2236224259


Then the most popular artists in the world are:

In [23]:
len(data_popular_artists)

201

In [7]:
track_play = (data.
    groupby(by = ['Track Name'])["Streams"].
    sum().
    reset_index().
    rename(columns = {'Streams':'total_track_plays_on_word'})
    [['Track Name','total_track_plays_on_word']]
         )
track_play['Artist'] = data['Artist']
track_play.sort_values(by=['total_track_plays_on_word'], ascending = False)

Unnamed: 0,Track Name,total_track_plays_on_word,Artist
12905,Shape of You,2993988783,Ed Sheeran
3342,Despacito - Remix,1829621841,The Vamps
3341,Despacito (Featuring Daddy Yankee),1460802540,Rombai
13414,Something Just Like This,1386258295,Ricky Martin
15228,Unforgettable,1366919951,Pitbull
5569,HUMBLE.,1311243745,Drake
16725,rockstar,1260181617,Romeo Santos
6566,I'm the One,1254196301,Mambo Kingz
6897,It Ain't Me (with Selena Gomez),1190339348,Twenty One Pilots
16276,XO TOUR Llif3,1171827725,CNCO


## Picking a threshold for popular artists