In [2]:
import pandas as pd
from scipy.spatial.distance import cosine
import numpy as np
from scipy.sparse import csr_matrix
from matplotlib import pyplot as plt

# display results to 3 decimal points, not in scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)

#download the files with we are going to work
info_user_songs = 'https://static.turi.com/datasets/millionsong/10000.txt'
songs_metadata_file = 'https://static.turi.com/datasets/millionsong/song_data.csv'

### Read the info_user_song and define the columns user_id, song_id and listen_count 

In [3]:
song_df_1 = pd.read_table(info_user_songs,header=None)
song_df_1.columns = ["user_id", "song_id", "listen_count"]
song_df_1.head()

Unnamed: 0,user_id,song_id,listen_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1


In [4]:
len(song_df_1)

2000000

### Read the songs_metadata_file  data and then  merges with song_df_1 to have only one dataset

In [5]:
song_df_2 =  pd.read_csv(songs_metadata_file)
song_df_2.head()


Unnamed: 0,song_id,title,release,artist_name,year
0,SOQMMHC12AB0180CB8,Silent Night,Monster Ballads X-Mas,Faster Pussy cat,2003
1,SOVFVAK12A8C1350D9,Tanssi vaan,Karkuteillä,Karkkiautomaatti,1995
2,SOGTUKN12AB017F4F1,No One Could Ever,Butter,Hudson Mohawke,2006
3,SOBNYVR12A8C13558C,Si Vos Querés,De Culo,Yerba Brava,2003
4,SOHSBXH12A8C13B0DF,Tangle Of Aspens,Rene Ablaze Presents Winter Sessions,Der Mystic,0


In [6]:
song_df =  pd.merge(song_df_1, song_df_2.drop_duplicates(['song_id']), on="song_id", how="left")
song_df

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Thicker Than Water,Jack Johnson,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Graduation,Kanye West,2007
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,In Between Dreams,Jack Johnson,2005
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999
5,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODDNQT12A6D4F5F7E,5,Apuesta Por El Rock 'N' Roll,Antología Audiovisual,Héroes del Silencio,2007
6,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODXRTY12AB0180F3B,1,Paper Gangsta,The Fame Monster,Lady GaGa,2008
7,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOFGUAY12AB017B0A8,1,Stacked Actors,There Is Nothing Left To Lose,Foo Fighters,1999
8,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOFRQTD12A81C233C0,1,Sehr kosmisch,Musik von Harmonia,Harmonia,0
9,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOHQWYZ12A6D4FA701,1,Heaven's gonna burn your eyes,Hôtel Costes 7 by Stéphane Pompougnac,Thievery Corporation feat. Emiliana Torrini,2002


In [7]:
len(song_df)

2000000

#### song_df contain info about user_id, song_id, _listen_count, title music, release, artist_name and year. The dataset have a length of 200000

In [8]:
if song_df['artist_name'].isnull().sum() > 0:
    song_df = song_df.dropna(axis = 0, subset = ['artist_name'])
    
len(song_df)

2000000

In [9]:
#selec a subset of total dataset
song_df_subset = song_df.loc[0:10000, :]
song_df_subset["song"] = song_df_subset["title"].map(str)+"-"+song_df_subset["artist_name"].map(str)
song_df_subset.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year,song
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Thicker Than Water,Jack Johnson,0,The Cove-Jack Johnson
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976,Entre Dos Aguas-Paco De Lucia
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Graduation,Kanye West,2007,Stronger-Kanye West
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,In Between Dreams,Jack Johnson,2005,Constellations-Jack Johnson
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999,Learn To Fly-Foo Fighters


### Calculo de popularidade das musicas
exitem duas formas para realizar operações (sum, count, min, max) sobre os dados de uma ou varias columnas. Neste caso se quer saber a occorrencia de uma musica pelo numero de usuarios e fazer uma relação com o numero de vezes que a musica foi reproduzida.

song_play e song_grouped  fazem a mesma função, somente muda um pouco o codigo da mesma*. Nelas é avaliado quantas vezes o titulo de uma musica "title"(numero de occurrencias) se repete no dataset, oseja basicamente determina quantos usuarios escutaram determinada musica em função de "listen_count". Posteriormente é establecido uma porcentagem para determinar a importancia da musica em função do numero de reproduções de determinada musica. finalmente é mostrado de maior a menor as musicas que foram mais escutadas pelos usuarios.


*Song_play para fazer o calculo usa o metodo count(), e song_grouped usa o metodo agg() que é mais generico e pode ser usado para calcular soma, div, max, min ... etc.


In [10]:

song_play =(song_df_subset.
            groupby("song")["listen_count"].
            count().
            reset_index().
            rename(columns = {'listen_count': 'total_song_plays'})
            [['song', 'total_song_plays']]
            )
grouped_sum = song_play['total_song_plays'].sum()
song_play["porcentage"] = song_play["total_song_plays"].div(grouped_sum)*100

song_play.sort_values(by=['total_song_plays','song'], ascending = False)

Unnamed: 0,song,total_song_plays,porcentage
3660,Sehr kosmisch-Harmonia,45,0.450
5105,You're The One-Dwight Yoakam,32,0.320
4678,Undo-Björk,32,0.320
3655,Secrets-OneRepublic,28,0.280
1071,Dog Days Are Over (Radio Edit)-Florence + The ...,28,0.280
4712,Use Somebody-Kings Of Leon,27,0.270
4378,The Scientist-Coldplay,27,0.270
3476,Revelry-Kings Of Leon,26,0.260
1387,Fireflies-Charttraxx Karaoke,24,0.240
1862,Horn Concerto No. 4 in E flat K495: II. Romanc...,23,0.230


In [11]:
song_grouped = song_df_subset.groupby(['song']).agg({'listen_count': 'count'}).reset_index()
grouped_sum1 = song_grouped['listen_count'].sum()
song_grouped['percentage']  = song_grouped['listen_count'].div(grouped_sum1)*100
song_grouped.sort_values(['listen_count', 'song'], ascending = [0,1])

Unnamed: 0,song,listen_count,percentage
3660,Sehr kosmisch-Harmonia,45,0.450
4678,Undo-Björk,32,0.320
5105,You're The One-Dwight Yoakam,32,0.320
1071,Dog Days Are Over (Radio Edit)-Florence + The ...,28,0.280
3655,Secrets-OneRepublic,28,0.280
4378,The Scientist-Coldplay,27,0.270
4712,Use Somebody-Kings Of Leon,27,0.270
3476,Revelry-Kings Of Leon,26,0.260
1387,Fireflies-Charttraxx Karaoke,24,0.240
1862,Horn Concerto No. 4 in E flat K495: II. Romanc...,23,0.230


In [12]:
song_grouped['listen_count'].describe()

count   5151.000
mean       1.942
std        2.165
min        1.000
25%        1.000
50%        1.000
75%        2.000
max       45.000
Name: listen_count, dtype: float64

### To know the total number of users  and  songs into dataset

In [13]:
users = song_df_subset['user_id'].unique()
len(users) ## return the number of unique users

365

In [14]:


songs = song_df_subset['song'].unique()
len(songs) ## return the number of unique songs

5151

In [15]:
n_users = users.shape[0]
n_songs = songs.shape[0]


In [16]:
users[0]

'b80344d063b5ccb3212f76538f3d9e43d87dca9e'

### separar os dados em dados de treinamento e dados de test

In [17]:
from sklearn import cross_validation as cv

train_data, test_data = cv.train_test_split(song_df, test_size=0.20, random_state=0)



In [18]:
train_data.head()

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year
608812,7b8fbe766a49e5d7618452149dfab920621fc4fb,SOJJYDE12AF729FC16,1,Two Is Better Than One,Love Drunk,Boys Like Girls featuring Taylor Swift,2009
623729,d24956cd68ff84b6d0271286ae6866ee1c89ff77,SOPQGWI12A8C135DDB,2,Royal Gregory,LP,Holy Fuck,2007
583106,da7b91b6cab1ca11227ee7720c4d2e03e8c31579,SOCOIIG12A58A7D151,1,Mr Sandman,Original Hits - 50s,The Chordettes,1993
435735,5f633da6ad4845350949c3c76ce6c4ef6f167476,SOQQTBB12AB0182F1D,2,A Days Work (feat. P.O.S),Rádio do Canibal,BK-One,0
1361953,01ad0fabd01af750700a1e80bb0055abcb3edd28,SOVYNVS12AC3DF64AB,2,Rockin' Rollin' Stone,100 Greatest Rockabilly Hits,Andy Starr,2000


### Recomendação por popularidade de item

In [19]:

train_data_grouped = train_data.groupby(['title']).agg({'user_id': 'count'}).reset_index()
train_data_grouped.rename(columns = {'user_id': 'score'},inplace=True)
train_data_sort = train_data_grouped.sort_values(['score', 'title'], ascending = [0,1])
train_data_sort['Rank'] = train_data_sort['score'].rank(ascending=0, method='first')
popularity_recommendations = train_data_sort.head(10)
popularity_recommendations

Unnamed: 0,title,score,Rank
6836,Sehr kosmisch,6630,1.0
8725,Undo,5639,2.0
1964,Dog Days Are Over (Radio Edit),5592,3.0
9496,You're The One,5396,4.0
6498,Revelry,4938,5.0
6825,Secrets,4627,6.0
3437,Horn Concerto No. 4 in E flat K495: II. Romanc...,4368,7.0
2595,Fireflies,3835,8.0
3322,Hey_ Soul Sister,3819,9.0
8494,Tive Sim,3707,10.0


Gerar recomendação para o usuario 3, quando se faz recomendação por populariade de item ,sempre serão recomendadas as mesmas musicas

In [22]:
user = users[3] # para escolher um usuario para recomendar musicas
users_recommendations =  popularity_recommendations
users_recommendations["user_id"] = user
cols = users_recommendations.columns.tolist()
cols = cols[-1:] + cols[:-1] # para organizar as colunas
users_recommendations = users_recommendations[cols]
users_recommendations



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,user_id,title,score,Rank
6836,8937134734f869debcab8f23d77465b4caaa85df,Sehr kosmisch,6630,1.0
8725,8937134734f869debcab8f23d77465b4caaa85df,Undo,5639,2.0
1964,8937134734f869debcab8f23d77465b4caaa85df,Dog Days Are Over (Radio Edit),5592,3.0
9496,8937134734f869debcab8f23d77465b4caaa85df,You're The One,5396,4.0
6498,8937134734f869debcab8f23d77465b4caaa85df,Revelry,4938,5.0
6825,8937134734f869debcab8f23d77465b4caaa85df,Secrets,4627,6.0
3437,8937134734f869debcab8f23d77465b4caaa85df,Horn Concerto No. 4 in E flat K495: II. Romanc...,4368,7.0
2595,8937134734f869debcab8f23d77465b4caaa85df,Fireflies,3835,8.0
3322,8937134734f869debcab8f23d77465b4caaa85df,Hey_ Soul Sister,3819,9.0
8494,8937134734f869debcab8f23d77465b4caaa85df,Tive Sim,3707,10.0


get user-items

In [24]:
user =  users[10]
#Get unique items (songs) corresponding to a given user
#specificamente é procurado dentro do data frame o usuario correspondente ao id usuario 
#e é salvo em user_data toda a informação referente a esse usuario em forma de data frame
user_data =  train_data[train_data['user_id'] == user]
#lista das musicas escutadas pelo usuario user
user_items = list(user_data['title'].unique())
#######################
song = songs[2]

#relacion de usuarios por musica
#item_data = train_data[train_data['song_id'] == song]
#item_users = set(item_data['user_id'].unique())

#lista das musicas do dataset
all_items = list(train_data['title'].unique())

#relacion entre as musicas do usuario e as do dataset

user_songs_users = []
for i in range(0, len(user_items)):
    item_data = train_data[train_data['title'] == user_items[i]]
    item_users = set(item_data['user_id'].unique())
    user_songs_users.append(item_users)
    
print (user_songs_users)
    
#print (test(train_data,user))

[{'2658cef5733cebb90bf3527ca9f0a12c9cb77859', '33f99909c5781f386ba747aff36c7a1c18cc4597', 'cf38611e6a9fe56474b3a8a98129e192b8523430', '86b91018b90505ea428694a57b2f3cb307604a6f', '76e91d79ec608d5b9d82e97c8d45498f8cd0bc97', '0f3240a6925fcce19b380e92993f83b2575093d1', 'ad5795175759343bcc53c516b60cf6ea96ce9e4c', '01c4c95bcfa8bf9208e39896476ad8620d0aa482', '4b3ad4da6c6922e10bfac5d640ab03506f5b1af8', '93c5685c0b7c7cbb94bd1940848a6e9d31dba4c6', '0e6df18e5e2c5a0b1ad7c8779feaa3393c365bec', '227bbd11ff08b085763946f2610a91b39197f0f2', '17aa9f6dbdf753831da8f38c71b66b64373de613', 'a7f329d59cfb9028535a390253ac927becf1b7a6', 'e33014d2d63194be4e6b273fc1cf1f3f1efc0aac', 'dda02b50dd1f204bf56ed229f09797530e3627dc', '19f150cd0e14ea66044ace3ba6af18f72b936bd5', '3bdece236d8a830c99bd637135cedd0c167cf09e', 'f020952fe188a38fe763684b09fed8eefbdfee55', '98e10346af841ae7c30b292bca56391a0bcca07b', 'd602d02db8ebfd7c5f275e3cfcfcf54c214f48ce', 'a054b5734b5497930d9bef1145f3261368419c9f', '74e233b2dcdd0a1e90b5dcbfe1216

### Create Matrix coocurrencia


In [26]:
cooccurence_matrix = np.matrix(np.zeros(shape=(len(user_items), len(all_items))), float)

for i in range(0,len(all_items)):
            #Calculate the total unique listeners (users) of song (item) i
            songs_i_data = train_data[train_data['title'] == all_items[i]]
            users_i = set(songs_i_data['user_id'].unique())
            
            for j in range(0,len(user_items)):       
                    
                #Get unique listeners (users) of song (item) j
                users_j = user_songs_users[j]
                    
                #Calculate intersection of listeners of songs i and j
                users_intersection = users_i.intersection(users_j)
                
                #Calculate cooccurence_matrix[i,j] as Jaccard Index
                if len(users_intersection) != 0:
                    #Calculate union of listeners of songs i and j
                    users_union = users_i.union(users_j)
                    
                    cooccurence_matrix[j,i] = float(len(users_intersection))/float(len(users_union))
                else:
                    cooccurence_matrix[j,i] = 0
                    
print (cooccurence_matrix)


[[0.         0.01238938 0.00342466 ... 0.0019802  0.         0.01195219]
 [0.         0.00462963 0.00434783 ... 0.         0.00645161 0.        ]
 [0.         0.         0.003861   ... 0.         0.         0.        ]
 ...
 [0.         0.01069519 0.         ... 0.         0.00316456 0.00319489]
 [0.00236967 0.0069808  0.00510204 ... 0.00196078 0.         0.        ]
 [0.004329   0.00707547 0.         ... 0.         0.         0.        ]]


In [None]:
len(cooccurence_matrix)   #conta as linhas da matriz (# items)
#len(cooccurence_matrix[0])

cooccurence_matrix[4][0]

### create matrix factorization


teste de matrix

In [None]:
def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
    Q = Q.T
    print Q
    for step in xrange(steps):
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] > 0:
                    #calcular erro
                    eij = R[i][j] - np.dot(P[i, : ], Q[: , j])
                    for k in xrange(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        eR = np.dot(P, Q)
        ##função de optimização
        e = 0
        for i in xrange(len(R)):
            for j in xrange(len(R[i])):
                if R[i][j] > 0:
                         e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)
                         for k in xrange(K):
                                 e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
        if e < 0.001:
            break
    
    return P, Q.T


                
    


### Fazendo a recomendação baseada em item

In [121]:
#user, cooccurence_matrix, all_songs, user_songs

#Calculate a weighted average of the scores in cooccurence matrix for all user songs.
#shape Return a tuple representing the dimensionality of the DataFrame.
sim_scores_user = cooccurence_matrix.sum(axis=0)/float(cooccurence_matrix.shape[0])
sim_scores_user = np.array(sim_scores_user)[0].tolist()

#Sort the indices of sim_scores_user based upon their value
#Also maintain the corresponding score
sort_index = sorted(((e,i) for i,e in enumerate(list(sim_scores_user))), reverse=True)

#Create a dataframe from the following
columns = ['user_id', 'song', 'score', 'rank']
df = pd.DataFrame(columns = columns)

#Fill the dataframe with top 10 item based recommendations
rank = 1 
for i in range(0,len(sort_index)):
    if ~np.isnan(sort_index[i][0]) and all_items[sort_index[i][1]] not in user_items and rank <= 10:
        df.loc[len(df)]=[user,all_items[sort_index[i][1]],sort_index[i][0],rank]
        rank = rank+1

if df.shape[0] == 0:
    print("The current user has no songs for training the item similarity based recommendation model.")
    print  -1
else:
    print df

                                    user_id                          song  \
0  17aa9f6dbdf753831da8f38c71b66b64373de613                     The Prize   
1  17aa9f6dbdf753831da8f38c71b66b64373de613                 Fast As I Can   
2  17aa9f6dbdf753831da8f38c71b66b64373de613                 Acadian Coast   
3  17aa9f6dbdf753831da8f38c71b66b64373de613         Step Through The Door   
4  17aa9f6dbdf753831da8f38c71b66b64373de613  Ghost At The Foot Of The Bed   
5  17aa9f6dbdf753831da8f38c71b66b64373de613               Love In 2 Parts   
6  17aa9f6dbdf753831da8f38c71b66b64373de613         Big Big Love (Fig .2)   
7  17aa9f6dbdf753831da8f38c71b66b64373de613              From The Station   
8  17aa9f6dbdf753831da8f38c71b66b64373de613       Cassius (album version)   
9  17aa9f6dbdf753831da8f38c71b66b64373de613                La Petite Mort   

   score rank  
0  0.056    1  
1  0.056    2  
2  0.056    3  
3  0.056    4  
4  0.055    5  
5  0.054    6  
6  0.052    7  
7  0.051    8  
8  0.051

[0.0017920363766275528,
 0.005225653056029724,
 0.002014512060043165,
 0.0009525765349964243,
 0.005448682944270727,
 0.004631762144158488,
 0.001636866868009788,
 0.009312489781181,
 0.005388223563085448,
 0.002585104288672399,
 0.005056869346619691,
 0.0065612258553180754,
 0.00490054613457727,
 0.0022635468973195147,
 0.0036463201030192,
 0.006734327739277444,
 0.003379082100348794,
 0.004891069444398177,
 0.0049869327367125385,
 0.002414094949511608,
 0.0030566590678331148,
 0.0018293803390466122,
 0.0039080301993573135,
 0.0038973310908697904,
 0.000787058443812403,
 0.004001289984994562,
 0.001442093574827507,
 0.003628041362410576,
 0.006069592084390228,
 0.003063772000642508,
 0.003555627718025675,
 0.002917898396809012,
 0.0031573849364819975,
 0.002121029758141265,
 0.008437326231247015,
 0.005801754115214239,
 0.0013055513938675166,
 0.004307574370173973,
 0.0028932241545306272,
 0.005161544161798878,
 0.003518865985860866,
 0.05496122233065657,
 0.0015075174856310008,
 0.00

Your first step will be to create the user-item matrix. Since you have both testing and training data you need to create two matrices.

In [28]:
#Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_songs))
for line in train_data.itertuples():
    train_data_matrix[line[1], line[2]] = line[3]

#train_data_matrix = np.zeros((n_users, n_songs))
#for line in test_data.itertuples():
 #   test_data_matrix[line[1]-1, line[2]-1] = line[3]


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [2]:
for row in train_data.itertuples():
     train_data_matrix[row[1]-1, row[2]-1] = row[3]


NameError: name 'train_data' is not defined

#### calcular similaridade de usuario-item

In [59]:
from sklearn.metrics.pairwise import pairwise_distances


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

In [53]:
df = pd.DataFrame({'col1': [5, 2], 'col2': [10, 20], 'col3':[5,4]},
                      index=['a', 'b'])

matrix =  np.zeros((2,3))
for row in df.itertuples():
    print row[1]-1
    print row[2]-1
   # matrix[row[1], row[2]] = row[3]
   # print matrix

4
9
1
19


### Recommendation by popularity of song

In [6]:
artist_play = (data.
     groupby(by = ['Artist'])['Streams'].
     sum().
     reset_index().
     rename(columns = {'Streams': 'total_artist_plays'})
     [['Artist', 'total_artist_plays']]
    )
artist_play.sort_values(by= ['total_artist_plays'], ascending = False)

Unnamed: 0,Artist,total_artist_plays
1627,Ed Sheeran,8913973976
1562,Drake,4523630992
5631,The Chainsmokers,4292590087
4641,Post Malone,3700404149
3050,Kendrick Lamar,3570665303
3556,Luis Fonsi,3555514919
2523,J Balvin,2494735971
924,Calvin Harris,2397708371
2469,Imagine Dragons,2322921399
1266,DJ Khaled,2236224259


In [8]:
artist_play['total_artist_plays'].describe()

count         6628.000
mean      26932825.632
std      208004490.308
min           1002.000
25%          26133.000
50%         175241.000
75%        2043230.000
max     8913973976.000
Name: total_artist_plays, dtype: float64

In [13]:
artist_play['total_artist_plays'].quantile(np.arange(.9, 1, .01)), 

(0.900    15429870.400
 0.910    18607267.630
 0.920    23570143.160
 0.930    29621448.130
 0.940    37989694.900
 0.950    53180615.950
 0.960    80408860.440
 0.970   130946684.520
 0.980   285558664.920
 0.990   702407456.450
 Name: total_artist_plays, dtype: float64,)

To understand: (100%-99% = 1%) 1% of artist have roughly 700.000.000 or more plays,2% have 200.000.000 or more, 3% have 130.000.000 or more, 4% have 80.000.000 or more.

Since we have so many artists, we'll limit it to the top 4%. This is arbitrary threshold for popularity.

In [22]:
popularity_threshold = 130000000
data_popular_artists = artist_play.query('total_artist_plays >= @popularity_threshold')
data_popular_artists.sort_values(by=['total_artist_plays'], ascending = False)


Unnamed: 0,Artist,total_artist_plays
1627,Ed Sheeran,8913973976
1562,Drake,4523630992
5631,The Chainsmokers,4292590087
4641,Post Malone,3700404149
3050,Kendrick Lamar,3570665303
3556,Luis Fonsi,3555514919
2523,J Balvin,2494735971
924,Calvin Harris,2397708371
2469,Imagine Dragons,2322921399
1266,DJ Khaled,2236224259


Then the most popular artists in the world are:

In [23]:
len(data_popular_artists)

201

In [7]:
track_play = (data.
    groupby(by = ['Track Name'])["Streams"].
    sum().
    reset_index().
    rename(columns = {'Streams':'total_track_plays_on_word'})
    [['Track Name','total_track_plays_on_word']]
         )
track_play['Artist'] = data['Artist']
track_play.sort_values(by=['total_track_plays_on_word'], ascending = False)

Unnamed: 0,Track Name,total_track_plays_on_word,Artist
12905,Shape of You,2993988783,Ed Sheeran
3342,Despacito - Remix,1829621841,The Vamps
3341,Despacito (Featuring Daddy Yankee),1460802540,Rombai
13414,Something Just Like This,1386258295,Ricky Martin
15228,Unforgettable,1366919951,Pitbull
5569,HUMBLE.,1311243745,Drake
16725,rockstar,1260181617,Romeo Santos
6566,I'm the One,1254196301,Mambo Kingz
6897,It Ain't Me (with Selena Gomez),1190339348,Twenty One Pilots
16276,XO TOUR Llif3,1171827725,CNCO


## Picking a threshold for popular artists