In [2]:
import pandas as pd
from scipy.spatial.distance import cosine
import numpy as np
from scipy.sparse import csr_matrix

# display results to 3 decimal points, not in scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [3]:
data = pd.read_csv("../../db/data.csv")
data.head()

Unnamed: 0,Position,Track Name,Artist,Streams,URL,Date,Region
0,1,Reggaetón Lento (Bailemos),CNCO,19272,https://open.spotify.com/track/3AEZUABDXNtecAO...,2017-01-01,ec
1,2,Chantaje,Shakira,19270,https://open.spotify.com/track/6mICuAdrwEjh6Y6...,2017-01-01,ec
2,3,Otra Vez (feat. J Balvin),Zion & Lennox,15761,https://open.spotify.com/track/3QwBODjSEzelZyV...,2017-01-01,ec
3,4,Vente Pa' Ca,Ricky Martin,14954,https://open.spotify.com/track/7DM4BPaS7uofFul...,2017-01-01,ec
4,5,Safari,J Balvin,14269,https://open.spotify.com/track/6rQSrBHf7HlZjtc...,2017-01-01,ec


In [4]:
len(data)

3441197

In [5]:
if data['Artist'].isnull().sum() > 0:
    data = data.dropna(axis = 0, subset = ['Artist'])
    
len(data)

3440540

In [6]:
artist_play = (data.
     groupby(by = ['Artist'])['Streams'].
     sum().
     reset_index().
     rename(columns = {'Streams': 'total_artist_plays'})
     [['Artist', 'total_artist_plays']]
    )
artist_play.sort_values(by= ['total_artist_plays'], ascending = False)

Unnamed: 0,Artist,total_artist_plays
1627,Ed Sheeran,8913973976
1562,Drake,4523630992
5631,The Chainsmokers,4292590087
4641,Post Malone,3700404149
3050,Kendrick Lamar,3570665303
3556,Luis Fonsi,3555514919
2523,J Balvin,2494735971
924,Calvin Harris,2397708371
2469,Imagine Dragons,2322921399
1266,DJ Khaled,2236224259


In [8]:
artist_play['total_artist_plays'].describe()

count         6628.000
mean      26932825.632
std      208004490.308
min           1002.000
25%          26133.000
50%         175241.000
75%        2043230.000
max     8913973976.000
Name: total_artist_plays, dtype: float64

In [13]:
artist_play['total_artist_plays'].quantile(np.arange(.9, 1, .01)), 

(0.900    15429870.400
 0.910    18607267.630
 0.920    23570143.160
 0.930    29621448.130
 0.940    37989694.900
 0.950    53180615.950
 0.960    80408860.440
 0.970   130946684.520
 0.980   285558664.920
 0.990   702407456.450
 Name: total_artist_plays, dtype: float64,)

To understand: (100%-99% = 1%) 1% of artist have roughly 700.000.000 or more plays,2% have 200.000.000 or more, 3% have 130.000.000 or more, 4% have 80.000.000 or more.

Since we have so many artists, we'll limit it to the top 4%. This is arbitrary threshold for popularity.

In [22]:
popularity_threshold = 130000000
data_popular_artists = artist_play.query('total_artist_plays >= @popularity_threshold')
data_popular_artists.sort_values(by=['total_artist_plays'], ascending = False)


Unnamed: 0,Artist,total_artist_plays
1627,Ed Sheeran,8913973976
1562,Drake,4523630992
5631,The Chainsmokers,4292590087
4641,Post Malone,3700404149
3050,Kendrick Lamar,3570665303
3556,Luis Fonsi,3555514919
2523,J Balvin,2494735971
924,Calvin Harris,2397708371
2469,Imagine Dragons,2322921399
1266,DJ Khaled,2236224259


Then the most popular artists in the world are:

In [23]:
len(data_popular_artists)

201

In [7]:
track_play = (data.
    groupby(by = ['Track Name'])["Streams"].
    sum().
    reset_index().
    rename(columns = {'Streams':'total_track_plays_on_word'})
    [['Track Name','total_track_plays_on_word']]
         )
track_play['Artist'] = data['Artist']
track_play.sort_values(by=['total_track_plays_on_word'], ascending = False)

Unnamed: 0,Track Name,total_track_plays_on_word,Artist
12905,Shape of You,2993988783,Ed Sheeran
3342,Despacito - Remix,1829621841,The Vamps
3341,Despacito (Featuring Daddy Yankee),1460802540,Rombai
13414,Something Just Like This,1386258295,Ricky Martin
15228,Unforgettable,1366919951,Pitbull
5569,HUMBLE.,1311243745,Drake
16725,rockstar,1260181617,Romeo Santos
6566,I'm the One,1254196301,Mambo Kingz
6897,It Ain't Me (with Selena Gomez),1190339348,Twenty One Pilots
16276,XO TOUR Llif3,1171827725,CNCO


## Picking a threshold for popular artists