In [1]:
import pandas as pd
from scipy.spatial.distance import cosine
import numpy as np
from scipy.sparse import csr_matrix

# display results to 3 decimal points, not in scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)

#download the files with we are going to work
info_user_songs = 'https://static.turi.com/datasets/millionsong/10000.txt'
songs_metadata_file = 'https://static.turi.com/datasets/millionsong/song_data.csv'

### Read the info_user_song and define the columns user_id, song_id and listen_count 

In [9]:
song_df_1 = pd.read_table(info_user_songs,header=None)
song_df_1.columns = ["user_id", "song_id", "listen_count"]
song_df_1.head()

Unnamed: 0,user_id,song_id,listen_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1


In [10]:
len(song_df_1)

2000000

### Read the songs_metadata_file  data and then  merges with song_df_1 to have only one dataset

In [13]:
song_df_2 =  pd.read_csv(songs_metadata_file)
song_df_2.head()


Unnamed: 0,song_id,title,release,artist_name,year
0,SOQMMHC12AB0180CB8,Silent Night,Monster Ballads X-Mas,Faster Pussy cat,2003
1,SOVFVAK12A8C1350D9,Tanssi vaan,Karkuteillä,Karkkiautomaatti,1995
2,SOGTUKN12AB017F4F1,No One Could Ever,Butter,Hudson Mohawke,2006
3,SOBNYVR12A8C13558C,Si Vos Querés,De Culo,Yerba Brava,2003
4,SOHSBXH12A8C13B0DF,Tangle Of Aspens,Rene Ablaze Presents Winter Sessions,Der Mystic,0


In [14]:
song_df =  pd.merge(song_df_1, song_df_2.drop_duplicates(['song_id']), on="song_id", how="left")
song_df.head()

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Thicker Than Water,Jack Johnson,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Graduation,Kanye West,2007
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,In Between Dreams,Jack Johnson,2005
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999


In [15]:
len(song_df)

2000000

#### song_df contain info about user_id, song_id, _listen_count, title music, release, artist_name and year. The dataset have a length of 200000

In [17]:
if song_df['artist_name'].isnull().sum() > 0:
    song_df = song_df.dropna(axis = 0, subset = ['artist_name'])
    
len(song_df)

2000000

In [22]:
song_play =(song_df.
            groupby(by=["title"])["listen_count"].
            sum().
            reset_index().
            rename(columns = {'listen_count': 'total_song_plays', 'title': 'song'})
            [['song', 'total_song_plays']]
            )
grouped_sum = song_play['total_song_plays'].sum()
song_play["porcentage"] = song_play["total_song_play"]

song_play['artist'] = song_df["artist_name"]

song_play.sort_values(by=['total_song_plays'], ascending =  False)

Unnamed: 0,song,total_song_plays,artist
9496,You're The One,54915,Train
8725,Undo,49253,Duffy
6498,Revelry,41418,Sia
3437,Horn Concerto No. 4 in E flat K495: II. Romanc...,31153,Tiësto
6836,Sehr kosmisch,31036,The Verve
1964,Dog Days Are Over (Radio Edit),26663,Band Of Horses
6825,Secrets,22100,Ron Carter
1264,Canada,21019,Kid Cudi / MGMT / Ratatat
4025,Invalid,19645,Amy Winehouse
243,Ain't Misbehavin,18309,The New Pornographers


In [6]:
artist_play = (data.
     groupby(by = ['Artist'])['Streams'].
     sum().
     reset_index().
     rename(columns = {'Streams': 'total_artist_plays'})
     [['Artist', 'total_artist_plays']]
    )
artist_play.sort_values(by= ['total_artist_plays'], ascending = False)

Unnamed: 0,Artist,total_artist_plays
1627,Ed Sheeran,8913973976
1562,Drake,4523630992
5631,The Chainsmokers,4292590087
4641,Post Malone,3700404149
3050,Kendrick Lamar,3570665303
3556,Luis Fonsi,3555514919
2523,J Balvin,2494735971
924,Calvin Harris,2397708371
2469,Imagine Dragons,2322921399
1266,DJ Khaled,2236224259


In [8]:
artist_play['total_artist_plays'].describe()

count         6628.000
mean      26932825.632
std      208004490.308
min           1002.000
25%          26133.000
50%         175241.000
75%        2043230.000
max     8913973976.000
Name: total_artist_plays, dtype: float64

In [13]:
artist_play['total_artist_plays'].quantile(np.arange(.9, 1, .01)), 

(0.900    15429870.400
 0.910    18607267.630
 0.920    23570143.160
 0.930    29621448.130
 0.940    37989694.900
 0.950    53180615.950
 0.960    80408860.440
 0.970   130946684.520
 0.980   285558664.920
 0.990   702407456.450
 Name: total_artist_plays, dtype: float64,)

To understand: (100%-99% = 1%) 1% of artist have roughly 700.000.000 or more plays,2% have 200.000.000 or more, 3% have 130.000.000 or more, 4% have 80.000.000 or more.

Since we have so many artists, we'll limit it to the top 4%. This is arbitrary threshold for popularity.

In [22]:
popularity_threshold = 130000000
data_popular_artists = artist_play.query('total_artist_plays >= @popularity_threshold')
data_popular_artists.sort_values(by=['total_artist_plays'], ascending = False)


Unnamed: 0,Artist,total_artist_plays
1627,Ed Sheeran,8913973976
1562,Drake,4523630992
5631,The Chainsmokers,4292590087
4641,Post Malone,3700404149
3050,Kendrick Lamar,3570665303
3556,Luis Fonsi,3555514919
2523,J Balvin,2494735971
924,Calvin Harris,2397708371
2469,Imagine Dragons,2322921399
1266,DJ Khaled,2236224259


Then the most popular artists in the world are:

In [23]:
len(data_popular_artists)

201

In [7]:
track_play = (data.
    groupby(by = ['Track Name'])["Streams"].
    sum().
    reset_index().
    rename(columns = {'Streams':'total_track_plays_on_word'})
    [['Track Name','total_track_plays_on_word']]
         )
track_play['Artist'] = data['Artist']
track_play.sort_values(by=['total_track_plays_on_word'], ascending = False)

Unnamed: 0,Track Name,total_track_plays_on_word,Artist
12905,Shape of You,2993988783,Ed Sheeran
3342,Despacito - Remix,1829621841,The Vamps
3341,Despacito (Featuring Daddy Yankee),1460802540,Rombai
13414,Something Just Like This,1386258295,Ricky Martin
15228,Unforgettable,1366919951,Pitbull
5569,HUMBLE.,1311243745,Drake
16725,rockstar,1260181617,Romeo Santos
6566,I'm the One,1254196301,Mambo Kingz
6897,It Ain't Me (with Selena Gomez),1190339348,Twenty One Pilots
16276,XO TOUR Llif3,1171827725,CNCO


## Picking a threshold for popular artists