# Pré-processamento dos dados

Para que a visualização multidimensional dos dados fosse possível, foi necessário realizar um processamento nos dados que consiste em extrair, para cada artista, os valores médios de alguns atributos de interesse, a fim de que o usuário possa comparar os pontos fortes de cada cantor. A análise foi concentrada nos 15 artistas com mais músicas no conjunto de dados.

In [28]:
import pandas as pd
import numpy  as np
import plotly.express as px
from sklearn.preprocessing import StandardScaler

In [29]:
# Leitura do conjunto de dados original
dados = pd.read_csv('https://raw.githubusercontent.com/EduardoDuX/DataViz/main/spotify-2023.csv', encoding='latin-1')
dados.head()

Unnamed: 0,track_name,artist(s)_name,artist_count,released_year,released_month,released_day,in_spotify_playlists,in_spotify_charts,streams,in_apple_playlists,...,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,Seven (feat. Latto) (Explicit Ver.),"Latto, Jung Kook",2,2023,7,14,553,147,141381703,43,...,125,B,Major,80,89,83,31,0,8,4
1,LALA,Myke Towers,1,2023,3,23,1474,48,133716286,48,...,92,C#,Major,71,61,74,7,0,10,4
2,vampire,Olivia Rodrigo,1,2023,6,30,1397,113,140003974,94,...,138,F,Major,51,32,53,17,0,31,6
3,Cruel Summer,Taylor Swift,1,2019,8,23,7858,100,800840817,116,...,170,A,Major,55,58,72,11,0,11,15
4,WHERE SHE GOES,Bad Bunny,1,2023,5,18,3133,50,303236322,84,...,144,A,Minor,65,23,80,14,63,11,6


In [30]:
# Removendo uma linha com dados inválidos
dados = dados.drop(574)

In [31]:
# Extraindo, para cada música, o artista principal dela
dados['main_artist'] = ''

for index, row in dados.iterrows():
    if row['artist_count'] > 1:
        artists = row['artist(s)_name'].split(',')
        dados.at[index, 'main_artist'] = artists[0]
    else:
        dados.at[index, 'main_artist'] = row['artist(s)_name']

# Exibindo o resultado
dados[['artist(s)_name', 'main_artist']]

Unnamed: 0,artist(s)_name,main_artist
0,"Latto, Jung Kook",Latto
1,Myke Towers,Myke Towers
2,Olivia Rodrigo,Olivia Rodrigo
3,Taylor Swift,Taylor Swift
4,Bad Bunny,Bad Bunny
...,...,...
948,Selena Gomez,Selena Gomez
949,Taylor Swift,Taylor Swift
950,"Feid, Paulo Londra",Feid
951,"Feid, Sech, Jhayco",Feid


In [32]:
# Exibindo os artistas com mais músicas no dataset (em ordem decrescente)
main_artists = dados['main_artist'].value_counts()[:15].keys().values.tolist()
main_artists

['Taylor Swift',
 'The Weeknd',
 'Bad Bunny',
 'SZA',
 'Kendrick Lamar',
 'Drake',
 'Harry Styles',
 'Feid',
 'Ed Sheeran',
 'Morgan Wallen',
 'Eminem',
 'BTS',
 'Karol G',
 'Labrinth',
 'Doja Cat']

In [33]:
# Variáveis de interesse
variaveis = ['streams', 'in_apple_playlists', 'in_apple_charts',
             'in_deezer_playlists', 'in_deezer_charts', 'in_shazam_charts', 'bpm', 
             'danceability_%', 'valence_%', 'energy_%',
             'acousticness_%', 'instrumentalness_%', 'liveness_%', 'speechiness_%']

# Variáveis de interesse que são do tipo string
variaveis_string = ['streams', 'in_deezer_playlists', 'in_shazam_charts']

for variavel in variaveis_string:
    dados[variavel] = dados[variavel].str.replace(',', '')

# Convertendo todas as variáveis de interesse para o tipo float
dados[variaveis] = dados[variaveis].astype(float)

In [34]:
# Calculando as médias de cada um dos atributos de interesse para todos os artistas
variaveis.append('main_artist')
medias = dados[variaveis].groupby('main_artist').mean()
medias

Unnamed: 0_level_0,streams,in_apple_playlists,in_apple_charts,in_deezer_playlists,in_deezer_charts,in_shazam_charts,bpm,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
main_artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
(G)I-DLE,1.334917e+08,12.0,121.0,8.0,0.0,79.5,133.0,77.5,67.0,87.0,3.5,0.0,37.5,11.5
21 Savage,6.068094e+07,3.0,0.0,5.0,0.0,0.0,148.0,68.0,29.0,73.0,0.0,0.0,7.0,7.0
24kgoldn,1.699402e+09,237.0,27.0,636.0,0.0,,91.0,70.0,76.0,72.0,22.0,0.0,27.0,4.0
50 Cent,1.202723e+09,235.0,106.0,5221.0,1.0,35.0,90.0,90.0,79.0,71.0,26.0,0.0,7.0,37.0
A$AP Rocky,9.418647e+07,17.0,60.0,28.0,1.0,44.0,90.0,60.0,13.0,53.0,4.0,0.0,21.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
girl in red,7.230439e+08,31.0,21.0,15.0,0.0,4.0,130.0,57.0,24.0,37.0,11.0,18.0,16.0,3.0
j-hope,1.361978e+08,14.0,67.0,9.5,0.0,4.5,95.5,73.0,56.0,77.0,27.0,0.0,10.0,8.5
sped up 8282,1.037625e+08,0.0,0.0,6.0,0.0,0.0,144.0,74.0,75.0,73.0,42.0,0.0,9.0,4.0
sped up nightcore,2.070333e+08,0.0,0.0,21.0,0.0,0.0,130.0,69.0,36.0,90.0,1.0,10.0,15.0,4.0


In [35]:
# Extraindo as medias apenas dos artistas principais
medias = medias.loc[main_artists]
medias

Unnamed: 0_level_0,streams,in_apple_playlists,in_apple_charts,in_deezer_playlists,in_deezer_charts,in_shazam_charts,bpm,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
main_artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Taylor Swift,400645500.0,51.444444,54.0,87.416667,1.611111,50.611111,124.111111,60.0,34.305556,56.25,28.416667,0.638889,16.5,7.25
The Weeknd,632839600.0,76.852941,71.029412,348.0,2.029412,73.1,118.794118,59.617647,42.882353,63.647059,21.147059,1.058824,20.617647,8.323529
Bad Bunny,590914800.0,41.692308,61.5,68.692308,3.615385,39.615385,119.307692,74.153846,48.576923,67.038462,24.0,2.423077,19.769231,9.576923
SZA,282304800.0,47.73913,55.26087,47.304348,1.173913,21.913043,116.956522,60.0,47.782609,53.826087,48.173913,1.826087,19.434783,9.608696
Kendrick Lamar,241561000.0,30.565217,12.869565,111.043478,0.0,31.478261,122.608696,66.434783,48.0,58.304348,41.086957,0.043478,16.304348,23.565217
Drake,423317400.0,72.105263,50.736842,225.684211,0.052632,22.0,133.578947,73.684211,30.526316,54.684211,5.526316,0.105263,23.105263,19.947368
Harry Styles,682861500.0,102.411765,32.058824,217.352941,4.470588,18.8,128.529412,61.352941,54.0,58.882353,42.823529,1.588235,14.294118,5.352941
Feid,245231800.0,28.466667,40.866667,29.533333,3.8,33.4,120.8,75.0,58.333333,67.6,11.533333,0.0,17.133333,9.466667
Ed Sheeran,1119975000.0,128.615385,46.461538,1333.923077,3.307692,81.454545,115.0,71.076923,52.538462,62.615385,33.692308,0.0,16.307692,5.230769
Morgan Wallen,143757500.0,16.25,47.25,2.583333,0.083333,49.166667,139.833333,58.166667,59.5,77.166667,20.25,0.0,22.5,3.25


In [36]:
# Alterando a escala dos valores para realizar a visualização
scaler = StandardScaler()
medias[variaveis[:-1]] = scaler.fit_transform(medias[variaveis[:-1]])
medias

Unnamed: 0_level_0,streams,in_apple_playlists,in_apple_charts,in_deezer_playlists,in_deezer_charts,in_shazam_charts,bpm,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
main_artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Taylor Swift,-0.430751,-0.343091,0.051084,-0.411478,-0.249927,0.497236,0.271113,-0.715591,-1.449052,-0.869132,0.126111,-0.24883,-0.670944,-0.704087
The Weeknd,0.366634,0.499946,0.88791,-0.040041,-0.046468,1.374943,-0.336412,-0.752622,-0.570047,0.111006,-0.381257,-0.127403,0.522474,-0.523754
Bad Bunny,0.222659,-0.66666,0.419634,-0.438168,0.724939,0.06809,-0.27773,0.655179,0.01357,0.560379,-0.182142,0.26708,0.276577,-0.313206
SZA,-0.837148,-0.466031,0.113043,-0.468655,-0.462578,-0.622804,-0.546377,-0.715591,-0.067837,-1.19031,1.505028,0.094456,0.179644,-0.307869
Kendrick Lamar,-0.977068,-1.03585,-1.970069,-0.3778,-1.033562,-0.249489,0.099445,-0.092396,-0.045557,-0.596924,1.010408,-0.420997,-0.72765,2.036574
Drake,-0.352892,0.342422,-0.109268,-0.214391,-1.007962,-0.61941,1.352916,0.609696,-1.836373,-1.076605,-1.471475,-0.403132,1.243459,1.428841
Harry Styles,0.538416,1.347971,-1.027108,-0.226266,1.140905,-0.744301,0.775952,-0.584562,0.569362,-0.520336,1.131609,0.02568,-1.310275,-1.022759
Feid,-0.964462,-1.105478,-0.594289,-0.493986,0.814735,-0.174486,-0.107218,0.737127,1.013469,0.634785,-1.052227,-0.433569,-0.487385,-0.331727
Ed Sheeran,2.039523,2.21739,-0.319357,1.365302,0.57528,1.701007,-0.769931,0.357185,0.419574,-0.025695,0.494313,-0.433569,-0.72668,-1.043282
Morgan Wallen,-1.312938,-1.510819,-0.280612,-0.5324,-0.993029,0.440862,2.067548,-0.893146,1.133037,1.902404,-0.443865,-0.433569,1.068036,-1.376015


In [37]:
# Exemplo de um Radar Chart
fig = px.line_polar(medias, medias.iloc[5], theta=medias.columns, line_close=True)
fig.update_traces(fill='toself')
fig.show()

In [38]:
# Salvando o dataframe como um arquivo .csv
medias.to_csv('media-artistas-principais.csv',index=True)