In [1]:
import pandas as pd
import re

## Analisando colunas e dados

In [3]:
dados = pd.read_csv('spotify_top_songs_audio_features.csv', sep=',')

dados.head()

Unnamed: 0,id,artist_names,track_name,source,key,mode,time_signature,danceability,energy,speechiness,acousticness,instrumentalness,liveness,valence,loudness,tempo,duration_ms,weeks_on_chart,streams
0,000xQL6tZNLJzIrtIgxqSl,"ZAYN, PARTYNEXTDOOR",Still Got Time (feat. PARTYNEXTDOOR),RCA Records Label,G,Major,4 beats,0.748,0.627,0.0639,0.131,0.0,0.0852,0.524,-6.029,120.963,188491,17,107527761
1,003eoIwxETJujVWmNFMoZy,Alessia Cara,Growing Pains,Def Jam Recordings,C#/Db,Minor,4 beats,0.353,0.755,0.733,0.0822,0.0,0.39,0.437,-6.276,191.153,193680,2,9944865
2,003vvx7Niy0yvhvHt4a68B,The Killers,Mr. Brightside,Island Records,C#/Db,Major,4 beats,0.352,0.911,0.0747,0.00121,0.0,0.0995,0.236,-5.23,148.033,222973,125,512388123
3,00B7TZ0Xawar6NZ00JFomN,"Cardi B, Chance the Rapper",Best Life (feat. Chance The Rapper),Atlantic/KSR,A,Major,4 beats,0.62,0.625,0.553,0.287,0.0,0.314,0.665,-7.438,167.911,284856,2,11985346
4,00Blm7zeNqgYLPtW6zg8cj,"Post Malone, The Weeknd",One Right Now (with The Weeknd),Republic Records,C#/Db,Major,4 beats,0.687,0.781,0.053,0.0361,0.0,0.0755,0.688,-4.806,97.014,193507,30,301860377


In [4]:
dados.shape

(6513, 19)

In [5]:
dados.dtypes

id                   object
artist_names         object
track_name           object
source               object
key                  object
mode                 object
time_signature       object
danceability        float64
energy              float64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
loudness            float64
tempo               float64
duration_ms           int64
weeks_on_chart        int64
streams               int64
dtype: object

### Quantidade de aparições de um artista no top200, incluindo participações

In [24]:
# Divide a string em artistas individuais e conta as ocorrências
artistas = dados['artist_names'].str.split(', ').explode().value_counts().reset_index()

# Renomeia as colunas
artistas.columns = ['Artista', 'QuantidadeAparicoes']

# Imprime o DataFrame resultante
artistas.head(50)

Unnamed: 0,Artista,QuantidadeAparicoes
0,Drake,229
1,Taylor Swift,203
2,Bad Bunny,159
3,Travis Scott,127
4,Post Malone,111
5,Juice WRLD,110
6,The Weeknd,104
7,Ariana Grande,101
8,21 Savage,97
9,BTS,95


### Analisando se foi utilizado algum outro simbolo para dividir dois artitas além da virgula

In [8]:
padrao = r'[^a-zA-Z0-9\s]'

# Lista para armazenar os artistas que atendem ao critério
artistas_analisar = []

# Iterar sobre as linhas do DataFrame e adicionar os artistas que atendem ao critério à lista
for index, row in artistas.iterrows():
    artista = row['Artista']
    # Verificar se o nome do artista contém caracteres que não são letras nem números
    if re.search(padrao, artista):
        artistas_analisar.append(artista)

# Criar um novo DataFrame com os artistas que atendem ao critério
artistas_analisar = pd.DataFrame({'Artista': artistas_analisar})

artistas_analisar.head(50)

Unnamed: 0,Artista
0,J. Cole
1,Ty Dolla $ign
2,Beyoncé
3,A$AP Rocky
4,ROSALÍA
5,Anne-Marie
6,Lenny Tavárez
7,Anderson .Paak
8,G-Eazy
9,¥$


### Analisando musicas que mais apareceram

In [31]:
musicas = dados[['track_name', 'artist_names']].explode('artist_names').groupby(['track_name', 'artist_names']).size().reset_index(name='Quantidade')

# Renomeia as colunas
musicas.columns = ['musica', 'Artista', 'QuantidadeVezes']
musicas = musicas.sort_values(by='QuantidadeVezes', ascending=False)

# Imprime o DataFrame resultante
musicas.head(50)

Unnamed: 0,musica,Artista,QuantidadeVezes
2324,Jingle Bell Rock,Bobby Helms,6
4155,Sleigh Ride,The Ronettes,6
3052,My Only Wish (This Year),Britney Spears,5
557,Believer,Imagine Dragons,5
4663,Thunder,Imagine Dragons,5
4370,Sunflower - Spider-Man: Into the Spider-Verse,"Post Malone, Swae Lee",5
5035,Whatever It Takes,Imagine Dragons,5
5438,rockstar (feat. 21 Savage),"Post Malone, 21 Savage",4
251,All The Stars (with SZA),"Kendrick Lamar, SZA",4
2893,Mean It,"Lauv, LANY",4


### Aqui temos um problema quanto as musicas

In [32]:
ocorrencias = dados[dados['track_name'].str.contains('Jingle Bell Rock')]
ocorrencias


Unnamed: 0,id,artist_names,track_name,source,key,mode,time_signature,danceability,energy,speechiness,acousticness,instrumentalness,liveness,valence,loudness,tempo,duration_ms,weeks_on_chart,streams
1331,1cImYkHOQMFE7AZC9aOk4y,Bobby Helms,Jingle Bell Rock,Universal Digital Enterprises,D,Major,4 beats,0.741,0.34,0.0446,0.614,0.0,0.0955,0.838,-13.421,119.606,129893,6,23307145
3108,3hBXvHLlTHvnbwrPbeoyAj,Bobby Helms,Jingle Bell Rock,Geffen,D,Major,4 beats,0.754,0.424,0.0363,0.643,0.0,0.0652,0.806,-8.463,119.705,130973,17,73994656
3585,4IdXngKo4g5exqZ0fQTecu,Bobby Helms,Jingle Bell Rock,Universal Digital Enterprises,D,Major,4 beats,0.741,0.326,0.0395,0.918,0.0,0.0658,0.751,-12.045,120.992,130267,4,15205861
5382,6TkXViJUO3SA6FMcrNwlD9,Bobby Helms,Jingle Bell Rock,Universal Digital Enterprises,D,Major,4 beats,0.729,0.368,0.0435,0.968,0.0,0.0811,0.738,-10.765,119.391,133427,43,117756145
5661,6p1GHnHSJSma1orNgP7NoE,Bobby Helms,Jingle Bell Rock,Big Buzz Productions,D,Major,4 beats,0.725,0.368,0.0411,0.967,0.0,0.074,0.7,-10.657,119.438,133407,40,77373767
5669,6pVW5LRWgeLaHudxauOTJU,Daryl Hall & John Oates,Jingle Bell Rock - Daryl's Version,Legacy Recordings,D,Major,4 beats,0.666,0.841,0.0312,0.613,0.0,0.113,0.84,-3.592,128.706,126360,19,156741431
6450,7vQbuQcyTflfCIOu3Uzzya,Bobby Helms,Jingle Bell Rock,Geffen,D,Major,4 beats,0.754,0.424,0.0363,0.643,0.0,0.0652,0.806,-8.463,119.705,130973,36,483558041


Existem musicas que provavelmente que entraram e sairam do spotify e voltaram novamente em algum momento, covers, musicas com nomes iguais mas artistas diferente e musicas com os mesmos artistas mas colunas com informações sonoras diferentes. Preciso me atentar quando for manipular colunas que musicas repetidas e com as mesma caracteristicas possam atrapalhar