# Análisis exploratorio de las canciones de Spotify más escuchadas del 2024 

In [1]:
# Importar las librerías necesarias
import pandas as pd
import plotly.express as px



In [2]:
# Inicializar el dataframe
df = pd.read_csv(r"../data/songs_2024.csv", encoding='ISO-8859-1')

## Análisis del dataframe


In [3]:
# Revisar información general del dataframe
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 29 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Track                       4600 non-null   object 
 1   Album Name                  4600 non-null   object 
 2   Artist                      4595 non-null   object 
 3   Release Date                4600 non-null   object 
 4   ISRC                        4600 non-null   object 
 5   All Time Rank               4600 non-null   object 
 6   Track Score                 4600 non-null   float64
 7   Spotify Streams             4487 non-null   object 
 8   Spotify Playlist Count      4530 non-null   object 
 9   Spotify Playlist Reach      4528 non-null   object 
 10  Spotify Popularity          3796 non-null   float64
 11  YouTube Views               4292 non-null   object 
 12  YouTube Likes               4285 non-null   object 
 13  TikTok Posts                3427 

In [4]:
# Ver las primeras lineas de las columnas
pd.set_option("display.max_columns", None)
df.head()



Unnamed: 0,Track,Album Name,Artist,Release Date,ISRC,All Time Rank,Track Score,Spotify Streams,Spotify Playlist Count,Spotify Playlist Reach,Spotify Popularity,YouTube Views,YouTube Likes,TikTok Posts,TikTok Likes,TikTok Views,YouTube Playlist Reach,Apple Music Playlist Count,AirPlay Spins,SiriusXM Spins,Deezer Playlist Count,Deezer Playlist Reach,Amazon Playlist Count,Pandora Streams,Pandora Track Stations,Soundcloud Streams,Shazam Counts,TIDAL Popularity,Explicit Track
0,MILLION DOLLAR BABY,Million Dollar Baby - Single,Tommy Richman,4/26/2024,QM24S2402528,1,725.4,390470936,30716,196631588,92.0,84274754,1713126,5767700,651565900.0,5332281936.0,150597040,210.0,40975,684,62.0,17598718,114.0,18004655,22931,4818457.0,2669262,,0
1,Not Like Us,Not Like Us,Kendrick Lamar,5/4/2024,USUG12400910,2,545.9,323703884,28113,174597137,92.0,116347040,3486739,674700,35223547.0,208339025.0,156380351,188.0,40778,3,67.0,10422430,111.0,7780028,28444,6623075.0,1118279,,1
2,i like the way you kiss me,I like the way you kiss me,Artemas,3/19/2024,QZJ842400387,3,538.4,601309283,54331,211607669,92.0,122599116,2228730,3025400,275154237.0,3369120610.0,373784955,190.0,74333,536,136.0,36321847,172.0,5022621,5639,7208651.0,5285340,,0
3,Flowers,Flowers - Single,Miley Cyrus,1/12/2023,USSM12209777,4,444.9,2031280633,269802,136569078,85.0,1096100899,10629796,7189811,1078757968.0,14603725994.0,3351188582,394.0,1474799,2182,264.0,24684248,210.0,190260277,203384,,11822942,,0
4,Houdini,Houdini,Eminem,5/31/2024,USUG12403398,5,423.3,107034922,7223,151469874,88.0,77373957,3670188,16400,,,112763851,182.0,12185,1,82.0,17660624,105.0,4493884,7006,207179.0,457017,,1


## Filtrar columnas

In [5]:
# Filtrar solo las columnas necesarias para trabajar solo con datos de Spotify
filtered_columns = [col for col in df.columns if 'spotify' in col.lower()]
spotify_df = df[['Track', 'Album Name', 'Artist', 'Release Date', 'All Time Rank', 'Track Score'] + filtered_columns ]


spotify_df.columns = spotify_df.columns.str.lower().str.replace(' ', "_", regex=False)
display(spotify_df.head())


Unnamed: 0,track,album_name,artist,release_date,all_time_rank,track_score,spotify_streams,spotify_playlist_count,spotify_playlist_reach,spotify_popularity
0,MILLION DOLLAR BABY,Million Dollar Baby - Single,Tommy Richman,4/26/2024,1,725.4,390470936,30716,196631588,92.0
1,Not Like Us,Not Like Us,Kendrick Lamar,5/4/2024,2,545.9,323703884,28113,174597137,92.0
2,i like the way you kiss me,I like the way you kiss me,Artemas,3/19/2024,3,538.4,601309283,54331,211607669,92.0
3,Flowers,Flowers - Single,Miley Cyrus,1/12/2023,4,444.9,2031280633,269802,136569078,85.0
4,Houdini,Houdini,Eminem,5/31/2024,5,423.3,107034922,7223,151469874,88.0


In [6]:
# Revisar nuevo dataframe
spotify_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   track                   4600 non-null   object 
 1   album_name              4600 non-null   object 
 2   artist                  4595 non-null   object 
 3   release_date            4600 non-null   object 
 4   all_time_rank           4600 non-null   object 
 5   track_score             4600 non-null   float64
 6   spotify_streams         4487 non-null   object 
 7   spotify_playlist_count  4530 non-null   object 
 8   spotify_playlist_reach  4528 non-null   object 
 9   spotify_popularity      3796 non-null   float64
dtypes: float64(2), object(8)
memory usage: 359.5+ KB


## Limpieza y tipos de datos

In [None]:
# Limpieza de datos nulos en las columnas importantes
spotify_df = spotify_df.dropna(subset=['artist', 'spotify_streams'])
spotify_df['track'] = spotify_df['track'].drop_duplicates()
spotify_df = spotify_df.dropna(subset=['track'])


spotify_df.isnull().sum()


track                       0
album_name                  0
artist                      0
release_date                0
all_time_rank               0
track_score                 0
spotify_streams             0
spotify_playlist_count     20
spotify_playlist_reach     22
spotify_popularity        706
dtype: int64

In [8]:
# Verificar y cambiar tipos de datos
display(spotify_df.dtypes)
# Cambiar tipo de dato de 'release_date' a datetime
spotify_df['release_date'] = pd.to_datetime(spotify_df['release_date'], errors='coerce')

# Cambiar tipo de dato de 'spotify_streams' a int
spotify_df['spotify_streams'] = spotify_df['spotify_streams'].str.replace(',', '')
spotify_df['spotify_streams'] = spotify_df['spotify_streams'].astype(int)

spotify_df.dtypes


track                      object
album_name                 object
artist                     object
release_date               object
all_time_rank              object
track_score               float64
spotify_streams            object
spotify_playlist_count     object
spotify_playlist_reach     object
spotify_popularity        float64
dtype: object

track                             object
album_name                        object
artist                            object
release_date              datetime64[ns]
all_time_rank                     object
track_score                      float64
spotify_streams                    int64
spotify_playlist_count            object
spotify_playlist_reach            object
spotify_popularity               float64
dtype: object

## Visualización de los datos


In [None]:
# Histograma que muestra la distribución de los streams de una canción 

# Crear bins para la popularidad de las canciones de 1-10
spotify_df["popularity_bin"] = pd.cut(
    spotify_df["spotify_popularity"],
    bins=10,        # 10 intervalos iguales
    labels=range(1, 11), # etiquetas 1–10
    ordered=True
)

fig = px.histogram(
    spotify_df,
    x='spotify_streams',
    nbins=50,
    color="popularity_bin",
    category_orders={'popularity_bin': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
    title='Streams según Popularidad (1–10)',
    color_discrete_sequence=px.colors.sequential.Viridis)

fig.update_layout(
    xaxis_title='Número de Streams',
    yaxis_title='Número de Canciones',
    legend_title='Popularidad',
    
)



fig.update_traces(
    marker=dict(
        line=dict(
            width=1,        # grosor del borde
            color="black"   # color del borde
        )
    )
)

fig.show()

In [None]:
# Año de lanzamiento vs Streams, muestra si las canciones más viejas todavía dominan, o si lo nuevo escala más rápido.

spotify_df['release_year'] = spotify_df['release_date'].dt.year

fig = px.scatter(
    spotify_df,
    x="release_year",
    y="spotify_streams",
    color="popularity_bin",  # para darle un gradiente de color 
    size='popularity_bin', # tamaño según 
    category_orders={'popularity_bin': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
    title="Año de Lanzamiento vs Streams(log) en Spotify",
    opacity=0.4,
    color_discrete_sequence=px.colors.qualitative.Bold

)
fig.update_layout(
    xaxis_title='Año de Lanzamiento',
    legend_title="Popularidad"
)

fig.update_yaxes(type="log", title="Número de Streams (log)")

fig.show()

In [70]:
# Top 10 canciones más escuchadas del 2024
top_10_songs = spotify_df[spotify_df['release_year'] == 2024].nlargest(10, 'spotify_streams').sort_values(by='spotify_streams', ascending=False)
order = top_10_songs["track"].tolist()
top_10_songs["track"] = pd.Categorical(top_10_songs["track"], categories=order, ordered=True)
order_rev = order[::-1]
# Hacemos la gráfica de barras

fig = px.bar(
    top_10_songs,
    y='track',
    x='spotify_streams',
    orientation='h',
    color='artist',
    category_orders={"track": order_rev},
    title='Top 10 Canciones Más Escuchadas en 2024',
    labels={'track': 'Canción', 'spotify_streams': 'Número de Streams', 'artist': 'Artista'},
    color_discrete_sequence=px.colors.qualitative.Pastel
    )
fig.show()
