In [32]:
## general imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math

In [120]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("wardabilal/spotify-global-music-dataset-20092025")

print("Path to dataset files:", path)

# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "track_data_final.csv"

# Load the latest version
df = kagglehub.dataset_load(
  KaggleDatasetAdapter.PANDAS,
  "wardabilal/spotify-global-music-dataset-20092025",
  file_path,
  # Provide any additional arguments like 
  # sql_query or pandas_kwargs. See the 
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

print("First 5 records:", df.head())

Path to dataset files: C:\Users\linus\.cache\kagglehub\datasets\wardabilal\spotify-global-music-dataset-20092025\versions\1
First 5 records:                  track_id                           track_name  track_number  \
0  6pymOcrCnMuCWdgGVTvUgP                                    3            57   
1  2lWc1iJlz2NVcStV5fbtPG                               Clouds             1   
2  1msEuwSBneBKpVCZQcFTsU  Forever & Always (Taylor’s Version)            11   
3  7bcy34fBT2ap1L4bfPsl9q            I Didn't Change My Number             2   
4  0GLfodYacy3BJE7AI3A8en                             Man Down             7   

   track_popularity  track_duration_ms  explicit     artist_name  \
0                61             213173     False  Britney Spears   
1                67             158760     False           BUNT.   
2                63             225328     False    Taylor Swift   
3                72             158463      True   Billie Eilish   
4                57             267013

# Cleaning data
We need to remove any unnecessary columns that don't provide any value to our analysis. For instance, the track number or the album it belongs won't give us any insight on how well the song performs (the album will benefit from the track, not in the other way). We'll also remove the album total tracks, the artist genre (half of the artists don't have any) and the track name and id.
<br>
We'll keep the date, as it gives us a chronological indicator.

In [121]:
## dropping track_number, album_id and album_name
df = df.drop("track_number", axis=1)
df = df.drop("track_name", axis=1)
df = df.drop("track_id", axis=1)
df = df.drop("artist_genres", axis=1)
df = df.drop("album_id", axis=1)
df = df.drop("album_name", axis=1)
df = df.drop("album_total_tracks", axis=1)

df.head()

Unnamed: 0,track_popularity,track_duration_ms,explicit,artist_name,artist_popularity,artist_followers,album_release_date,album_type
0,61,213173,False,Britney Spears,80.0,17755451.0,2009-11-09,compilation
1,67,158760,False,BUNT.,69.0,293734.0,2023-01-13,single
2,63,225328,False,Taylor Swift,100.0,145396321.0,2021-04-09,album
3,72,158463,True,Billie Eilish,90.0,118692183.0,2021-07-30,album
4,57,267013,False,Rihanna,90.0,68997177.0,2010-01-01,album


In [94]:
## checking for null values
df.isnull().sum()

track_popularity      0
track_duration_ms     0
explicit              0
artist_name           4
artist_popularity     4
artist_followers      4
album_release_date    0
album_type            0
dtype: int64

In [122]:
# Checking what are the tracks and seeing if they might be relevant for our analysis 

missing_artist_name = [[index,row] for index, row in df.iterrows() if pd.isna(row["artist_name"])]
print(missing_artist_name)

[[436, track_popularity              65
track_duration_ms         214586
explicit                    True
artist_name                  NaN
artist_popularity            NaN
artist_followers             NaN
album_release_date    2012-01-01
album_type                 album
Name: 436, dtype: object], [1396, track_popularity              68
track_duration_ms         160166
explicit                   False
artist_name                  NaN
artist_popularity            NaN
artist_followers             NaN
album_release_date    2023-04-07
album_type                single
Name: 1396, dtype: object], [3885, track_popularity              30
track_duration_ms         255634
explicit                   False
artist_name                  NaN
artist_popularity            NaN
artist_followers             NaN
album_release_date    2011-04-18
album_type                single
Name: 3885, dtype: object], [3886, track_popularity              59
track_duration_ms         247533
explicit                   Fals

In [123]:
## setting incomplete infos as the most common ones
mode_artist_popularity = df["artist_popularity"].mode()[0]
df["artist_popularity"] = df["artist_popularity"].fillna(mode_artist_popularity)

mode_artist_followers = df["artist_followers"].mode()[0]
df["artist_followers"] = df["artist_followers"].fillna(mode_artist_followers)

# names and dates are too complicated to import, since there are multiple of them and are in disorder
# we're gonna set these values to "unknown"
df["artist_name"] = df["artist_name"].fillna("unknown")
df["album_release_date"] = df["album_release_date"].fillna("unknown")

In [97]:
df.isnull().sum()

track_popularity      0
track_duration_ms     0
explicit              0
artist_name           0
artist_popularity     0
artist_followers      0
album_release_date    0
album_type            0
dtype: int64

In [None]:
## let's change the strings to unique identifiers
print(df["album_type"].unique())
print(df["artist_name"].unique())


['compilation' 'single' 'album']
['Britney Spears' 'BUNT.' 'Taylor Swift' ... 'Troy' 'Zelda' 'bôa']


In [124]:
## changing the artist name to numbers so that the model can treat it
conversion_artist = dict()
unique_id = 0
for artist in df["artist_name"] :
    if artist not in conversion_artist.keys() : 
        conversion_artist[artist] = unique_id
    unique_id += 1

df["artist_name"] = df["artist_name"].map(conversion_artist)

# same for album type
df["album_type"] = df["album_type"].map({"compilation" : 0, "single" : 1, "album" : 2})

# same for explicit content
df["explicit"] = df["explicit"].map({False : 0, True : 1})


In [125]:
df.head()

Unnamed: 0,track_popularity,track_duration_ms,explicit,artist_name,artist_popularity,artist_followers,album_release_date,album_type
0,61,213173,0,0,80.0,17755451.0,2009-11-09,0
1,67,158760,0,1,69.0,293734.0,2023-01-13,1
2,63,225328,0,2,100.0,145396321.0,2021-04-09,2
3,72,158463,1,3,90.0,118692183.0,2021-07-30,2
4,57,267013,0,4,90.0,68997177.0,2010-01-01,2


In [None]:
## the date will be unreadable : we'll split it into year - month - day
df["year"] = [None] * len(df)
df["month"] = [None] * len(df)
df["day"] = [None] * len(df)
date_order = ["year", "month", "day"]


for index, row in df.iterrows() :
    date_parts = row["album_release_date"].split("-")
    for i in range(3 - len(date_parts)):                # cooler way to do it : date_parts.extend([0] * (3 - len(date_parts)))
        date_parts.append(0)
    for i in range(len(date_parts)) :
        df.loc[index, date_order[i]] = date_parts[i]

df = df.drop("album_release_date", axis=1)

df.head()

Unnamed: 0,track_popularity,track_duration_ms,explicit,artist_name,artist_popularity,artist_followers,album_release_date,album_type,year,month,day
0,61,213173,0,0,80.0,17755451.0,2009-11-09,0,2009,11,9
1,67,158760,0,1,69.0,293734.0,2023-01-13,1,2023,1,13
2,63,225328,0,2,100.0,145396321.0,2021-04-09,2,2021,4,9
3,72,158463,1,3,90.0,118692183.0,2021-07-30,2,2021,7,30
4,57,267013,0,4,90.0,68997177.0,2010-01-01,2,2010,1,1


In [149]:
df.isnull().sum()

track_popularity      0
track_duration_ms     0
explicit              0
artist_name           0
artist_popularity     0
artist_followers      0
album_release_date    0
album_type            0
year                  0
month                 0
day                   0
dtype: int64