In [1]:
import pandas as pd
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Load Spotify Songs dataset
dataset = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    "joebeachcapital/30000-spotify-songs",
    "spotify_songs.csv"
    )


Downloading from https://www.kaggle.com/api/v1/datasets/download/joebeachcapital/30000-spotify-songs?dataset_version_number=2&file_name=spotify_songs.csv...


100%|██████████| 3.01M/3.01M [00:00<00:00, 29.6MB/s]

Extracting zip of spotify_songs.csv...





In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 1. STRATIFIED SAMPLING
# Extract 5000 rows maintaining the original proportion of genres.
# 'stratify' ensures that if 20% of the original data is Rock, 20% of the subset is Rock.
subset, _ = train_test_split(dataset, train_size=5000, stratify=dataset['playlist_genre'], random_state=42)

# Reset index to have clean indices from 0 to 4999
subset = subset.reset_index(drop=True)

# Display initial subset shape
print(f"Initial subset shape: {subset.shape}")

# 2. HANDLE MISSING VALUES
# Check for NaNs
nan_rows = subset.isna().any(axis=1).sum()
nan_perc = nan_rows / subset.shape[0] * 100
print(f"Rows with NaN values: {nan_rows} ({nan_perc:.2f}%)")

# IMPORTANT: Apply dropna on 'subset', not on the original 'dataset'
subset.dropna(inplace=True)
print(f"Shape after dropping NaNs: {subset.shape}")

# 3. FEATURE SELECTION (DROPPING IRRELEVANT COLUMNS)
# We need to remove IDs, Names, and Dates because:
# a) They are unique (High Cardinality) and cause the "Curse of Dimensionality" if encoded.
# b) The model should learn from audio features, not from the song title.
cols_to_drop = [
    'track_id',
    'track_name',
    'track_artist',
    'track_album_id',
    'track_album_name',
    'track_album_release_date',
    'playlist_name',
    'playlist_id',
    'playlist_subgenre' # We drop subgenre to focus on the main 'playlist_genre'
]

# Drop columns only if they exist in the dataframe
subset_clean = subset.drop(columns=cols_to_drop, errors='ignore')

# 4. SPLIT FEATURES (X) AND TARGET (y)
# We must separate the target variable BEFORE any further processing to avoid Data Leakage.
target_col = 'playlist_genre'

# y = The Labels (Target)
y = subset_clean[target_col]

# X = The Features (Numerical Audio Data)
# We drop the target column from X
X = subset_clean.drop(columns=[target_col])

# 5. FINAL VERIFICATION
print("\n--- Features (X) Head ---")
# This should contain only numerical columns like danceability, energy, tempo...
display(X.head())

print("\n--- Target (y) Head ---")
# This should contain the genres (pop, rock, etc.)
display(y.head())

# Check data types of X to ensure everything is numeric
print("\n--- X Data Types ---")
print(X.dtypes)

Initial subset shape: (5000, 23)
Rows with NaN values: 0 (0.00%)
Shape after dropping NaNs: (5000, 23)

--- Features (X) Head ---


Unnamed: 0,track_popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,64,0.691,0.873,5,-8.758,0,0.0382,0.33,2e-06,0.0362,0.916,93.685,251253
1,50,0.553,0.328,9,-10.797,1,0.0341,0.643,0.0,0.109,0.345,103.472,270960
2,1,0.624,0.733,7,-5.149,1,0.0273,0.16,0.529,0.157,0.282,136.012,220667
3,55,0.682,0.681,10,-6.997,1,0.087,0.388,0.000138,0.385,0.516,87.469,145947
4,35,0.764,0.889,11,-4.697,0,0.141,0.0438,0.0,0.377,0.59,122.208,270200



--- Target (y) Head ---


Unnamed: 0,playlist_genre
0,latin
1,rock
2,pop
3,r&b
4,rap



--- X Data Types ---
track_popularity      int64
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
duration_ms           int64
dtype: object
