In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer  # must import to enable IterativeImputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import classification_report
import numpy as np


In [6]:
df = pd.read_csv("sample_spotify_likes_dataset_noisy.csv")
print(df.shape)
df.head()


(300, 20)


Unnamed: 0,track_id,track_name,artist_name,album_name,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,popularity,playlist_name,playlist_type,user_label
0,liked_1,Liked Song 1,Artist 1,Liked Album 1,160279,0.551132,0.745578,6,-11.726042,0,0.061513,0.136288,0.333667,0.098837,,107.903753,81,Release Radar,release_radar,liked
1,liked_2,Liked Song 2,Artist 2,Liked Album 2,247394,0.628896,0.723333,10,-5.380365,0,0.031586,0.172677,0.235328,0.143355,0.812165,134.891325,66,Discover Weekly,release_radar,liked
2,liked_3,Liked Song 3,Artist 3,Liked Album 3,243242,0.600947,0.586471,2,-12.391219,0,0.094269,0.727056,0.452457,0.238392,0.735293,86.169704,48,Discover Weekly,release_radar,liked
3,liked_4,Liked Song 4,Artist 4,Liked Album 4,246344,0.614178,,11,-6.390501,1,0.090372,0.708321,0.463525,0.152212,0.772903,80.03831,69,Release Radar,release_radar,liked
4,liked_5,Liked Song 5,Artist 5,Liked Album 5,167233,0.764733,0.59798,6,-12.826981,0,0.04975,0.067894,0.417688,0.178197,0.737663,124.349128,83,Release Radar,discover_weekly,liked


In [7]:
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['user_label'])
df.head()

Unnamed: 0,track_id,track_name,artist_name,album_name,duration_ms,danceability,energy,key,loudness,mode,...,acousticness,instrumentalness,liveness,valence,tempo,popularity,playlist_name,playlist_type,user_label,label_encoded
0,liked_1,Liked Song 1,Artist 1,Liked Album 1,160279,0.551132,0.745578,6,-11.726042,0,...,0.136288,0.333667,0.098837,,107.903753,81,Release Radar,release_radar,liked,1
1,liked_2,Liked Song 2,Artist 2,Liked Album 2,247394,0.628896,0.723333,10,-5.380365,0,...,0.172677,0.235328,0.143355,0.812165,134.891325,66,Discover Weekly,release_radar,liked,1
2,liked_3,Liked Song 3,Artist 3,Liked Album 3,243242,0.600947,0.586471,2,-12.391219,0,...,0.727056,0.452457,0.238392,0.735293,86.169704,48,Discover Weekly,release_radar,liked,1
3,liked_4,Liked Song 4,Artist 4,Liked Album 4,246344,0.614178,,11,-6.390501,1,...,0.708321,0.463525,0.152212,0.772903,80.03831,69,Release Radar,release_radar,liked,1
4,liked_5,Liked Song 5,Artist 5,Liked Album 5,167233,0.764733,0.59798,6,-12.826981,0,...,0.067894,0.417688,0.178197,0.737663,124.349128,83,Release Radar,discover_weekly,liked,1


In [8]:
features = df[['danceability','energy','valence','acousticness','instrumentalness','liveness','speechiness','tempo','loudness','mode']]
target = df['label_encoded']

In [9]:
# Percent missing per column
print(df.isna().mean() * 100)


track_id            0.000000
track_name          0.000000
artist_name         0.000000
album_name          0.000000
duration_ms         0.000000
danceability        7.000000
energy              4.666667
key                 0.000000
loudness            0.000000
mode                0.000000
speechiness         0.000000
acousticness        0.000000
instrumentalness    0.000000
liveness            0.000000
valence             7.000000
tempo               4.666667
popularity          0.000000
playlist_name       0.000000
playlist_type       0.000000
user_label          0.000000
label_encoded       0.000000
dtype: float64


In [10]:
imputer = IterativeImputer(random_state=42)
features_imputed = imputer.fit_transform(features)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    features_imputed, target, test_size=0.2, random_state=42
)

In [12]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)


In [13]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))


              precision    recall  f1-score   support

    disliked       0.92      1.00      0.96        22
       liked       0.96      1.00      0.98        22
     neutral       1.00      0.81      0.90        16

    accuracy                           0.95        60
   macro avg       0.96      0.94      0.94        60
weighted avg       0.95      0.95      0.95        60



In [14]:
y_probs = clf.predict_proba(X_test)

# Example: print probabilities for first 5 songs
for i, probs in enumerate(y_probs[:5]):
    print(f"Song {i+1} -> Disliked: {probs[0]:.2f}, Neutral: {probs[1]:.2f}, Liked: {probs[2]:.2f}")

Song 1 -> Disliked: 0.77, Neutral: 0.00, Liked: 0.23
Song 2 -> Disliked: 0.77, Neutral: 0.01, Liked: 0.22
Song 3 -> Disliked: 0.41, Neutral: 0.00, Liked: 0.59
Song 4 -> Disliked: 0.00, Neutral: 1.00, Liked: 0.00
Song 5 -> Disliked: 0.95, Neutral: 0.00, Liked: 0.05
