In [1]:
import pandas as pd
import random
import numpy as np

def random_features(base_val, variation=0.15):
    # Add noise
    val = base_val + random.uniform(-variation, variation)
    return max(0, min(1, val))  # Clamp to [0,1]

tracks = []
classes = ["liked", "neutral", "disliked"]

for cls in classes:
    for i in range(100):
        # Base feature values by class, with overlaps
        if cls == "liked":
            dance_base = 0.7
            energy_base = 0.7
            valence_base = 0.75
        elif cls == "neutral":
            dance_base = 0.5
            energy_base = 0.5
            valence_base = 0.5
        else:  # disliked
            dance_base = 0.35
            energy_base = 0.4
            valence_base = 0.3

        track = {
            "track_id": f"{cls}_{i+1}",
            "track_name": f"{cls.capitalize()} Song {i+1}",
            "artist_name": f"Artist {i+1}",
            "album_name": f"{cls.capitalize()} Album {i+1}",
            "duration_ms": random.randint(150000, 300000),
            "danceability": random_features(dance_base),
            "energy": random_features(energy_base),
            "key": random.randint(0,11),
            "loudness": random.uniform(-15, -5),
            "mode": random.randint(0,1),
            "speechiness": random.uniform(0.03,0.15),
            "acousticness": random.uniform(0.0,0.9),
            "instrumentalness": random.uniform(0.0,0.5),
            "liveness": random.uniform(0.0,0.3),
            "valence": random_features(valence_base),
            "tempo": random.uniform(70,160),
            "popularity": random.randint(20,90),
            "playlist_name": random.choice(["Discover Weekly", "Release Radar"]),
            "playlist_type": random.choice(["discover_weekly","release_radar"]),
            "user_label": cls
        }

        # Randomly drop some values (~5%) to simulate missing data
        for col in ["danceability","energy","valence","tempo"]:
            if random.random() < 0.05:
                track[col] = np.nan

        tracks.append(track)

# Convert to DataFrame
df = pd.DataFrame(tracks)

# Save to CSV
df.to_csv("sample_spotify_likes_dataset_noisy.csv", index=False)

print("Noisy sample dataset created! Shape:", df.shape)
df.head()


Noisy sample dataset created! Shape: (300, 20)


Unnamed: 0,track_id,track_name,artist_name,album_name,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,popularity,playlist_name,playlist_type,user_label
0,liked_1,Liked Song 1,Artist 1,Liked Album 1,160279,0.551132,0.745578,6,-11.726042,0,0.061513,0.136288,0.333667,0.098837,,107.903753,81,Release Radar,release_radar,liked
1,liked_2,Liked Song 2,Artist 2,Liked Album 2,247394,0.628896,0.723333,10,-5.380365,0,0.031586,0.172677,0.235328,0.143355,0.812165,134.891325,66,Discover Weekly,release_radar,liked
2,liked_3,Liked Song 3,Artist 3,Liked Album 3,243242,0.600947,0.586471,2,-12.391219,0,0.094269,0.727056,0.452457,0.238392,0.735293,86.169704,48,Discover Weekly,release_radar,liked
3,liked_4,Liked Song 4,Artist 4,Liked Album 4,246344,0.614178,,11,-6.390501,1,0.090372,0.708321,0.463525,0.152212,0.772903,80.03831,69,Release Radar,release_radar,liked
4,liked_5,Liked Song 5,Artist 5,Liked Album 5,167233,0.764733,0.59798,6,-12.826981,0,0.04975,0.067894,0.417688,0.178197,0.737663,124.349128,83,Release Radar,discover_weekly,liked
