In [2]:
import pandas as pd
import random

# Helper function to generate realistic audio features
def random_features(base_val, variation=0.1):
    val = base_val + random.uniform(-variation, variation)
    # Keep values within 0-1 for normalized features
    return max(0, min(1, val))

tracks = []

# Define 3 classes
classes = ["liked", "neutral", "disliked"]

for cls in classes:
    for i in range(50):
        # Base values for features by class
        if cls == "liked":
            dance = random_features(0.8)
            energy = random_features(0.75)
            valence = random_features(0.85)
        elif cls == "neutral":
            dance = random_features(0.5)
            energy = random_features(0.5)
            valence = random_features(0.5)
        else:  # disliked
            dance = random_features(0.3)
            energy = random_features(0.35)
            valence = random_features(0.2)
        
        track = {
            "track_id": f"{cls}_{i+1}",
            "track_name": f"{cls.capitalize()} Song {i+1}",
            "artist_name": f"Artist {i+1}",
            "album_name": f"{cls.capitalize()} Album {i+1}",
            "duration_ms": random.randint(150000, 300000),
            "danceability": dance,
            "energy": energy,
            "key": random.randint(0,11),
            "loudness": random.uniform(-15, -5),
            "mode": random.randint(0,1),
            "speechiness": random.uniform(0.03,0.1),
            "acousticness": random.uniform(0.0,0.8),
            "instrumentalness": random.uniform(0.0,0.5),
            "liveness": random.uniform(0.0,0.3),
            "valence": valence,
            "tempo": random.uniform(70,160),
            "popularity": random.randint(20,90),
            "playlist_name": random.choice(["Discover Weekly", "Release Radar"]),
            "playlist_type": random.choice(["discover_weekly","release_radar"]),
            "user_label": cls  # liked / neutral / disliked
        }
        tracks.append(track)

# Convert to DataFrame
df = pd.DataFrame(tracks)

# Save to CSV
df.to_csv("sample_spotify_likes_dataset_large.csv", index=False)

print("Large sample dataset created! Shape:", df.shape)
df.head()


Large sample dataset created! Shape: (150, 20)


Unnamed: 0,track_id,track_name,artist_name,album_name,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,popularity,playlist_name,playlist_type,user_label
0,liked_1,Liked Song 1,Artist 1,Liked Album 1,236963,0.83255,0.74617,0,-6.801118,0,0.074184,0.114732,0.432678,0.196662,0.872891,142.610619,56,Release Radar,discover_weekly,liked
1,liked_2,Liked Song 2,Artist 2,Liked Album 2,268504,0.859259,0.794001,1,-8.652977,1,0.079939,0.551897,0.120098,0.086692,0.858906,133.160965,68,Release Radar,release_radar,liked
2,liked_3,Liked Song 3,Artist 3,Liked Album 3,211955,0.843244,0.783737,4,-11.914876,1,0.055502,0.49281,0.35142,0.06147,0.878269,156.063764,32,Discover Weekly,release_radar,liked
3,liked_4,Liked Song 4,Artist 4,Liked Album 4,242253,0.71703,0.7657,3,-7.223379,0,0.052061,0.299828,0.271306,0.083949,0.864498,101.295537,30,Discover Weekly,release_radar,liked
4,liked_5,Liked Song 5,Artist 5,Liked Album 5,244519,0.823733,0.685467,9,-6.488025,1,0.07154,0.234729,0.320165,0.25858,0.765604,72.576062,21,Release Radar,release_radar,liked
