In [1]:
%pip install sdv

Collecting sdv
  Obtaining dependency information for sdv from https://files.pythonhosted.org/packages/bf/dc/584e1a86cb553fe4eb58cc04c35af7d37063491e670cc3b5872b8475afc2/sdv-1.4.0-py2.py3-none-any.whl.metadata
  Downloading sdv-1.4.0-py2.py3-none-any.whl.metadata (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.8/88.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting boto3<2,>=1.15.0 (from sdv)
  Obtaining dependency information for boto3<2,>=1.15.0 from https://files.pythonhosted.org/packages/4d/65/a0927694bb02d8ae46c399d9a8ed2cd9c724adbc9bbb7849a0650850ad31/boto3-1.28.61-py3-none-any.whl.metadata
  Downloading boto3-1.28.61-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<2,>=1.18 (from sdv)
  Obtaining dependency information for botocore<2,>=1.18 from https://files.pythonhosted.org/packages/3b/6b/5292877865d820007c2c1afde587f17a2815e3e16d1a28421486f2816d81/botocore-1.31.61-py3-none-any.whl.metadata
  Downloading botocore-1.31.61-py3-none-

In [2]:
from sdv.sampling import Condition
from sdv.lite import SingleTablePreset
from sdv.datasets.local import load_csvs
from sdv.metadata import SingleTableMetadata
from sdv.evaluation.single_table import evaluate_quality
from sdv.evaluation.single_table import get_column_plot

In [4]:
datasets = load_csvs(".")
real_data = datasets["spotify_tracks_reduced"]

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(real_data)



In [5]:
synthesizer = SingleTablePreset(
    metadata=metadata,
    name='FAST_ML'
)

In [6]:
synthesizer.fit(real_data)

In [7]:
# create list of all unique track_genres in real_data
track_genres = real_data['track_genre'].unique().tolist()

In [8]:
# sample 1000 rows per genre
conditions = [
    Condition(num_rows=1000, column_values={'track_genre': genre})
    for genre in track_genres
]

synthetic_data = synthesizer.sample_from_conditions(conditions=conditions, output_file_path='synthetic_data.csv')

Sampling conditions:   0%|          | 0/37000 [00:00<?, ?it/s]

Sampling conditions: 100%|██████████| 37000/37000 [00:08<00:00, 4566.74it/s]


In [9]:
synthetic_data.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,624,5y0nMZC5kF6uLYaLc97mSo,Cream,在動物園散步才是正經事,Last Last,46,350550,False,0.425629,0.265287,...,-16.143084,1,0.082919,0.582338,0.352354,0.129548,0.381069,138.005954,4,acoustic
1,573,4LfryUF6KUoJNPoOhSsV7z,Slipknot,"die sometime, it's good for u",Places,74,160563,True,0.590011,0.599355,...,-3.301788,1,0.153442,0.34651,0.0,0.514882,0.384911,104.775458,4,acoustic
2,164,4cO3Woi163T6kfaUEjsSFU,Eastmountainsouth,Gold-Diggers Sound,"10,000 DOLLARS",40,304184,False,0.387066,0.827122,...,-10.761001,1,0.267515,0.600324,0.508648,0.41206,0.452385,122.22472,3,acoustic
3,190,6KAf1Y6OHR34qdmgUFO5zf,my little airport,La Fine dell'Estate,Feels Right,1,221366,False,0.656337,0.502719,...,-11.14486,1,0.002993,0.184809,0.0,0.0112,0.995,79.907066,4,acoustic
4,422,0XHkGVCs5hxW2gtHnPHkJi,Frei Gilson,Mozart: A Night of Classics,Better Than Heaven,31,408802,False,0.624698,0.178146,...,-19.895502,1,0.0,0.639736,0.625501,0.052142,0.323888,101.331384,4,acoustic


quality_report = evaluate_quality(
    real_data,
    synthetic_data,
    metadata
)

In [14]:
fig = get_column_plot(
    real_data=real_data,
    synthetic_data=synthetic_data,
    column_name='acousticness',
    metadata=metadata
)
    
fig.show()