In [8]:
from sdv.sampling import Condition
from sdv.lite import SingleTablePreset
from sdv.datasets.local import load_csvs
from sdv.metadata import SingleTableMetadata
from sdv.evaluation.single_table import evaluate_quality
from sdv.evaluation.single_table import get_column_plot

In [9]:
datasets = load_csvs(".")
real_data = datasets["spotify_tracks"]

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(real_data)



In [3]:
synthesizer = SingleTablePreset(
    metadata=metadata,
    name='FAST_ML'
)

In [4]:
synthesizer.fit(real_data)

In [11]:
# create list of all unique track_genres in real_data
track_genres = real_data['track_genre'].unique().tolist()

In [14]:
# sample 1000 rows per genre
conditions = [
    Condition(num_rows=1000, column_values={'track_genre': genre})
    for genre in track_genres
]

synthetic_data = synthesizer.sample_from_conditions(conditions=conditions, output_file_path='synthetic_data.csv')

Sampling conditions: 100%|██████████| 114000/114000 [00:34<00:00, 3290.28it/s]


In [15]:
synthetic_data.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,1001,7IikbYJuxnILUaytiJ6KXY,Girls' Generation,Fragments of Freedom,Snap Out Of It,40,223551,False,0.140962,0.308925,...,-12.935391,0,0.0,0.829654,0.085631,0.32018,0.099219,58.253383,3,acoustic
1,778,6S3JlDAGk3uu3NtZbPnuhS,Megadeth,This Is Jimmy Clanton,Posso Clamar,44,303386,False,0.648285,0.739773,...,-3.925941,1,0.0,0.167758,0.51055,0.106409,0.32201,117.675969,3,acoustic
2,519,6S3JlDAGk3uu3NtZbPnuhS,Özdemir Erdoğan,Return of the Dream Canteen,Run Rudolph Run,61,104428,True,0.456375,0.243728,...,-15.601375,1,0.080802,0.716644,0.126997,0.082134,0.0,75.465353,4,acoustic
3,550,0iAkkPHwTkS1nC2vbcw3Ms,Kishore Kumar,Panakkaran (Original Motion Picture Soundtrack),The Wilhelm Scream,14,203147,True,0.472842,0.850369,...,-4.642574,0,0.211355,0.0,0.31636,0.456496,0.168052,132.867038,4,acoustic
4,449,5W1UtLcQLQOFZDpDeNn8Gi,Bobby Goldsboro,Super Fun Learning Songs,"12 Variations on an Allegretto in B Flat, K.50...",42,168044,False,0.209686,0.5537,...,-6.717145,0,0.0,0.338755,0.0,0.430138,0.0,97.643418,4,acoustic


quality_report = evaluate_quality(
    real_data,
    synthetic_data,
    metadata
)

In [24]:
fig = get_column_plot(
    real_data=real_data,
    synthetic_data=synthetic_data,
    column_name='tempo',
    metadata=metadata
)
    
fig.show()