# Data Synthesis from the Hannum data

## 1. Data loading

 Let's set the notebook in autoreload mode and load the required libraries 

In [10]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
from biolearn.data_library import DataLibrary
import pandas as pd
from sdv.metadata import Metadata


First we load the data

In [None]:
hannum_acc = 'GSE40279'
if 'hannum' not in locals():
    # Load the dataset
    # This will download the dataset if it is not already available
    # The dataset is from the GEO database, specifically GSE40279
    hannum = DataLibrary().get(hannum_acc).load()
print(f'{hannum.dnam=}')
print(f'{hannum.metadata=}')

hannum.dnam=                GSM989827  GSM989828  ...  GSM990626  GSM990627
id                                    ...                      
cg00000029       0.464197   0.454883  ...   0.499145   0.458600
cg00000108       0.941091   0.939033  ...   0.931690   0.974731
cg00000109       0.911182   0.596455  ...   0.900938   0.829869
cg00000165       0.132014   0.206917  ...   0.167477   0.170578
cg00000236       0.717861   0.723935  ...   0.730215   0.782844
...                   ...        ...  ...        ...        ...
ch.9.98937537R   0.042808   0.036811  ...   0.056429   0.040701
ch.9.98957343R   0.052589   0.053343  ...   0.047835   0.027499
ch.9.98959675F   0.035624   0.075618  ...   0.028896   0.000000
ch.9.98989607R   0.028066   0.017428  ...   0.025346   0.011863
ch.9.991104F     0.043850   0.032950  ...   0.052959   0.085375

[473034 rows x 656 columns]
hannum.metadata=            age  sex             ethnicity       tissue
id                                                     

Now let's make a biolearn function to do load an accession, choose a target, and wrap it together in a dataframe, so that we can augment it with SDV

In [None]:
def get_biolearn_df(accession: str, target_col: str | None = None) -> tuple[pd.DataFrame, Metadata]:
    """
    Returns the Biolearn Dataset for the specified accession, including all the methylation data and the selected target column.
    """
    data = DataLibrary().get(accession).load()
    if target_col is not None and target_col not in data.metadata.columns:
        raise ValueError(f"Target column '{target_col}' not found in metadata."
                         f" Available columns: {data.metadata.columns.tolist()}")
    df = (data.dnam.T.join(data.metadata[target_col]) if target_col else 
          data.dnam.T)

    return (
        df,
        Metadata.detect_from_dataframe(
            data=df,
            table_name=accession))
hannum_data, hannum_metadata = get_biolearn_df(hannum_acc, target_col='age')
print(f'{hannum_data.shape=}')
display(hannum_data.head())

hannum_data.shape=(656, 473035)


Unnamed: 0,cg00000029,cg00000108,cg00000109,cg00000165,cg00000236,cg00000289,cg00000292,cg00000321,cg00000363,cg00000622,cg00000658,cg00000714,cg00000721,cg00000734,cg00000769,cg00000807,cg00000884,cg00000905,cg00000924,cg00000948,cg00000957,cg00001099,cg00001245,cg00001249,cg00001261,cg00001269,cg00001349,cg00001364,cg00001446,cg00001510,cg00001534,cg00001582,cg00001583,cg00001593,cg00001594,cg00001687,cg00001747,cg00001791,cg00001793,cg00001809,...,ch.9.592503R,ch.9.691424R,ch.9.72278394R,ch.9.750163F,ch.9.75018133F,ch.9.76081330F,ch.9.77067993R,ch.9.77250961R,ch.9.7776528F,ch.9.79515146F,ch.9.80193246F,ch.9.82095949F,ch.9.83519450F,ch.9.83610230R,ch.9.837340R,ch.9.84051654F,ch.9.84078312F,ch.9.84366407R,ch.9.86947500F,ch.9.87682774F,ch.9.88862796F,ch.9.898515R,ch.9.90287778F,ch.9.90621653R,ch.9.914443R,ch.9.919537F,ch.9.93373462R,ch.9.93402636R,ch.9.941347R,ch.9.945770F,ch.9.96055087R,ch.9.97139671F,ch.9.98463211R,ch.9.98936572R,ch.9.98937537R,ch.9.98957343R,ch.9.98959675F,ch.9.98989607R,ch.9.991104F,age
GSM989827,0.464197,0.941091,0.911182,0.132014,0.717861,0.686521,0.805003,0.228244,0.338483,0.016508,0.81014,0.177981,0.921818,0.09303,0.061099,0.825423,0.887293,0.080157,0.461842,0.860963,0.868008,0.724557,0.007187,0.884556,0.468196,0.82348,0.797852,0.84499,0.841705,0.51592,0.913358,0.059022,0.081076,0.940081,0.014163,0.994419,0.032287,0.879292,0.668893,0.788245,...,0.054992,0.034581,0.021039,0.026177,0.027904,0.033439,0.046605,0.024686,0.022716,0.040809,0.061157,0.0,0.027217,0.024176,0.030293,0.027931,0.025627,0.012241,0.04258,0.114574,0.033558,0.022865,0.017504,0.026138,0.036515,0.092026,0.013003,0.025619,0.043778,0.022659,0.109918,0.061222,0.034284,0.133692,0.042808,0.052589,0.035624,0.028066,0.04385,67.0
GSM989828,0.454883,0.939033,0.596455,0.206917,0.723935,0.619084,0.814672,0.310879,0.418998,0.005747,0.778277,0.144454,0.907529,0.087869,0.066413,0.794975,0.874965,0.077787,0.465295,0.843149,0.875177,0.773727,0.014777,0.870456,0.480313,0.857899,0.674899,0.858959,0.848453,0.477897,0.919409,0.076662,0.117756,0.940844,0.014231,0.985381,0.039957,0.89957,0.768924,0.781482,...,0.051482,0.01335,0.018421,0.026533,0.035707,0.040875,0.039543,0.019726,0.022132,0.05179,0.05238,0.014176,0.02602,0.037001,0.029746,0.016973,0.023624,0.012178,0.044214,0.105732,0.04273,0.024127,0.019708,0.028303,0.063338,0.114144,0.007174,0.029295,0.038686,0.005095,0.076996,0.05264,0.027978,0.12527,0.036811,0.053343,0.075618,0.017428,0.03295,89.0
GSM989829,0.485764,0.918802,0.870333,0.162861,0.719196,0.635678,0.824336,0.263215,0.424736,0.012197,0.768844,0.185125,0.916278,0.090048,0.062418,0.801009,0.861004,0.081269,0.448755,0.86881,0.882586,0.800382,0.012149,0.828569,0.468173,0.820224,0.820352,0.851901,0.827013,0.502154,0.907879,0.071017,0.101064,0.908645,0.014148,0.992431,0.071048,0.902628,0.650774,0.76086,...,0.048902,0.010345,0.029149,0.039623,0.033678,0.027344,0.048583,0.022987,0.034782,0.049825,0.046989,0.010377,0.02914,0.037939,0.032652,0.020021,0.043318,0.010776,0.044517,0.097549,0.038133,0.028466,0.018805,0.032412,0.041598,0.110516,0.014274,0.031233,0.035164,0.021444,0.070694,0.058888,0.032643,0.139105,0.042844,0.045973,0.126421,0.021752,0.022375,66.0
GSM989830,0.480785,0.929908,0.889689,0.19778,0.704061,0.610864,0.811152,0.316761,0.398772,0.019945,0.825187,0.162875,0.913187,0.091596,0.069014,0.800715,0.868666,0.079151,0.462793,0.8648,0.8724,0.836739,0.007205,0.844759,0.457894,0.837539,0.848526,0.85425,0.848894,0.499023,0.933358,0.071929,0.074679,0.902501,0.026563,0.991809,0.042836,0.897485,0.67255,0.784108,...,0.044365,0.024194,0.035627,0.017327,0.048607,0.027239,0.061536,0.030126,0.02536,0.049826,0.05131,0.007367,0.026216,0.038522,0.032679,0.021958,0.029685,0.010396,0.037067,0.11134,0.041635,0.029368,0.026202,0.027365,0.035321,0.102213,0.014307,0.035059,0.042269,0.028587,0.094749,0.056279,0.036997,0.140601,0.042258,0.048733,0.084051,0.027504,0.053007,64.0
GSM989831,0.50122,0.934548,0.89045,0.148437,0.754913,0.651262,0.808628,0.338289,0.366965,0.0,0.816176,0.198095,0.924478,0.095762,0.063317,0.798748,0.850316,0.091947,0.485184,0.861199,0.883432,0.78354,0.011335,0.88454,0.479263,0.857668,0.859865,0.831074,0.851069,0.48296,0.938526,0.07517,0.090993,0.923369,0.021058,0.996237,0.070497,0.899383,0.746606,0.758764,...,0.051306,0.020005,0.042227,0.018185,0.03488,0.032161,0.044261,0.015343,0.039565,0.050379,0.066897,0.019311,0.020241,0.03007,0.040077,0.016914,0.025134,0.019928,0.049483,0.131276,0.031274,0.038676,0.031734,0.022399,0.043735,0.111282,0.017529,0.026308,0.037813,0.018626,0.110543,0.057568,0.036746,0.129993,0.039613,0.039254,0.165874,0.020889,0.0,62.0


Now we can just synthesise more data!

# 2. Basic Usage

## 2.1 Creating a Synthesizer

An SDV **synthesizer** is an object that you can use to create synthetic data. It learns patterns from the real data and replicates them to generate synthetic data.

In [None]:
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer = GaussianCopulaSynthesizer(hannum_metadata)
synthesizer.fit(hannum_data)


We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.



Now the synthesizer is ready to use!

## 2.2 Generating Synthetic Data

Use the `sample` function and pass in any number of rows to synthesize.

In [None]:
synthetic_data = synthesizer.sample(num_rows=500)
synthetic_data.head()

Unnamed: 0,guest_email,has_rewards,room_type,amenities_fee,checkin_date,checkout_date,room_rate,billing_address,credit_card_number
0,dsullivan@example.net,False,BASIC,0.29,27 Mar 2020,09 Mar 2020,135.15,"90469 Karla Knolls Apt. 781\nSusanberg, CA 70033",5161033759518983
1,steven59@example.org,False,DELUXE,8.15,07 Sep 2020,25 Jun 2020,183.24,"6108 Carla Ports Apt. 116\nPort Evan, MI 71694",4133047413145475690
2,brandon15@example.net,False,BASIC,11.65,22 Mar 2020,01 Apr 2020,163.57,86709 Jeremy Manors Apt. 786\nPort Garychester...,4977328103788
3,humphreyjennifer@example.net,False,BASIC,48.12,04 Jun 2020,14 May 2020,127.75,"8906 Bobby Trail\nEast Sandra, NY 43986",3524946844839485
4,joshuabrown@example.net,False,DELUXE,11.07,08 Jan 2020,13 Jan 2020,180.12,"732 Dennis Lane\nPort Nicholasstad, DE 49786",4446905799576890978


The synthesizer is generating synthetic guests in the **same format as the original data**.

## 2.3 Evaluating Real vs. Synthetic Data

SDV has built-in functions for evaluating the synthetic data and getting more insight.

As a first step, we can run a **diagnostic** to ensure that the data is valid. SDV's diagnostic performs some basic checks such as:

- All primary keys must be unique
- Continuous values must adhere to the min/max of the real data
- Discrete columns (non-PII) must have the same categories as the real data
- Etc.

In [None]:
from sdv.evaluation.single_table import run_diagnostic

diagnostic = run_diagnostic(
    real_data=hannum_data,
    synthetic_data=synthetic_data,
    metadata=hannum_metadata,
)

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 9/9 [00:00<00:00, 1123.67it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 249.54it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%



<font color="green"><b>The score is 100%</b></font>, indicating that the data is fully valid.

We can also measure the **data quality** or the statistical similarity between the real and synthetic data. This value may vary anywhere from 0 to 100%.

In [None]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    hannum_data,
    synthetic_data,
    hannum_metadata,
)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 9/9 [00:00<00:00, 813.29it/s]|
Column Shapes Score: 90.06%

(2/2) Evaluating Column Pair Trends: |██████████| 36/36 [00:00<00:00, 501.50it/s]|
Column Pair Trends Score: 89.29%

Overall Score (Average): 89.68%



According to the score, the synthetic data is about 88% similar to the real data in terms of statistical similarity.

We can also get more details from the report. For example, the Column Shapes sub-score is 89%. Which columns had the highest vs. the lowest scores?

In [None]:
quality_report.get_details('Column Shapes')

Unnamed: 0,Column,Metric,Score
0,has_rewards,TVComplement,0.982
1,room_type,TVComplement,0.984
2,amenities_fee,KSComplement,0.764778
3,checkin_date,KSComplement,0.962
4,checkout_date,KSComplement,0.96875
5,room_rate,KSComplement,0.742


## 2.4 Visualizing the Data
For more insights, we can visualize the real vs. synthetic data.

Let's perform a 1D visualization comparing a column of the real data to the synthetic data.

In [None]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data=hannum_data,
    synthetic_data=synthetic_data,
    column_name='age',
    metadata=hannum_metadata
)

fig.show()

We can also visualize in 2D, comparing the correlations of a pair of columns.

In [None]:
from sdv.evaluation.single_table import get_column_pair_plot

fig = get_column_pair_plot(
    real_data=hannum_data,
    synthetic_data=synthetic_data,
    column_names=['cg00000108','cg00000029'],
    metadata=hannum_metadata
)

fig.show()

## 2.5 Saving and Loading
We can save the synthesizer to share with others and sample more synthetic data in the future.

In [None]:
synthesizer.save('my_synthesizer.pkl')

synthesizer = GaussianCopulaSynthesizer.load('my_synthesizer.pkl')

# 3. Gaussian Copula Customization

A key benefit of using the Gaussian Copula is **customization and transparency**. This synthesizer estimates the shape of every column using a 1D distribution. We can set these shapes ourselves.

In [None]:
custom_synthesizer = GaussianCopulaSynthesizer(
    hannum_metadata,
    default_distribution='truncnorm',
    numerical_distributions={
        'checkin_date': 'uniform',
        'checkout_date': 'uniform',
        'room_rate': 'gaussian_kde'
    }
)

custom_synthesizer.fit(hannum_data)

After training, we can inspect the distributions. In this case, the synthesizer returns the parameter it learned using the truncnorm distribution.

<font color=navy><i>More information about truncnorm distribution is available in the [scipy documentation](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.truncnorm.html).</i></font>

In [None]:
learned_distributions = custom_synthesizer.get_learned_distributions()
learned_distributions['has_rewards']

{'distribution': 'truncnorm',
 'learned_parameters': {'a': np.float64(-0.5415003253031383),
  'b': np.float64(0.46460737566051247),
  'loc': np.float64(0.5390634681863229),
  'scale': np.float64(0.9878956157544778)}}

By setting these distributions strategically, you can make tradeoffs in the quality of your synthetic data.

In [None]:
synthetic_data_customized = custom_synthesizer.sample(num_rows=500)

quality_report = evaluate_quality(
    hannum_data,
    synthetic_data_customized,
    hannum_metadata
)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 9/9 [00:00<00:00, 999.52it/s]|
Column Shapes Score: 93.49%

(2/2) Evaluating Column Pair Trends: |██████████| 36/36 [00:00<00:00, 584.63it/s]|
Column Pair Trends Score: 90.87%

Overall Score (Average): 92.18%



And we can verify this using the visualization functions.

In [None]:
fig = get_column_plot(
    real_data=hannum_data,
    synthetic_data=synthetic_data_customized,
    column_name='room_rate',
    metadata=hannum_metadata
)

fig.show()

# 4. Conditional Sampling
Another benefit of using the Gaussian Copula is the ability to **efficiently sample conditions**. This allows us to simulate hypothetical scenarios.

Let's start by creating a scenario where every hotel guest is staying in a `SUITE` (half with rewards and half without).

In [None]:
from sdv.sampling import Condition

suite_guests_with_rewards = Condition(
    num_rows=250,
    column_values={'room_type': 'SUITE', 'has_rewards': True}
)

suite_guests_without_rewards = Condition(
    num_rows=250,
    column_values={'room_type': 'SUITE', 'has_rewards': False}
)

In [None]:
fig = get_column_plot(
    real_data=real_data,
    synthetic_data=simulated_synthetic_data,
    column_name='room_type',
    metadata=metadata
)

fig.update_layout(
    title='Using synthetic data to simulate room_type scenario'
)

fig.show()