# Synthetic Data Generator

This function generates practice data for testing programs, stats, etc. 

1. import libraries
2. describe a data schema
3. Make a dictionary
4. Use control flow to fill the dictionary with the simulated data
5. Convert the dictionary to a Pandas dataframe
6. generate missing values (NaN). Determined by the 'missing fraction parameter.
7. Save the data as a .csv file

In [4]:
# Import libraries
import numpy as np
import pandas as pd
import random
from datetime import datetime, timedelta

In [6]:
# Main Function
def generate_synthetic_dataset(
    num_rows=100,
    schema=[
        ("age", "int"),
        ("height", "float"),
        ("gender", "category"),
        ("is_active", "bool"),
        ("signup_date", "date")
    ],
    missing_fraction=0.0,
    save_path= "DG_Sim_Data.csv"
):
    data = {} # empty dictionary
    
    for col_name, col_type in schema: # generate random data in a manner that depends on the data type listed in the schema
        if col_type == "int":
            data[col_name] = np.random.randint(18, 65, size=num_rows) 
        elif col_type == "float":
            data[col_name] = np.round(np.random.normal(loc=170, scale=10, size=num_rows), 2)
        elif col_type == "category":
            data[col_name] = np.random.choice(["male", "female", "other"], size=num_rows)
        elif col_type == "bool":
            data[col_name] = np.random.choice([True, False], size=num_rows)
        elif col_type == "date":
            base = datetime.today()
            data[col_name] = [(base - timedelta(days=random.randint(0, 3650))).date() for _ in range(num_rows)]
        else:
            raise ValueError(f"Unsupported column type: {col_type}")

    df = pd.DataFrame(data)
    
    # Introduce missing values
    if missing_fraction > 0:
        for col in df.columns:
            df.loc[df.sample(frac=missing_fraction).index, col] = np.nan

    # Save to CSV
    df.to_csv(save_path, index=False)
    print(f"Saved dataset to {save_path}")
    
    return df # keeps the df in memory if desired

In [7]:
# Call the function to create a dataset
df = generate_synthetic_dataset(num_rows=200, missing_fraction=0.05)
print(df.head())

Saved dataset to DG_Sim_Data.csv
    age  height  gender is_active signup_date
0  37.0     NaN  female     False  2020-04-27
1  35.0  180.86  female     False         NaN
2  63.0  176.80    male      True  2021-01-29
3  64.0  191.69   other      True  2023-01-17
4  27.0  161.93  female      True  2016-02-03


  df.loc[df.sample(frac=missing_fraction).index, col] = np.nan
