# Generating synthetic data

In [None]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
# constants
DIM = 3 # want to generate three-dimensional data
SAMPLES = 500 # n samples for each dataset
color_list = ["#d81159", "#FF8020", "#0496ff", "#0077CC", "#005FA3", "#00477A"]

# for each dataset, define standard deviation per axis ("scale") and center
dataset_params = {
    0: {"scale": [1, 1, 1], "center": [0, 0, 0]},
    1: {"scale": [1, 0.3, 1.2], "center": [7, 0, 0]},
    2: {"scale": [0.3, 1.3, 0.3], "center": [14, 0, 0]},
    3: {"scale": [0.4, 1.3, 0.3], "center": [21, 0, 0]},
    4: {"scale": [0.5, 1.3, 0.3], "center": [28, 0, 0]},
    5: {"scale": [0.6, 1.3, 0.3], "center": [35, 0, 0]},
}

In [None]:
# datasets spread along x-axis
datasets = []
for i in range(len(dataset_params)): # for each dataset
    axes = []
    for j in range(DIM): # for each of the three dimensions, generate values according to the std. dev. and center of that axis
        nums = np.random.normal(loc = 10, scale = dataset_params[i]["scale"][j], size = SAMPLES) + dataset_params[i]["center"][j]
        axes.append(nums)
    df = pd.DataFrame(axes).transpose() # create a df from the lists
    df["dataset_id"] = (np.zeros(SAMPLES) + i).astype(int).astype(str) # add a "dataset ID"; helpful for post-concatenation of all these datasets
    datasets.append(df)

synthetic_df = pd.concat(datasets) # concat together all of the
synthetic_df.columns = ["x", "y", "z", "dataset_id"] # rename dataset columns
synthetic_df.to_pickle("synthetic_data.pkl")

In [None]:
# plot
fig = px.scatter_3d(synthetic_df, x="x", y="y", z="z", color='dataset_id', color_discrete_sequence = color_list, labels = {"x": "Neuron 1","y": "Neuron 2", "z": "Neuron 3"})
fig.update_traces(marker_size=3)
fig.update_layout(width=1100, height=600, font_size=11, scene_aspectmode="data", scene_camera_eye= dict(x=0.2, y=-2.0, z=1)) # adjust plot appearance
fig.show()

In [None]:
# datasets all centered at origin
datasets = []
for i in range(len(dataset_params)): # for each dataset
    axes = []
    for j in range(DIM): # for each of the three dimensions, generate values according to the std. dev. and center of that axis
        nums = np.random.normal(loc = 0, scale = dataset_params[i]["scale"][j], size = SAMPLES)
        axes.append(nums)
    df = pd.DataFrame(axes).transpose() # create a df from the lists
    df["dataset_id"] = (np.zeros(SAMPLES) + i).astype(int).astype(str) # add a "dataset ID"; helpful for post-concatenation of all these datasets
    datasets.append(df)

synthetic_df = pd.concat(datasets) # concat together all of the
synthetic_df.columns = ["x", "y", "z", "dataset_id"] # rename dataset columns
synthetic_df.to_pickle("synthetic_data_centered.pkl")

In [None]:
# plot
fig = px.scatter_3d(synthetic_df, x="x", y="y", z="z", color='dataset_id', color_discrete_sequence = color_list, labels = {"x": "Neuron 1","y": "Neuron 2", "z": "Neuron 3"})
fig.update_traces(marker_size=3)
fig.update_layout(width=800, height=700, font_size=11, scene_aspectmode="data") # adjust plot appearance
fig.show()