In [1]:
import pandas as pd
import numpy as np
import numpy.random as nr

# Data Preparation

In [2]:
# load example data
raw = pd.read_csv("example.csv", index_col=0)

# some column and index shuffleing
raw["ENTITY"] = raw.index
raw.index = raw.index.map(str) +"_"+ raw["TIME"].map(str)
raw.rename(columns={"TIME.1":"TIME"})
del raw.index.name

In [3]:
raw1 = raw[raw.DATASET == 1][::]
raw2 = raw[raw.DATASET == 2][::]
raw3 = raw[raw.DATASET == 3][::]

datasets = [raw1, raw2, raw3]

# normally distributed features
means1 = [10, 12, 16]
stds1 = [3, 3, 2]

means2 = [10, 10, 10]
stds2 = [1, 1, 1]

means3 = [10, 20, 30]
stds3 = [3, 3, 2]

means4 = [20, 21, 20]
stds4 = [4, 5, 4]

# chi square features
degs = [2,3,4]

# categorical features
vals1 = ["a", "b", "c"]
probs1 = [[0.2, 0.7, 0.1], [0.3, 0.3, 0.4], [0.8, 0.1, 0.1]]

vals2 = [1, 2, 3]

# batch effects
batches1 = [0, 0, 5]
b_means1 = [20, 20, 20]
b_stds1 = [2, 1, 1]

batches2 = [0, 0, 2]
b_means2 = [20, 20, 20]
b_stds2 = [1, 1, 1]

#longitudinal
long_m = [[10, 12, 15, 17, 19, 22, 25, 27, 29, 33], 
         [12, 12, 16, 19, 23, 19, 28, 30, 31, 33],
         [10, 12, 14, 16, 18, 20, 22, 24, 26, 28]]
long_s = [[2.1, 1.3, 1, 1.5, 1.7, 2, 2.5, 3, 3, 3.3],
         [2.2, 1.3, 1, 1.5, 1.7, 2, 2.5, 3, 3, 3.3],
         [2.1, 1.3, 1, 1.5, 1.7, 2, 2.5, 3, 3, 3.3]]

# Generate normally distributed features

In [4]:
def fill_nd_feats(feat, datasets, means, stds):
    for df, mean, std in zip(datasets, means, stds):
        df[feat] = nr.normal(mean, std, size=len(df))
        
fill_nd_feats("NDSIG1", datasets, means1, stds1)
fill_nd_feats("NDSIG2", datasets, means3, stds3)
fill_nd_feats("NDNON1", datasets, means2, stds2)
fill_nd_feats("NDNON2", datasets, means4, stds4)

# Chi² feats

In [5]:
def fill_chi_feats(feat, datasets, degs):
    for df, deg in zip(datasets, degs):
        df[feat] = nr.chisquare(deg, size=len(df))

fill_chi_feats("CHIFEAT", datasets, degs)

# Categorical feats

In [6]:
def fill_cat_feats(feat, datasets, vals, probs=None):
    if probs:
        for df, prob in zip(datasets, probs):
            df[feat] = nr.choice(vals, p=prob, size=len(df))
    else:
        for df in datasets:
            df[feat] = nr.choice(vals, size=len(df))
        
fill_cat_feats("CATSIG1", datasets, vals1, probs1)
fill_cat_feats("CATNON2", datasets, vals2)

# Batch

In [7]:
def fill_nd_feats(feat, datasets, means, stds, batches):
    for df, mean, std, batch in zip(datasets, means, stds, batches):
        df[feat] = nr.normal(mean, std, size=len(df))
        # add batch effect
        df[feat] = df[feat] + batch

fill_nd_feats("BATCH1", datasets, b_means1, b_stds1, batches1)
fill_nd_feats("BATCH2", datasets, b_means2, b_stds2, batches2)

# Longitudinal

In [8]:
def fill_long_feat(feat, datasets, time, means, stds):
    
    times = datasets[0][time].unique()
    
    for df, mean, std in zip(datasets, means, stds):

        for t, m, s in zip(times, mean, std):
            inds = df[df[time]==t].index
            n_vals = nr.normal(m, s, size=len(inds))
            df.loc[inds, feat] = n_vals
            

fill_long_feat("LONG1", datasets, "TIME", long_m, long_s)

# Combine

In [9]:
pd.concat(datasets)

Unnamed: 0,TIME,DATASET,FEAT1,ENTITY,NDSIG1,NDSIG2,NDNON1,NDNON2,CHIFEAT,CATSIG1,CATNON2,BATCH1,BATCH2,LONG1
1_0,0,1,658,1,12.441774,10.236285,9.476064,17.892295,1.294711,b,2,19.962548,19.774541,9.244095
1_2,2,1,543,1,10.250727,12.446808,12.207192,20.135977,0.516262,a,2,19.836118,21.118032,10.088124
1_4,4,1,520,1,8.022591,6.604950,10.456204,26.743414,0.231687,a,3,18.466029,18.950687,15.076548
1_6,6,1,563,1,6.720334,7.713518,10.482056,17.532401,3.003787,b,1,17.851328,19.521939,18.959528
1_8,8,1,389,1,14.582794,9.104352,10.664228,25.365963,1.069971,a,1,20.526361,20.696594,19.175477
1_10,10,1,371,1,6.734580,6.118418,9.036824,27.776500,2.288257,b,3,20.757215,19.450429,25.121916
2_0,0,1,500,2,11.009014,13.408899,10.349559,24.899667,1.763022,b,2,23.351921,20.884968,7.040759
2_2,2,1,419,2,8.072676,13.363940,11.087540,26.038448,0.752505,b,2,20.895222,21.411699,13.440874
2_4,4,1,431,2,7.865206,11.404251,10.780546,20.940662,1.665520,b,1,22.800936,18.766930,14.287249
2_6,6,1,285,2,6.795747,8.339810,11.270465,19.437340,6.200681,b,1,19.222769,20.885964,19.722769


In [10]:
pd.concat(datasets).to_csv("simulated.csv")