In [1]:
import pandas as pd
import numpy as np
import numpy.random as nr

# Data Preparation

In [2]:
# load example data
raw = pd.read_csv("example.csv", index_col=0)

# some column and index shuffleing
raw["ENTITY"] = raw.index
raw.index = raw.index.map(str) +"_"+ raw["TIME"].map(str)
raw.rename(columns={"TIME.1":"TIME"})
del raw.index.name

In [3]:
raw1 = raw[raw.DATASET == 1][::]
raw2 = raw[raw.DATASET == 2][::]
raw3 = raw[raw.DATASET == 3][::]

datasets = [raw1, raw2, raw3]

# normally distributed features
means1 = [10, 12, 16]
stds1 = [3, 3, 2]

means2 = [10, 10, 10]
stds2 = [1, 1, 1]

means3 = [10, 20, 30]
stds3 = [3, 3, 2]

means4 = [20, 21, 20]
stds4 = [4, 5, 4]

# chi square features
degs = [2,3,4]

# categorical features
vals1 = ["a", "b", "c"]
probs1 = [[0.2, 0.7, 0.1], [0.3, 0.3, 0.4], [0.8, 0.1, 0.1]]

vals2 = [1, 2, 3]

# batch effects
batches1 = [0, 0, 5]
b_means1 = [20, 20, 20]
b_stds1 = [2, 1, 1]

batches2 = [0, 0, 2]
b_means2 = [20, 20, 20]
b_stds2 = [1, 1, 1]

#longitudinal
long_m = [[10, 12, 15, 17, 19, 22, 25, 27, 29, 33], 
         [12, 12, 16, 19, 23, 19, 28, 30, 31, 33],
         [10, 12, 14, 16, 18, 20, 22, 24, 26, 28]]
long_s = [[2.1, 1.3, 1, 1.5, 1.7, 2, 2.5, 3, 3, 3.3],
         [2.2, 1.3, 1, 1.5, 1.7, 2, 2.5, 3, 3, 3.3],
         [2.1, 1.3, 1, 1.5, 1.7, 2, 2.5, 3, 3, 3.3]]

# Generate normally distributed features

In [4]:
def fill_nd_feats(feat, datasets, means, stds):
    for df, mean, std in zip(datasets, means, stds):
        df[feat] = nr.normal(mean, std, size=len(df))
        
fill_nd_feats("NDSIG1", datasets, means1, stds1)
fill_nd_feats("NDSIG2", datasets, means3, stds3)
fill_nd_feats("NDNON1", datasets, means2, stds2)
fill_nd_feats("NDNON2", datasets, means4, stds4)

# Chi² feats

In [5]:
def fill_chi_feats(feat, datasets, degs):
    for df, deg in zip(datasets, degs):
        df[feat] = nr.chisquare(deg, size=len(df))

fill_chi_feats("CHIFEAT", datasets, degs)

# Categorical feats

In [6]:
def fill_cat_feats(feat, datasets, vals, probs=None):
    if probs:
        for df, prob in zip(datasets, probs):
            df[feat] = nr.choice(vals, p=prob, size=len(df))
    else:
        for df in datasets:
            df[feat] = nr.choice(vals, size=len(df))
        
fill_cat_feats("CATSIG1", datasets, vals1, probs1)
fill_cat_feats("CATNON2", datasets, vals2)

# Batch

In [7]:
def fill_batch_feats(feat, datasets, means, stds, batches):
    for df, mean, std, batch in zip(datasets, means, stds, batches):
        df[feat] = nr.normal(mean, std, size=len(df))
        # add batch effect
        df[feat] = df[feat] + batch

fill_batch_feats("BATCH1", datasets, b_means1, b_stds1, batches1)
fill_batch_feats("BATCH2", datasets, b_means2, b_stds2, batches2)

# Longitudinal

In [8]:
def fill_long_feat(feat, datasets, time, means, stds):
    
    times = datasets[0][time].unique()
    
    for df, mean, std in zip(datasets, means, stds):

        for t, m, s in zip(times, mean, std):
            inds = df[df[time]==t].index
            n_vals = nr.normal(m, s, size=len(inds))
            df.loc[inds, feat] = n_vals
            

fill_long_feat("LONG1", datasets, "TIME", long_m, long_s)

# NAN Features

In [9]:
fill_nd_feats("NAN1", datasets, means1, stds1)
datasets[0]["NAN1"] = np.nan

# Combine

In [10]:
data = pd.concat(datasets)
data

Unnamed: 0,TIME,DATASET,FEAT1,ENTITY,NDSIG1,NDSIG2,NDNON1,NDNON2,CHIFEAT,CATSIG1,CATNON2,BATCH1,BATCH2,LONG1,NAN1
1_0,0,1,658,1,10.791988,6.444779,11.258263,15.183098,5.006881,b,2,23.398382,18.946908,9.140887,
1_2,2,1,543,1,9.975609,8.586402,10.140096,25.805657,0.949981,b,3,21.389564,20.049130,9.374503,
1_4,4,1,520,1,5.600375,5.497788,11.756352,23.043734,2.085709,a,3,17.509859,20.550166,14.612981,
1_6,6,1,563,1,7.795047,13.760622,9.885872,22.001222,1.178162,b,3,20.220502,21.321849,16.578907,
1_8,8,1,389,1,14.218476,5.949870,11.766515,23.805506,4.284354,b,2,20.551883,20.537604,21.116545,
1_10,10,1,371,1,12.327310,11.155140,9.362746,20.427708,0.420687,b,1,20.622579,19.051853,25.730791,
2_0,0,1,500,2,12.289954,4.485074,11.847872,15.726820,2.008265,b,2,22.232263,19.032939,8.896366,
2_2,2,1,419,2,7.799325,10.049497,11.200782,14.796221,0.478334,c,1,20.915560,20.179123,11.812474,
2_4,4,1,431,2,8.966209,9.052807,9.613307,17.884844,7.754755,b,2,20.148358,21.432536,14.995419,
2_6,6,1,285,2,14.220876,12.193595,9.139438,15.905126,1.007549,a,3,23.371060,19.570196,16.840597,


In [11]:
pd.concat(datasets).to_csv("simulated.csv")