In [1]:
import pandas as pd
import numpy as np
import numpy.random as nr

# Data Preparation

In [2]:
# load example data
raw = pd.read_csv("example.csv", index_col=0)

# some column and index shuffleing
raw["ENTITY"] = raw.index
raw.index = raw.index.map(str) +"_"+ raw["TIME"].map(str)
raw.rename(columns={"TIME.1":"TIME"})
del raw.index.name

In [3]:
raw1 = raw[raw.DATASET == 1][::]
raw2 = raw[raw.DATASET == 2][::]
raw3 = raw[raw.DATASET == 3][::]

datasets = [raw1, raw2, raw3]

# normally distributed features
means1 = [10, 12, 16]
stds1 = [3, 3, 2]

means2 = [10, 10, 10]
stds2 = [1, 1, 1]

means3 = [10, 20, 30]
stds3 = [3, 3, 2]

means4 = [20, 21, 20]
stds4 = [4, 5, 4]

# chi square features
degs = [2,3,4]

# categorical features
vals1 = ["a", "b", "c"]
probs1 = [[0.2, 0.7, 0.1], [0.3, 0.3, 0.4], [0.8, 0.1, 0.1]]

vals2 = [1, 2, 3]

vals3 = [1, 2]

# batch effects
batches1 = [0, 0, 5]
b_means1 = [20, 20, 20]
b_stds1 = [2, 1, 1]

batches2 = [0, 0, 2]
b_means2 = [20, 20, 20]
b_stds2 = [1, 1, 1]

#longitudinal
long_m = [[10, 12, 15, 17, 19, 22, 25, 27, 29, 33], 
         [12, 12, 16, 19, 23, 19, 28, 30, 31, 33],
         [10, 12, 14, 16, 18, 20, 22, 24, 26, 28]]
long_s = [[2.1, 1.3, 1, 1.5, 1.7, 2, 2.5, 3, 3, 3.3],
         [2.2, 1.3, 1, 1.5, 1.7, 2, 2.5, 3, 3, 3.3],
         [2.1, 1.3, 1, 1.5, 1.7, 2, 2.5, 3, 3, 3.3]]

# Generate normally distributed features

In [4]:
def fill_nd_feats(feat, datasets, means, stds):
    for df, mean, std in zip(datasets, means, stds):
        df[feat] = nr.normal(mean, std, size=len(df))
        
fill_nd_feats("NDSIG1", datasets, means1, stds1)
fill_nd_feats("NDSIG2", datasets, means3, stds3)
fill_nd_feats("NDNON1", datasets, means2, stds2)
fill_nd_feats("NDNON2", datasets, means4, stds4)

# Chi² feats

In [5]:
def fill_chi_feats(feat, datasets, degs):
    for df, deg in zip(datasets, degs):
        df[feat] = nr.chisquare(deg, size=len(df))

fill_chi_feats("CHIFEAT", datasets, degs)

# Categorical feats

In [6]:
def fill_cat_feats(feat, datasets, vals, probs=None):
    if probs:
        for df, prob in zip(datasets, probs):
            df[feat] = nr.choice(vals, p=prob, size=len(df))
    else:
        for df in datasets:
            df[feat] = nr.choice(vals, size=len(df))
        
fill_cat_feats("CATSIG1", datasets, vals1, probs1)
fill_cat_feats("CATNON2", datasets, vals2)
fill_cat_feats("CATDICHO", datasets, vals3)

# Batch

In [7]:
def fill_batch_feats(feat, datasets, means, stds, batches):
    for df, mean, std, batch in zip(datasets, means, stds, batches):
        df[feat] = nr.normal(mean, std, size=len(df))
        # add batch effect
        df[feat] = df[feat] + batch

fill_batch_feats("BATCH1", datasets, b_means1, b_stds1, batches1)
fill_batch_feats("BATCH2", datasets, b_means2, b_stds2, batches2)

# Longitudinal

In [8]:
def fill_long_feat(feat, datasets, time, means, stds):
    
    times = datasets[0][time].unique()
    
    for df, mean, std in zip(datasets, means, stds):

        for t, m, s in zip(times, mean, std):
            inds = df[df[time]==t].index
            n_vals = nr.normal(m, s, size=len(inds))
            df.loc[inds, feat] = n_vals
            

fill_long_feat("LONG1", datasets, "TIME", long_m, long_s)

# NAN Features

In [9]:
fill_nd_feats("NAN1", datasets, means1, stds1)
datasets[0]["NAN1"] = np.nan

# Combine

In [10]:
data = pd.concat(datasets)
data

Unnamed: 0,TIME,DATASET,FEAT1,ENTITY,NDSIG1,NDSIG2,NDNON1,NDNON2,CHIFEAT,CATSIG1,CATNON2,CATDICHO,BATCH1,BATCH2,LONG1,NAN1
1_0,0,1,658,1,11.011468,12.558571,9.667827,26.718079,4.306700,a,3,1,18.940196,20.496291,12.006220,
1_2,2,1,543,1,10.962827,9.385050,8.928369,28.082092,1.448887,b,2,2,23.084742,19.197574,11.737242,
1_4,4,1,520,1,5.848932,12.239156,9.888821,23.126320,4.488518,b,3,1,20.105039,20.142859,15.143529,
1_6,6,1,563,1,6.226329,10.424294,9.008361,19.567322,2.217571,b,2,1,18.958418,19.911081,16.285862,
1_8,8,1,389,1,7.217990,12.511185,10.477781,27.503524,3.444671,b,3,2,21.904402,18.630012,17.021575,
1_10,10,1,371,1,8.309122,7.685481,9.057778,16.862431,1.724354,b,1,2,23.961162,20.698375,20.719118,
2_0,0,1,500,2,8.866331,13.544045,10.705604,19.690834,0.376296,b,2,1,22.304584,17.759053,7.079383,
2_2,2,1,419,2,12.119411,8.065864,11.268401,22.222272,0.523106,a,3,2,21.884522,18.461503,12.132528,
2_4,4,1,431,2,12.100397,7.282974,10.780238,25.173125,0.263327,b,2,2,18.977398,22.205710,14.506498,
2_6,6,1,285,2,8.223755,13.712827,8.984823,21.493370,4.220111,a,3,2,18.858680,21.149282,16.752246,


In [11]:
pd.concat(datasets).to_csv("simulated.csv")