In [6]:
import numpy as np 
import itertools
from scipy import signal
import sys 
import pandas as pd

In [47]:
def generatelinear(n,outputname,sample_seed=None):
   '''
   Function to generate the linear data 
   n is the sample size 
   '''
   f = open(outputname, "w")
   if(sample_seed==None):
      rng = np.random.default_rng()
      sample_seed = rng.integers(sys.maxsize) 
   rng=np.random.default_rng(sample_seed)
    
   sample_seed1 = rng.integers(2**32 - 1)
   np.random.seed(sample_seed1)
   f.write("Seed "+ str(sample_seed))
   f.write("\n")
   beta1=[0.1,0.5,1,1.5,2,2.5,3,4,5,6]
   beta2=[0.1,0.5,1,1.5,2,2.5,3,4,5,6]
   for i in itertools.product(beta1,beta2):
      b1,b2=i
      L=np.random.binomial(1,0.5,n)
      A=b1*L+np.random.normal(0,1,n)
      B=b2*A+np.random.normal(0,1,n)
      f.write(str(b1)+" "+ str(b2))
      f.write("\n")
      np.savetxt(f,L,newline=' ',fmt='%s')
      f.write("\n")
      np.savetxt(f,A,newline=' ',fmt='%s')
      f.write("\n")
      np.savetxt(f,B,newline=' ',fmt='%s')
      f.write("\n")
   f.close()
      
   


In [48]:
def generateind(n,outputname,sample_seed=None):
   '''
   Function to generate the indpendent data 
   n is the sample size 
   '''
   f = open(outputname, "w")
   if(sample_seed==None):
      rng = np.random.default_rng()
      sample_seed = rng.integers(sys.maxsize) 
   rng=np.random.default_rng(sample_seed)
    
   sample_seed1 = rng.integers(2**32 - 1)
   np.random.seed(sample_seed1)
   f.write("Seed "+ str(sample_seed))

   f.write("\n")
   

   beta1=[0.1,0.5,1,1.5,2,2.5,3,4,5,6]
   beta2=[0.1,0.5,1,1.5,2,2.5,3,4,5,6]
   for i in itertools.product(beta1,beta2):
      b1,b2=i
      L=np.random.binomial(1,0.5,n)
      A=b1*L+np.random.normal(0,1,n)
      B=b2*L+np.random.normal(0,1,n)
      f.write(str(b1)+" "+ str(b2))
      f.write("\n")
      np.savetxt(f,L,newline=' ',fmt='%s')
      f.write("\n")
      np.savetxt(f,A,newline=' ',fmt='%s')
      f.write("\n")
      np.savetxt(f,B,newline=' ',fmt='%s')
      f.write("\n")
   f.close()
      
   


In [49]:
def generatesine(n,outputname,sample_seed=None):
   '''
   Function to generate the sine data 
   n is the sample size 
   '''
   f = open(outputname, "w")
   if(sample_seed==None):
      rng = np.random.default_rng()
      sample_seed = rng.integers(sys.maxsize) 
   rng=np.random.default_rng(sample_seed)
    
   sample_seed1 = rng.integers(2**32 - 1)
   np.random.seed(sample_seed1)
   f.write("Seed "+ str(sample_seed))
   f.write("\n")

   sigma1=[0.2,0.4,0.6,0.8,1]
   mu2=[0.1,0.5,1,1.5,2,2.5,3,4,5,6]
   sigma2=[0.5,1]
   for i in itertools.product(mu2,sigma1,sigma2):
      m2,s1,s2=i
      L=np.random.binomial(1,0.5,n)
      A=m2*L + np.random.normal(0,1,n)
      B=np.sin(2 * np.pi * (1/(8*s1)) * (A)- (np.pi/2 ))+np.random.normal(0,s2,n)
      f.write(str(m2)+" "+ str(s1)+" "+str(s2)+" ")
      f.write("\n")
      np.savetxt(f,L,newline=' ',fmt='%s')
      f.write("\n")
      np.savetxt(f,A,newline=' ',fmt='%s')
      f.write("\n")
      np.savetxt(f,B,newline=' ',fmt='%s')
      f.write("\n")
   f.close()
      
   


In [50]:
def generatesaw(n,outputname,sample_seed=None):
   '''
   Function to generate the sawtooth data 
   n is the sample size 
   '''
  
   f = open(outputname, "w")
   if(sample_seed==None):
      rng = np.random.default_rng()
      sample_seed = rng.integers(sys.maxsize) 
   rng=np.random.default_rng(sample_seed)
    
   sample_seed1 = rng.integers(2**32 - 1)
   np.random.seed(sample_seed1)
   f.write("Seed "+ str(sample_seed))
   f.write("\n")
   sigma1=[0.2,0.4,0.6,0.8,1]
   mu2=[0.1,0.5,1,1.5,2,2.5,3,4,5,6]
   sigma2=[0.5,1]
   for i in itertools.product(mu2,sigma1,sigma2):
      m2,s1,s2=i
      L=np.random.binomial(1,0.5,n)
      A=m2*L + np.random.normal(0,1,n)
      B=signal.sawtooth(2 * np.pi * (1/(8*s1)) * (A)- (np.pi/2 ))+np.random.normal(0,s2,n)
      f.write(str(m2)+" "+ str(s1)+" "+str(s2)+" ")
      f.write("\n")
      np.savetxt(f,L,newline=' ',fmt='%s')
      f.write("\n")
      np.savetxt(f,A,newline=' ',fmt='%s')
      f.write("\n")
      np.savetxt(f,B,newline=' ',fmt='%s')
      f.write("\n")
   f.close()
      
   


In [51]:
def generateparabola(n,outputname,sample_seed=None):
   '''
   Function to generate the linear data 
   n is the sample size 
   '''
   f = open(outputname, "w")
   if(sample_seed==None):
      rng = np.random.default_rng()
      sample_seed = rng.integers(sys.maxsize) 
   rng=np.random.default_rng(sample_seed)
    
   sample_seed1 = rng.integers(2**32 - 1)
   np.random.seed(sample_seed1)
   f.write("Seed "+ str(sample_seed))
   f.write("\n")
   beta1=[0.1,0.5,1,1.5,2,2.5,3,4,5,6]
   beta2=[0.1,0.5,1,1.5,2,2.5,3,4,5,6]
   for i in itertools.product(beta1,beta2):
      b1,b2=i
      L=np.random.binomial(1,0.5,n)
      A=b1*L+np.random.normal(0,1,n)
      B=b2*(A**2)+np.random.normal(0,1,n)
      f.write(str(b1)+" "+ str(b2))
      f.write("\n")
      np.savetxt(f,L,newline=' ',fmt='%s')
      f.write("\n")
      np.savetxt(f,A,newline=' ',fmt='%s')
      f.write("\n")
      np.savetxt(f,B,newline=' ',fmt='%s')
      f.write("\n")
   f.close()
      
   


In [52]:
def generateparabolavar(n,outputname,sample_seed=None):
   '''
   Function to generate the linear data 
   n is the sample size 
   '''
   f = open(outputname, "w")
   if(sample_seed==None):
      rng = np.random.default_rng()
      sample_seed = rng.integers(sys.maxsize) 
   rng=np.random.default_rng(sample_seed)
    
   sample_seed1 = rng.integers(2**32 - 1)
   np.random.seed(sample_seed1)
   f.write("Seed "+ str(sample_seed))
   f.write("\n")
   beta1=[1,2,3,4,5]
   beta2=[1,2,3,4,5]
   amps=[1,2,3,4]
   for i in itertools.product(beta1,beta2,amps):
      b1,b2,a=i
      L=np.random.binomial(1,0.5,n)
      A=a*L+ L*np.random.normal(0,b1,n)+(1-L)*np.random.normal(0,b2,n)
      B=(A**2)+np.random.normal(0,1,n) # Parabola unequal variance 
      f.write(str(b1)+" "+ str(b2)+" " + str(a))
      f.write("\n")
      np.savetxt(f,L,newline=' ',fmt='%s')
      f.write("\n")
      np.savetxt(f,A,newline=' ',fmt='%s')
      f.write("\n")
      np.savetxt(f,B,newline=' ',fmt='%s')
      f.write("\n")
   f.close()
      
   


In [53]:
def generatelinearvar(n,outputname,sample_seed=None):
   '''
   Function to generate the linear data 
   n is the sample size 
   '''
   f = open(outputname, "w")
   if(sample_seed==None):
      rng = np.random.default_rng()
      sample_seed = rng.integers(sys.maxsize) 
   rng=np.random.default_rng(sample_seed)
    
   sample_seed1 = rng.integers(2**32 - 1)
   np.random.seed(sample_seed1)
   f.write("Seed "+ str(sample_seed))
   f.write("\n")
   beta1=[1,2,3,4,5]
   beta2=[1,2,3,4,5]
   amps=[1,2,3,4]
   for i in itertools.product(beta1,beta2,amps):
      b1,b2,a=i
      L=np.random.binomial(1,0.5,n)
      A=a*L+ L*np.random.normal(0,b1,n)+(1-L)*np.random.normal(0,b2,n)
      B=(A)+np.random.normal(0,1,n) # Linear unequal variance 
      f.write(str(b1)+" "+ str(b2)+" " + str(a))
      f.write("\n")
      np.savetxt(f,L,newline=' ',fmt='%s')
      f.write("\n")
      np.savetxt(f,A,newline=' ',fmt='%s')
      f.write("\n")
      np.savetxt(f,B,newline=' ',fmt='%s')
      f.write("\n")
   f.close()
      
   


In [29]:
generatesaw(1000,outputname="dummycheck1.txt") # to generate the linear dataset with 1000 samples, pass the seed newseed argument to reproduce the result 

In [30]:
generatesaw(1000,outputname="dummycheck2.txt",sample_seed=4000370289086976611)

In [54]:
# function to extract seeds from the data that is already generated, and use it to reproduce the same data files 
def extractseed_simdata(filename):
    fo=open(filename, "r")
    line=fo.readline()
    fo.close()
    line=line.split()
    return int(line[1])

In [9]:

# the simulated data is generated with a seed at the start of the file
# so we skip it, but not for the yeast data 
# if yeast is present then dont read the seed line
    if(data.find('yeast')==-1 and data.find('human')==-1): #if yeast is not present
        line=fo.readline() 

Unnamed: 0,Seed 3586939475498222224
0,0.1 0.1
1,1 1 0 1 1 0 0 1 0 1 1 1 0 1 0 0 0 0 1 0 1 1 1 ...
2,0.8144257468760792 1.0701011818883808 1.642291...
3,0.49996757562623745 0.862709080594079 0.432483...
4,0.1 0.5
...,...
395,4.125626323380968 -1.2440952190175767 1.558612...
396,6 6
397,1 1 0 0 1 1 1 0 1 0 1 0 0 1 1 0 0 1 1 1 1 0 0 ...
398,4.050248868095945 6.199980712348974 1.15931310...


Reproduction of all the simulated data

In [67]:
# output_dir is the place where you want to regenerate the data
#input_dir is the place where the exisiting generated data is residing
# NOTE: if both the directories are the same then it will replace the exisitng data
output_dir='./simulation_final/data/test/'
input_dir='./simulation_final/data/'
generatelinear(1000,outputname=output_dir+'Linear1000.txt',sample_seed=extractseed_simdata(input_dir+'Linear1000.txt'))
generateind(1000,outputname=output_dir+'Indp1000.txt',sample_seed=extractseed_simdata(input_dir+'Indp1000.txt'))
generatesine(1000,outputname=output_dir+'Sine1000.txt',sample_seed=extractseed_simdata(input_dir+'Sine1000.txt'))
generatesaw(1000,outputname=output_dir+'Saw1000.txt',sample_seed= extractseed_simdata(input_dir+'Saw1000.txt'))
generatelinear(500,outputname=output_dir+'Linear500.txt',sample_seed=extractseed_simdata(input_dir+'Linear500.txt'))
generateind(500,outputname=output_dir+'Indp500.txt',sample_seed= extractseed_simdata(input_dir+'Indp500.txt'))
generatesine(500,outputname=output_dir+'Sine500.txt',sample_seed= extractseed_simdata(input_dir+'Sine500.txt'))
generatesaw(500,outputname=output_dir+'Saw500.txt',sample_seed= extractseed_simdata(input_dir+'Saw500.txt'))
generatelinear(300,outputname=output_dir+'Linear300.txt',sample_seed=extractseed_simdata(input_dir+'Linear300.txt'))
generateind(300,outputname=output_dir+'Indp300.txt',sample_seed= extractseed_simdata(input_dir+'Indp300.txt'))
generatesine(300,outputname=output_dir+'Sine300.txt',sample_seed=extractseed_simdata(input_dir+'Sine300.txt'))
generatesaw(300,outputname=output_dir+'Saw300.txt',sample_seed=extractseed_simdata(input_dir+'Saw300.txt'))
generateparabola(1000,outputname=output_dir+'Para1000.txt',sample_seed= extractseed_simdata(input_dir+'Para1000.txt'))
generateparabola(500,outputname=output_dir+'Para500.txt', sample_seed=extractseed_simdata(input_dir+'Para500.txt'))
generateparabola(300,outputname=output_dir+'Para300.txt',sample_seed=extractseed_simdata(input_dir+'Para300.txt'))
generateparabolavar(1000,outputname=output_dir+'Paravar1000.txt', sample_seed=extractseed_simdata(input_dir+'Paravar1000.txt'))
generateparabolavar(500,outputname=output_dir+'Paravar500.txt',sample_seed=extractseed_simdata(input_dir+'Paravar500.txt'))
generateparabolavar(300,outputname=output_dir+'Paravar300.txt',sample_seed= extractseed_simdata(input_dir+'Paravar300.txt')) 
generatelinearvar(1000,outputname=output_dir+'Linearvar1000.txt',sample_seed= extractseed_simdata(input_dir+'Linearvar1000.txt'))
generatelinearvar(500,outputname=output_dir+'Linearvar500.txt',sample_seed=extractseed_simdata(input_dir+'Linearvar500.txt'))
generatelinearvar(300,outputname=output_dir+'Linearvar300.txt',sample_seed= extractseed_simdata(input_dir+'Linearvar300.txt'))


In [68]:
output_dir='./simulation_final/data/test/10run/'
input_dir='./simulation_final/data/10rundata/'
generatesine(500,outputname=output_dir+'Sine500run1.txt',sample_seed=extractseed_simdata(input_dir+'Sine500run1.txt'))
generatesine(500,outputname=output_dir+'Sine500run2.txt',sample_seed=extractseed_simdata(input_dir+'Sine500run2.txt'))
generatesine(500,outputname=output_dir+'Sine500run3.txt',sample_seed=extractseed_simdata(input_dir+'Sine500run3.txt'))
generatesine(500,outputname=output_dir+'Sine500run4.txt',sample_seed=extractseed_simdata(input_dir+'Sine500run4.txt'))
generatesine(500,outputname=output_dir+'Sine500run5.txt',sample_seed=extractseed_simdata(input_dir+'Sine500run5.txt'))
generatesine(500,outputname=output_dir+'Sine500run6.txt',sample_seed=extractseed_simdata(input_dir+'Sine500run6.txt'))
generatesine(500,outputname=output_dir+'Sine500run7.txt',sample_seed=extractseed_simdata(input_dir+'Sine500run7.txt'))
generatesine(500,outputname=output_dir+'Sine500run8.txt',sample_seed=extractseed_simdata(input_dir+'Sine500run8.txt'))
generatesine(500,outputname=output_dir+'Sine500run9.txt',sample_seed=extractseed_simdata(input_dir+'Sine500run9.txt'))
generatesine(500,outputname=output_dir+'Sine500run10.txt',sample_seed=extractseed_simdata(input_dir+'Sine500run10.txt'))

In [69]:
output_dir='./simulation_final/data/test/10run/'
input_dir='./simulation_final/data/10rundata/'
generatesaw(500,outputname=output_dir+'Saw500run1.txt',sample_seed=extractseed_simdata(input_dir+'Saw500run1.txt'))
generatesaw(500,outputname=output_dir+'Saw500run2.txt',sample_seed=extractseed_simdata(input_dir+'Saw500run2.txt'))
generatesaw(500,outputname=output_dir+'Saw500run3.txt',sample_seed=extractseed_simdata(input_dir+'Saw500run3.txt'))
generatesaw(500,outputname=output_dir+'Saw500run4.txt',sample_seed=extractseed_simdata(input_dir+'Saw500run4.txt'))
generatesaw(500,outputname=output_dir+'Saw500run5.txt',sample_seed=extractseed_simdata(input_dir+'Saw500run5.txt'))
generatesaw(500,outputname=output_dir+'Saw500run6.txt',sample_seed=extractseed_simdata(input_dir+'Saw500run6.txt'))
generatesaw(500,outputname=output_dir+'Saw500run7.txt',sample_seed=extractseed_simdata(input_dir+'Saw500run7.txt'))
generatesaw(500,outputname=output_dir+'Saw500run8.txt',sample_seed=extractseed_simdata(input_dir+'Saw500run8.txt'))
generatesaw(500,outputname=output_dir+'Saw500run9.txt',sample_seed=extractseed_simdata(input_dir+'Saw500run9.txt'))
generatesaw(500,outputname=output_dir+'Saw500run10.txt',sample_seed=extractseed_simdata(input_dir+'Saw500run10.txt'))

In [70]:
output_dir='./simulation_final/data/test/10run/'
input_dir='./simulation_final/data/10rundata/'
generatelinear(500,outputname=output_dir+'Linear500run1.txt',sample_seed=extractseed_simdata(input_dir+'Linear500run1.txt'))
generatelinear(500,outputname=output_dir+'Linear500run2.txt',sample_seed=extractseed_simdata(input_dir+'Linear500run2.txt'))
generatelinear(500,outputname=output_dir+'Linear500run3.txt',sample_seed=extractseed_simdata(input_dir+'Linear500run3.txt'))
generatelinear(500,outputname=output_dir+'Linear500run4.txt',sample_seed=extractseed_simdata(input_dir+'Linear500run4.txt'))
generatelinear(500,outputname=output_dir+'Linear500run5.txt',sample_seed=extractseed_simdata(input_dir+'Linear500run5.txt'))
generatelinear(500,outputname=output_dir+'Linear500run6.txt',sample_seed=extractseed_simdata(input_dir+'Linear500run6.txt'))
generatelinear(500,outputname=output_dir+'Linear500run7.txt',sample_seed=extractseed_simdata(input_dir+'Linear500run7.txt'))
generatelinear(500,outputname=output_dir+'Linear500run8.txt',sample_seed=extractseed_simdata(input_dir+'Linear500run8.txt'))
generatelinear(500,outputname=output_dir+'Linear500run9.txt',sample_seed=extractseed_simdata(input_dir+'Linear500run9.txt'))
generatelinear(500,outputname=output_dir+'Linear500run10.txt',sample_seed=extractseed_simdata(input_dir+'Linear500run10.txt'))

In [71]:
output_dir='./simulation_final/data/test/10run/'
input_dir='./simulation_final/data/10rundata/'
generateind(500,outputname=output_dir+'Indp500run1.txt',sample_seed=extractseed_simdata(input_dir+'Indp500run1.txt'))
generateind(500,outputname=output_dir+'Indp500run2.txt',sample_seed=extractseed_simdata(input_dir+'Indp500run2.txt'))
generateind(500,outputname=output_dir+'Indp500run3.txt',sample_seed=extractseed_simdata(input_dir+'Indp500run3.txt'))
generateind(500,outputname=output_dir+'Indp500run4.txt',sample_seed=extractseed_simdata(input_dir+'Indp500run4.txt'))
generateind(500,outputname=output_dir+'Indp500run5.txt',sample_seed=extractseed_simdata(input_dir+'Indp500run5.txt'))
generateind(500,outputname=output_dir+'Indp500run6.txt',sample_seed=extractseed_simdata(input_dir+'Indp500run6.txt'))
generateind(500,outputname=output_dir+'Indp500run7.txt',sample_seed=extractseed_simdata(input_dir+'Indp500run7.txt'))
generateind(500,outputname=output_dir+'Indp500run8.txt',sample_seed=extractseed_simdata(input_dir+'Indp500run8.txt'))
generateind(500,outputname=output_dir+'Indp500run9.txt',sample_seed=extractseed_simdata(input_dir+'Indp500run9.txt'))
generateind(500,outputname=output_dir+'Indp500run10.txt',sample_seed=extractseed_simdata(input_dir+'Indp500run10.txt'))