In [None]:
import numpy as np
import pandas as pd

In [None]:
num_ppl = 20000
num_snps = 100

num_causal_snps = 25   # linear snps
num_epi = 1   # 2-loci epistasis

In [None]:
def generate_effects(num_snps, num_causal_snps):
    causal_snps_idx = sorted(np.random.choice(num_snps, num_causal_snps, replace = False))
    effect = np.random.normal(0, 0.5, size = num_causal_snps + 1)
    
    epi_snps_idx = sorted(np.random.choice(num_snps, 3, replace = False)) # 2-loci

    return (causal_snps_idx, effect, epi_snps_idx)

In [None]:
def pheno_generation(gen, causal_snps_idx, effect, epi_snps_idx, epi_prob_matrix, alpha):
    # Linear Effect
    score = effect[0] + np.dot(gen[:,causal_snps_idx] , effect[1:])
    lin_prob = np.exp(score) / (1 + np.exp(score))
    
    # Epistasis effect
    epi_prob = np.zeros(num_ppl)
    for i, r in enumerate(gen[:, epi_snps_idx]):
        a = r[0]
        b = r[1]
        c = r[2]
        epi_prob[i] = epi_prob_matrix[a][b][c]
        
    # mix effects w/ alpha
    prob = alpha*lin_prob + (1-alpha)*epi_prob
    
    Y = np.random.binomial(1, prob, size=prob.shape)
    
    return (Y, prob)

In [None]:
def create_dataset(gen, Y, name, alpha):
    
    print(f"y_class = 0 {sum(Y == 0)} || y_class = 1 {sum(Y == 1)}")
    
    gen_df = pd.DataFrame(gen)
    y_class_df = pd.Series(Y)
    
    gen_df.to_csv(f"./datasets/feature_{name}.csv", header=False, index=False)
    y_class_df.to_csv(f"./datasets/target_{name}_alpha{alpha}.csv", header=False, index=False)

In [None]:
epi_probs = np.array([[[0.28186155422521764,  0.5207791187331811,  0.2773927825295285],
                                [0.49178390877316336,  0.06925274393903989,  0.5308461659563246],
                                [0.5093544578890467,  0.06692282387853395,  0.26835675071191784]],

                                [[0.5183650906430136,  0.017803417313929152,  0.5171418772166806],
                                [0.07420167489622208,  0.9680921450761688,  0.021446583929458814],
                                [0.06595582496233192,  0.9238186716301662,  0.5075679589275518]],

                                [[0.29670500689115026,  0.4785174292128735,  0.37799105858697174],
                                [0.49125471903631546,  0.10062947629458073,  0.28829934726164014],
                                [0.2760927418974399,  0.4920959768446959,  0.5991589071692529]]])

In [None]:
feature_name = 'gam_100snps'

# Create GEN
np.random.seed(42)
gen = np.random.choice([0, 1, 2], p=[0.25, 0.5, 0.25],
                       size=(num_ppl, num_snps))

# generate lin effect
causal_snps_idx, effect, epi_snps_idx = generate_effects(num_snps, num_causal_snps)

alphas = [round(0.1*i, 2) for i in range(11)]


for alpha in alphas:
    Y, prob = pheno_generation(gen, causal_snps_idx, effect, epi_snps_idx, epi_probs, alpha)    
    create_dataset(gen, Y, feature_name, alpha)