<a href="https://colab.research.google.com/github/Dowell-Lab/psea/blob/main/notebook_examples/simulateddata-bothdirs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [49]:
import pandas as pd
import numpy as np
import numpy.random as random
from datetime import datetime
import random
import plotly.express as px
from concurrent.futures import ProcessPoolExecutor


In [50]:
ongoogle=0
if ongoogle==1:
    from google.colab import drive
    outdir="/content/drive/MyDrive/temp/"
    drive.mount('/content/drive')
    parallel=False
else:
    outdir="/Users/allenma/temp/"
    parallel=True

In [51]:
now = datetime.now()
date_time_string = now.strftime("%Y%m%d%H%M%S")
print(date_time_string)


20241011155516


# Look at the real data

In [52]:
#This brings in the normalized counts for all the individuals with Trisomy 21 in the Human Trisome Project. These are not the real count data but are similar to reall count data.
#df=pd.read_csv('/content/drive/MyDrive/normcounts.csv')
gene_exp_url="https://raw.githubusercontent.com/Dowell-Lab/psea/refs/heads/main/testdata/value_expression.csv"
gene_exp_df=pd.read_csv(gene_exp_url, index_col=0)

#this brings in the medical disorders for all the individuals with Trisomy 21 in the Human Trisome Project
comorbid_url = "https://raw.githubusercontent.com/Dowell-Lab/psea/refs/heads/main/testdata/comorbid_file.csv"
comorbid_df = pd.read_csv(comorbid_url, index_col=0)

#this brings in random names to use
namefile = "https://raw.githubusercontent.com/Dowell-Lab/psea/refs/heads/main/testdata/namelist.txt"
namesdf = pd.read_csv(namefile, index_col=0, names=["name"])



In [53]:
samplename="Patient" # What is the sample column name in your data. In mine, it is patient.

## How many samples does the real data have

In [54]:
total_samples = gene_exp_df.shape[0]


## Plot the real gene expression data

Collect the metadata about the values and binaray attributes. Remove values where the mean is 0. Remove bianarys where the samples with the attribute are all samples or none.

In [55]:
def collect_gene_metadata(gene_exp_df):
  genenames = [colname for colname in gene_exp_df.columns if colname!=samplename]
  gene_exp_df_metadata = gene_exp_df[genenames]
  gene_exp_df_metadata = gene_exp_df_metadata.T
  gene_exp_df_metadata["mean"] = gene_exp_df_metadata.mean(axis=1)
  gene_exp_df_metadata["std"] = gene_exp_df_metadata.std(axis=1)
  gene_exp_df_metadata = gene_exp_df_metadata[["mean", "std"]]
  gene_exp_df_metadata = gene_exp_df_metadata[gene_exp_df_metadata["mean"]!=0]
  gene_exp_df_metadata["log_mean"] = np.log(gene_exp_df_metadata["mean"])
  gene_exp_df_metadata["log_std"] = np.log(gene_exp_df_metadata["std"])
  return gene_exp_df_metadata


def collect_comorbid_metadata(comorbid_df):
  comorbidnames = [colname for colname in comorbid_df.columns if colname!=samplename]
  comorbid_df_metadata = comorbid_df[comorbidnames].T
  comorbid_df_metadata["samples_with_binary_attribute"] = comorbid_df_metadata.sum(axis=1)
  comorbid_df_metadata = comorbid_df_metadata[["samples_with_binary_attribute"]]
  return comorbid_df_metadata

def removeallornone_bianarys(comorbid_df_metadata):
  comorbid_df_metadata = comorbid_df_metadata[comorbid_df_metadata["samples_with_binary_attribute"]!=total_samples]
  comorbid_df_metadata = comorbid_df_metadata[comorbid_df_metadata["samples_with_binary_attribute"]!=0]
  return comorbid_df_metadata


In [56]:
gene_exp_df_metadata = collect_gene_metadata(gene_exp_df)

In [57]:
fig = px.scatter(gene_exp_df_metadata, y="log_mean", x="log_std")
fig.show()


## Plot the read comorbidity data

In [58]:
comorbid_df_metadata = collect_comorbid_metadata(comorbid_df)

In [59]:
fig = px.violin(comorbid_df_metadata, y="samples_with_binary_attribute")
fig.show()

In [60]:
comorbid_df_metadata = removeallornone_bianarys(comorbid_df_metadata)
fig = px.violin(comorbid_df_metadata, y="samples_with_binary_attribute", box=True)
fig.show()

# Code that makes the simulated data

## Pick names for simulated samplesnames

In [61]:
# Select a random sample of names from namesdf
random_names = namesdf.sample(n=total_samples, replace=False)
random_names["name"] = random_names.index


### How many genes do you want to simulate

In [62]:
simulate_n_genes = 10

## First simulate the simulated values on the real values in the valuedf (gene expression based on real genes)

In [63]:
def variableexp(total_samples, mean_exp, std_exp):
  arr = np.random.normal(mean_exp, std_exp, total_samples)
  return arr

def generate_gene_exp(simulated_gene_exp_df):
  simulated_gene_exp_df["exp_array"] =  simulated_gene_exp_df.apply(lambda row: variableexp(total_samples, row["mean"], row["std"]), axis=1)
  return simulated_gene_exp_df

def simulate_values_based_on_real_genes(gene_exp_df_metadata, simulate_n_genes, total_samples):
  # Cut the data from the 'mean' column into simulate_n_genes bins with equal number of rows
  gene_exp_df_metadata['exp_group'] = pd.qcut(gene_exp_df_metadata['mean'], q=simulate_n_genes, labels=False)

  # Create an empty DataFrame to store the randomly selected rows
  simulated_gene_exp_df = pd.DataFrame()

  # Iterate through each unique group in 'exp_group'
  for group in gene_exp_df_metadata['exp_group'].unique():
    # Select rows belonging to the current group
    rows_in_group = gene_exp_df_metadata[gene_exp_df_metadata['exp_group'] == group]
    # Randomly select one row from the group
    if not rows_in_group.empty:
      random_row = rows_in_group.sample(n=1)
      # Append the randomly selected row to the DataFrame
      simulated_gene_exp_df = pd.concat([simulated_gene_exp_df, random_row])
  simulated_gene_exp_df = generate_gene_exp(simulated_gene_exp_df)
  simulated_gene_exp_df["sim_gene_name"] = "simulated_based_on_"+simulated_gene_exp_df.index
  repeat_random_names = [random_names["name"].to_list() for i in range(simulate_n_genes)]
  simulated_gene_exp_df["names"] = repeat_random_names
  simulated_gene_exp_df = simulated_gene_exp_df[["names", "exp_array", "sim_gene_name"]].copy()
  simulated_gene_exp_df_long = simulated_gene_exp_df.explode(["names", "exp_array"])
  simulated_gene_exp_df_long['exp_array'] = simulated_gene_exp_df_long['exp_array'].clip(lower=0)   
  final_simulated_value_df = simulated_gene_exp_df_long.pivot(index='names', columns='sim_gene_name', values='exp_array')
  final_simulated_value_df.index.name = None
  return final_simulated_value_df



In [64]:
simulated_value_df = simulate_values_based_on_real_genes(gene_exp_df_metadata, simulate_n_genes, total_samples)

In [65]:
simulated_value_df_Patient_column = simulated_value_df.copy()
simulated_value_df_Patient_column[samplename] = simulated_value_df.index
simulated_value_df_Patient_column = simulated_value_df_Patient_column.reset_index(drop=True)


In [66]:
simulated_value_df_Patient_column.to_csv(outdir+"simulated_gene_exp_"+date_time_string+".csv")

## Simulate the comorbids

In [67]:
def make_binary_list(total_samples, samples_true_uniform):
  binary_list = [0] * total_samples
  for _ in range(samples_true_uniform):
    random_index = random.randint(0, total_samples - 1)
    while binary_list[random_index] == 1:
      random_index = random.randint(0, total_samples - 1)
    binary_list[random_index] = 1
  return binary_list

def find_nearest_index(array, value):
    array = np.asarray(array)
    
    # Ignore NaN values
    valid_indices = ~np.isnan(array)
    filtered_array = array[valid_indices]
    
    if filtered_array.size == 0:
        raise ValueError("Input array has no valid numbers.")
    
    idx = (np.abs(filtered_array - value)).argmin()
    return np.where(valid_indices)[0][idx]

def add_bias_one_set_values(current_binary_list, simulated_value_df, genename, percent_binary_attributes_thatarevaluebias, Zscore_valuebais, Zscore_valuebais_sigma, top_or_bottom, samples_true):
  gene_vals = sorted(simulated_value_df[genename].to_list())
  gene_mean = simulated_value_df[genename].mean()
  gene_std = simulated_value_df[genename].std()
  gene_target_sigma = Zscore_valuebais_sigma*gene_std
  if top_or_bottom=="top":
    gene_target = gene_mean+Zscore_valuebais*gene_std
  else:
    gene_target = gene_mean-Zscore_valuebais*gene_std
  counttries=0
  while sum(current_binary_list)<samples_true:
    random_number = np.random.normal(loc=gene_target, scale=gene_target_sigma)
    if random_number>0:
        leftover_gene_vals = [gv if current_binary_list[i] == 0 else np.nan for i, gv in enumerate(gene_vals)]
        index_nearest_to_random_number = find_nearest_index(leftover_gene_vals, random_number)
        if current_binary_list[index_nearest_to_random_number]!=1:
          current_binary_list[index_nearest_to_random_number] = 1
        else:
            counttries = counttries+1
            if counttries>10:
                gene_target_sigma = gene_target_sigma*2
                counttries=0
                print("widening sigma"+str(samples_true)+"sum_total"+str(sum(current_binary_list))+"_"+str(gene_mean)+"_"+str(gene_target_sigma)+"_"+str(percent_binary_attributes_thatarevaluebias)+"_"+str(Zscore_valuebais)+"_"+str(top_or_bottom))
                
  return current_binary_list, gene_target_sigma


def create_simulated_n_biarary_att_from_realdata(comorbid_df_metadata, nlevels_binaryatts=12):
  nbinaryatt_min = comorbid_df_metadata["samples_with_binary_attribute"].min()
  nbinaryatt_max =comorbid_df_metadata["samples_with_binary_attribute"].max()
  nbinaryatt_med = comorbid_df_metadata["samples_with_binary_attribute"].median()
  nbinaryatt_std = comorbid_df_metadata["samples_with_binary_attribute"].std()
  simulated_n_bianary_attributes = []
  simulated_n_bianary_attributes.append(nbinaryatt_min)
  simulated_n_bianary_attributes.append(nbinaryatt_max)
  while len(simulated_n_bianary_attributes)<nlevels_binaryatts:
    random_number = int(np.random.normal(loc=nbinaryatt_med, scale=nbinaryatt_std))
    if random_number>0:
      simulated_n_bianary_attributes.append(random_number)
      simulated_n_bianary_attributes = sorted(list(set(simulated_n_bianary_attributes)))
  return simulated_n_bianary_attributes


def create_many_ba_serial(simulated_value_df, percent_binary_attributes_thatarevaluebias_list, Zscore_valuebais_list, Zscore_valuebais_sigma_list, simulated_n_bianary_attributes, biasdirs):
  df_list =[]
  genenames = [gn for gn in simulated_value_df.columns if gn!=samplename]
  for percent_binary_attributes_thatarevaluebias in percent_binary_attributes_thatarevaluebias_list:
    for Zscore_valuebais in Zscore_valuebais_list:
      for Zscore_valuebais_sigma in Zscore_valuebais_sigma_list:
        for samples_true in simulated_n_bianary_attributes:
          for top_or_bottom in biasdirs:
            for genename in genenames:
                bias_name, bl = generate_bias_data(simulated_value_df, genename, percent_binary_attributes_thatarevaluebias, Zscore_valuebais, Zscore_valuebais_sigma, top_or_bottom, samples_true)
                ba_df = pd.DataFrame(index=simulated_value_df.index)
                ba_df[bias_name] = bl
                df_list.append(ba_df)
  sim_ba_df = pd.concat(df_list, axis=1)
  sim_ba_df.index=simulated_value_df.index
  return ba_df

# Define the innermost logic in a separate function
def generate_bias_data(simulated_value_df, genename, percent_binary_attributes_thatarevaluebias, Zscore_valuebais, Zscore_valuebais_sigma, top_or_bottom, samples_true):
    percent_binary_attributes_not_bias = 1 - percent_binary_attributes_thatarevaluebias
    samples_true_uniform = int(round(samples_true * percent_binary_attributes_not_bias, 0))
    samples_true_bias = samples_true - samples_true_uniform
    bl = make_binary_list(total_samples, samples_true_uniform)
    bl, gene_target_sigma = add_bias_one_set_values(bl, simulated_value_df, genename, percent_binary_attributes_thatarevaluebias, Zscore_valuebais, Zscore_valuebais_sigma, top_or_bottom, samples_true)
    bias_name = f"{genename}_Truesamplesize{samples_true}_biassamplesize{samples_true_bias}_Zscorevaluebais{Zscore_valuebais}_sigma{Zscore_valuebais_sigma}_direction_{top_or_bottom}_pba{percent_binary_attributes_thatarevaluebias}"
    return bias_name, bl

def create_many_ba_parallel(simulated_value_df, percent_binary_attributes_thatarevaluebias_list, Zscore_valuebais_list, Zscore_valuebais_sigma_list, simulated_n_bianary_attributes, biasdirs):
    genenames = [gn for gn in simulated_value_df.columns if gn!=samplename]
    df_list = []
    num_processes = 64 
    with ProcessPoolExecutor(max_workers=num_processes) as executor:
        futures = []
        for percent_binary_attributes_thatarevaluebias in percent_binary_attributes_thatarevaluebias_list:
            for Zscore_valuebais in Zscore_valuebais_list:
                for Zscore_valuebais_sigma in Zscore_valuebais_sigma_list:
                    for samples_true in simulated_n_bianary_attributes:
                        for top_or_bottom in biasdirs:
                            for genename in genenames:
                                futures.append(
                                    executor.submit(generate_bias_data, simulated_value_df, genename, percent_binary_attributes_thatarevaluebias, Zscore_valuebais, Zscore_valuebais_sigma, top_or_bottom, samples_true)
                                )

        # Collect results
        for future in futures:
            bias_name, bl = future.result()
            ba_df = pd.DataFrame(index=simulated_value_df.index)
            ba_df[bias_name] = bl
            df_list.append(ba_df)
    sim_ba_df = pd.concat(df_list, axis=1)
    sim_ba_df.index=simulated_value_df.index
    return sim_ba_df

def creat_many_ba(simulated_value_df, percent_binary_attributes_thatarevaluebias_list, Zscore_valuebais_list, Zscore_valuebais_sigma_list, simulated_n_bianary_attributes, biasdirs):
    if parallel==True:
        sim_ba_df = create_many_ba_parallel(simulated_value_df, percent_binary_attributes_thatarevaluebias_list, Zscore_valuebais_list, Zscore_valuebais_sigma_list, simulated_n_bianary_attributes, biasdirs)
    else:
        sim_ba_df = create_many_ba_serial(simulated_value_df, percent_binary_attributes_thatarevaluebias_list, Zscore_valuebais_list, Zscore_valuebais_sigma_list, simulated_n_bianary_attributes, biasdirs)
    return sim_ba_df


How many samples do you want to have the binary attribute? simulated_n_bianary_attributes = [20,40] means that the program will create some binary attributes that are true in 20 samples and some true in 40 samples.


In [68]:
#simulated_n_bianary_attributes = [20,40]
# I base my simulated_n_bianary_attributes on the real distrubution of the number of people with the comorbiditys
simulated_n_bianary_attributes = create_simulated_n_biarary_att_from_realdata(comorbid_df_metadata, nlevels_binaryatts=10)
simulated_n_bianary_attributes

[1, 7, 13, 18, 20, 21, 27, 39, 60, 141]

Do you want to simulate that the samples that are enriched for the biarary attribute have higher values (biasdirs=["bottom"]) or lower values (biasdirs=["top"]) or both (biasdirs=["top", "bottom"]) in the value data frame.

In [69]:
#biasdirs=["top"]
#biasdirs=["bottom"]
biasdirs=["top"]

At what Zscore for the gene should the bias be at? What zscore-sigma should the bias have? Zscore_valuebais_list=[1,1.5,2,2.5] means the program will crease some biarary attributes that are bias at a zscore of 1 and some at a zcore of 1.5 relative to the mean and std of the value.

In [70]:
Zscore_valuebais_list=[1,1.5,2,2.5,3]
#Zscore_valuebais_list=[2.5]

Zscore_valuebais_sigma_list=[0.5]


What percent of true samples do you want to be bais?

In [71]:
percent_binary_attributes_thatarevaluebias_list=range(0,100, 10)
percent_binary_attributes_thatarevaluebias_list=[i/100 for i in percent_binary_attributes_thatarevaluebias_list]
percent_binary_attributes_thatarevaluebias_list

[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

In [72]:
simulated_ba_df = creat_many_ba(simulated_value_df, percent_binary_attributes_thatarevaluebias_list, Zscore_valuebais_list, Zscore_valuebais_sigma_list, simulated_n_bianary_attributes, biasdirs)
simulated_ba_df

Unnamed: 0,simulated_based_on_ENSG00000159259_Truesamplesize1_biassamplesize0_Zscorevaluebais1_sigma0.5_direction_top_pba0.0,simulated_based_on_ENSG00000160202_Truesamplesize1_biassamplesize0_Zscorevaluebais1_sigma0.5_direction_top_pba0.0,simulated_based_on_ENSG00000160294_Truesamplesize1_biassamplesize0_Zscorevaluebais1_sigma0.5_direction_top_pba0.0,simulated_based_on_ENSG00000224388_Truesamplesize1_biassamplesize0_Zscorevaluebais1_sigma0.5_direction_top_pba0.0,simulated_based_on_ENSG00000224427_Truesamplesize1_biassamplesize0_Zscorevaluebais1_sigma0.5_direction_top_pba0.0,simulated_based_on_ENSG00000224790_Truesamplesize1_biassamplesize0_Zscorevaluebais1_sigma0.5_direction_top_pba0.0,simulated_based_on_ENSG00000232777_Truesamplesize1_biassamplesize0_Zscorevaluebais1_sigma0.5_direction_top_pba0.0,simulated_based_on_ENSG00000237604_Truesamplesize1_biassamplesize0_Zscorevaluebais1_sigma0.5_direction_top_pba0.0,simulated_based_on_ENSG00000269950_Truesamplesize1_biassamplesize0_Zscorevaluebais1_sigma0.5_direction_top_pba0.0,simulated_based_on_ENSG00000279064_Truesamplesize1_biassamplesize0_Zscorevaluebais1_sigma0.5_direction_top_pba0.0,...,simulated_based_on_ENSG00000159259_Truesamplesize141_biassamplesize127_Zscorevaluebais3_sigma0.5_direction_top_pba0.9,simulated_based_on_ENSG00000160202_Truesamplesize141_biassamplesize127_Zscorevaluebais3_sigma0.5_direction_top_pba0.9,simulated_based_on_ENSG00000160294_Truesamplesize141_biassamplesize127_Zscorevaluebais3_sigma0.5_direction_top_pba0.9,simulated_based_on_ENSG00000224388_Truesamplesize141_biassamplesize127_Zscorevaluebais3_sigma0.5_direction_top_pba0.9,simulated_based_on_ENSG00000224427_Truesamplesize141_biassamplesize127_Zscorevaluebais3_sigma0.5_direction_top_pba0.9,simulated_based_on_ENSG00000224790_Truesamplesize141_biassamplesize127_Zscorevaluebais3_sigma0.5_direction_top_pba0.9,simulated_based_on_ENSG00000232777_Truesamplesize141_biassamplesize127_Zscorevaluebais3_sigma0.5_direction_top_pba0.9,simulated_based_on_ENSG00000237604_Truesamplesize141_biassamplesize127_Zscorevaluebais3_sigma0.5_direction_top_pba0.9,simulated_based_on_ENSG00000269950_Truesamplesize141_biassamplesize127_Zscorevaluebais3_sigma0.5_direction_top_pba0.9,simulated_based_on_ENSG00000279064_Truesamplesize141_biassamplesize127_Zscorevaluebais3_sigma0.5_direction_top_pba0.9
Abbey,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
Addie,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
Adelind,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
Aila,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
Alaine,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Willyt,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
Winifred,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
Winny,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1
Winonah,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,1


In [73]:
simulated_ba_df_Patient_column = simulated_ba_df.copy()
simulated_ba_df_Patient_column[samplename] = simulated_ba_df_Patient_column.index
simulated_ba_df_Patient_column = simulated_ba_df_Patient_column.reset_index(drop=True)


# Save output

In [74]:
simulated_ba_df_Patient_column.to_csv(outdir+"simulated_binary_attribute_"+date_time_string+".csv")

In [75]:
def savemetadata():
  gene_exp_df_metadata.to_csv(outdir+"gene_exp_metadata_"+date_time_string+".csv")
  comorbid_df_metadata.to_csv(outdir+"comorbid_metadata_"+date_time_string+".csv")

def savesettings(date_time_string, samplename, simulate_n_genes, simulated_n_bianary_attributes, Zscore_valuebais_list, Zscore_valuebais_sigma_list, percent_binary_attributes_thatarevaluebias_list, biasdirs):
  wf = open(outdir+"run_info_"+date_time_string+".txt", "w")
  strings_to_save = [date_time_string, samplename]
  lists_to_save = [simulate_n_genes, simulated_n_bianary_attributes, Zscore_valuebais_list, Zscore_valuebais_sigma_list, percent_binary_attributes_thatarevaluebias_list, biasdirs]
  for string_to_save in strings_to_save:
    wf.write(string_to_save+"\n")
  for list_to_save in lists_to_save:
    wf.write(str(list_to_save)+"\n")
  wf.close()

In [76]:
savesettings(date_time_string, samplename, simulate_n_genes, simulated_n_bianary_attributes, Zscore_valuebais_list, Zscore_valuebais_sigma_list, percent_binary_attributes_thatarevaluebias_list, biasdirs)
