# imports

In [49]:
import random

import numpy as np
from tqdm import tqdm

# open original vcf file

In [50]:
with open("/Users/minhtoo.lin/repos/nf-gwas/tests/input/pipeline/example.vcf", "r") as f:
    vcf_orig = f.readlines()

header_lines_orig = vcf_orig[:6]

# params

In [54]:
# num_positions = 40_000
# num_individuals = 100_000
# num_positions = 1_000_000
# num_individuals = 50_000
num_positions = 100_002
num_individuals = 100_000
num_chromosomes = 21
# 22nd chromosome is sex chromosome, plink2 doesn't accept it directly, mafan.

ethnic_groups_to_ratios = {
    "chinese": 0.6,
    "malay": 0.15,
    "indian": 0.15,
    "others": 0.1
}
assert sum(ethnic_groups_to_ratios.values()) == 1

assert num_positions % num_chromosomes == 0
variants_per_chromosome = num_positions // num_chromosomes
print(variants_per_chromosome)

# prev VCF (16 GB) was 100k positions x 100k individuals
# new VC (200 GB) is 1 million position x 50k individuals

4762


# prepare new chromosome header line

In [59]:
# just copy the fixed part
chrom_header_line_base = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"
GT_CHOICES = np.array(["0/0", "0/1", "1/1"])

for group, ratio in ethnic_groups_to_ratios.items():
    print(f"processing ethnic group: {group} with ratio: {ratio}")

    num_indivs_for_group = int(ratio * num_individuals)
    print(f"num indivs for group: {num_indivs_for_group}")
    
    chrom_header_line_chars = [chrom_header_line_base]
    for i in range(1, num_indivs_for_group + 1):  # 1-indexed
        chrom_header_line_chars.append(f"\t{i}")
    chrom_header_line_chars.append("\n")
    
    chrom_header_line_new = "".join(chrom_header_line_chars)
    
    with open(f"../data/ethnic_data_v2/{num_positions}_variants_{num_indivs_for_group}_samples_{num_chromosomes}_chromosomes_{group}_ethnicity.vcf", "w") as f:
        # write 6 header lines
        f.writelines(header_lines_orig)
        # write new chromosome header line
        f.write(chrom_header_line_new)

        chromosome_idx = 1
        for row_idx in tqdm(range(1, num_positions + 1)):  # 1-indexed, up to num_position rows
            line_base = f"{chromosome_idx}\t{row_idx}\t{row_idx}\t2\t1\t.\t.\tPR\tGT"
            curr_line_chars = [line_base]
            
            randints = np.random.randint(0, len(GT_CHOICES), size=(num_indivs_for_group))
            curr_line_chars.append("\t")
            curr_line_chars.append("\t".join(GT_CHOICES[randints]))
            curr_line_chars.append("\n")
    
            f.write("".join(curr_line_chars))
            
            if row_idx % variants_per_chromosome == 0:
                print(f"incrementing chromosome index by 1, from {chromosome_idx}")
                chromosome_idx += 1

processing ethnic group: chinese with ratio: 0.6
num indivs for group: 60000


  5%|████▌                                                                                           | 4778/100002 [00:42<13:35, 116.73it/s]

incrementing chromosome index by 1, from 1


 10%|█████████▏                                                                                      | 9542/100002 [01:23<13:01, 115.77it/s]

incrementing chromosome index by 1, from 2


 14%|█████████████▌                                                                                 | 14306/100002 [02:04<12:18, 115.97it/s]

incrementing chromosome index by 1, from 3


 19%|██████████████████                                                                             | 19066/100002 [02:46<11:35, 116.39it/s]

incrementing chromosome index by 1, from 4


 24%|██████████████████████▋                                                                        | 23830/100002 [03:27<10:58, 115.74it/s]

incrementing chromosome index by 1, from 5


 29%|███████████████████████████▏                                                                   | 28594/100002 [04:08<10:11, 116.71it/s]

incrementing chromosome index by 1, from 6


 33%|████████████████████████████████                                                                | 33353/100002 [38:15<38:02, 29.19it/s]

incrementing chromosome index by 1, from 7


 38%|███████████████████████████████████▍                                                         | 38112/100002 [1:01:39<09:15, 111.31it/s]

incrementing chromosome index by 1, from 8


 43%|███████████████████████████████████████▊                                                     | 42864/100002 [2:11:00<08:48, 108.01it/s]

incrementing chromosome index by 1, from 9


 48%|████████████████████████████████████████████▎                                                | 47636/100002 [3:33:37<07:51, 111.09it/s]

incrementing chromosome index by 1, from 10


 52%|████████████████████████████████████████████████▋                                            | 52404/100002 [4:54:28<06:46, 117.11it/s]

incrementing chromosome index by 1, from 11


 57%|█████████████████████████████████████████████████████▏                                       | 57167/100002 [6:38:03<06:00, 118.78it/s]

incrementing chromosome index by 1, from 12


 62%|██████████████████████████████████████████████████████████▏                                   | 61922/100002 [8:47:11<40:38, 15.61it/s]

incrementing chromosome index by 1, from 13


 67%|██████████████████████████████████████████████████████████████                               | 66686/100002 [9:44:57<05:01, 110.59it/s]

incrementing chromosome index by 1, from 14


 71%|██████████████████████████████████████████████████████████████████▍                          | 71444/100002 [9:45:41<04:16, 111.18it/s]

incrementing chromosome index by 1, from 15


 76%|██████████████████████████████████████████████████████████████████████▊                      | 76209/100002 [9:46:22<03:29, 113.54it/s]

incrementing chromosome index by 1, from 16


 81%|███████████████████████████████████████████████████████████████████████████▎                 | 80974/100002 [9:47:04<02:46, 114.31it/s]

incrementing chromosome index by 1, from 17


 86%|███████████████████████████████████████████████████████████████████████████████▋             | 85730/100002 [9:47:48<02:11, 108.62it/s]

incrementing chromosome index by 1, from 18


 90%|████████████████████████████████████████████████████████████████████████████████████▏        | 90493/100002 [9:48:31<01:27, 108.39it/s]

incrementing chromosome index by 1, from 19


 95%|████████████████████████████████████████████████████████████████████████████████████████▌    | 95260/100002 [9:49:15<00:43, 108.41it/s]

incrementing chromosome index by 1, from 20


100%|█████████████████████████████████████████████████████████████████████████████████████████████| 100002/100002 [9:49:58<00:00,  2.82it/s]


incrementing chromosome index by 1, from 21
processing ethnic group: malay with ratio: 0.15
num indivs for group: 15000


  5%|████▋                                                                                           | 4845/100002 [00:11<03:36, 439.32it/s]

incrementing chromosome index by 1, from 1


 10%|█████████▏                                                                                      | 9573/100002 [00:22<03:29, 431.97it/s]

incrementing chromosome index by 1, from 2


 14%|█████████████▋                                                                                 | 14356/100002 [00:33<03:14, 441.02it/s]

incrementing chromosome index by 1, from 3


 19%|██████████████████▏                                                                            | 19136/100002 [00:44<03:01, 445.51it/s]

incrementing chromosome index by 1, from 4


 24%|██████████████████████▋                                                                        | 23863/100002 [00:54<02:47, 455.16it/s]

incrementing chromosome index by 1, from 5


 29%|███████████████████████████▏                                                                   | 28620/100002 [01:05<02:43, 435.51it/s]

incrementing chromosome index by 1, from 6


 33%|███████████████████████████████▋                                                               | 33393/100002 [01:16<02:34, 431.47it/s]

incrementing chromosome index by 1, from 7


 38%|████████████████████████████████████▏                                                          | 38149/100002 [01:27<02:17, 450.00it/s]

incrementing chromosome index by 1, from 8


 43%|████████████████████████████████████████▊                                                      | 42911/100002 [01:38<02:08, 444.31it/s]

incrementing chromosome index by 1, from 9


 48%|█████████████████████████████████████████████▎                                                 | 47694/100002 [01:48<01:58, 442.71it/s]

incrementing chromosome index by 1, from 10


 52%|█████████████████████████████████████████████████▊                                             | 52426/100002 [02:00<01:57, 406.01it/s]

incrementing chromosome index by 1, from 11


 57%|██████████████████████████████████████████████████████▎                                        | 57196/100002 [02:11<01:49, 392.57it/s]

incrementing chromosome index by 1, from 12


 62%|██████████████████████████████████████████████████████████▉                                    | 61975/100002 [02:22<01:28, 430.24it/s]

incrementing chromosome index by 1, from 13


 67%|███████████████████████████████████████████████████████████████▍                               | 66728/100002 [02:34<01:14, 443.86it/s]

incrementing chromosome index by 1, from 14


 71%|███████████████████████████████████████████████████████████████████▉                           | 71492/100002 [02:45<01:07, 420.11it/s]

incrementing chromosome index by 1, from 15


 76%|████████████████████████████████████████████████████████████████████████▍                      | 76262/100002 [02:57<00:57, 415.61it/s]

incrementing chromosome index by 1, from 16


 81%|████████████████████████████████████████████████████████████████████████████▉                  | 81021/100002 [03:09<00:44, 423.17it/s]

incrementing chromosome index by 1, from 17


 86%|█████████████████████████████████████████████████████████████████████████████████▍             | 85772/100002 [03:20<00:33, 430.74it/s]

incrementing chromosome index by 1, from 18


 91%|██████████████████████████████████████████████████████████████████████████████████████         | 90538/100002 [03:31<00:21, 437.78it/s]

incrementing chromosome index by 1, from 19


 95%|██████████████████████████████████████████████████████████████████████████████████████████▌    | 95320/100002 [03:42<00:10, 439.37it/s]

incrementing chromosome index by 1, from 20


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 100002/100002 [03:53<00:00, 428.72it/s]


incrementing chromosome index by 1, from 21
processing ethnic group: indian with ratio: 0.15
num indivs for group: 15000


  5%|████▌                                                                                           | 4804/100002 [00:11<03:58, 399.41it/s]

incrementing chromosome index by 1, from 1


 10%|█████████▏                                                                                      | 9580/100002 [00:24<04:05, 368.03it/s]

incrementing chromosome index by 1, from 2


 14%|█████████████▋                                                                                 | 14364/100002 [00:36<03:28, 411.17it/s]

incrementing chromosome index by 1, from 3


 19%|██████████████████▏                                                                            | 19098/100002 [00:47<03:00, 449.31it/s]

incrementing chromosome index by 1, from 4


 24%|██████████████████████▋                                                                        | 23876/100002 [00:58<02:51, 444.19it/s]

incrementing chromosome index by 1, from 5


 29%|███████████████████████████▏                                                                   | 28627/100002 [01:08<02:45, 432.51it/s]

incrementing chromosome index by 1, from 6


 33%|███████████████████████████████▋                                                               | 33409/100002 [01:19<02:31, 438.29it/s]

incrementing chromosome index by 1, from 7


 38%|████████████████████████████████████▎                                                          | 38172/100002 [01:30<02:17, 448.78it/s]

incrementing chromosome index by 1, from 8


 43%|████████████████████████████████████████▊                                                      | 42909/100002 [01:41<02:06, 451.38it/s]

incrementing chromosome index by 1, from 9


 48%|█████████████████████████████████████████████▎                                                 | 47674/100002 [01:51<01:54, 455.49it/s]

incrementing chromosome index by 1, from 10


 52%|█████████████████████████████████████████████████▊                                             | 52432/100002 [02:02<01:45, 450.36it/s]

incrementing chromosome index by 1, from 11


 57%|██████████████████████████████████████████████████████▎                                        | 57203/100002 [02:13<01:38, 434.52it/s]

incrementing chromosome index by 1, from 12


 62%|██████████████████████████████████████████████████████████▊                                    | 61956/100002 [02:24<01:24, 449.12it/s]

incrementing chromosome index by 1, from 13


 67%|███████████████████████████████████████████████████████████████▍                               | 66753/100002 [02:35<01:14, 448.99it/s]

incrementing chromosome index by 1, from 14


 71%|███████████████████████████████████████████████████████████████████▉                           | 71490/100002 [02:45<01:04, 439.73it/s]

incrementing chromosome index by 1, from 15


 76%|████████████████████████████████████████████████████████████████████████▍                      | 76247/100002 [02:56<00:52, 453.36it/s]

incrementing chromosome index by 1, from 16


 81%|████████████████████████████████████████████████████████████████████████████▉                  | 81043/100002 [03:06<00:42, 451.26it/s]

incrementing chromosome index by 1, from 17


 86%|█████████████████████████████████████████████████████████████████████████████████▍             | 85767/100002 [03:18<00:32, 442.29it/s]

incrementing chromosome index by 1, from 18


 91%|█████████████████████████████████████████████████████████████████████████████████████▉         | 90527/100002 [03:28<00:23, 409.28it/s]

incrementing chromosome index by 1, from 19


 95%|██████████████████████████████████████████████████████████████████████████████████████████▌    | 95321/100002 [03:40<00:10, 431.67it/s]

incrementing chromosome index by 1, from 20


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 100002/100002 [03:51<00:00, 432.54it/s]


incrementing chromosome index by 1, from 21
processing ethnic group: others with ratio: 0.1
num indivs for group: 10000


  5%|████▋                                                                                           | 4834/100002 [00:07<02:48, 564.75it/s]

incrementing chromosome index by 1, from 1


 10%|█████████▏                                                                                      | 9635/100002 [00:15<02:26, 618.50it/s]

incrementing chromosome index by 1, from 2


 14%|█████████████▋                                                                                 | 14396/100002 [00:23<02:20, 609.59it/s]

incrementing chromosome index by 1, from 3


 19%|██████████████████▏                                                                            | 19143/100002 [00:31<02:11, 613.65it/s]

incrementing chromosome index by 1, from 4


 24%|██████████████████████▋                                                                        | 23889/100002 [00:39<01:56, 654.42it/s]

incrementing chromosome index by 1, from 5


 29%|███████████████████████████▏                                                                   | 28656/100002 [00:47<02:05, 566.85it/s]

incrementing chromosome index by 1, from 6


 33%|███████████████████████████████▋                                                               | 33407/100002 [00:55<01:49, 605.73it/s]

incrementing chromosome index by 1, from 7


 38%|████████████████████████████████████▎                                                          | 38187/100002 [01:03<01:42, 604.14it/s]

incrementing chromosome index by 1, from 8


 43%|████████████████████████████████████████▊                                                      | 42959/100002 [01:11<01:31, 624.39it/s]

incrementing chromosome index by 1, from 9


 48%|█████████████████████████████████████████████▎                                                 | 47704/100002 [01:18<01:27, 597.96it/s]

incrementing chromosome index by 1, from 10


 52%|█████████████████████████████████████████████████▊                                             | 52466/100002 [01:26<01:12, 651.80it/s]

incrementing chromosome index by 1, from 11


 57%|██████████████████████████████████████████████████████▍                                        | 57261/100002 [01:34<01:08, 626.52it/s]

incrementing chromosome index by 1, from 12


 62%|██████████████████████████████████████████████████████████▉                                    | 62018/100002 [01:41<00:57, 660.57it/s]

incrementing chromosome index by 1, from 13


 67%|███████████████████████████████████████████████████████████████▍                               | 66773/100002 [01:48<00:49, 670.21it/s]

incrementing chromosome index by 1, from 14


 72%|███████████████████████████████████████████████████████████████████▉                           | 71506/100002 [01:55<00:42, 664.67it/s]

incrementing chromosome index by 1, from 15


 76%|████████████████████████████████████████████████████████████████████████▍                      | 76306/100002 [02:02<00:36, 655.51it/s]

incrementing chromosome index by 1, from 16


 81%|████████████████████████████████████████████████████████████████████████████▉                  | 81036/100002 [02:09<00:28, 655.82it/s]

incrementing chromosome index by 1, from 17


 86%|█████████████████████████████████████████████████████████████████████████████████▌             | 85825/100002 [02:17<00:21, 670.07it/s]

incrementing chromosome index by 1, from 18


 91%|██████████████████████████████████████████████████████████████████████████████████████         | 90567/100002 [02:24<00:14, 629.19it/s]

incrementing chromosome index by 1, from 19


 95%|██████████████████████████████████████████████████████████████████████████████████████████▌    | 95354/100002 [02:31<00:06, 670.52it/s]

incrementing chromosome index by 1, from 20


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 100002/100002 [02:38<00:00, 629.49it/s]

incrementing chromosome index by 1, from 21





# write fake phenotypes file

In [34]:
num_phenotypes_total = 100
num_phenotypes_per_file = 100

assert num_phenotypes_total % num_phenotypes_per_file == 0

phenotypes_list = [
    "diabetes_susceptibility_TCF7L3",
    "diabetes_susceptibility_TCF7L4",
    "cancer_susceptibility_TP54",
    "facial_morphology_AXIN2",
    "cognitive_performance_NTRK1",
    "circadian_rhythm_PER2",
    "leptin_resistance_A829C",
    "blood_pressure_RS1711",
    "lipid_metabolism_LPL-94C>T",
    "amino_acid_metabolism_GATM-23G>A",
    "sugar_metabolism_HNF1A-177G>C",
    "memory_performance_BDNF",
    "attention_span_DRD4-7R",
    "intelligence_CMIP",
    "cardiovascular_risk_9p21",
    "diabetes_susceptibility_TCF7L2",
    "cancer_susceptibility_TP53",
    "sleep_duration_DEC2",
    "morning_activity_PER3",
    "evening_activity_CRY1",
    "bone_density_OPN-660T>C",
    "muscle_mass_ACTN3-R577X",
    "hair_texture_HH-116G>A",
    "earlobe_attachment_RS7794745",
    "taste_preference_TAS2R38-PAV",
    "olive_oil_sensitivity_OLE1",
    "caffeine_metabolism_CYP1A2-163C>A",
    "vitamin_d_metabolism_VDR-2228570",
    "inflammation_response_IL6",
    "allergic_susceptibility_HLA-DQ",
    "musical_aptitude_SLC6A4",
    "math_ability_FOXP2",
    "language_acquisition_CNTNAP2",
    "emotional_resilience_NPY",
    "stress_response_CRHR1",
    "immunodeficiency_IL2RG",
    "gut_microbiome_composition_FUT2",
    "thyroid_function_TPO",
    "hormone_sensitivity_ESR1",
    "age-cognitive_decline_APOE",
    "neurodegenerative_risk_PSEN1",
    "anaesthetic_response_SLC22A1",
    "alcohol_metabolism_ADH1B",
    "drug_response_CYP2D6",
    "tooth_enamel_thickness_AMELX",
    "tendon_strength_TNC",
    "joint_mobility_COL5A1",
    "eczema_susceptibility_FLG",
    "scoliosis_predisposition_LMX1B",
    "keloid_formation_TGFBR2",
    "tumor_suppressor_TP53",
    "telomere_length_TERT",
    "hemoglobin_variant_HBB-7C>T",
    "blood_clotting_F5",
    "migraine_susceptibility_MTHFR",
    "tinnitus_prevalence_16p11.2",
    "colour_blindness_OPN1LW/OPN1MW",
    "depth_perception_GNB3",
    "muscle_fiber_composition_MYH7",
    "sweat_gland_density_CFTR",
    "finger_length_ratio_HOXA13",
    "voice_pitch_AR",
    "blisters_variant_COL7A1",
    "scar_tissue_TGFB1",
    "sleep_talking_ADH1B",
    "dream_recollection_CACNA1C",
    "sensory_processing_ADRA2B",
    "hypermobility_syndrome_TNC",
    "joint_hypermobility_COL1A1",
    "osteoarthritis_GDF5",
    "varicose_veins_FGG",
    "vaccine_response_HLA-A",
    "drug_addiction_DRD2",
    "alcohol_addiction_GABRA2",
    "tobacco_addiction_CHRNA5",
    "taste_aversion_TAS2R16",
    "spicy_food_TRPV1",
    "carb_metabolism_PNPLA3",
    "high_altitude_EGLN1",
    "cold_response_UCP1",
    "heat_response_TRPV1",
    "voice_recovery_IL6",
    "neuroplasticity_BDNF",
    "sibling_similarity_ADD1",
    "empathy_OXTR",
    "common_colds_CDHR3",
    "joint_pain_TRPA1",
    "muscle_recovery_ACTN3",
    "corneal_thickness_ZNF469",
    "tear_production_AQP5",
    "nail_growth_NFKB1",
    "tattoo_fading_IRF4",
    "sweat_odor_ABCC11",
    "longevity_factors",
    "oxidative_stress_SOD2",
    "anti-aging_MTOR",
    "tissue_regeneration_PAX7",
    "radiation_susceptibility_TP53",
    "umami_taste_TAS1R1",
    "iron_supplementation_HFE"
]

assert len(phenotypes_list) == num_phenotypes_total, f"{len(phenotypes_list)} != num_phenotypes_total {num_phenotypes_total}"

header_base = "FID IID"

for group, ratio in ethnic_groups_to_ratios.items():
    print(f"processing ethnic group: {group} with ratio: {ratio}")

    num_indivs_for_group = int(ratio * num_individuals)
    print(f"num indivs for group: {num_indivs_for_group}")

    curr_pheno_idx = 1  # 1-indexed
    for file_idx in range(num_phenotypes_total // num_phenotypes_per_file):
        with open(f"../data/ethnic_data/phenotype_{num_indivs_for_group}_samples_{num_phenotypes_per_file}cols_{group}_ethnicity_{file_idx}.txt", "w") as f:
            # write header
            header_chars = [header_base]
            for pheno_idx in range(curr_pheno_idx, curr_pheno_idx + num_phenotypes_per_file):
                # header_chars.append(f" {phenotypes_list[pheno_idx - 1]}")
                header_chars.append(f" Y{pheno_idx}")
            header_chars.append("\n")
            f.write("".join(header_chars))
    
            # draw from normal distribution
            random_nums = np.random.normal(size=(num_indivs_for_group, num_phenotypes_per_file))
    
            # write data rows
            for row_idx in tqdm(range(1, num_indivs_for_group + 1)): # 1-indexed
                row_chars = [f"{row_idx} {row_idx}"]
                for pheno_idx in range(num_phenotypes_per_file):
                    row_chars.append(f" {random_nums[row_idx - 1][pheno_idx]}")  # 0-indexed
                row_chars.append("\n")
                f.write("".join(row_chars))
    
        curr_pheno_idx += num_phenotypes_per_file

processing ethnic group: chinese with ratio: 0.6
num indivs for group: 60000


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 60000/60000 [00:03<00:00, 15363.61it/s]


processing ethnic group: malay with ratio: 0.15
num indivs for group: 15000


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 15000/15000 [00:01<00:00, 14908.21it/s]


processing ethnic group: indians with ratio: 0.15
num indivs for group: 15000


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 15000/15000 [00:01<00:00, 14673.66it/s]


processing ethnic group: others with ratio: 0.1
num indivs for group: 10000


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 14953.21it/s]


# AWS has limit of 1024 chars on any param value, so below is too long

In [38]:
phenotypes_str_joined = ",".join(f"Y{i}" for i in range(1, 101))
print(phenotypes_str_joined)
print(len(phenotypes_str_joined))

Y1,Y2,Y3,Y4,Y5,Y6,Y7,Y8,Y9,Y10,Y11,Y12,Y13,Y14,Y15,Y16,Y17,Y18,Y19,Y20,Y21,Y22,Y23,Y24,Y25,Y26,Y27,Y28,Y29,Y30,Y31,Y32,Y33,Y34,Y35,Y36,Y37,Y38,Y39,Y40,Y41,Y42,Y43,Y44,Y45,Y46,Y47,Y48,Y49,Y50,Y51,Y52,Y53,Y54,Y55,Y56,Y57,Y58,Y59,Y60,Y61,Y62,Y63,Y64,Y65,Y66,Y67,Y68,Y69,Y70,Y71,Y72,Y73,Y74,Y75,Y76,Y77,Y78,Y79,Y80,Y81,Y82,Y83,Y84,Y85,Y86,Y87,Y88,Y89,Y90,Y91,Y92,Y93,Y94,Y95,Y96,Y97,Y98,Y99,Y100
391


In [35]:
phenotypes_str_joined = ",".join(phenotypes_list)
print(len(phenotypes_str_joined))
print(phenotypes_str_joined)

2404
diabetes_susceptibility_TCF7L3,diabetes_susceptibility_TCF7L4,cancer_susceptibility_TP54,facial_morphology_AXIN2,cognitive_performance_NTRK1,circadian_rhythm_PER2,leptin_resistance_A829C,blood_pressure_RS1711,lipid_metabolism_LPL-94C>T,amino_acid_metabolism_GATM-23G>A,sugar_metabolism_HNF1A-177G>C,memory_performance_BDNF,attention_span_DRD4-7R,intelligence_CMIP,cardiovascular_risk_9p21,diabetes_susceptibility_TCF7L2,cancer_susceptibility_TP53,sleep_duration_DEC2,morning_activity_PER3,evening_activity_CRY1,bone_density_OPN-660T>C,muscle_mass_ACTN3-R577X,hair_texture_HH-116G>A,earlobe_attachment_RS7794745,taste_preference_TAS2R38-PAV,olive_oil_sensitivity_OLE1,caffeine_metabolism_CYP1A2-163C>A,vitamin_d_metabolism_VDR-2228570,inflammation_response_IL6,allergic_susceptibility_HLA-DQ,musical_aptitude_SLC6A4,math_ability_FOXP2,language_acquisition_CNTNAP2,emotional_resilience_NPY,stress_response_CRHR1,immunodeficiency_IL2RG,gut_microbiome_composition_FUT2,thyroid_function_TPO,hormone_s

# need to generate our own .fam file too (use this to override the output of plink2 when converting .vcf --> .fam/.bed/.bim)
- need to have matching FID & IID with phenotype.txt
- need to have non-zero last column in .fam, can just generate some random numbers

In [14]:
# this value doesn't really matter, I believe. 
# just doesn't have to be -9 (missing/problematic phenotype)

for group, ratio in ethnic_groups_to_ratios.items():
    print(f"processing ethnic group: {group} with ratio: {ratio}")

    num_indivs_for_group = int(ratio * num_individuals)
    print(f"num indivs for group: {num_indivs_for_group}")
    
    random_nums = np.random.normal(size=(num_indivs_for_group))
    
    with open(f"../data/ethnic_data/{num_indivs_for_group}_samples_{group}_ethnicity.fam", "w") as f:
        for row_idx in tqdm(range(1, num_indivs_for_group + 1)):
            f.write(f"{row_idx}\t{row_idx}\t0\t0\t0\t{random_nums[row_idx - 1]}\n")

processing ethnic group: chinese with ratio: 0.6
num indivs for group: 60000


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 60000/60000 [00:00<00:00, 823270.80it/s]


processing ethnic group: malay with ratio: 0.15
num indivs for group: 15000


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 15000/15000 [00:00<00:00, 1000549.62it/s]


processing ethnic group: indians with ratio: 0.15
num indivs for group: 15000


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 15000/15000 [00:00<00:00, 953467.61it/s]


processing ethnic group: others with ratio: 0.1
num indivs for group: 10000


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 960894.39it/s]


# write rsids.tsv - here also CHROM needs to match .vcf. only need 1 for whole GWAS (doesn't depend on ethnicity)

In [61]:
chromosome_idx = 1
rsid_init = 270000
header_line = "CHROM	POS	RSID	REF	ALT	AAF\n"
with open(f"../data/ethnic_data_v2/rsids_{num_positions}_variants_{num_chromosomes}_chromosomes.tsv", "w") as f:
    f.write(header_line)

    for row_idx in tqdm(range(1, num_positions + 1)):
        f.write(f"{chromosome_idx}\t{row_idx}\trs{rsid_init + row_idx}\t2\t1\t0,1\n")
        
        if row_idx % variants_per_chromosome == 0:
            print(f"incrementing chromosome index by 1, from {chromosome_idx}")
            chromosome_idx += 1

100%|███████████████████████████████████████████████████████████████████| 100002/100002 [00:00<00:00, 1778187.17it/s]

incrementing chromosome index by 1, from 1
incrementing chromosome index by 1, from 2
incrementing chromosome index by 1, from 3
incrementing chromosome index by 1, from 4
incrementing chromosome index by 1, from 5
incrementing chromosome index by 1, from 6
incrementing chromosome index by 1, from 7
incrementing chromosome index by 1, from 8
incrementing chromosome index by 1, from 9
incrementing chromosome index by 1, from 10
incrementing chromosome index by 1, from 11
incrementing chromosome index by 1, from 12
incrementing chromosome index by 1, from 13
incrementing chromosome index by 1, from 14
incrementing chromosome index by 1, from 15
incrementing chromosome index by 1, from 16
incrementing chromosome index by 1, from 17
incrementing chromosome index by 1, from 18
incrementing chromosome index by 1, from 19
incrementing chromosome index by 1, from 20
incrementing chromosome index by 1, from 21



