
## Benchmark data generation

### Populations
    Latino (easy) - Chm 22?
    5-ancestry (no west asian - medium) - Chm ?
    7 ancestry (west asian - hard) - Chm 20

### Balance
    Balanced reference panel, balanced query panel
    Balanced reference panel, imbalanced query panel
    Imbalanced reference panel, imbalanced (differently) query panel

### Generations
    2,4,8,16,24,32,48,64,72,100
 
### Available SNPs:
    Full Set
    UKBio subset


In [1]:
# Strategy
# note: use the pyadmix that comes with xgmix - not the one from the standalone repo
"""

- pre-create Latino, 5anc, 7anc sample map and split into train1, train2, val, test

- dataset in [Full, UKB]:

    - experiment in [Latino, 5anc, 7anc]:
        - balance in [balance ,imbalance]:
        
            - designate an experiment here and copy all useful data
            - name: full/latino/balance2/generated_data
            - read in vcf file
            - process genetic map
            - process sample map
            - for split in [train1, train2, val, test]
                - generation in [2,4,8,16,24,32,48,64,72,100]
            
"""

'\n\n- pre-create Latino, 5anc, 7anc sample map and split into train1, train2, val, test\n\n- dataset in [Full, UKB]:\n\n    - experiment in [Latino, 5anc, 7anc]:\n        - balance in [balance ,imbalance]:\n        \n            - designate an experiment here and copy all useful data\n            - name: full/latino/balance2/generated_data\n            - read in vcf file\n            - process genetic map\n            - process sample map\n            - for split in [train1, train2, val, test]\n                - generation in [2,4,8,16,24,32,48,64,72,100]\n            \n'

In [2]:
import sys
import os
import allel
import numpy as np
import pickle
import pandas as pd

np.random.seed(94305)
sys.path.append("/home/arvindsk/XGMix")

In [3]:
class Struct:
    def __init__(self, **entries):
        self.__dict__.update(entries)

chm = "22"
data_path = "/home/wknd37/Admixture/generated_data/6_even_anc_t2/chm"+chm

# chm = "20"
# data_path = "../Admixture/generated_data/7_anc_t2/chm"+chm

args = {
    "genetic_map_file": "/home/database/maps/rfmix/allchrs.b37.gmap",
    "chm": chm,
    "window_size": 100,
    "smooth_size": 75,
    "missing": 0.0,
    "device": None
}

args = Struct(**args)

args.model = "./trained_models/XGMix/missing_0/chm_" + chm + ".pkl"
args.checkpoint_path = "./trained_models/checkpoint.pkl"

In [4]:
# step 1 - get the sample maps for the experiments of interest

In [5]:
from Admixture.utils import join_paths, read_vcf, filter_map
from Admixture.Admixture import split_sample_map, read_sample_map
from Utils.utils import run_shell_cmd, get_num_outs
from pyadmix import simulate
from pyadmix import get_chm_info, get_sample_map_data, write_output

In [6]:
master_sample_map = "/home/projects/world_wide_references/rfmix_sample_map.tsv"
set_names = ["train1","train2","val","test"]
founders_ratios = [0.65,0.2,0.1,0.05]

In [7]:
expt_to_pops_map = {"latino":["AFR","NAT","EUR"], 
                   "five":["AFR", "EAS", "EUR", "NAT", "SAS"],
                   "seven":["AFR", "EAS", "EUR", "NAT", "OCE", "SAS", "WAS"]}

In [8]:
def balance_map(map_path):
    samples = sample_map_data = pd.read_csv(map_path,delimiter="\t",header=None,comment="#")
    samples.columns = ['Sample', 'Population']
    samples = np.array(samples)

    # prune every population to size of min population
    # create a dict mapping pop to list of samples and prune
    pops = np.unique(samples[:,1])
    prune_dict = {k:[] for k in pops}
    for i,j in samples:
        prune_dict[j].append(i)
    min_pop = min([len(prune_dict[i]) for i in prune_dict])
    for i in prune_dict:
        prune_dict[i] = np.random.choice(prune_dict[i],min_pop,replace=False)

    with open(map_path, "w") as f:
        for pop in prune_dict:
            for sam in prune_dict[pop]:
                f.write("{}\t{}\n".format(sam,pop))

In [9]:
# from XGMix/admixture/fast_admix.py. Here because the original API doesn't have sample_weights


def main_admixture_fast(chm, root, sub_instance_names, sample_map_files, sample_map_files_idxs, reference_file, genetic_map_file,
    num_outs, generations, sample_weights = None, verbose=True):

    """
    chm: chm number
    root: data path with generated_data folder
    sub_instance_names: (a list) subsets like train1, train2, val
    sample_map_files: (a list) the files of the above
    sample_map_file_idxs: (a list) a way to make sure the individual sets map to the original
    population names
    reference_file: vcf file
    genetic_map_file: gmap file
    num_outs: (a list) number of outputs for each generation for each set
    generations: generations to simulate for each set (a single list)
    """
    if sample_weights == None:
        sample_weights = [None]*len(sample_map_files)
    
    print("Sample weights given is : ",sample_weights)

    output_path = join_paths(root, 'chm{}'.format(chm), verb=verbose)
    
    # path for simulation output
    simulation_output_path = join_paths(output_path, 'simulation_output')

    # Register and writing SNP physical positions
    if type(reference_file) == str:
        print("Reading reference file...")
        ref = read_vcf(reference_file)
    else:
        ref = reference_file
    np.savetxt(output_path +  "/positions.txt", ref['variants/POS'], delimiter='\n')
    np.savetxt(output_path + "/references.txt", ref['variants/REF'], delimiter='\n', fmt="%s")

    # Process genetic map data
    genetic_map_data = get_chm_info(genetic_map_file, ref)

    # simulate for each sub-instance
    for i, instance_name in enumerate(sub_instance_names):

        if num_outs[i] > 0:
            # paths for each set
            instance_path = join_paths(simulation_output_path, instance_name, verb=verbose)
            # get sample map data
            sample_map_data = get_sample_map_data(sample_map_files[i], ref, sample_weights=sample_weights[i])
            # get the dataset
            dataset = simulate(ref, sample_map_data, genetic_map_data, out_root=None,
                num_samples_per_gen=num_outs[i], gens_to_ret=generations,
                random_seed=94305,verbose=verbose)
            
            # apply sample_map_files_idxs trasnform
            idx_to_pop_map = sample_map_files_idxs[i]
            if not is_same(idx_to_pop_map):
                for key in dataset.keys():
                    for i in range(len(dataset[key])):
                        dataset[key][i].maternal["anc"] = np.vectorize(idx_to_pop_map.get)(dataset[key][i].maternal["anc"])
                        dataset[key][i].paternal["anc"] = np.vectorize(idx_to_pop_map.get)(dataset[key][i].paternal["anc"])

            # save the data
            write_output(instance_path,dataset)

def is_same(mapper):
    for key in mapper.keys():
        if key != mapper[key]:
            return False
    return True

In [10]:
# generate all the master sample maps

for expt in expt_to_pops_map.keys():

    # define the data path and sample map path within the data path
    data_path = "/home/arvindsk/xgmix_expts/benchmark_data/sample_maps/{}/".format(expt)
    if not os.path.exists(data_path):
        os.makedirs(data_path)
    map_path = data_path+expt+".map"

    filter_map(master_sample_map, expt_to_pops_map[expt], map_path)


    # split into train1, train2, val, test

    samples, pop_ids = read_sample_map(map_path, population_path = data_path)
    sample_map_path = join_paths(data_path, "sample_maps")
    sample_map_paths = [sample_map_path+"/"+s+".map" for s in set_names]
    sample_map_idxs = split_sample_map(sample_ids = np.array(samples["Sample"]),
                                        populations = np.array(samples["Population"]),
                                        ratios = founders_ratios,
                                        pop_ids = pop_ids,
                                        sample_map_paths=sample_map_paths)

    # save the sample_map_idxs as it is specific to this dataset and its splits
    # to be used later
    pickle.dump(sample_map_idxs, open(data_path+"sample_map_idxs.dict","wb"))

path created: /home/arvindsk/xgmix_expts/benchmark_data/sample_maps/latino/sample_maps
path created: /home/arvindsk/xgmix_expts/benchmark_data/sample_maps/five/sample_maps
path created: /home/arvindsk/xgmix_expts/benchmark_data/sample_maps/seven/sample_maps


In [11]:
r_admixed = 3.0
generations = [2,4,8,16,24,32,48,64,72,100]

In [12]:
def get_bal_sample_weights(map_path,wt_path):
    # idea is that each ancestry must be equally weighed.
    # split weight of an ancestry into samples of that ancestry to ensure this.
    
    print(map_path, wt_path)
    
    samples = sample_map_data = pd.read_csv(map_path,delimiter="\t",header=None,comment="#")
    samples.columns = ['Sample', 'Population']
    samples = np.array(samples)

    # prune every population to size of min population
    # create a dict mapping pop to list of samples and prune
    pops = np.unique(samples[:,1])
    prune_dict = {k:[] for k in pops}
    for i,j in samples:
        prune_dict[j].append(i)
    
    weights = []
    
    wt_each_anc = 1.0/len(prune_dict.keys())
    
    with open(wt_path, "w") as f:
        for pop in prune_dict:
            for sam in prune_dict[pop]:
                # weight is weight for each ancestry divided by number of samples in this ancestry
                wt_this = wt_each_anc / len(prune_dict[pop])
                f.write("{}\t{}\n".format(sam,wt_this))

In [13]:
# now create all the experiments and process the train1, train2, val, test accordingly.
# then generate the data

for chm in [20]:
    
    # makedirs chm

    
    for dataset in ["full","ukb"]:
        
        # read reference and genetic map
        if dataset == "full":
            reference_file = "/home/projects/world_wide_references/ref_final_{0}/ref_final_beagle_phased_1kg_hgdp_sgdp_chr{0}_hg19.vcf.gz".format(chm)
            genetic_map_file = "/home/database/maps/rfmix/allchrs.b37.gmap"
        if dataset == "ukb":
            reference_file = "/home/arvindsk/xgmix_expts/ukb/filtered_references/ukb_snps_chm_{0}.recode.vcf".format(chm)
            genetic_map_file = "/home/database/maps/rfmix/allchrs.b38.gmap"
        print("Using genetic map: {}".format(genetic_map_file))
        
        reference_file = read_vcf(reference_file)
        
        
        for bal in ["unbal_unbal","bal_bal","bal_unbal","bal_admix"]:
            for expt in ["latino","five","seven"]:
                
                print("Doing: {}, {}, {}".format(dataset,bal,expt))

                sample_weights = None # default, but can change based on the expt
                # create experiment path
                data_path = "/home/arvindsk/xgmix_expts/benchmark_data/chm{}/{}/{}/{}/".format(chm, dataset, bal, expt)
                if not os.path.exists(data_path):
                    os.makedirs(data_path)
                    
                data_path = join_paths(data_path, 'generated_data', verb=False)


                # copy populations.txt and sample maps previously generated
                expt_samples_path = "/home/arvindsk/xgmix_expts/benchmark_data/sample_maps/{}/".format(expt)
                run_shell_cmd("cp {}/populations.txt {}".format(expt_samples_path,data_path))
                run_shell_cmd("cp -r {}/sample_maps {}".format(expt_samples_path, data_path))
                sample_map_idxs = pickle.load(open(expt_samples_path+"sample_map_idxs.dict","rb"))
                sample_map_paths = [data_path+"/sample_maps/"+s+".map" for s in set_names]
                
                # sample weight paths
                sample_wt_paths = [data_path+"/sample_maps/" for s in set_names]
                for path1 in sample_wt_paths:
                    if not os.path.isdir(path1):
                        os.makedirs(path1)

                # based on bal, split the sample maps accordingly
                
                assert("train1" in sample_map_paths[0])
                assert("train2" in sample_map_paths[1])
                assert("val" in sample_map_paths[2])
                assert("test" in sample_map_paths[3])
                

                if bal == "bal_bal":
                    balance_map(sample_map_paths[0])
                    balance_map(sample_map_paths[1])
                    balance_map(sample_map_paths[2])
                    balance_map(sample_map_paths[3])
                        
                elif bal == "bal_unbal":
                    balance_map(sample_map_paths[0])
                    balance_map(sample_map_paths[1])
                    
                elif bal == "unbal_unbal":
                    pass
                
                elif bal == "bal_admix":
                    # get sample_weights for train1, train2
                    # get sample_weights for train1, train2
                    sample_weights = [None, None, None, None]
                    get_bal_sample_weights(sample_map_paths[0], sample_wt_paths[0]+"/train1.wts")
                    get_bal_sample_weights(sample_map_paths[1], sample_wt_paths[1]+"/train2.wts")
                    sample_weights[0] = sample_wt_paths[0]+"/train1.wts"
                    sample_weights[1] = sample_wt_paths[1]+"/train2.wts"
                    sample_weights[2] = None
                    sample_weights[3] = None
                
                else:
                    print("Weird balance given")

                if bal != "bal_admix":
                    assert(sample_weights == None)
                
                # read the sample_map_idx
                
                num_outs = get_num_outs(sample_map_paths, r_admixed)
                num_outs_per_gen = [n//len(generations) for n in num_outs]
                # chm, data_path, set_names, 
                main_admixture_fast(chm, data_path, set_names, sample_map_paths, sample_map_idxs,
                               reference_file, genetic_map_file, num_outs_per_gen, generations, sample_weights)

Using genetic map: /home/database/maps/rfmix/allchrs.b37.gmap
File read: 516800 SNPs for 3558 individuals
Doing: full, unbal_unbal, latino
Running: cp /home/arvindsk/xgmix_expts/benchmark_data/sample_maps/latino//populations.txt /home/arvindsk/xgmix_expts/benchmark_data/full/unbal_unbal/latino/generated_data
Running: cp -r /home/arvindsk/xgmix_expts/benchmark_data/sample_maps/latino//sample_maps /home/arvindsk/xgmix_expts/benchmark_data/full/unbal_unbal/latino/generated_data
Sample weights given is :  [None, None, None, None]
path created: /home/arvindsk/xgmix_expts/benchmark_data/full/unbal_unbal/latino/generated_data/chm20
path created: /home/arvindsk/xgmix_expts/benchmark_data/full/unbal_unbal/latino/generated_data/chm20/simulation_output
path created: /home/arvindsk/xgmix_expts/benchmark_data/full/unbal_unbal/latino/generated_data/chm20/simulation_output/train1
Building founders
Simulating...
Simulating generation  2
Simulating generation  4
Simulating generation  8
Simulating gene

Simulating generation  8
Simulating generation  16
Simulating generation  24
Simulating generation  32
Simulating generation  48
Simulating generation  64
Simulating generation  72
Simulating generation  100
Writing generation: 0
Writing generation: 2
Writing generation: 4
Writing generation: 8
Writing generation: 16
Writing generation: 24
Writing generation: 32
Writing generation: 48
Writing generation: 64
Writing generation: 72
Writing generation: 100
path created: /home/arvindsk/xgmix_expts/benchmark_data/full/unbal_unbal/seven/generated_data/chm20/simulation_output/val
Building founders
Simulating...
Simulating generation  2
Simulating generation  4
Simulating generation  8
Simulating generation  16
Simulating generation  24
Simulating generation  32
Simulating generation  48
Simulating generation  64
Simulating generation  72
Simulating generation  100
Writing generation: 0
Writing generation: 2
Writing generation: 4
Writing generation: 8
Writing generation: 16
Writing generation:

Writing generation: 48
Writing generation: 64
Writing generation: 72
Writing generation: 100
Doing: full, bal_bal, seven
Running: cp /home/arvindsk/xgmix_expts/benchmark_data/sample_maps/seven//populations.txt /home/arvindsk/xgmix_expts/benchmark_data/full/bal_bal/seven/generated_data
Running: cp -r /home/arvindsk/xgmix_expts/benchmark_data/sample_maps/seven//sample_maps /home/arvindsk/xgmix_expts/benchmark_data/full/bal_bal/seven/generated_data
Sample weights given is :  [None, None, None, None]
path created: /home/arvindsk/xgmix_expts/benchmark_data/full/bal_bal/seven/generated_data/chm20
path created: /home/arvindsk/xgmix_expts/benchmark_data/full/bal_bal/seven/generated_data/chm20/simulation_output
path created: /home/arvindsk/xgmix_expts/benchmark_data/full/bal_bal/seven/generated_data/chm20/simulation_output/train1
Building founders
Simulating...
Simulating generation  2
Simulating generation  4
Simulating generation  8
Simulating generation  16
Simulating generation  24
Simulati

Simulating generation  32
Simulating generation  48
Simulating generation  64
Simulating generation  72
Simulating generation  100
Writing generation: 0
Writing generation: 2
Writing generation: 4
Writing generation: 8
Writing generation: 16
Writing generation: 24
Writing generation: 32
Writing generation: 48
Writing generation: 64
Writing generation: 72
Writing generation: 100
path created: /home/arvindsk/xgmix_expts/benchmark_data/full/bal_unbal/five/generated_data/chm20/simulation_output/val
Building founders
Simulating...
Simulating generation  2
Simulating generation  4
Simulating generation  8
Simulating generation  16
Simulating generation  24
Simulating generation  32
Simulating generation  48
Simulating generation  64
Simulating generation  72
Simulating generation  100
Writing generation: 0
Writing generation: 2
Writing generation: 4
Writing generation: 8
Writing generation: 16
Writing generation: 24
Writing generation: 32
Writing generation: 48
Writing generation: 64
Writing

path created: /home/arvindsk/xgmix_expts/benchmark_data/full/bal_admix/latino/generated_data/chm20/simulation_output/test
Building founders
Simulating...
Simulating generation  2
Simulating generation  4
Simulating generation  8
Simulating generation  16
Simulating generation  24
Simulating generation  32
Simulating generation  48
Simulating generation  64
Simulating generation  72
Simulating generation  100
Writing generation: 0
Writing generation: 2
Writing generation: 4
Writing generation: 8
Writing generation: 16
Writing generation: 24
Writing generation: 32
Writing generation: 48
Writing generation: 64
Writing generation: 72
Writing generation: 100
Doing: full, bal_admix, five
Running: cp /home/arvindsk/xgmix_expts/benchmark_data/sample_maps/five//populations.txt /home/arvindsk/xgmix_expts/benchmark_data/full/bal_admix/five/generated_data
Running: cp -r /home/arvindsk/xgmix_expts/benchmark_data/sample_maps/five//sample_maps /home/arvindsk/xgmix_expts/benchmark_data/full/bal_admix/

Writing generation: 4
Writing generation: 8
Writing generation: 16
Writing generation: 24
Writing generation: 32
Writing generation: 48
Writing generation: 64
Writing generation: 72
Writing generation: 100
Using genetic map: /home/database/maps/rfmix/allchrs.b38.gmap
File read: 15964 SNPs for 3558 individuals
Doing: ukb, unbal_unbal, latino
Running: cp /home/arvindsk/xgmix_expts/benchmark_data/sample_maps/latino//populations.txt /home/arvindsk/xgmix_expts/benchmark_data/ukb/unbal_unbal/latino/generated_data
Running: cp -r /home/arvindsk/xgmix_expts/benchmark_data/sample_maps/latino//sample_maps /home/arvindsk/xgmix_expts/benchmark_data/ukb/unbal_unbal/latino/generated_data
Sample weights given is :  [None, None, None, None]
path created: /home/arvindsk/xgmix_expts/benchmark_data/ukb/unbal_unbal/latino/generated_data/chm20
path created: /home/arvindsk/xgmix_expts/benchmark_data/ukb/unbal_unbal/latino/generated_data/chm20/simulation_output
path created: /home/arvindsk/xgmix_expts/benchma

Simulating...
Simulating generation  2
Simulating generation  4
Simulating generation  8
Simulating generation  16
Simulating generation  24
Simulating generation  32
Simulating generation  48
Simulating generation  64
Simulating generation  72
Simulating generation  100
Writing generation: 0
Writing generation: 2
Writing generation: 4
Writing generation: 8
Writing generation: 16
Writing generation: 24
Writing generation: 32
Writing generation: 48
Writing generation: 64
Writing generation: 72
Writing generation: 100
path created: /home/arvindsk/xgmix_expts/benchmark_data/ukb/unbal_unbal/seven/generated_data/chm20/simulation_output/val
Building founders
Simulating...
Simulating generation  2
Simulating generation  4
Simulating generation  8
Simulating generation  16
Simulating generation  24
Simulating generation  32
Simulating generation  48
Simulating generation  64
Simulating generation  72
Simulating generation  100
Writing generation: 0
Writing generation: 2
Writing generation: 4
W

Writing generation: 48
Writing generation: 64
Writing generation: 72
Writing generation: 100
Doing: ukb, bal_bal, seven
Running: cp /home/arvindsk/xgmix_expts/benchmark_data/sample_maps/seven//populations.txt /home/arvindsk/xgmix_expts/benchmark_data/ukb/bal_bal/seven/generated_data
Running: cp -r /home/arvindsk/xgmix_expts/benchmark_data/sample_maps/seven//sample_maps /home/arvindsk/xgmix_expts/benchmark_data/ukb/bal_bal/seven/generated_data
Sample weights given is :  [None, None, None, None]
path created: /home/arvindsk/xgmix_expts/benchmark_data/ukb/bal_bal/seven/generated_data/chm20
path created: /home/arvindsk/xgmix_expts/benchmark_data/ukb/bal_bal/seven/generated_data/chm20/simulation_output
path created: /home/arvindsk/xgmix_expts/benchmark_data/ukb/bal_bal/seven/generated_data/chm20/simulation_output/train1
Building founders
Simulating...
Simulating generation  2
Simulating generation  4
Simulating generation  8
Simulating generation  16
Simulating generation  24
Simulating gen

Simulating generation  64
Simulating generation  72
Simulating generation  100
Writing generation: 0
Writing generation: 2
Writing generation: 4
Writing generation: 8
Writing generation: 16
Writing generation: 24
Writing generation: 32
Writing generation: 48
Writing generation: 64
Writing generation: 72
Writing generation: 100
path created: /home/arvindsk/xgmix_expts/benchmark_data/ukb/bal_unbal/five/generated_data/chm20/simulation_output/val
Building founders
Simulating...
Simulating generation  2
Simulating generation  4
Simulating generation  8
Simulating generation  16
Simulating generation  24
Simulating generation  32
Simulating generation  48
Simulating generation  64
Simulating generation  72
Simulating generation  100
Writing generation: 0
Writing generation: 2
Writing generation: 4
Writing generation: 8
Writing generation: 16
Writing generation: 24
Writing generation: 32
Writing generation: 48
Writing generation: 64
Writing generation: 72
Writing generation: 100
path created:

Simulating...
Simulating generation  2
Simulating generation  4
Simulating generation  8
Simulating generation  16
Simulating generation  24
Simulating generation  32
Simulating generation  48
Simulating generation  64
Simulating generation  72
Simulating generation  100
Writing generation: 0
Writing generation: 2
Writing generation: 4
Writing generation: 8
Writing generation: 16
Writing generation: 24
Writing generation: 32
Writing generation: 48
Writing generation: 64
Writing generation: 72
Writing generation: 100
Doing: ukb, bal_admix, five
Running: cp /home/arvindsk/xgmix_expts/benchmark_data/sample_maps/five//populations.txt /home/arvindsk/xgmix_expts/benchmark_data/ukb/bal_admix/five/generated_data
Running: cp -r /home/arvindsk/xgmix_expts/benchmark_data/sample_maps/five//sample_maps /home/arvindsk/xgmix_expts/benchmark_data/ukb/bal_admix/five/generated_data
/home/arvindsk/xgmix_expts/benchmark_data/ukb/bal_admix/five/generated_data/sample_maps/train1.map /home/arvindsk/xgmix_exp

In [None]:
### some sanity checks on the data 
"""
- could check if the balanced dataset is significantly more balanced than the unbalanced.
- could check the ancestries

"""

In [6]:
# balanced data

data_path = "/home/arvindsk/xgmix_expts/benchmark_data/full/bal_bal/seven/generated_data/chm20/simulation_output/train1/gen_24/mat_map.npy"
aa = np.load(data_path)

In [7]:
np.unique(aa,return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6]),
 array([3535479, 3505838, 2795708, 4197521, 3615904, 2997183, 3125167]))

In [8]:
# unbalanced data

data_path = "/home/arvindsk/xgmix_expts/benchmark_data/full/unbal_unbal/seven/generated_data/chm20/simulation_output/train1/gen_24/mat_map.npy"
aa = np.load(data_path)

In [9]:
np.unique(aa,return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6]),
 array([76457817, 97536822, 32121469, 15753892,  3333939, 34984965,
        12681496]))

In [13]:
# bal admix

data_path = "/home/arvindsk/xgmix_expts/benchmark_data/full/bal_admix/seven/generated_data/chm20/simulation_output/train1/gen_24/mat_map.npy"
aa = np.load(data_path)
aa.shape

(528, 516800)

In [11]:
np.unique(aa,return_counts=True)

(array([0, 1, 2, 3, 4, 5, 6]),
 array([39278169, 39093344, 38222556, 37791148, 38648160, 40893887,
        38943136]))

In [12]:
## check ukb shape

data_path = "/home/arvindsk/xgmix_expts/benchmark_data/ukb/bal_admix/seven/generated_data/chm20/simulation_output/train1/gen_24/mat_map.npy"
aa = np.load(data_path)
aa.shape

(528, 15964)

In [15]:
## check latino number of anc
data_path = "/home/arvindsk/xgmix_expts/benchmark_data/full/bal_admix/latino/generated_data/chm20/simulation_output/train1/gen_24/mat_map.npy"
aa = np.load(data_path)
print(aa.shape)
print(np.unique(aa))

[0 1 2]


In [16]:
## check five number of anc
data_path = "/home/arvindsk/xgmix_expts/benchmark_data/full/bal_admix/five/generated_data/chm20/simulation_output/train1/gen_24/mat_map.npy"
aa = np.load(data_path)
print(aa.shape)
print(np.unique(aa))

[0 1 2 3 4]
