# Filter Variants

This notebook is meant to filter the rat variants data even further following the LD pruning step.  The filtering will be similar to the way it was conducted in Xu et al., Nature

In [1]:
import sgkit as sg
from sgkit.io import plink
import pandas as pd
import numpy as np
import math
import pickle
import xarray as xr

## Load in Genotypes and Phenotypes Data

In [9]:
ds = plink.read_plink(bed_path = 'ratgenes_pruned/ratgenes_pruned.bed', bim_path = 'ratgenes_pruned/ratgenes_pruned.bim', fam_path = 'ratgenes_pruned/ratgenes_pruned.fam')
call_g_mask = ds["call_genotype_mask"].any(dim = "ploidy")
call_g = xr.where(call_g_mask, -1, ds["call_genotype"].sum(dim = "ploidy"))
genotypes_matrix = call_g.values
genotypes_matrix = np.transpose(genotypes_matrix)

loco = pd.read_csv("pheno_loco_clean.txt", sep = '\t')
ds

Unnamed: 0,Array,Chunk
Bytes,377.55 kiB,377.55 kiB
Shape,"(96654,)","(96654,)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray
"Array Chunk Bytes 377.55 kiB 377.55 kiB Shape (96654,) (96654,) Dask graph 1 chunks in 3 graph layers Data type int32 numpy.ndarray",96654  1,

Unnamed: 0,Array,Chunk
Bytes,377.55 kiB,377.55 kiB
Shape,"(96654,)","(96654,)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,188.78 kiB,94.39 kiB
Shape,"(96654, 2)","(96654, 1)"
Dask graph,2 chunks in 13 graph layers,2 chunks in 13 graph layers
Data type,|S1 numpy.ndarray,|S1 numpy.ndarray
"Array Chunk Bytes 188.78 kiB 94.39 kiB Shape (96654, 2) (96654, 1) Dask graph 2 chunks in 13 graph layers Data type |S1 numpy.ndarray",2  96654,

Unnamed: 0,Array,Chunk
Bytes,188.78 kiB,94.39 kiB
Shape,"(96654, 2)","(96654, 1)"
Dask graph,2 chunks in 13 graph layers,2 chunks in 13 graph layers
Data type,|S1 numpy.ndarray,|S1 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,792.54 kiB,792.54 kiB
Shape,"(13526,)","(13526,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,,
"Array Chunk Bytes 792.54 kiB 792.54 kiB Shape (13526,) (13526,) Dask graph 1 chunks in 4 graph layers Data type",13526  1,

Unnamed: 0,Array,Chunk
Bytes,792.54 kiB,792.54 kiB
Shape,"(13526,)","(13526,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,,

Unnamed: 0,Array,Chunk
Bytes,2.44 GiB,128.00 MiB
Shape,"(96654, 13526, 2)","(8192, 8192, 2)"
Dask graph,24 chunks in 2 graph layers,24 chunks in 2 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray
"Array Chunk Bytes 2.44 GiB 128.00 MiB Shape (96654, 13526, 2) (8192, 8192, 2) Dask graph 24 chunks in 2 graph layers Data type int8 numpy.ndarray",2  13526  96654,

Unnamed: 0,Array,Chunk
Bytes,2.44 GiB,128.00 MiB
Shape,"(96654, 13526, 2)","(8192, 8192, 2)"
Dask graph,24 chunks in 2 graph layers,24 chunks in 2 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.44 GiB,128.00 MiB
Shape,"(96654, 13526, 2)","(8192, 8192, 2)"
Dask graph,24 chunks in 3 graph layers,24 chunks in 3 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray
"Array Chunk Bytes 2.44 GiB 128.00 MiB Shape (96654, 13526, 2) (8192, 8192, 2) Dask graph 24 chunks in 3 graph layers Data type bool numpy.ndarray",2  13526  96654,

Unnamed: 0,Array,Chunk
Bytes,2.44 GiB,128.00 MiB
Shape,"(96654, 13526, 2)","(8192, 8192, 2)"
Dask graph,24 chunks in 3 graph layers,24 chunks in 3 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.53 MiB,5.53 MiB
Shape,"(96654,)","(96654,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,,
"Array Chunk Bytes 5.53 MiB 5.53 MiB Shape (96654,) (96654,) Dask graph 1 chunks in 4 graph layers Data type",96654  1,

Unnamed: 0,Array,Chunk
Bytes,5.53 MiB,5.53 MiB
Shape,"(96654,)","(96654,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,,

Unnamed: 0,Array,Chunk
Bytes,792.54 kiB,792.54 kiB
Shape,"(13526,)","(13526,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,,
"Array Chunk Bytes 792.54 kiB 792.54 kiB Shape (13526,) (13526,) Dask graph 1 chunks in 4 graph layers Data type",13526  1,

Unnamed: 0,Array,Chunk
Bytes,792.54 kiB,792.54 kiB
Shape,"(13526,)","(13526,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,,

Unnamed: 0,Array,Chunk
Bytes,211.34 kiB,211.34 kiB
Shape,"(13526,)","(13526,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,,
"Array Chunk Bytes 211.34 kiB 211.34 kiB Shape (13526,) (13526,) Dask graph 1 chunks in 4 graph layers Data type",13526  1,

Unnamed: 0,Array,Chunk
Bytes,211.34 kiB,211.34 kiB
Shape,"(13526,)","(13526,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,,

Unnamed: 0,Array,Chunk
Bytes,211.34 kiB,211.34 kiB
Shape,"(13526,)","(13526,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,,
"Array Chunk Bytes 211.34 kiB 211.34 kiB Shape (13526,) (13526,) Dask graph 1 chunks in 4 graph layers Data type",13526  1,

Unnamed: 0,Array,Chunk
Bytes,211.34 kiB,211.34 kiB
Shape,"(13526,)","(13526,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,,

Unnamed: 0,Array,Chunk
Bytes,13.21 kiB,13.21 kiB
Shape,"(13526,)","(13526,)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray
"Array Chunk Bytes 13.21 kiB 13.21 kiB Shape (13526,) (13526,) Dask graph 1 chunks in 3 graph layers Data type int8 numpy.ndarray",13526  1,

Unnamed: 0,Array,Chunk
Bytes,13.21 kiB,13.21 kiB
Shape,"(13526,)","(13526,)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,13.21 kiB,13.21 kiB
Shape,"(13526,)","(13526,)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray
"Array Chunk Bytes 13.21 kiB 13.21 kiB Shape (13526,) (13526,) Dask graph 1 chunks in 3 graph layers Data type int8 numpy.ndarray",13526  1,

Unnamed: 0,Array,Chunk
Bytes,13.21 kiB,13.21 kiB
Shape,"(13526,)","(13526,)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray


## Clean phenotypes data

In [10]:
def get_rats_with_loco(ds, phenotypes):
    
    rat_ids = ds['sample_id'].values
    phenotypes = phenotypes[(phenotypes['rfid'].isin(rat_ids))]
    phenotypes = phenotypes.set_index('rfid')
    phenotypes.index.name = 'Sample'
    select_traits = ['loco_maxcent', 'loco_maxdis', 'loco_maxrear', 'loco_maxact']
    phenotypes = phenotypes[select_traits]
    
    return(phenotypes)

In [11]:
loco = get_rats_with_loco(ds, loco)
print(loco.shape)
loco.head()

(2368, 4)


Unnamed: 0_level_0,loco_maxcent,loco_maxdis,loco_maxrear,loco_maxact
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
00077E61F3,171.3,568.0,19.0,109.0
00077E6207,115.4,697.0,26.0,135.0
00077E6232,48.1,652.0,20.0,124.0
00077E6239,147.1,466.0,25.0,97.0
00077E62D2,89.4,599.0,22.0,96.0


## Merge Genotypes with Phenotypes

In [30]:
ds_annotations = pd.DataFrame.to_xarray(loco).rename({"Sample":"samples"})
ds = ds.set_index({"samples": "sample_id"})
ds = ds.drop_duplicates(dim = ['samples'])
ds = ds.sel(samples = list(loco.index))
ds = ds.merge(ds_annotations, join="left")
ds

  return self.array[key]


Unnamed: 0,Array,Chunk
Bytes,377.55 kiB,377.55 kiB
Shape,"(96654,)","(96654,)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray
"Array Chunk Bytes 377.55 kiB 377.55 kiB Shape (96654,) (96654,) Dask graph 1 chunks in 3 graph layers Data type int32 numpy.ndarray",96654  1,

Unnamed: 0,Array,Chunk
Bytes,377.55 kiB,377.55 kiB
Shape,"(96654,)","(96654,)"
Dask graph,1 chunks in 3 graph layers,1 chunks in 3 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,188.78 kiB,94.39 kiB
Shape,"(96654, 2)","(96654, 1)"
Dask graph,2 chunks in 13 graph layers,2 chunks in 13 graph layers
Data type,|S1 numpy.ndarray,|S1 numpy.ndarray
"Array Chunk Bytes 188.78 kiB 94.39 kiB Shape (96654, 2) (96654, 1) Dask graph 2 chunks in 13 graph layers Data type |S1 numpy.ndarray",2  96654,

Unnamed: 0,Array,Chunk
Bytes,188.78 kiB,94.39 kiB
Shape,"(96654, 2)","(96654, 1)"
Dask graph,2 chunks in 13 graph layers,2 chunks in 13 graph layers
Data type,|S1 numpy.ndarray,|S1 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,436.55 MiB,11.95 MiB
Shape,"(96654, 2368, 2)","(8192, 765, 2)"
Dask graph,1776 chunks in 4 graph layers,1776 chunks in 4 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray
"Array Chunk Bytes 436.55 MiB 11.95 MiB Shape (96654, 2368, 2) (8192, 765, 2) Dask graph 1776 chunks in 4 graph layers Data type int8 numpy.ndarray",2  2368  96654,

Unnamed: 0,Array,Chunk
Bytes,436.55 MiB,11.95 MiB
Shape,"(96654, 2368, 2)","(8192, 765, 2)"
Dask graph,1776 chunks in 4 graph layers,1776 chunks in 4 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,436.55 MiB,11.95 MiB
Shape,"(96654, 2368, 2)","(8192, 765, 2)"
Dask graph,1776 chunks in 5 graph layers,1776 chunks in 5 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray
"Array Chunk Bytes 436.55 MiB 11.95 MiB Shape (96654, 2368, 2) (8192, 765, 2) Dask graph 1776 chunks in 5 graph layers Data type bool numpy.ndarray",2  2368  96654,

Unnamed: 0,Array,Chunk
Bytes,436.55 MiB,11.95 MiB
Shape,"(96654, 2368, 2)","(8192, 765, 2)"
Dask graph,1776 chunks in 5 graph layers,1776 chunks in 5 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,5.53 MiB,5.53 MiB
Shape,"(96654,)","(96654,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,,
"Array Chunk Bytes 5.53 MiB 5.53 MiB Shape (96654,) (96654,) Dask graph 1 chunks in 4 graph layers Data type",96654  1,

Unnamed: 0,Array,Chunk
Bytes,5.53 MiB,5.53 MiB
Shape,"(96654,)","(96654,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,,

Unnamed: 0,Array,Chunk
Bytes,138.75 kiB,138.75 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 6 graph layers,1 chunks in 6 graph layers
Data type,,
"Array Chunk Bytes 138.75 kiB 138.75 kiB Shape (2368,) (2368,) Dask graph 1 chunks in 6 graph layers Data type",2368  1,

Unnamed: 0,Array,Chunk
Bytes,138.75 kiB,138.75 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 6 graph layers,1 chunks in 6 graph layers
Data type,,

Unnamed: 0,Array,Chunk
Bytes,37.00 kiB,37.00 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 6 graph layers,1 chunks in 6 graph layers
Data type,,
"Array Chunk Bytes 37.00 kiB 37.00 kiB Shape (2368,) (2368,) Dask graph 1 chunks in 6 graph layers Data type",2368  1,

Unnamed: 0,Array,Chunk
Bytes,37.00 kiB,37.00 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 6 graph layers,1 chunks in 6 graph layers
Data type,,

Unnamed: 0,Array,Chunk
Bytes,37.00 kiB,37.00 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 6 graph layers,1 chunks in 6 graph layers
Data type,,
"Array Chunk Bytes 37.00 kiB 37.00 kiB Shape (2368,) (2368,) Dask graph 1 chunks in 6 graph layers Data type",2368  1,

Unnamed: 0,Array,Chunk
Bytes,37.00 kiB,37.00 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 6 graph layers,1 chunks in 6 graph layers
Data type,,

Unnamed: 0,Array,Chunk
Bytes,2.31 kiB,2.31 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 5 graph layers,1 chunks in 5 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray
"Array Chunk Bytes 2.31 kiB 2.31 kiB Shape (2368,) (2368,) Dask graph 1 chunks in 5 graph layers Data type int8 numpy.ndarray",2368  1,

Unnamed: 0,Array,Chunk
Bytes,2.31 kiB,2.31 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 5 graph layers,1 chunks in 5 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.31 kiB,2.31 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 5 graph layers,1 chunks in 5 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray
"Array Chunk Bytes 2.31 kiB 2.31 kiB Shape (2368,) (2368,) Dask graph 1 chunks in 5 graph layers Data type int8 numpy.ndarray",2368  1,

Unnamed: 0,Array,Chunk
Bytes,2.31 kiB,2.31 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 5 graph layers,1 chunks in 5 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray


## Begin Filterig the Genotypes Dataset

### First fillter out missing SNPs for GWAS

In [32]:
ds = sg.stats.pca.count_call_alternate_alleles(ds)
variant_mask = ((ds.call_alternate_allele_count < 0).any(dim="samples")) | \
    (ds.call_alternate_allele_count.std(dim="samples") <= 0.0)
ds = ds.sel(variants=~variant_mask)
print(f"Samples: {len(ds.samples)}  Variants: {len(ds.variants)}")
ds

Samples: 2368  Variants: 8690


Unnamed: 0,Array,Chunk
Bytes,39.25 MiB,1.37 MiB
Shape,"(8690, 2368)","(939, 765)"
Dask graph,1776 chunks in 17 graph layers,1776 chunks in 17 graph layers
Data type,int16 numpy.ndarray,int16 numpy.ndarray
"Array Chunk Bytes 39.25 MiB 1.37 MiB Shape (8690, 2368) (939, 765) Dask graph 1776 chunks in 17 graph layers Data type int16 numpy.ndarray",2368  8690,

Unnamed: 0,Array,Chunk
Bytes,39.25 MiB,1.37 MiB
Shape,"(8690, 2368)","(939, 765)"
Dask graph,1776 chunks in 17 graph layers,1776 chunks in 17 graph layers
Data type,int16 numpy.ndarray,int16 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,33.95 kiB,33.95 kiB
Shape,"(8690,)","(8690,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray
"Array Chunk Bytes 33.95 kiB 33.95 kiB Shape (8690,) (8690,) Dask graph 1 chunks in 4 graph layers Data type int32 numpy.ndarray",8690  1,

Unnamed: 0,Array,Chunk
Bytes,33.95 kiB,33.95 kiB
Shape,"(8690,)","(8690,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,16.97 kiB,8.49 kiB
Shape,"(8690, 2)","(8690, 1)"
Dask graph,2 chunks in 14 graph layers,2 chunks in 14 graph layers
Data type,|S1 numpy.ndarray,|S1 numpy.ndarray
"Array Chunk Bytes 16.97 kiB 8.49 kiB Shape (8690, 2) (8690, 1) Dask graph 2 chunks in 14 graph layers Data type |S1 numpy.ndarray",2  8690,

Unnamed: 0,Array,Chunk
Bytes,16.97 kiB,8.49 kiB
Shape,"(8690, 2)","(8690, 1)"
Dask graph,2 chunks in 14 graph layers,2 chunks in 14 graph layers
Data type,|S1 numpy.ndarray,|S1 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,39.25 MiB,1.37 MiB
Shape,"(8690, 2368, 2)","(939, 765, 2)"
Dask graph,1776 chunks in 5 graph layers,1776 chunks in 5 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray
"Array Chunk Bytes 39.25 MiB 1.37 MiB Shape (8690, 2368, 2) (939, 765, 2) Dask graph 1776 chunks in 5 graph layers Data type int8 numpy.ndarray",2  2368  8690,

Unnamed: 0,Array,Chunk
Bytes,39.25 MiB,1.37 MiB
Shape,"(8690, 2368, 2)","(939, 765, 2)"
Dask graph,1776 chunks in 5 graph layers,1776 chunks in 5 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,39.25 MiB,1.37 MiB
Shape,"(8690, 2368, 2)","(939, 765, 2)"
Dask graph,1776 chunks in 6 graph layers,1776 chunks in 6 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray
"Array Chunk Bytes 39.25 MiB 1.37 MiB Shape (8690, 2368, 2) (939, 765, 2) Dask graph 1776 chunks in 6 graph layers Data type bool numpy.ndarray",2  2368  8690,

Unnamed: 0,Array,Chunk
Bytes,39.25 MiB,1.37 MiB
Shape,"(8690, 2368, 2)","(939, 765, 2)"
Dask graph,1776 chunks in 6 graph layers,1776 chunks in 6 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,509.18 kiB,509.18 kiB
Shape,"(8690,)","(8690,)"
Dask graph,1 chunks in 5 graph layers,1 chunks in 5 graph layers
Data type,,
"Array Chunk Bytes 509.18 kiB 509.18 kiB Shape (8690,) (8690,) Dask graph 1 chunks in 5 graph layers Data type",8690  1,

Unnamed: 0,Array,Chunk
Bytes,509.18 kiB,509.18 kiB
Shape,"(8690,)","(8690,)"
Dask graph,1 chunks in 5 graph layers,1 chunks in 5 graph layers
Data type,,

Unnamed: 0,Array,Chunk
Bytes,138.75 kiB,138.75 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 6 graph layers,1 chunks in 6 graph layers
Data type,,
"Array Chunk Bytes 138.75 kiB 138.75 kiB Shape (2368,) (2368,) Dask graph 1 chunks in 6 graph layers Data type",2368  1,

Unnamed: 0,Array,Chunk
Bytes,138.75 kiB,138.75 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 6 graph layers,1 chunks in 6 graph layers
Data type,,

Unnamed: 0,Array,Chunk
Bytes,37.00 kiB,37.00 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 6 graph layers,1 chunks in 6 graph layers
Data type,,
"Array Chunk Bytes 37.00 kiB 37.00 kiB Shape (2368,) (2368,) Dask graph 1 chunks in 6 graph layers Data type",2368  1,

Unnamed: 0,Array,Chunk
Bytes,37.00 kiB,37.00 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 6 graph layers,1 chunks in 6 graph layers
Data type,,

Unnamed: 0,Array,Chunk
Bytes,37.00 kiB,37.00 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 6 graph layers,1 chunks in 6 graph layers
Data type,,
"Array Chunk Bytes 37.00 kiB 37.00 kiB Shape (2368,) (2368,) Dask graph 1 chunks in 6 graph layers Data type",2368  1,

Unnamed: 0,Array,Chunk
Bytes,37.00 kiB,37.00 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 6 graph layers,1 chunks in 6 graph layers
Data type,,

Unnamed: 0,Array,Chunk
Bytes,2.31 kiB,2.31 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 5 graph layers,1 chunks in 5 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray
"Array Chunk Bytes 2.31 kiB 2.31 kiB Shape (2368,) (2368,) Dask graph 1 chunks in 5 graph layers Data type int8 numpy.ndarray",2368  1,

Unnamed: 0,Array,Chunk
Bytes,2.31 kiB,2.31 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 5 graph layers,1 chunks in 5 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.31 kiB,2.31 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 5 graph layers,1 chunks in 5 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray
"Array Chunk Bytes 2.31 kiB 2.31 kiB Shape (2368,) (2368,) Dask graph 1 chunks in 5 graph layers Data type int8 numpy.ndarray",2368  1,

Unnamed: 0,Array,Chunk
Bytes,2.31 kiB,2.31 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 5 graph layers,1 chunks in 5 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray


In [34]:
# Save filtered to pickle

with open('ds_known.pkl', 'wb') as handle:
    pickle.dump(ds, handle)

In [2]:
# Open saved pickled file

with open('ds_known.pkl', 'rb') as handle:
    ds = pickle.load(handle)

### Run GWAS Linear Regression to get Pvalues

In [4]:
traits = ['loco_maxcent', 'loco_maxdis', 'loco_maxrear', 'loco_maxact']
ds["call_dosage"] = ds.call_genotype.sum(dim="ploidy")
ds_lr = sg.gwas_linear_regression(ds, dosage="call_dosage", add_intercept=True, covariates=[], traits=traits)
ds_lr

Unnamed: 0,Array,Chunk
Bytes,271.56 kiB,29.34 kiB
Shape,"(8690, 4)","(939, 4)"
Dask graph,12 chunks in 57 graph layers,12 chunks in 57 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 271.56 kiB 29.34 kiB Shape (8690, 4) (939, 4) Dask graph 12 chunks in 57 graph layers Data type float64 numpy.ndarray",4  8690,

Unnamed: 0,Array,Chunk
Bytes,271.56 kiB,29.34 kiB
Shape,"(8690, 4)","(939, 4)"
Dask graph,12 chunks in 57 graph layers,12 chunks in 57 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,271.56 kiB,29.34 kiB
Shape,"(8690, 4)","(939, 4)"
Dask graph,12 chunks in 73 graph layers,12 chunks in 73 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 271.56 kiB 29.34 kiB Shape (8690, 4) (939, 4) Dask graph 12 chunks in 73 graph layers Data type float64 numpy.ndarray",4  8690,

Unnamed: 0,Array,Chunk
Bytes,271.56 kiB,29.34 kiB
Shape,"(8690, 4)","(939, 4)"
Dask graph,12 chunks in 73 graph layers,12 chunks in 73 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,271.56 kiB,29.34 kiB
Shape,"(8690, 4)","(939, 4)"
Dask graph,12 chunks in 75 graph layers,12 chunks in 75 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 271.56 kiB 29.34 kiB Shape (8690, 4) (939, 4) Dask graph 12 chunks in 75 graph layers Data type float64 numpy.ndarray",4  8690,

Unnamed: 0,Array,Chunk
Bytes,271.56 kiB,29.34 kiB
Shape,"(8690, 4)","(939, 4)"
Dask graph,12 chunks in 75 graph layers,12 chunks in 75 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,39.25 MiB,1.37 MiB
Shape,"(8690, 2368)","(939, 765)"
Dask graph,1776 chunks in 17 graph layers,1776 chunks in 17 graph layers
Data type,int16 numpy.ndarray,int16 numpy.ndarray
"Array Chunk Bytes 39.25 MiB 1.37 MiB Shape (8690, 2368) (939, 765) Dask graph 1776 chunks in 17 graph layers Data type int16 numpy.ndarray",2368  8690,

Unnamed: 0,Array,Chunk
Bytes,39.25 MiB,1.37 MiB
Shape,"(8690, 2368)","(939, 765)"
Dask graph,1776 chunks in 17 graph layers,1776 chunks in 17 graph layers
Data type,int16 numpy.ndarray,int16 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,33.95 kiB,33.95 kiB
Shape,"(8690,)","(8690,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray
"Array Chunk Bytes 33.95 kiB 33.95 kiB Shape (8690,) (8690,) Dask graph 1 chunks in 4 graph layers Data type int32 numpy.ndarray",8690  1,

Unnamed: 0,Array,Chunk
Bytes,33.95 kiB,33.95 kiB
Shape,"(8690,)","(8690,)"
Dask graph,1 chunks in 4 graph layers,1 chunks in 4 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,16.97 kiB,8.49 kiB
Shape,"(8690, 2)","(8690, 1)"
Dask graph,2 chunks in 14 graph layers,2 chunks in 14 graph layers
Data type,|S1 numpy.ndarray,|S1 numpy.ndarray
"Array Chunk Bytes 16.97 kiB 8.49 kiB Shape (8690, 2) (8690, 1) Dask graph 2 chunks in 14 graph layers Data type |S1 numpy.ndarray",2  8690,

Unnamed: 0,Array,Chunk
Bytes,16.97 kiB,8.49 kiB
Shape,"(8690, 2)","(8690, 1)"
Dask graph,2 chunks in 14 graph layers,2 chunks in 14 graph layers
Data type,|S1 numpy.ndarray,|S1 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,39.25 MiB,1.37 MiB
Shape,"(8690, 2368, 2)","(939, 765, 2)"
Dask graph,1776 chunks in 5 graph layers,1776 chunks in 5 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray
"Array Chunk Bytes 39.25 MiB 1.37 MiB Shape (8690, 2368, 2) (939, 765, 2) Dask graph 1776 chunks in 5 graph layers Data type int8 numpy.ndarray",2  2368  8690,

Unnamed: 0,Array,Chunk
Bytes,39.25 MiB,1.37 MiB
Shape,"(8690, 2368, 2)","(939, 765, 2)"
Dask graph,1776 chunks in 5 graph layers,1776 chunks in 5 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,39.25 MiB,1.37 MiB
Shape,"(8690, 2368, 2)","(939, 765, 2)"
Dask graph,1776 chunks in 6 graph layers,1776 chunks in 6 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray
"Array Chunk Bytes 39.25 MiB 1.37 MiB Shape (8690, 2368, 2) (939, 765, 2) Dask graph 1776 chunks in 6 graph layers Data type bool numpy.ndarray",2  2368  8690,

Unnamed: 0,Array,Chunk
Bytes,39.25 MiB,1.37 MiB
Shape,"(8690, 2368, 2)","(939, 765, 2)"
Dask graph,1776 chunks in 6 graph layers,1776 chunks in 6 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,509.18 kiB,509.18 kiB
Shape,"(8690,)","(8690,)"
Dask graph,1 chunks in 5 graph layers,1 chunks in 5 graph layers
Data type,,
"Array Chunk Bytes 509.18 kiB 509.18 kiB Shape (8690,) (8690,) Dask graph 1 chunks in 5 graph layers Data type",8690  1,

Unnamed: 0,Array,Chunk
Bytes,509.18 kiB,509.18 kiB
Shape,"(8690,)","(8690,)"
Dask graph,1 chunks in 5 graph layers,1 chunks in 5 graph layers
Data type,,

Unnamed: 0,Array,Chunk
Bytes,138.75 kiB,138.75 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 6 graph layers,1 chunks in 6 graph layers
Data type,,
"Array Chunk Bytes 138.75 kiB 138.75 kiB Shape (2368,) (2368,) Dask graph 1 chunks in 6 graph layers Data type",2368  1,

Unnamed: 0,Array,Chunk
Bytes,138.75 kiB,138.75 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 6 graph layers,1 chunks in 6 graph layers
Data type,,

Unnamed: 0,Array,Chunk
Bytes,37.00 kiB,37.00 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 6 graph layers,1 chunks in 6 graph layers
Data type,,
"Array Chunk Bytes 37.00 kiB 37.00 kiB Shape (2368,) (2368,) Dask graph 1 chunks in 6 graph layers Data type",2368  1,

Unnamed: 0,Array,Chunk
Bytes,37.00 kiB,37.00 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 6 graph layers,1 chunks in 6 graph layers
Data type,,

Unnamed: 0,Array,Chunk
Bytes,37.00 kiB,37.00 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 6 graph layers,1 chunks in 6 graph layers
Data type,,
"Array Chunk Bytes 37.00 kiB 37.00 kiB Shape (2368,) (2368,) Dask graph 1 chunks in 6 graph layers Data type",2368  1,

Unnamed: 0,Array,Chunk
Bytes,37.00 kiB,37.00 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 6 graph layers,1 chunks in 6 graph layers
Data type,,

Unnamed: 0,Array,Chunk
Bytes,2.31 kiB,2.31 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 5 graph layers,1 chunks in 5 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray
"Array Chunk Bytes 2.31 kiB 2.31 kiB Shape (2368,) (2368,) Dask graph 1 chunks in 5 graph layers Data type int8 numpy.ndarray",2368  1,

Unnamed: 0,Array,Chunk
Bytes,2.31 kiB,2.31 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 5 graph layers,1 chunks in 5 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.31 kiB,2.31 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 5 graph layers,1 chunks in 5 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray
"Array Chunk Bytes 2.31 kiB 2.31 kiB Shape (2368,) (2368,) Dask graph 1 chunks in 5 graph layers Data type int8 numpy.ndarray",2368  1,

Unnamed: 0,Array,Chunk
Bytes,2.31 kiB,2.31 kiB
Shape,"(2368,)","(2368,)"
Dask graph,1 chunks in 5 graph layers,1 chunks in 5 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,157.00 MiB,5.48 MiB
Shape,"(8690, 2368)","(939, 765)"
Dask graph,1776 chunks in 7 graph layers,1776 chunks in 7 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 157.00 MiB 5.48 MiB Shape (8690, 2368) (939, 765) Dask graph 1776 chunks in 7 graph layers Data type int64 numpy.ndarray",2368  8690,

Unnamed: 0,Array,Chunk
Bytes,157.00 MiB,5.48 MiB
Shape,"(8690, 2368)","(939, 765)"
Dask graph,1776 chunks in 7 graph layers,1776 chunks in 7 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray


### Check some of the pvalue stats, Make sure no NaNs

In [36]:
pvals = np.asarray(ds_lr.variant_linreg_p_value)
print("Length of set", len(pvals))
print("Some pvalues", pvals[0:10])

Length of set 8690
Some pvalues [[0.83498755 0.83178252 0.39806148 0.78515391]
 [0.83498755 0.83178252 0.39806148 0.78515391]
 [0.83498755 0.83178252 0.39806148 0.78515391]
 [0.48890551 0.18730565 0.2468513  0.30857111]
 [0.48890551 0.18730565 0.2468513  0.30857111]
 [0.6104458  0.04121454 0.04754996 0.00912043]
 [0.24086775 0.22143437 0.04632651 0.32791972]
 [0.61826744 0.11577944 0.07347522 0.01856175]
 [0.61826744 0.11577944 0.07347522 0.01856175]
 [0.11371496 0.17127835 0.89635393 0.26219276]]


### Filter based on 0.05 GWAS Pvalues

In [37]:
ds_lr_05 = ds_lr.sel(variants=((ds_lr.variant_linreg_p_value < 0.05).any('traits')))
print(f"Samples: {len(ds_lr_05.samples)}  Variants: {len(ds_lr_05.variants)}")

Samples: 2368  Variants: 3408


### Filter based on 0.01 GWAS Pvalues

In [38]:
ds_lr_01 = ds_lr_05.sel(variants=((ds_lr_05.variant_linreg_p_value < 0.01).any('traits')))
print(f"Samples: {len(ds_lr_01.samples)}  Variants: {len(ds_lr_01.variants)}")

Samples: 2368  Variants: 1319


### Filter based on 0.001 GWAS Pvalues

In [39]:
ds_lr_001 = ds_lr_01.sel(variants=((ds_lr_01.variant_linreg_p_value < 0.001).any('traits')))
print(f"Samples: {len(ds_lr_001.samples)}  Variants: {len(ds_lr_001.variants)}")

Samples: 2368  Variants: 546


### Filter based on 0.0001 GWAS Pvalues

In [40]:
ds_lr_0001 = ds_lr_001.sel(variants=((ds_lr_001.variant_linreg_p_value < 0.0001).any('traits')))
print(f"Samples: {len(ds_lr_0001.samples)}  Variants: {len(ds_lr_0001.variants)}")

Samples: 2368  Variants: 152


### Filter based on 0.00001 GWAS Pvalues

In [41]:
ds_lr_00001 = ds_lr_0001.sel(variants=((ds_lr_0001.variant_linreg_p_value < 0.00001).any('traits')))
print(f"Samples: {len(ds_lr_00001.samples)}  Variants: {len(ds_lr_00001.variants)}")

Samples: 2368  Variants: 100


## Save as Numpy Arrays and pickle

In [45]:
call_g_mask = ds_lr_05["call_genotype_mask"].any(dim = "ploidy")
call_g = xr.where(call_g_mask, -1, ds_lr_05["call_genotype"].sum(dim = "ploidy"))
genotypes_matrix_05 = call_g.values
genotypes_matrix_05 = np.transpose(genotypes_matrix_05)

with open('genotypes_matrix_05.pkl', 'wb') as handle:
    pickle.dump(genotypes_matrix_05, handle)