In [1]:
import hail as hl
hl.init()

Running on Apache Spark version 2.4.4
SparkUI available at http://10.1.1.170:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.35-bd2a3a9fe07c
LOGGING: writing to /home/brendan/code/VariantSpark/dev-notebooks/hail-20210103-1502-0.2.35-bd2a3a9fe07c.log


In [2]:
from hail.plot import show
from pprint import pprint
hl.plot.output_notebook()

In [3]:
data = hl.import_vcf('../data/hipsterIndex/hipster.vcf.bgz')

In [4]:
data.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AA: str, 
        AC: array<int32>, 
        AF: array<float64>, 
        AFR_AF: array<float64>, 
        AMR_AF: array<float64>, 
        AN: int32, 
        CIEND: array<int32>, 
        CIPOS: array<int32>, 
        CS: str, 
        DP: int32, 
        EAS_AF: array<float64>, 
        END: int32, 
        EUR_AF: array<float64>, 
        EX_TARGET: bool, 
        IMPRECISE: bool, 
        MC: array<str>, 
        MEINFO: array<str>, 
        MEND: int32, 
        MLEN: int32, 
        MSTART: int32, 
        MULTI_ALLELIC: bool, 
        NS: int32, 
        SAS_AF: array<float64>, 
        SVLEN: array<int32>, 
        SVTYPE: str, 
        TSD: str, 
      

In [5]:
labels = hl.import_table('../data/hipsterIndex/hipster_labels.txt', delimiter=',', 
                types=dict(label='int64', score='float64')).key_by('samples')

2021-01-03 15:02:42 Hail: INFO: Reading table with no type imputation
  Loading column 'samples' as type 'str' (type not specified)
  Loading column 'score' as type 'float64' (user-specified)
  Loading column 'label' as type 'int64' (user-specified)



In [6]:
labels.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'samples': str 
    'score': float64 
    'label': int64 
----------------------------------------
Key: ['samples']
----------------------------------------


In [7]:
mt = data.annotate_cols(label = labels[data.s])
mt.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'label': struct {
        score: float64, 
        label: int64
    }
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AA: str, 
        AC: array<int32>, 
        AF: array<float64>, 
        AFR_AF: array<float64>, 
        AMR_AF: array<float64>, 
        AN: int32, 
        CIEND: array<int32>, 
        CIPOS: array<int32>, 
        CS: str, 
        DP: int32, 
        EAS_AF: array<float64>, 
        END: int32, 
        EUR_AF: array<float64>, 
        EX_TARGET: bool, 
        IMPRECISE: bool, 
        MC: array<str>, 
        MEINFO: array<str>, 
        MEND: int32, 
        MLEN: int32, 
        MSTART: int32, 
        MULTI_ALLELIC: bool, 
        NS: int32, 
        SAS_AF: array<float64>, 
   

In [8]:
mt.count()

2021-01-03 15:02:43 Hail: INFO: Coerced almost-sorted dataset


(17010, 2504)

In [9]:
gwas = hl.linear_regression_rows(y=mt.label.score,
                                 x=mt.GT.n_alt_alleles(),
                                 covariates=[1.0])

2021-01-03 15:02:45 Hail: INFO: Coerced almost-sorted dataset
2021-01-03 15:02:46 Hail: INFO: linear_regression_rows: running on 2504 samples for 1 response variable y,
    with input variable x, and 1 additional covariate...


In [10]:
gwas.show()

locus,alleles,n,sum_x,y_transpose_x,beta,standard_error,t_stat,p_value
locus<GRCh37>,array<str>,int32,float64,float64,float64,float64,float64,float64
2:109511398,"[""G"",""A""]",2504,4.0,35.0,-0.703,1.62,-0.434,0.664
2:109511454,"[""C"",""A""]",2504,1.0,11.5,2.05,3.24,0.633,0.527
2:109511463,"[""G"",""A""]",2504,147.0,1300.0,-0.579,0.26,-2.23,0.0261
2:109511467,"[""GACTC"",""G""]",2504,547.0,5060.0,-0.207,0.142,-1.45,0.147
2:109511478,"[""C"",""T""]",2504,1.0,12.0,2.55,3.24,0.788,0.431
2:109511497,"[""G"",""T""]",2504,1.0,10.5,1.05,3.24,0.324,0.746
2:109511525,"[""G"",""GAATT""]",2504,13.0,99.5,-1.81,0.899,-2.01,0.0445
2:109511527,"[""A"",""C""]",2504,1.0,5.5,-3.95,3.24,-1.22,0.222
2:109511532,"[""A"",""G""]",2504,1.0,6.5,-2.95,3.24,-0.912,0.362
2:109511579,"[""C"",""G""]",2504,1.0,9.5,0.0481,3.24,0.0149,0.988


In [11]:
p = hl.plot.manhattan(gwas.p_value)
show(p)