In [1]:
ROOT_DIR = '/gpfs/commons/groups/gursoy_lab/aelhussein/blockchain'
multichainLoc = ''
chainName = 'public_access_4'
datadir = f'{ROOT_DIR}/multichain'
querydir = f'{ROOT_DIR}/public/code'
metafile = f'{ROOT_DIR}/public/data/samples/metadata.csv'
annotation_path = f'{ROOT_DIR}/public/data/annotations'
personPath = f'{ROOT_DIR}/public/data/clinical/person.csv'
dataPath = f'{ROOT_DIR}/public/data/clinical/'

In [16]:
# Standard libaries
import pandas as pd
import json
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import sys
sys.path.append(f'{querydir}')
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
#Network functions
from QueryParse import (
                        harmonizeMetadata,
                        removeRelated,
                        getPCs,
                        getAgeGender,
                        getPhenotype,
                        getVariantDF,
                        runGwas
                          )

from QueryClinical import ( 
                            queryGroupDemographics,
                            domainQuery,
                            personQuery,
                            parseKeys
                          )


## BUILD COHORT

### Harmonize genetic data

In [3]:
#Search for patients variant called with GATK and sequenced with Illumina seq machine
metadata = 'GATK,Illumina'
meta_ids = harmonizeMetadata(metadata)

654 patients meet sequencing metadata criteria


### Remove related people

In [4]:
''' THIS STEP TAKES ~1-2MINUTES BECAUSE OF THE CALCUALTION NOT DATA EXTRACTION. 
    IMPLEMENTATION OF GFAF HAS BEEN VECTORIZED BUT CAN FURTHER OPTIMIZED'''
unrelated_ids = removeRelated(meta_ids)

654 remaining after removing related samples


# EXTRACT PC'S

In [5]:
kSearch = 20
pc_df = getPCs(unrelated_ids, kSearch)

{"V1":{"34272":-1.310507281,"60656":-1.2854650929,"604":-1.321331634,"51122":-1.2975354101,"68393":-1.3163465303,"37115":-1.3230902094,"55454":-1.2917574733,"64066":-1.3101966572,"68279":-1.2984455683,"57067":-1.3051600624,"38913":-1.3156043164,"81768":-1.319260963,"64009":-1.2980208957,"72577":-1.3167727968,"78089":-1.2951565579,"94551":-1.2946193407,"62814":-1.3076964757,"65605":-1.3084769161,"50434":-1.3016808684,"18692":-1.2808567988,"68672":-1.2868420284,"48761":-1.302418238,"7489":-1.29088183,"4316":-1.306589736,"41481":-1.3177935611,"111072":-1.3281136208,"109029":-1.2827567338,"30255":-1.3146412879,"59146":-1.2970322079,"50060":-1.3085235369,"74356":-1.3041109738,"10739":-1.3363613661,"2530":-1.2944502141,"7870":-1.283937206,"102825":-1.3738247217,"69585":-1.3375268265,"58795":-1.3417567999,"93451":-1.357263976,"59067":-1.3454848796,"55480":-1.2576204273,"37803":-1.3081228992,"713":-1.3411958581,"14530":-1.3388582818,"9807":-1.3338731972,"21336":-1.3467795877,"63897":-1.3286772

# Get phenotypes

###  Age and gender

In [13]:
demos = getAgeGender(chainName, multichainLoc, datadir, unrelated_ids)

### Phenotype of interest

In [None]:
#Phenotype here is diabetes but can be any user defined logic
pheno_id = '201826'
phenos = getPhenotype(pheno_id, demos)

# EXTRACT GENOTYPE INFORMATION

In [None]:
""" Extract variants of interest
    We show an example variant here to limit computation cost. In a full analysis we would pull the stored variants
    from the mapping stream and loop through them with parallelization  """
chrom = '1'
variant = '111489509'
genotype = 'all'
metadata = None
variants_df = getVariantDF(chrom, variant, genotype = 'all', metadata = None)

{"230135148":{"1|0":[60656,57067,64009,72577,5002,65605,18692,34884,2530,48671,55480,110758,42610,88191,61868,4152,82226,115157,47131,38141,82317,77258,7685,101068,108355,87600,104833,80397,44833,44507,30837,66284,89896,83951,15882,101184,89898,6753,61464,8622,102516,54213,74388,22642,82066,48820,90248,102644,101618,37947,64597,47381,45570,81257,109918,66970,117194,25609,68531,19015,78799,34461,25148,92165,40302,50930,12947,33005,88677,109551,92496,114782,111149,20802,41362,105156,117558,76080,109443,37047,31865,20146,9975,18697,56895,64768,39936,62879,13783,110617],"1|1":[51122,68279,78089,713,52844,72297,103114,15410,94436,14195,20873,81512,112725,69818,92821,12688],"0|0":["10169","49008","81768","62883","34769","62327","28228","24190","21929","116677","86006","52894","104660","46897","101511","27561","84983","61068","91436","93148","72504","59153","85012","66966","86387","111969","72639","30340","24896","42413","440","85580","6801","7237","61126","54873","24633","107147","116868","6

# RUN GWAS

In [None]:
mdf = runGwas(pc_df, phenos, variants_df, kSearch)
mdf.summary()

0,1,2,3
Dep. Variable:,variant,R-squared:,0.096
Model:,OLS,Adj. R-squared:,0.063
Method:,Least Squares,F-statistic:,2.915
Date:,"Wed, 01 Nov 2023",Prob (F-statistic):,7.62e-06
Time:,17:45:29,Log-Likelihood:,-150.7
No. Observations:,654,AIC:,349.4
Df Residuals:,630,BIC:,457.0
Df Model:,23,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.1329,0.056,2.368,0.018,0.023,0.243
phenotype,-0.0745,0.062,-1.208,0.228,-0.196,0.047
age,-0.0004,0.000,-0.768,0.443,-0.001,0.001
gender,0.0492,0.025,1.974,0.049,0.000,0.098
V1,6.339e-05,0.000,0.193,0.847,-0.001,0.001
V2,0.0007,0.000,4.359,0.000,0.000,0.001
V3,0.0004,0.001,0.336,0.737,-0.002,0.002
V4,-0.0001,0.000,-0.296,0.767,-0.001,0.001
V5,-0.0002,0.001,-0.335,0.738,-0.001,0.001

0,1,2,3
Omnibus:,256.808,Durbin-Watson:,2.053
Prob(Omnibus):,0.0,Jarque-Bera (JB):,709.311
Skew:,2.041,Prob(JB):,9.44e-155
Kurtosis:,6.06,Cond. No.,712.0
