In [4]:
ROOT_DIR = '/gpfs/commons/groups/gursoy_lab/aelhussein/blockchain'
multichainLoc = ''
chainName = 'public_access_2'
datadir = f'{ROOT_DIR}/multichain'
querydir = f'{ROOT_DIR}/public/code'
metafile = f'{ROOT_DIR}/public/data/samples/metadata.csv'
annotation_path = f'{ROOT_DIR}/public/data/annotations'
personPath = f'{ROOT_DIR}/public/data/clinical/person.csv'
dataPath = f'{ROOT_DIR}/public/data/clinical/'

In [325]:
# Standard libaries
import pandas as pd
import json
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import sys
sys.path.append(f'{querydir}')
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
#Network functions
from QueryClinical import ( 
                            queryGroupDemographics,
                            domainQuery,
                            personQuery,
                            parseKeys
                          )

from QueryVariant import ( 
                            queryVariants,
                            queryPersonsChroms,
                            getPatientVariantAnnotation
                         )

from QueryCombination import (
                                extractGeneVariants,
                                queryClinicalGeneVariantRange,
                                queryVariantClinical,
                             )

from QueryAnalysis import ( 
                            queryMetadata, 
                            queryKinship,
                            querySamplePCA
                          )


## BUILD COHORT

### Harmonize genetic data

In [326]:
def harmonizeMetadata(metadata):
    """ Returns list of harmonized IDs """
    response = queryMetadata(chainName, datadir, metadata)
    metadata_list = metadata.split(',')
    ids = [value for meta in metadata_list for value in list(response[meta].values())[0]]
    meta_ids = list(set(ids))
    print(f'{len(meta_ids)} patients meet sequencing metadata criteria')
    return meta_ids

In [327]:
#Search for patients variant called with GATK and sequenced with Illumina seq machine
metadata = 'GATK,Illumina'
meta_ids = harmonizeMetadata(metadata)

654 patients meet sequencing metadata criteria


### Remove related people

In [328]:
def removeRelated(ids):
    """ Checks and removes related patients """
    if isinstance(ids, list):
        ids = ','.join(ids)
    response = queryKinship(chainName, datadir, ids)
    response_json = json.loads(response)
    kin_df = pd.DataFrame(response_json)
    unrelated = kin_df.apply(lambda col: (col == 'UR').sum() == kin_df.shape[0] - 1)
    unrelated_ids = unrelated.index.tolist()
    print(f'{len(unrelated_ids)} remaining after removing related samples')
    return unrelated_ids

In [329]:
''' THIS STEP TAKES ~1-2MINUTES BECAUSE OF THE CALCUALTION NOT DATA EXTRACTION. 
    IMPLEMENTATION OF GFAF HAS BEEN VECTORIZED BUT CAN FURTHER OPTIMIZED'''
unrelated_ids = removeRelated(meta_ids)

# EXTRACT PC'S

In [None]:
def getPCs(ids, kSearch):
    """ Get PCs for list of samples """
    kSearch = 20
    if isinstance(ids, list):
        ids = ','.join(ids)
    response = querySamplePCA(chainName, datadir, ids, kSearch)
    pc_df = pd.DataFrame(json.loads(response))
    return pc_df

In [None]:
kSearch = 20
pc_df = getPCs(unrelated_ids, kSearch)

{"V1":{"79049":-107.6406412417,"72609":-106.7401337592,"34272":-108.1433257639,"60656":-106.0768393394,"116889":-107.5028315796,"604":-109.0365535663,"51122":-107.0728843615,"28658":-107.3003521858,"68393":-108.6251817978,"37115":-109.1816715588,"55454":-106.5960878482,"64066":-108.1176930274,"20001":-108.8726405881,"96304":-108.2457738164,"68279":-107.1479907927,"5202":-108.0556257367,"57067":-107.7020722075,"77106":-106.5064513214,"29870":-109.072437311,"38913":-108.5639341595,"81768":-108.8656813728,"64009":-107.1129467284,"72577":-108.6603573982,"5002":-108.7741676457,"78089":-106.8765809933,"50636":-106.9525089115,"94551":-106.8322497175,"62814":-107.9113775425,"65605":-107.975779646,"50434":-107.4149684109,"79915":-107.741451605,"18692":-105.6965619852,"68672":-106.1904643373,"48761":-107.4758163025,"112014":-107.5495446892,"14431":-108.228587368,"83111":-107.5119780726,"7489":-106.5238295814,"4316":-107.8200491645,"41481":-108.7445910702,"111072":-109.5962044884,"92861":-107.014

# Get phenotypes

###  Age and gender

In [None]:
def getAgeGender(chainName, multichainLoc, datadir, ids):
    """ extract the age and gender of all patients """
    searchKeys = 'birth_datetime,gender_concept_id'
    demo_data = queryGroupDemographics(chainName, multichainLoc, datadir, searchKeys)
    demographics = pd.DataFrame(demo_data).T
    keys = searchKeys.split(',')
    demographics.columns = keys

    demographics['birth_datetime'] = pd.to_datetime(demographics['birth_datetime'])
    current_date = pd.to_datetime('2023-07-06')
    demographics['age'] = (current_date - demographics['birth_datetime']).dt.days / 365.25
    demo_processed = demographics[['gender_concept_id', 'age']]
    demo_processed['gender'] = demo_processed['gender_concept_id'].replace({8507:0, 8532:1})
    demo_processed.drop(columns = 'gender_concept_id', inplace = True)

    if isinstance(ids,str):
        ids = ids.split(',')

    return demo_processed.loc[ids]

In [None]:
demos = getAgeGender(chainName, multichainLoc, datadir, unrelated_ids)

### Phenotype of interest

In [None]:
def getPhenotype(pheno_id, demos):
    """ Get phenotype of interest """
    searchKeys = 'demographics' # returns basic information for patients with disease. can be changed if more complex info needed
    response = domainQuery(chainName, multichainLoc, datadir, pheno_id, searchKeys)
    data = [r['data']['json'] for r in response]
    df = pd.DataFrame(data)
    disease_ids = list(df['person_id'].unique())
    phenos = demos.copy()
    phenos['phenotype'] = 0
    phenos.loc[phenos.index.isin(disease_ids), 'phenotype']  = 1
    return phenos

In [None]:
#Phenotype here is diabetes but can be any user defined logic
pheno_id = '201826'
phenos = getPhenotype(pheno_id, demos)

# EXTRACT GENOTYPE INFORMATION

In [None]:
def getVariantDF(chrom, variants, genotype = 'all', metadata = None):
    """ Get variant in DF format """
    response = queryVariants(chainName, multichainLoc, datadir, chrom, variants, genotype, metadata)
    variants_dict = json.loads(response)
    data_for_df = []
    for variant, genotypes in variants_dict.items():
        for genotype, ids in genotypes.items():
            for id_ in ids:
                data_for_df.append({'variant': variant, 'genotype': genotype, 'id': id_})
    df = pd.DataFrame(data_for_df)
    variants_df = df.pivot(index='id', columns='variant', values='genotype').reset_index()
    variants_df.set_index('id', inplace=True)
    variants_df.replace({"0|0":0, "1|0":1, "1|1":1}, inplace = True)
    variants_df.columns = ['variant']
    variants_df.index = variants_df.index.astype(str)
    return variants_df

In [None]:
""" Extract variants of interest
    We show an example variant here to limit computation cost. In a full analysis we would pull the stored variants
    from the mapping stream and loop through them with parallelization  """
chrom = '1'
variant = '230135148'
genotype = 'all'
metadata = None
variants_df = getVariantDF(chrom, variant, genotype = 'all', metadata = None)

{"230135148":{"1|0":[60656,57067,64009,72577,5002,65605,18692,34884,2530,48671,55480,110758,42610,88191,61868,4152,82226,115157,47131,38141,82317,77258,7685,101068,108355,87600,104833,80397,44833,44507,30837,66284,89896,83951,15882,101184,89898,6753,61464,8622,102516,54213,74388,22642,82066,48820,90248,102644,101618,37947,64597,47381,45570,81257,109918,66970,117194,25609,68531,19015,78799,34461,25148,92165,40302,50930,12947,33005,88677,109551,92496,114782,111149,20802,41362,105156,117558,76080,109443,37047,31865,20146,9975,18697,56895,64768,39936,62879,13783,110617],"1|1":[51122,68279,78089,713,52844,72297,103114,15410,94436,14195,20873,81512,112725,69818,92821,12688],"0|0":["65287","63577","111969","67557","72078","70983","10581","68306","110607","67237","3631","78214","114683","22453","108457","46893","37533","108293","48473","56482","69168","45218","91234","109805","34947","64049","39382","65165","93846","58786","31346","53006","76908","94122","65293","42413","53933","71477","23684"

# RUN GWAS

In [None]:
def runGwas(pc_df, phenos, variants_df):
    #Linear mixed model with age, gender and phenotype
    covariates = pc_df.merge(phenos, left_index =True, right_index=True)
    data = covariates.merge(variants_df, left_index =True, right_index=True)
    formula = f"variant ~ phenotype + age + gender + " + ' + '.join([f'V{i}' for i in range(1,kSearch+1)])
    md = smf.ols(formula, data)
    mdf = md.fit()
    return mdf

In [None]:
mdf = runGwas(pc_df, phenos, variants_df)
mdf.summary()

0,1,2,3
Dep. Variable:,variant,R-squared:,0.096
Model:,OLS,Adj. R-squared:,0.063
Method:,Least Squares,F-statistic:,2.915
Date:,"Wed, 01 Nov 2023",Prob (F-statistic):,7.62e-06
Time:,16:24:43,Log-Likelihood:,-150.7
No. Observations:,654,AIC:,349.4
Df Residuals:,630,BIC:,457.0
Df Model:,23,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.1329,0.056,2.368,0.018,0.023,0.243
phenotype,-0.0745,0.062,-1.208,0.228,-0.196,0.047
age,-0.0004,0.000,-0.768,0.443,-0.001,0.001
gender,0.0492,0.025,1.974,0.049,0.000,0.098
V1,6.339e-05,0.000,0.193,0.847,-0.001,0.001
V2,0.0007,0.000,4.359,0.000,0.000,0.001
V3,0.0004,0.001,0.336,0.737,-0.002,0.002
V4,-0.0001,0.000,-0.296,0.767,-0.001,0.001
V5,-0.0002,0.001,-0.335,0.738,-0.001,0.001

0,1,2,3
Omnibus:,256.808,Durbin-Watson:,2.053
Prob(Omnibus):,0.0,Jarque-Bera (JB):,709.311
Skew:,2.041,Prob(JB):,9.44e-155
Kurtosis:,6.06,Cond. No.,712.0
