Reading in the data and then filtering out SNPs which do not meet p value significance before joining the data together.

In [1]:
%load_ext autoreload
%autoreload 2

import polars as pl

# exp: Exposure
# out: Outcome
# ea: Exposure allele
# oa: Other allele

exp_header_dict = {
  'rsID':'rsid',
  'CHROM':'chr_exp',
  'ALT':'ea_exp',
  'REF':'oa_exp',
  'POOLED_ALT_AF':'eaf_exp',
  'EFFECT_SIZE':'beta_exp',
  'SE':'se_exp',
  'pvalue':'pval_exp'
}

out_header_dict = {
  'markername':'rsid',
  'chr':'chr_out',
  'bp_hg19':'pos_out',
  'effect_allele':'ea_out',
  'noneffect_allele':'oa_out',
  'effect_allele_freq':'eaf_out',
  'beta':'beta_out',
  'se_dgc':'se_out',
  'p_dgc':'pval_out'}

pthresh = 5e-8

# Renaming columns and filtering data to only include observations which fulfill significance threshold
dexp = (pl.scan_csv("dataset/ldlc_gwas.txt",separator="\t")
        .rename(exp_header_dict)
        .filter((pl.col('pval_exp') < pthresh)))
dout = (pl.scan_csv("dataset/mi_gwas.tsv",separator="\t")
        .rename(out_header_dict))

combined = (dexp.join(dout, on='rsid')
            # Convert all data to lowercase
            .with_columns(
                pl.col('ea_exp').str.to_lowercase(),
                pl.col('oa_exp').str.to_lowercase(),
                pl.col('ea_out').str.to_lowercase(),
                pl.col('oa_out').str.to_lowercase(),
            # Convert minor allele freq to effect allele freq
                pl.col('eaf_exp').mul(-1).add(1))
            .collect())

print(combined.shape)

(75089, 26)


We can use the provided harmonize function.

In [2]:
from MR.harmonize import harmonize

total = harmonize(combined, palindromic_action=1, palindromic_threshold=0.08)
print(total.shape)

# Seeing if duplicates are dropped
print(total.unique(subset=['rsid']).shape)

(73095, 26)
(73095, 26)


Clumping the data based on Linkage Disequilibirum (LD). This is to filter out SNPs which are closely correlated in order to essentially avoid double counting results into the final calculation.

In [3]:
from MR.ld import ld_clump

pruned_rsids = ld_clump(total['rsid'], total['pval_exp'])

processed_data = (total.join(pruned_rsids, on='rsid'))

Calculating causal effects.

In [21]:
from MR.calculate_effect import calculate_effect, methods

print(processed_data.shape)
print(methods)
print()

# print('MR Presso')
# result = calculate_effect(processed_data, 'presso')
# print(f'Effect: {result["effect"]}')
# print(f'se: {result["se"]}\n')

print('Inverse Variance Weighted')
result = calculate_effect(processed_data, 'inverse_variance_weighted')
print(f'Effect: {result["effect"]}')
print(f'se: {result["se"]}')
print(f'pval: {result["pval"]}\n')

print('Wald ratio')
result = calculate_effect(processed_data, 'wald_ratio')
print(f'Effect: {result["effect"]}')
print(f'se: {result["se"]}')
print(f'pval: {result["pval"]}\n')

print('Simple Median')
result = calculate_effect(processed_data, 'simple_median')
print(f'Effect: {result["effect"]}')
print(f'se: {result["se"]}')
print(f'pval: {result["pval"]}\n')

print('Weighted Median')
result = calculate_effect(processed_data, 'weighted_median')
print(f'Effect: {result["effect"]}')
print(f'se: {result["se"]}')
print(f'pval: {result["pval"]}\n')

print('Penalised Weighted Median')
result = calculate_effect(processed_data, 'penalised_weighted_median')
print(f'Effect: {result["effect"]}')
print(f'se: {result["se"]}')
print(f'pval: {result["pval"]}\n')

print('Egger Regression')
result = calculate_effect(processed_data, 'egger_regression')
print(f'Effect: {result["effect"]}')
print(f'se: {result["se"]}')
print(f'pval: {result["pval"]}\n')

print('Simple Mode')
result = calculate_effect(processed_data, 'simple_mode')
print(f'Effect: {result["effect"]}')
print(f'se: {result["se"]}')
print(f'pval: {result["pval"]}\n')

print('Weighted Mode')
result = calculate_effect(processed_data, 'weighted_mode')
print(f'Effect: {result["effect"]}')
print(f'se: {result["se"]}')
print(f'pval: {result["pval"]}\n')

# print('Penalised Weighted Mode')
# result = calculate_effect(processed_data, 'penalised_weighted_mode')
# print(f'Effect: {result["effect"]}')
# print(f'se: {result["se"]}')
# print(f'pval: {result["pval"]}\n')

print('Maximum Likelihood')
result = calculate_effect(processed_data, 'maximum_likelihood')
print(f'Effect: {result["effect"]}')
print(f'se: {result["se"]}')
print(f'pval: {result["pval"]}\n')


(379, 26)
['inverse_variance_weighted', 'wald_ratio', 'maximum_likelihood', 'simple_median', 'weighted_median', 'penalised_weighted_median', 'simple_mode', 'weighted_mode', 'penalised_weighted_mode', 'egger_regression', 'presso']

Inverse Variance Weighted
Effect: 0.41290206324176987
se: 0.046608320255201514
pval: 0.0

Wald ratio
Effect: shape: (379,)
Series: 'beta_out' [f64]
[
	0.655268
	-0.406161
	-0.343544
	-0.695233
	0.842235
	0.467457
	0.441508
	2.329176
	0.924326
	1.049014
	-1.151759
	4.073095
	…
	0.32426
	1.527697
	0.460279
	2.382986
	-1.411832
	-0.425027
	0.263853
	0.483913
	0.85838
	1.090972
	-0.254394
	0.588553
	1.771192
]
se: shape: (379,)
Series: 'se_out' [f64]
[
	1.191913
	0.451443
	0.57623
	1.227839
	0.278493
	0.600904
	0.310076
	0.900905
	1.099366
	0.545874
	0.54861
	1.150285
	…
	1.016341
	0.931797
	1.184941
	0.852642
	0.858951
	0.745924
	0.857858
	0.838885
	1.11277
	0.88085
	1.034593
	1.130289
	0.871065
]
pval: [5.82483156e-01 3.68283422e-01 5.51046890e-01 5.71240978e-0