Reading in the data and then filtering out SNPs which do not meet p value significance before joining the data together.

In [1]:
%load_ext autoreload
%autoreload 2

import polars as pl

# exp: Exposure
# out: Outcome
# ea: Exposure allele
# oa: Other allele

exp_header_dict = {
  'rsID':'rsid',
  'CHROM':'chr_exp',
  'ALT':'ea_exp',
  'REF':'oa_exp',
  'POOLED_ALT_AF':'eaf_exp',
  'EFFECT_SIZE':'beta_exp',
  'SE':'se_exp',
  'pvalue':'pval_exp'
}

out_header_dict = {
  'markername':'rsid',
  'chr':'chr_out',
  'bp_hg19':'pos_out',
  'effect_allele':'ea_out',
  'noneffect_allele':'oa_out',
  'effect_allele_freq':'eaf_out',
  'beta':'beta_out',
  'se_dgc':'se_out',
  'p_dgc':'pval_out'}

pthresh = 5e-8

# Renaming columns and filtering data to only include observations which fulfill significance threshold
dexp = (pl.scan_csv("dataset/ldlc_gwas.txt",separator="\t")
        .rename(exp_header_dict)
        .filter((pl.col('pval_exp') < pthresh)))
dout = (pl.scan_csv("dataset/mi_gwas.tsv",separator="\t")
        .rename(out_header_dict))

combined = (dexp.join(dout, on='rsid')
            # Convert all data to lowercase
            .with_columns(
                pl.col('ea_exp').str.to_lowercase(),
                pl.col('oa_exp').str.to_lowercase(),
                pl.col('ea_out').str.to_lowercase(),
                pl.col('oa_out').str.to_lowercase(),
            # Convert minor allele freq to effect allele freq
                pl.col('eaf_exp').mul(-1).add(1))
            .collect())

print(combined.shape)

(75089, 26)


Harmonizing the data.

Gathering all SNPs using fowards strand with matching effect and alternate alleles between exposure and outcome.

In [2]:
forwards_same = combined.filter(((pl.col('ea_exp') == pl.col('ea_out')) & (pl.col('oa_exp') == pl.col('oa_out'))))

Gathering all SNPs using forwards strand with flipped effect and alternate alleles between exposure and outcome. The effect is then multiplied by -1.

In [3]:
forwards_flipped = (
  combined.filter(((pl.col('ea_exp') == pl.col('oa_out')) & (pl.col('oa_exp') == pl.col('ea_out'))))
  .with_columns(
    # Flip the signs of the outcome effects
    pl.col('beta_out').mul(-1),
    # Flip the effect allele frequency
    pl.col('eaf_out').mul(-1).add(1)
  )
).rename({
    'oa_out': 'ea_out',
    'ea_out': 'oa_out',
})

Flipping the outcome alleles of the remaining SNPs since the remaining valid SNPs must use the reverse strand.

In [4]:
# Find cases where alleles don't match
reverse = (combined.filter(~(((pl.col('ea_exp') == pl.col('ea_out')) & (pl.col('oa_exp') == pl.col('oa_out'))) | 
                            (((pl.col('ea_exp') == pl.col('oa_out')) & (pl.col('oa_exp') == pl.col('ea_out'))))))
            # Flipping the alleles
            .with_columns(pl.col('ea_out').str.replace('a', 't'))
            .with_columns(pl.col('ea_out').str.replace('t', 'a'))
            .with_columns(pl.col('ea_out').str.replace('g', 'c'))
            .with_columns(pl.col('ea_out').str.replace('c', 'g'))
            .with_columns(pl.col('oa_out').str.replace('a', 't'))
            .with_columns(pl.col('oa_out').str.replace('t', 'a'))
            .with_columns(pl.col('oa_out').str.replace('g', 'c'))
            .with_columns(pl.col('oa_out').str.replace('c', 'g'))
)

Gathering SNPs from reverse strand which use the same alleles for exposure and outcome.

In [5]:
reverse_same = (
  reverse.filter(((pl.col('ea_exp') == pl.col('ea_out')) & (pl.col('oa_exp') == pl.col('oa_out'))))
)

Gathering SNPs from reverse strand which flipped the effect and alternate alleles. We then multiply the effect by -1.

In [6]:
reverse_flipped = (
  # Find all reversed cases
  reverse.filter(((pl.col('ea_exp') == pl.col('oa_out')) & (pl.col('oa_exp') == pl.col('ea_out'))))
  .with_columns(
    # Flip the signs of the outcome effects
    pl.col('beta_out').mul(-1),
    # Flip the effect allele frequency
    pl.col('eaf_out').mul(-1).add(1)
  )
).rename({
    'oa_out': 'ea_out',
    'ea_out': 'oa_out',
})

Combining all the different cases into one dataframe.

In [7]:
print(forwards_same.shape)
print(forwards_flipped.shape)
print(reverse_same.shape)
print(reverse_flipped.shape)

# Combining all SNPs
total = pl.concat([forwards_same[sorted(forwards_same.columns)],
                    forwards_flipped[sorted(forwards_flipped.columns)], 
                    reverse_same[sorted(reverse_same.columns)], 
                    reverse_flipped[sorted(reverse_flipped.columns)]])

total.write_csv('dataset/total.csv', separator='\t')

(8679, 26)
(66342, 26)
(1, 26)
(6, 26)


Dealing with palindromic SNPs. These are troublesome because we are unable to determine whether they are reverse strand or using the different effect and alternate alleles. Thus, we have to infer from the effect allele frequency.

In [8]:
threshold = 0.08

palindromic = total.filter(((pl.col('ea_exp') == 'a') & (pl.col('oa_exp') == 't')) |
  ((pl.col('ea_exp') == 't') & (pl.col('oa_exp') == 'a')) |
  ((pl.col('ea_exp') == 'g') & (pl.col('oa_exp') == 'c')) |
  ((pl.col('ea_exp') == 'c') & (pl.col('oa_exp') == 'g')))

total = total.filter(
  ~(((pl.col('ea_exp') == 'a') & (pl.col('oa_exp') == 't')) |
  ((pl.col('ea_exp') == 't') & (pl.col('oa_exp') == 'a')) |
  ((pl.col('ea_exp') == 'g') & (pl.col('oa_exp') == 'c')) |
  ((pl.col('ea_exp') == 'c') & (pl.col('oa_exp') == 'g')))
)

# For all SNPs where effect allele freq is greater than threshold + 0.5 and minor allele freq is less than 0.5 - threshold, flip the beta effect.
# For all SNPs where both effect ellele freq and minor allele freq are greater than 0.5 + threshold or less than 0.5 - threshold, change nothing.
# Otherwise, discard the SNP.

correct_palindromic = palindromic.filter((((pl.col('eaf_exp') > 0.5 + threshold) & (pl.col('eaf_out') > 0.5 + threshold)) | 
                              ((pl.col('eaf_exp') < 0.5 - threshold) & (pl.col('eaf_out') < 0.5 - threshold))))

flipped_palindromic =  (palindromic.filter(((pl.col('eaf_exp') > 0.5 + threshold) & (pl.col('eaf_out') < 0.5 - threshold)))
  .with_columns(
      # Flip the signs of the outcome effects
      pl.col('beta_out').mul(-1)
  )
)

total = pl.concat([total, correct_palindromic, flipped_palindromic])

print(total.shape)

(73098, 26)


Alternatively, we can use the provided harmonize function.

In [9]:
from MR.harmonize import harmonize

total = harmonize(combined)
print(total.shape)

(73759, 26)


Pruning the data based on Linkage Disequilibirum (LD).

In [10]:
from MR.ld import ld_clump

pruned_rsids = ld_clump(total['rsid'], total['pval_exp'])

processed_data = (total.join(pruned_rsids, on='rsid'))

Calculating causal effects.

In [36]:
from MR.calculate_effect import calculate_effect, methods

print(processed_data.shape)
print(methods)
print()

print('Inverse Variance Weighted')
result = calculate_effect(processed_data, 'inverse_variance_weighted')
print(f'Effect: {result["effect"]}')
print(f'se: {result["se"]}\n')

print('Wald ratio')
result = calculate_effect(processed_data, 'wald_ratio')
print(f'Effect: {result["effect"]}')
print(f'se: {result["se"]}\n')

print('Simple Median')
result = calculate_effect(processed_data, 'simple_median')
print(f'Effect: {result["effect"]}')
print(f'se: {result["se"]}\n')

print('Weighted Median')
result = calculate_effect(processed_data, 'weighted_median')
print(f'Effect: {result["effect"]}')
print(f'se: {result["se"]}\n')

print('Penalised Weighted Median')
result = calculate_effect(processed_data, 'penalised_weighted_median')
print(f'Effect: {result["effect"]}')
print(f'se: {result["se"]}\n')

print('Egger Regression')
result = calculate_effect(processed_data, 'egger_regression')
print(f'Effect: {result["effect"]}')
print(f'se: {result["se"]}\n')

print('Simple Mode')
result = calculate_effect(processed_data, 'simple_mode')
print(f'Effect: {result["effect"]}')
print(f'se: {result["se"]}\n')

print('Weighted Mode')
result = calculate_effect(processed_data, 'weighted_mode')
print(f'Effect: {result["effect"]}')
print(f'se: {result["se"]}\n')

# print('Penalised Weighted Mode')
# result = calculate_effect(processed_data, 'penalised_weighted_mode')
# print(f'Effect: {result["effect"]}')
# print(f'se: {result["se"]}\n')

print('Maximum Likelihood')
result = calculate_effect(processed_data, 'maximum_likelihood')
print(f'Effect: {result["effect"]}')
print(f'se: {result["se"]}\n')


(448, 26)
['inverse_variance_weighted', 'wald_ratio', 'maximum_likelihood', 'simple_median', 'weighted_median', 'penalised_weighted_median', 'simple_mode', 'weighted_mode', 'penalised_weighted_mode', 'egger_regression', 'presso']

Inverse Variance Weighted
Effect: 0.3885336590849115
se: 0.024339255184504666

Wald ratio
Effect: shape: (448,)
Series: 'beta_out' [f64]
[
	2.264208
	-1.967468
	1.962896
	0.318726
	-0.417966
	-0.137194
	1.815582
	0.922458
	0.470684
	-3.331028
	0.368439
	-0.211527
	…
	0.627907
	-1.367675
	-2.167526
	0.537657
	-0.694712
	-0.488811
	-0.32226
	-0.507765
	-0.952006
	0.712864
	-1.021774
	-0.183355
	-0.693967
]
se: shape: (448,)
Series: 'se_out' [f64]
[
	1.353386
	0.789001
	1.485588
	0.800577
	0.789399
	0.998637
	0.909066
	0.587375
	0.441421
	0.70752
	0.739068
	1.291699
	…
	0.542865
	1.086411
	0.762293
	1.093857
	0.190792
	1.11468
	0.298454
	1.180086
	0.173793
	1.568221
	0.267013
	0.787796
	0.979388
]

Simple Median
Effect: 0.5104459011676726
se: 0.05363689114768657