Reading in the data and then filtering out SNPs which do not meet p value significance before joining the data together.

In [1]:
import polars as pl

# exp: Exposure
# out: Outcome
# ea: Exposure allele
# oa: Other allele

exp_header_dict = {
  'rsID':'rsid',
  'CHROM':'chr_exp',
  'ALT':'ea_exp',
  'REF':'oa_exp',
  'POOLED_ALT_AF':'maf_exp',
  'EFFECT_SIZE':'beta_exp',
  'SE':'se_exp',
  'pvalue':'pval_exp'
}

out_header_dict = {
  'markername':'rsid',
  'chr':'chr_out',
  'bp_hg19':'pos_out',
  'effect_allele':'ea_out',
  'noneffect_allele':'oa_out',
  'effect_allele_freq':'ea_freq',
  'beta':'beta_out',
  'se_dgc':'se_out',
  'p_dgc':'pval_out'}

pthresh = 5e-8

# Renaming columns and filtering data to only include observations which fulfill significance threshold
dexp = (pl.scan_csv("dataset/ldlc_gwas.txt",separator="\t")
        .rename(exp_header_dict)
        .filter((pl.col('pval_exp') < pthresh)))
dout = (pl.scan_csv("dataset/mi_gwas.tsv",separator="\t")
        .rename(out_header_dict))

combined = (dexp.join(dout, on='rsid')
            # Convert all data to lowercase
            .with_columns(pl.col('ea_exp').str.to_lowercase())
            .with_columns(pl.col('oa_exp').str.to_lowercase())
            .with_columns(pl.col('ea_out').str.to_lowercase())
            .with_columns(pl.col('oa_out').str.to_lowercase())
            .collect())

print(combined.shape)

(75089, 26)


Harmonizing the data.

Gathering all SNPs using fowards strand with matching effect and alternate alleles between exposure and outcome.

In [2]:
forwards_same = combined.filter(((pl.col('ea_exp') == pl.col('ea_out')) & (pl.col('oa_exp') == pl.col('oa_out'))))

Gathering all SNPs using forwards strand with flipped effect and alternate alleles between exposure and outcome. The effect is then multiplied by -1.

In [3]:
forwards_flipped = (
  combined.filter(((pl.col('ea_exp') == pl.col('oa_out')) & (pl.col('oa_exp') == pl.col('ea_out'))))
  # Flip the signs of the outcome effects
  .with_columns(
    pl.col('beta_out').mul(-1),
    pl.col('ea_freq').mul(-1).add(1)
  )
)

Flipping the outcome alleles of the remaining SNPs since the remaining valid SNPs must use the reverse strand.

In [4]:
# Find cases where alleles don't match
reverse = (combined.filter(~(((pl.col('ea_exp') == pl.col('ea_out')) & (pl.col('oa_exp') == pl.col('oa_out'))) | 
                            (((pl.col('ea_exp') == pl.col('oa_out')) & (pl.col('oa_exp') == pl.col('ea_out'))))))
            # Flipping the alleles
            .with_columns(pl.col('ea_out').str.replace('a', 't'))
            .with_columns(pl.col('ea_out').str.replace('t', 'a'))
            .with_columns(pl.col('ea_out').str.replace('g', 'c'))
            .with_columns(pl.col('ea_out').str.replace('c', 'g'))
            .with_columns(pl.col('oa_out').str.replace('a', 't'))
            .with_columns(pl.col('oa_out').str.replace('t', 'a'))
            .with_columns(pl.col('oa_out').str.replace('g', 'c'))
            .with_columns(pl.col('oa_out').str.replace('c', 'g'))
)

Gathering SNPs from reverse strand which use the same alleles for exposure and outcome.

In [5]:
reverse_same = (
  reverse.filter(((pl.col('ea_exp') == pl.col('ea_out')) & (pl.col('oa_exp') == pl.col('oa_out'))))
)

Gathering SNPs from reverse strand which flipped the effect and alternate alleles. We then multiply the effect by -1.

In [6]:
reverse_flipped = (
  # Find all reversed cases
  reverse.filter(((pl.col('ea_exp') == pl.col('oa_out')) & (pl.col('oa_exp') == pl.col('ea_out'))))
  # Flip the signs of the outcome effects
  .with_columns(
    pl.col('beta_out').mul(-1),
    pl.col('ea_freq').mul(-1).add(1)
  )
)

Combining all the different cases into one dataframe.

In [7]:
print(forwards_same.shape)
print(forwards_flipped.shape)
print(reverse_same.shape)
print(reverse_flipped.shape)

# Combining all SNPs
total = pl.concat([forwards_same, forwards_flipped, reverse_same, reverse_flipped])

(8679, 26)
(66342, 26)
(1, 26)
(6, 26)


Discarding all palindromic SNPs since we are unable to determine whether the effect and outcome alleles are matching.

In [8]:
total = total.filter(
  ~(((pl.col('ea_exp') == 'a') & (pl.col('oa_exp') == 't')) |
  ((pl.col('ea_exp') == 't') & (pl.col('oa_exp') == 'a')) |
  ((pl.col('ea_exp') == 'g') & (pl.col('oa_exp') == 'c')) |
  ((pl.col('ea_exp') == 'c') & (pl.col('oa_exp') == 'g')))
)

# maf_threshold = 0.6

# palindromic = total.filter(((pl.col('ea_exp') == 'a') & (pl.col('oa_exp') == 't')) |
#   ((pl.col('ea_exp') == 't') & (pl.col('oa_exp') == 'a')) |
#   ((pl.col('ea_exp') == 'g') & (pl.col('oa_exp') == 'c')) |
#   ((pl.col('ea_exp') == 'c') & (pl.col('oa_exp') == 'g')))

# # If ea_freq is greater than threshold then we keep the SNP
# # If ea_freq is less than 1-threshold, we keep SNP after negating the effect

# print(palindromic.select(['ea_exp', 'oa_exp', 'ea_out', 'oa_out']))


Alternatively, we can use the provided harmonize function.

In [11]:
from MR.harmonize import harmonize

total = harmonize(combined)
print(total.shape)

(63712, 26)


Pruning the data based on Linkage Disequilibirum (LD).

In [12]:
from MR.ld import ld_clump

pruned_rsids = ld_clump(total['rsid'], total['pval_exp'])

processed_data = (total.join(pruned_rsids, on='rsid'))

Calculate the causal effect

In [13]:
from MR.calculate_effect import calculate_effect, methods
print(processed_data.shape)

print(methods)

print('IVW')
result = calculate_effect(processed_data, 'ivw')
print(f'Effect: {result["effect"]}')
print(f'se: {result["se"]}')

print('Wald ratio')
result = calculate_effect(processed_data, 'wald ratio')
print(f'Effect: {result["effect"]}')
print(f'se: {result["se"]}')




(444, 26)
['iwv', 'wald ratio']
IVW
Effect: 0.5666555763511523
se: 0.024475735763108256
Wald ratio
Effect: 0.5137645701874517
se: 0.8410924885983558
