In [17]:
from pysam import VariantFile
from collections import defaultdict 
from collections import Counter 
import pandas as pd 
import bioframe as bf

def search_omim(rec):
    bases = rec.info.get('CLNDISDB')
    if bases is None:
        return []
    omim_ids = []
    for b in bases:
        if ":" in b:
            name, ind = b.split(":", maxsplit=1)
            if name == 'OMIM':
                omim_name = ind.split("|")[0]
                omim_ids.append(omim_name)
    return omim_ids
    
def parse_clinvar(inpath: str):
    bcf_in = VariantFile(inpath)
    lines = []

    filter_out_stats = ('_single_submitter', '_conflicting_interpretations') # remove questionable entries  
    for rec in bcf_in.fetch():
        omim_ids = search_omim(rec)
        if len(omim_ids) == 0:
            omim_ids = None
        else:
            omim_ids = ";".join(omim_ids)
        if rec.alts is not None:
            alt = rec.alts[0] # only one alt for record in clinvar 
        else:
            alt = None
        sign = rec.info.get('CLNSIG')
        if sign:
            sign = ";".join(sign)
        gene_info = rec.info.get('GENEINFO')
        consequences =  rec.info.get('MC', [])
        consequences = ";".join(consequences)
        filter_out = False
        possible_stats = rec.info.get('CLNREVSTAT')
        if possible_stats:
            stats = ";".join(possible_stats)
            for s in possible_stats:
                if s in filter_out_stats:
                    filter_out = True
        else:
            stats = None
        if not filter_out:
            lines.append( (rec.chrom, rec.start, rec.stop, rec.ref, alt, rec.id, omim_ids, gene_info, consequences, sign, stats))
    return pd.DataFrame(lines, columns=["chrom", 
                                        "start", 
                                        "end",
                                        "ref", 
                                        "alt", 
                                        "clinvar_id", 
                                        "omim_ids", 
                                        "gene_info", 
                                        "consequences",
                                        "significance",
                                        "stats"])                

In [2]:
!rm -f clinvar_20240107.vcf
!wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/weekly/clinvar_20240107.vcf.gz
!gunzip clinvar_20240107.vcf.gz
!ls

--2024-03-28 14:55:09--  https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/weekly/clinvar_20240107.vcf.gz
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 130.14.250.10, 130.14.250.7, 2607:f220:41e:250::7, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|130.14.250.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 91330038 (87M) [application/x-gzip]
Saving to: ‘clinvar_20240107.vcf.gz.1’


2024-03-28 14:55:14 (21.4 MB/s) - ‘clinvar_20240107.vcf.gz.1’ saved [91330038/91330038]

clinvar_20240107.vcf	   clinvar.ipynb
clinvar_20240107.vcf.gz.1  clinvar_significance_mapping.txt
clinvar_cleaned.tsv


In [3]:
clinvar = parse_clinvar("clinvar_20240107.vcf")

[W::vcf_parse] Contig '1' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '2' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '3' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '4' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '5' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '6' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '7' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '8' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '9' is not defined in the header. (Quick workaround: index the file with tabix.)
[W::vcf_parse] Contig '10' is not defined in the header. (Quick workaroun

In [4]:
cnt = Counter([x for y in clinvar['stats'][~pd.isnull(clinvar['stats'])].str.split(";") for x in y])

In [5]:
cnt.most_common()

[('criteria_provided', 352265),
 ('_multiple_submitters', 352265),
 ('_no_conflicts', 352265),
 ('no_assertion_criteria_provided', 50305),
 ('reviewed_by_expert_panel', 15557),
 ('no_assertion_provided', 10289),
 ('no_interpretation_for_the_single_variant', 719),
 ('practice_guideline', 51)]

In [6]:
AUTOSOMES = set(map(str, range(1, 23)))
clinvar['chrom'] = clinvar['chrom'].apply(lambda x: f"chr{x}" if x in AUTOSOMES or x in ('X', 'Y') else "chrM" if x == "MT" else x)


In [7]:
clinvar = clinvar[~pd.isnull(clinvar['significance'])].copy()

In [8]:
clinvar.to_csv("clinvar_cleaned.tsv", sep="\t", index=None)

In [9]:
with open("clinvar_significance_mapping.txt") as inp:
     significance_mapping = dict(line.rstrip().rsplit(";", 1) for line in inp)

In [10]:
clinvar['significance'] = clinvar['significance'].apply(lambda x: significance_mapping[x])

In [14]:
clinvar = clinvar[clinvar['significance'] != "none"]

In [33]:
exons = pd.read_table("exons.bed")

In [43]:
clinvar_no_exons = bf.setdiff(clinvar, exons)

In [44]:
clinvar_no_exons.shape

(36377, 11)

In [46]:
clinvar_no_exons.to_csv("clinvar_reg.tsv", sep="\t", index=None)