In [4]:
import sys
sys.path.append('..')

In [7]:
from filter_clinvar_xml import filter_xml, pprint, iterate_cvs_from_xml

from cmat.clinvar_xml_io import *
from cmat.clinvar_xml_io.xml_parsing import *

import gzip
import os
import re
import json

In [17]:
work_dir = os.getenv('WORK_DIR')

In [19]:
march_clinvar = os.path.join(work_dir, 'march-full-clinvar.xml.gz')
june_clinvar = os.path.join(work_dir, 'june-full-clinvar.xml.gz')

march_evidence = os.path.join(work_dir, 'cttv012-2025-01-23.json.gz')
june_evidence = os.path.join(work_dir, 'cttv012-2025-05-12.json.gz')

In [36]:
# Get all RCVs present in 25.06 evidence without EFO mapping
june_rcvs_no_efo = set()
with gzip.open(june_evidence, 'rt') as june_ev_file:
    for line in june_ev_file:
        ev = json.loads(line)
        if 'diseaseFromSourceMappedId' not in ev:
            june_rcvs_no_efo.add(ev['studyId'])            

In [37]:
len(june_rcvs_no_efo)

1093765

In [39]:
# Remove all RCVs also present in 25.03 evidence without EFO mapping
with gzip.open(march_evidence, 'rt') as mar_ev_file:
    for line in mar_ev_file:
        ev = json.loads(line)
        if 'diseaseFromSourceMappedId' not in ev:
            june_rcvs_no_efo.discard(ev['studyId'])

In [40]:
len(june_rcvs_no_efo)

877745

In [42]:
# Of the remainder, count how many are new in this version of ClinVar and how many were present in the last version we used
# Partition into RCVs present in March version of ClinVar, and those not (i.e. new in June)
no_efo_rcvs_present_in_march = set()
for record in ClinVarDataset(march_clinvar):
    if record.accession in june_rcvs_no_efo:
        no_efo_rcvs_present_in_march.add(record.accession)
        june_rcvs_no_efo.remove(record.accession)

In [43]:
len(june_rcvs_no_efo)

101398

In [44]:
len(no_efo_rcvs_present_in_march)

776347

In [45]:
with open(os.path.join(work_dir, 'new_unmapped_rcvs.txt'), 'w+') as f:
    f.write('\n'.join(june_rcvs_no_efo))
    
with open(os.path.join(work_dir, 'existing_unmapped_rcvs.txt'), 'w+') as f:
    f.write('\n'.join(no_efo_rcvs_present_in_march))

* 101,398 RCVs (`june_rcvs_no_efo` -> `new_unmapped_rcvs.txt`)
    * no EFO mapping
    * present in June but not March evidence
    * present in June but not March ClinVar
* 776,347 RCVs (`no_efo_rcvs_present_in_march` -> `existing_unmapped_rcvs.txt`)
    * no EFO mapping
    * present in June but not March evidence
    * present in June **and also** March ClinVar
    * These are the ones that moved from filtered out (fatal) to no mapping (skipped)
    
The 101K "new unmapped" RCVs are of some interest to us as a lot of these get added every month to ClinVar, but for now we'll focus on the 776K that used to be filtered out and are now included with no EFO mapping.

Example: `RCV000024525`
* present in Jan ClinVar (used in March sub) with "not provided"
* [modified in Feb](https://www.ncbi.nlm.nih.gov/clinvar/RCV000024525/) to include synonyms `RECLASSIFIED - ADRA2C POLYMORPHISM; RECLASSIFIED - ADRB1 POLYMORPHISM` ([medgen](https://www.ncbi.nlm.nih.gov/medgen/C3661900))

Possibly related issue: [#384](https://github.com/EBIvariation/CMAT/issues/384)

See trait annotation code [here](https://github.com/EBIvariation/CMAT/blob/master/cmat/output_generation/clinvar_to_evidence_strings.py#L407). This operates on all `traits_with_valid_names` for a record. These "reclassified" terms got added as synonyms for [`not provided`](https://www.ncbi.nlm.nih.gov/medgen/C3661900), but aren't recognised as invalid as they're not in the nonspecific trait names list - hence they show up as unmapped in the evidence.

If this is true we can resolve it in two ways:
1. Add these "reclassified" terms to the nonspecific trait names list
2. Modify the annotation to exclude an unmapped trait entirely if _any_ of its synonyms appear in the nonspecific trait names list

In [48]:
from collections import Counter

In [53]:
# For both March and June ClinVars,
#   For each RCV, get its preferred_or_other_valid_name (omits "not provided" etc.) and all_names (what we attempt to map, includes everything)

def get_preferred_and_all_names(clinvar_dataset):
    preferred_names = Counter()
    all_names = Counter()
    no_traits_with_valid_names = set()
    seen = 0
    for record in clinvar_dataset:
        if record.accession in no_efo_rcvs_present_in_march:
            seen += 1
            if not record.traits_with_valid_names:
                no_traits_with_valid_names.add(record.accession)
                continue
            for trait in record.traits_with_valid_names:
                preferred_names[trait.preferred_or_other_valid_name] += 1
                for n in trait.all_names:
                    all_names[n] += 1
        # early stopping
        if seen == len(no_efo_rcvs_present_in_march):
            break
    return no_traits_with_valid_names, preferred_names, all_names

In [50]:
march_dataset = ClinVarDataset(march_clinvar)
june_dataset = ClinVarDataset(june_clinvar)

In [54]:
march_no_valid_traits, march_preferred_names, march_all_names = get_preferred_and_all_names(march_dataset)

In [55]:
len(march_no_valid_traits)

775329

In [56]:
march_preferred_names

Counter({'Autoimmune lymphoproliferative syndrome, type 1b': 2,
         'Epilepsy, childhood absence 2': 524,
         'Febrile seizures, familial, 8': 443,
         'Glucocorticoid resistance, relative': 1,
         'Oculopharyngeal muscular dystrophy 2': 1,
         'Myoclonic-astatic epilepsy': 124,
         'Developmental and epileptic encephalopathy, 74': 1,
         'Distal Renal Tubular Acidosis, Recessive': 1,
         'POLR3-related leukodystrophy': 26,
         'Leukoencephalopathy-ataxia-hypodontia-hypomyelination syndrome': 301,
         'Hypogonadotropic hypogonadism 7 with or without anosmia': 3,
         'Hypomyelinating leukodystrophy 8 with or without oligodontia and-or hypogonadotropic hypogonadism': 9,
         'Charcot-Marie-Tooth disease, demyelinating, IIA 1I': 4,
         'Neonatal pseudo-hydrocephalic progeroid syndrome': 7,
         'Autism spectrum disorder': 1,
         'RECLASSIFIED - POLYMORPHISM': 1,
         'RECLASSIFIED - ADRA2C POLYMORPHISM': 30,
    

In [57]:
march_all_names

Counter({'Autoimmune lymphoproliferative syndrome, type 1b': 2,
         'Epilepsy, childhood absence 2': 524,
         'CONVULSIONS, FAMILIAL FEBRILE, 8': 443,
         'Febrile seizures, familial, 8': 443,
         'Glucocorticoid resistance, relative': 1,
         'Oculopharyngeal muscular dystrophy 2': 1,
         'Generalized myoclonic-atonic seizure': 124,
         'Myoclonic atonic seizures': 124,
         'Myoclonic-astatic epilepsy': 124,
         'Developmental and epileptic encephalopathy, 74': 1,
         'EPILEPTIC ENCEPHALOPATHY, EARLY INFANTILE, 74': 1,
         'Distal Renal Tubular Acidosis, Recessive': 1,
         '4H leukodystrophy': 26,
         'POLR-related leukodystrophy': 26,
         'POLR3-related leukodystrophy': 26,
         'Pol III-Related Leukodystrophies': 26,
         'Pol III-related leukodystrophy': 26,
         'ATAXIA, DELAYED DENTITION, AND HYPOMYELINATION': 301,
         'Hypomyelination-hypogonadotropic hypogonadism-hypodontia syndrome': 301,
   

Note: the following commands ran but the notebook didn't save, I copy/pasted the output below.

In [None]:
june_no_valid_traits, june_preferred_names, june_all_names = get_preferred_and_all_names(june_dataset)

In [None]:
june_no_valid_traits

```
set()
```

In [None]:
june_preferred_names

```
Counter({'RECLASSIFIED - ADRA2C POLYMORPHISM': 775359,
         'GLUCOCORTICOID RESISTANCE, MILD': 1,
         'Epilepsy with myoclonic atonic seizures': 124,
         'Basal ganglia calcification, idiopathic, 10, autosomal recessive': 3,
         'EPILEPSY, CHILDHOOD ABSENCE, SUSCEPTIBILITY TO, 2': 524,
         'Febrile seizures, familial, 8': 443,
         'Developmental and epileptic encephalopathy, 74': 1,
         'RECLASSIFIED - PLA2G7 POLYMORPHISM': 1,
         'AUTOIMMUNE LYMPHOPROLIFERATIVE SYNDROME, TYPE IB': 2,
         'Autism spectrum disorder': 1,
         'Distal Renal Tubular Acidosis, Recessive': 1,
         'Leukoencephalopathy, ataxia, hypodontia, hypomyelination syndrome': 301,
         'POLR-related leukodystrophy': 26,
         'Hypogonadotropic hypogonadism 7 with or without anosmia': 3,
         'Hypomyelinating leukodystrophy 8 with or without oligodontia and-or hypogonadotropic hypogonadism': 9,
         'Neonatal pseudo-hydrocephalic progeroid syndrome': 7,
         'Charcot-Marie-Tooth disease, demyelinating, IIA 1I': 4,
         'Pseudoarylsulfatase A deficiency': 1,
         'Early onset Alzheimer disease with behavioral disturbance': 1,
         'Venous malformation': 1,
         'CPT1A POLYMORPHISM': 1,
         'Oculopharyngeal muscular dystrophy 2': 1})
```

In [None]:
june_all_names

```
Counter({'RECLASSIFIED - ADRA2C POLYMORPHISM': 775359,
         'RECLASSIFIED - ADRB1 POLYMORPHISM': 775359,
         'none provided': 775359,
         'not provided': 775359,
         'GLUCOCORTICOID RESISTANCE, MILD': 1,
         'Epilepsy with myoclonic atonic seizures': 124,
         'Generalized myoclonic-atonic seizure': 124,
         'Myoclonic atonic seizures': 124,
         'Basal ganglia calcification, idiopathic, 10, autosomal recessive': 3,
         'EPILEPSY, CHILDHOOD ABSENCE, SUSCEPTIBILITY TO, 2': 524,
         'CONVULSIONS, FAMILIAL FEBRILE, 8': 443,
         'Febrile seizures, familial, 8': 443,
         'Developmental and epileptic encephalopathy, 74': 1,
         'EPILEPTIC ENCEPHALOPATHY, EARLY INFANTILE, 74': 1,
         'RECLASSIFIED - IL4R POLYMORPHISM': 1,
         'RECLASSIFIED - MS4A2 POLYMORPHISM': 1,
         'RECLASSIFIED - MYOC POLYMORPHISM': 1,
         'RECLASSIFIED - PLA2G7 POLYMORPHISM': 1,
         'RECLASSIFIED - POLYMORPHISM': 1,
         'AUTOIMMUNE LYMPHOPROLIFERATIVE SYNDROME, TYPE IB': 2,
         'Autism spectrum disorder': 1,
         'Autism spectrum disorders': 1,
         'Distal Renal Tubular Acidosis, Recessive': 1,
         'ATAXIA, DELAYED DENTITION, AND HYPOMYELINATION': 301,
         'LEUKODYSTROPHY, HYPOMYELINATING, 7, WITH OLIGODONTIA': 301,
         'LEUKODYSTROPHY, HYPOMYELINATING, 7, WITH OLIGODONTIA AND HYPOGONADOTROPIC HYPOGONADISM': 301,
         'LEUKODYSTROPHY, HYPOMYELINATING, 7, WITH OR WITHOUT OLIGODONTIA AND/OR HYPOGONADOTROPIC HYPOGONADISM': 301,
         'LEUKODYSTROPHY, HYPOMYELINATING, 7, WITHOUT OLIGODONTIA OR HYPOGONADOTROPIC HYPOGONADISM': 301,
         'LEUKOENCEPHALOPATHY, HYPOMYELINATING, WITH ATAXIA AND DELAYED DENTITION': 301,
         'Leukodystrophy, hypomyelinating, with hypodontia and hypogonadotropic hypogonadism': 301,
         'Leukoencephalopathy, ataxia, hypodontia, hypomyelination syndrome': 301,
         '4H leukodystrophy': 26,
         'POLR-related leukodystrophy': 26,
         'HYPOGONADOTROPIC HYPOGONADISM 7 WITHOUT ANOSMIA': 3,
         'Hypogonadotropic hypogonadism 7 with or without anosmia': 3,
         'Endosteal sclerosis-cerebellar hypoplasia syndrome': 9,
         'Hypomyelinating leukodystrophy 8 with or without oligodontia and-or hypogonadotropic hypogonadism': 9,
         'Hypomyelinating leukodystrophy 8, with or without oligodontia and/or hypogonadotropic hypogonadism': 9,
         'LEUKODYSTROPHY, HYPOMYELINATING, 8, WITH HYPODONTIA AND HYPOGONADOTROPIC HYPOGONADISM': 9,
         'Neonatal pseudo-hydrocephalic progeroid syndrome': 7,
         'Wiedemann-Rautenstrauch syndrome': 7,
         'CHARCOT-MARIE-TOOTH NEUROPATHY, TYPE 1I': 4,
         'Charcot-Marie-Tooth disease, demyelinating, IIA 1I': 4,
         'Charcot-Marie-Tooth disease, demyelinating, type 1I': 4,
         'Pseudoarylsulfatase A deficiency': 1,
         'Early onset Alzheimer disease with behavioral disturbance': 1,
         'Venous malformation': 1,
         'CPT1A POLYMORPHISM': 1,
         'Oculopharyngeal muscular dystrophy 2': 1})
```