## Variant data selection and preprocessing (__Humsavar__)

In [1]:
import json
import pandas as pd
import re
import requests
import time
from Bio.Data import IUPACData
from datetime import datetime
from pathlib import Path
import warnings

warnings.filterwarnings("ignore")

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_colwidth', None)

In [2]:
def parse_humsavar(file):
    """
    Parse humsavar.txt file content and extract the variant table into a pandas DataFrame.
    
    Parameters:
    file (str): humsavar.txt
    
    Returns:
    pandas.DataFrame: DataFrame containing the variant data with columns:
        - Main gene name
        - Swiss-Prot AC
        - FTId
        - AA change
        - Variant category
        - dbSNP
        - Disease name
    """
    # split content to lines
    lines = file.split('\n')

    # find start of the table
    table_start = None
    for i, line in enumerate(lines):
        if line.startswith('Main   '):
            table_start = i
            break
    
    if table_start is None:
        raise ValueError("Could not find the table header in the file")
    
    # skip header and underscores lines
    data_start = table_start + 2
    
    data = []
    for line in lines[data_start:]:
        if not line.strip():
            continue
            
        # split line into fields while preserving whitespace
        fields = re.split(r'\s{2,}', line.strip())
        
        # check all required fields have been correctly added
        if len(fields) >= 7:
            data.append({
                'Main gene name': fields[0],
                'Swiss-Prot AC': fields[1],
                'FTId': fields[2],
                'AA change': fields[3],
                'Variant category': fields[4],
                'dbSNP': fields[5],
                'Disease name': fields[6] if fields[6] != '-' else None
            })

    df = pd.DataFrame(data)
    return df

In [3]:
def read_and_parse_humsavar(file_path):
    """
    Read humsavar.txt file and parse it into a DataFrame.
    
    Parameters:
    file_path (str): Path to humsavar.txt
    
    Returns:
    pandas.DataFrame: DataFrame containing the data
    """
    with open(file_path, 'r') as file:
        content = file.read()
    return parse_humsavar(content)

We must read and process Humsavar files for __2025__ and __2021__.

The source column is added to track the dataset origin and to distinguish variants after merging.

In [4]:
file = '../data/humsavar/humsavar_202501.txt'
data25 = read_and_parse_humsavar(file)
data25['Source']='2025'
data25.head()

Unnamed: 0,Main gene name,Swiss-Prot AC,FTId,AA change,Variant category,dbSNP,Disease name,Source
0,A1BG,P04217,VAR_018369,p.His52Arg,LB/B,rs893184,,2025
1,A1BG,P04217,VAR_018370,p.His395Arg,LB/B,rs2241788,,2025
2,A1CF,Q9NQ94,VAR_052201,p.Val555Met,LB/B,rs9073,,2025
3,A1CF,Q9NQ94,VAR_059821,p.Ala558Ser,LB/B,rs11817448,,2025
4,A2M,P01023,VAR_000012,p.Arg704His,LB/B,rs1800434,,2025


In [5]:
len(data25)

83697

In [14]:
data25["Main gene name"].nunique()

13080

In [6]:
file = '../data/humsavar/humsavar_202102.txt'
data21 = read_and_parse_humsavar(file)
data21['Source']='2021'
data21.head()

Unnamed: 0,Main gene name,Swiss-Prot AC,FTId,AA change,Variant category,dbSNP,Disease name,Source
0,A1BG,P04217,VAR_018369,p.His52Arg,LB/B,rs893184,,2021
1,A1BG,P04217,VAR_018370,p.His395Arg,LB/B,rs2241788,,2021
2,A1CF,Q9NQ94,VAR_052201,p.Val555Met,LB/B,rs9073,,2021
3,A1CF,Q9NQ94,VAR_059821,p.Ala558Ser,LB/B,rs11817448,,2021
4,A2M,P01023,VAR_000012,p.Arg704His,LB/B,rs1800434,,2021


In [7]:
len(data21)

79192

In [15]:
data21["Main gene name"].nunique()

12743

Strip whitespaces from columns.

In [8]:
compare_columns=['Main gene name', 'Swiss-Prot AC', 'AA change']
data25[compare_columns] = data25[compare_columns].apply(lambda x: x.str.strip())
data21[compare_columns] = data21[compare_columns].apply(lambda x: x.str.strip())

Now, we proceed to remove duplicate entries (for both 2025 and 2021).

Many of the variants are already included in 2025, and as our goal is to keep the dataset with the __latest__ information, we remove possible duplicates.

In [9]:
print("Before removing duplicates: ", len(data25))
data25_1 = data25.drop_duplicates(subset=compare_columns).reset_index(drop=True)
print("After removing duplicates: ", len(data25_1))

Before removing duplicates:  83697
After removing duplicates:  82585


In [10]:
print("Before removing duplicates: ", len(data21))
data21_1 = data21.drop_duplicates(subset=compare_columns).reset_index(drop=True)
print("After removing duplicates: ", len(data21_1))

Before removing duplicates:  79192
After removing duplicates:  78192


In [11]:
# This is the difference in unique variants between 2025 and 2021.
len(data25_1) - len(data21_1)

4393

In [12]:
# how many rows are in 2025 but not in 2021 based on compare_columns
diff_data = data25_1[~data25_1[compare_columns].apply(tuple, axis=1).isin(data21_1[compare_columns].apply(tuple, axis=1))]
print(f"Rows in 2025 but NOT in 2021: {len(diff_data)}")

Rows in 2025 but NOT in 2021: 6047


Clinical significance distribution of 2025 variants not in 2021.

In [13]:
diff_data['Variant category'].value_counts()

Variant category
LP/P    2291
US      2222
LB/B    1534
Name: count, dtype: int64

We keep only LP/P (Likely Pathogenic/Pathogenic) and LB/B (Likely Benign/Benign) variants. With this we aim to remove VUS.

In [17]:
data25_2 = diff_data[diff_data['Variant category'].isin(['LP/P','LB/B'])].reset_index(drop=True)

In [18]:
data25_2['Variant category'].value_counts()

Variant category
LP/P    2291
LB/B    1534
Name: count, dtype: int64

In [19]:
# how many genes we have
data25_2['Main gene name'].nunique()

1258

In [20]:
data25_2['Swiss-Prot AC'].nunique()

1258

In [21]:
data25_2.head()

Unnamed: 0,Main gene name,Swiss-Prot AC,FTId,AA change,Variant category,dbSNP,Disease name,Source
0,AARS1,P49588,VAR_089576,p.Arg326Trp,LP/P,-,"Charcot-Marie-Tooth disease, axonal, 2N (CMT2N...",2025
1,AARS1,P49588,VAR_089577,p.Thr606Ile,LP/P,-,"Leukoencephalopathy, hereditary diffuse, with ...",2025
2,AARS1,P49588,VAR_089578,p.Ser698Phe,LP/P,-,"Charcot-Marie-Tooth disease, axonal, 2N (CMT2N...",2025
3,ABCA4,P78363,VAR_084908,p.Asp1102Tyr,LB/B,rs138641544,,2025
4,ABCA4,P78363,VAR_084916,p.Gly1203Asp,LP/P,-,Stargardt disease 1 (STGD1) [MIM:248200],2025


Exclude entries where the gene name corresponds to a dash

In [24]:
data25_2 = data25_2[data25_2['Main gene name']!='-'].reset_index(drop=True)

Check if there are accession numbers that do not have the expected length of 6.

In [19]:
[uni for uni in data25_2['Swiss-Prot AC'].unique() if len(uni)!= 6]

[]

As with ClinVar dataset, we convert Aa changes to 1 letter notation.

In [20]:
def convert_to_one_letter(aa_change):
    prefix, from_aa, position, to_aa = (
        aa_change[0:2],  # e.g., 'p.'
        aa_change[2:5],  # e.g., 'Arg'
        ''.join(filter(str.isdigit, aa_change)),  # e.g., '326'
        aa_change[-3:]  # e.g., 'Trp'
    )

    # convert to 1 letter code
    from_aa_one = IUPACData.protein_letters_3to1.get(from_aa, '?')
    to_aa_one = IUPACData.protein_letters_3to1.get(to_aa, '?')
    return f"{from_aa_one}{position}{to_aa_one}"

In [21]:
data25_2['Variant'] = data25_2['AA change'].apply(convert_to_one_letter)

And we assign binary clinical significance.

In [22]:
data25_2.loc[data25_2['Variant category']=='LB/B','BinaryClinicalSignificance'] = 'B'
data25_2.loc[data25_2['Variant category']=='LP/P','BinaryClinicalSignificance'] = 'P'

In [23]:
data25_2.BinaryClinicalSignificance.value_counts()

BinaryClinicalSignificance
P    2291
B    1533
Name: count, dtype: int64

In [24]:
data25_2['Main gene name'].nunique()

1257

Filter duplicated dbSNP entries.

In [25]:
data25_2[data25_2.duplicated(subset=['dbSNP'], keep=False)].dbSNP.unique()

array(['-', 'rs1645264815', 'rs2153228682', 'rs1705222655', 'rs752450983',
       'rs1599011050', 'rs1949512456', 'rs28642966', 'rs201552310',
       'rs200005406', 'rs2072648', 'rs1057517926', 'rs1757708758',
       'rs77834747', 'rs381427', 'rs421016', 'rs121908310', 'rs77933015',
       'rs121908308', 'rs782199122', 'rs1191455921', 'rs2071312',
       'rs104894264', 'rs7480563', 'rs7126405', 'rs2293232', 'rs2246901',
       'rs200291894', 'rs571714796', 'rs7255187', 'rs1684813071',
       'rs1554297905', 'rs1838076782', 'rs141269120', 'rs1057149',
       'rs1385657144', 'rs1965499910'], dtype=object)

In [26]:
# use first output as example
data25_2[data25_2.dbSNP=='rs1645264815']

Unnamed: 0,Main gene name,Swiss-Prot AC,FTId,AA change,Variant category,dbSNP,Disease name,Source,Variant,BinaryClinicalSignificance
119,AGO1,Q9UL18,VAR_088408,p.Leu190Pro,LP/P,rs1645264815,Neurodevelopmental disorder with language dela...,2025,L190P,P
120,AGO1,Q9UL18,VAR_088409,p.Leu190Arg,LP/P,rs1645264815,Neurodevelopmental disorder with language dela...,2025,L190R,P


In [27]:
data25_2.to_csv('../data/humsavar/humsavar_20212025.csv', index=0)

We add more information to the dataset by retrieving .json files from Uniprot.

MANE-Select (Matched Annotation from NCBI and EMBL-EBI) provides a single annotated transcript per gene, thus giving consistency between RefSeq and Ensembl. The addition of MANE-Select data helps standardize variant annotations. 

With this we aim to link variants to a reliable reference transcript.

In [28]:
def extract_mane_select_info(humsavar_df, uniprot_id_column="Swiss-Prot AC", 
                           cache_dir="uniprot_cache", 
                           rate_limit_delay=0.1,
                           batch_size=100):
    """
    Extracts MANE-Select information for each UniProt ID with caching and rate limiting.
    
    Args:
        humsavar_df (pd.DataFrame): DataFrame containing UniProt IDs
        uniprot_id_column (str): Name of the column containing UniProt IDs
        cache_dir (str): Directory to store cached responses
        rate_limit_delay (float): Delay between API calls in seconds
        batch_size (int): Number of proteins to process before saving interim results
    
    Returns:
        pd.DataFrame: DataFrame with MANE-Select information
    """

    cache_path = Path(cache_dir)
    cache_path.mkdir(parents=True, exist_ok=True)
    log_file = cache_path / f"extraction_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
    
    def log_message(message):
        """Write message to log file and print it"""
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        log_message = f"[{timestamp}] {message}"
        print(log_message)
        with open(log_file, "a") as f:
            f.write(log_message + "\n")

    def get_cached_response(uniprot_id):
        """Get cached response for a UniProt ID"""
        cache_file = cache_path / f"{uniprot_id}.json"
        if cache_file.exists():
            try:
                with open(cache_file, "r") as f:
                    return json.load(f)
            except json.JSONDecodeError:
                return None
        return None
    
    def cache_response(uniprot_id, data):
        """Cache response for a UniProt ID"""
        cache_file = cache_path / f"{uniprot_id}.json"
        with open(cache_file, "w") as f:
            json.dump(data, f)
    
    def process_uniprot_id(uniprot_id):
        """Process a single UniProt ID"""
        cached_data = get_cached_response(uniprot_id)
        if cached_data is not None:
            data = cached_data
            log_message(f"Using cached data for {uniprot_id}")
        else:
            try:
                # Fetch JSON data from UniProt API
                json_url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json"
                response = requests.get(json_url)
                response.raise_for_status()
                data = response.json()
                # Cache the response
                cache_response(uniprot_id, data)
                log_message(f"Fetched and cached data for {uniprot_id}")
                # Rate limiting
                time.sleep(rate_limit_delay)
            except Exception as e:
                log_message(f"Error fetching data for {uniprot_id}: {str(e)}")
                return {
                    'Swiss-Prot AC': uniprot_id,
                    'MANE-Select ID': None,
                    'Ensembl ID': None,
                    'Protein ID': None,
                    'RefSeq Nucleotide ID': None,
                    'RefSeq Protein ID': None,
                    'Error': str(e)
                }
        
        try:
            # Search for MANE-Select information
            mane_select_info = None
            for entry in data.get('uniProtKBCrossReferences', []):
                if entry.get('database') == 'MANE-Select':
                    mane_select_info = entry
                    break
            
            if mane_select_info:
                mane_id = mane_select_info['id']
                ensembl_id = mane_select_info['id'].split('.')[0]
                protein_id = next(
                    (prop['value'] for prop in mane_select_info['properties'] 
                     if prop['key'] == 'ProteinId'), None)
                refseq_nucleotide_id = next(
                    (prop['value'] for prop in mane_select_info['properties'] 
                     if prop['key'] == 'RefSeqNucleotideId'), None)
                refseq_protein_id = next(
                    (prop['value'] for prop in mane_select_info['properties'] 
                     if prop['key'] == 'RefSeqProteinId'), None)
            else:
                mane_id = ensembl_id = protein_id = refseq_nucleotide_id = refseq_protein_id = None
            
            return {
                'Swiss-Prot AC': uniprot_id,
                'MANE-Select ID': mane_id,
                'Ensembl ID': ensembl_id,
                'Protein ID': protein_id,
                'RefSeq Nucleotide ID': refseq_nucleotide_id,
                'RefSeq Protein ID': refseq_protein_id,
                'Error': None
            }
        except Exception as e:
            log_message(f"Error processing data for {uniprot_id}: {str(e)}")
            return {
                'Swiss-Prot AC': uniprot_id,
                'MANE-Select ID': None,
                'Ensembl ID': None,
                'Protein ID': None,
                'RefSeq Nucleotide ID': None,
                'RefSeq Protein ID': None,
                'Error': str(e)
            }

    results = []
    total_ids = len(humsavar_df[uniprot_id_column].unique())
    log_message(f"Starting processing of {total_ids} Uniprot IDs")
    
    for i, uniprot_id in enumerate(humsavar_df[uniprot_id_column].unique(), 1):
        result = process_uniprot_id(uniprot_id)
        results.append(result)

        if i % batch_size == 0:
            interim_df = pd.DataFrame(results)
            interim_file = cache_path / f"interim_results_{i}.csv"
            interim_df.to_csv(interim_file, index=False)
            log_message(f"Processed {i}/{total_ids} IDs. Saved results to {interim_file}")

    mane_select_df = pd.DataFrame(results)

    final_file = cache_path/"final_results.csv"
    mane_select_df.to_csv(final_file, index=False)
    log_message(f"Final results saved to {final_file}")
    return mane_select_df

In [29]:
data25_2 = pd.read_csv('../data/humsavar/humsavar_20212025.csv')

Retrieve and add MANE-Select transcript mappings to the dataset.

In [30]:
mane_results = extract_mane_select_info(
    data25_2,
    cache_dir="uniprot_cache",  # store API responses
    rate_limit_delay=0.1,       # delay between API calls in seconds
    batch_size=100              # save results every 100 proteins
)

[2025-03-18 16:45:29] Starting processing of 1257 Uniprot IDs
[2025-03-18 16:45:29] Using cached data for P49588
[2025-03-18 16:45:29] Using cached data for P78363
[2025-03-18 16:45:29] Using cached data for Q9NP58
[2025-03-18 16:45:29] Using cached data for O14678
[2025-03-18 16:45:29] Using cached data for Q96SE0
[2025-03-18 16:45:29] Using cached data for O95870
[2025-03-18 16:45:29] Using cached data for A6QL63
[2025-03-18 16:45:29] Using cached data for P25106
[2025-03-18 16:45:29] Using cached data for A6NK06
[2025-03-18 16:45:29] Using cached data for Q3I5F7
[2025-03-18 16:45:29] Using cached data for Q9ULC5
[2025-03-18 16:45:29] Using cached data for P68133
[2025-03-18 16:45:29] Using cached data for P62736
[2025-03-18 16:45:29] Using cached data for P60709
[2025-03-18 16:45:29] Using cached data for P63267
[2025-03-18 16:45:29] Using cached data for Q8TC94
[2025-03-18 16:45:29] Using cached data for P35609
[2025-03-18 16:45:29] Using cached data for O60266
[2025-03-18 16:45:29

In [31]:
mane_results

Unnamed: 0,Swiss-Prot AC,MANE-Select ID,Ensembl ID,Protein ID,RefSeq Nucleotide ID,RefSeq Protein ID,Error
0,P49588,ENST00000261772.13,ENST00000261772,ENSP00000261772.8,NM_001605.3,NP_001596.2,
1,P78363,ENST00000370225.4,ENST00000370225,ENSP00000359245.3,NM_000350.3,NP_000341.2,
2,Q9NP58,ENST00000265316.9,ENST00000265316,ENSP00000265316.3,NM_005689.4,NP_005680.1,
3,O14678,ENST00000356924.9,ENST00000356924,ENSP00000349396.4,NM_005050.4,NP_005041.1,
4,Q96SE0,ENST00000316470.9,ENST00000316470,ENSP00000326491.4,NM_032604.4,NP_115993.3,
...,...,...,...,...,...,...,...
1252,Q96K58,ENST00000300849.5,ENST00000300849,ENSP00000300849.4,NM_024706.5,NP_078982.3,
1253,P17019,ENST00000356929.3,ENST00000356929,ENSP00000349401.2,NM_021269.3,NP_067092.2,
1254,O60290,ENST00000223210.5,ENST00000223210,ENSP00000223210.4,NM_001099220.3,NP_001092690.1,
1255,Q9BRT8,ENST00000356521.9,ENST00000356521,ENSP00000348915.4,NM_018491.5,NP_060961.3,


In [32]:
# check if there are any errors
errors = mane_results[mane_results['Error'].notna()]

if len(errors) > 0:
    print("\nEntries with errors:")
    print(errors[['Swiss-Prot AC', 'Error']].head())

Now we can combine the retrieved transcript mappings with the main dataset.

In [33]:
merged_df = data25_2.merge(mane_results, on="Swiss-Prot AC", how="left")
merged_df.rename(columns={'Swiss-Prot AC':'Uniprot'}, inplace=True)

Also, important to drop duplicate rows again!

In [34]:
merged_df=merged_df.drop_duplicates().reset_index(drop=True)
len(merged_df)

3824

In [35]:
merged_df.head()

Unnamed: 0,Main gene name,Uniprot,FTId,AA change,Variant category,dbSNP,Disease name,Source,Variant,BinaryClinicalSignificance,MANE-Select ID,Ensembl ID,Protein ID,RefSeq Nucleotide ID,RefSeq Protein ID,Error
0,AARS1,P49588,VAR_089576,p.Arg326Trp,LP/P,-,"Charcot-Marie-Tooth disease, axonal, 2N (CMT2N...",2025,R326W,P,ENST00000261772.13,ENST00000261772,ENSP00000261772.8,NM_001605.3,NP_001596.2,
1,AARS1,P49588,VAR_089577,p.Thr606Ile,LP/P,-,"Leukoencephalopathy, hereditary diffuse, with ...",2025,T606I,P,ENST00000261772.13,ENST00000261772,ENSP00000261772.8,NM_001605.3,NP_001596.2,
2,AARS1,P49588,VAR_089578,p.Ser698Phe,LP/P,-,"Charcot-Marie-Tooth disease, axonal, 2N (CMT2N...",2025,S698F,P,ENST00000261772.13,ENST00000261772,ENSP00000261772.8,NM_001605.3,NP_001596.2,
3,ABCA4,P78363,VAR_084908,p.Asp1102Tyr,LB/B,rs138641544,,2025,D1102Y,B,ENST00000370225.4,ENST00000370225,ENSP00000359245.3,NM_000350.3,NP_000341.2,
4,ABCA4,P78363,VAR_084916,p.Gly1203Asp,LP/P,-,Stargardt disease 1 (STGD1) [MIM:248200],2025,G1203D,P,ENST00000370225.4,ENST00000370225,ENSP00000359245.3,NM_000350.3,NP_000341.2,


In [46]:
merged_df.to_csv('../data/humsavar/humsavar_20212025_v2.csv',index=0)

We are ready to prepare input for the VEP tool and obtain predictions.

But there is a problem: Humsavar data is in Aa format, so there is no chromosome, genomic coordinate info etc (needed for VEPs).

Possible solutions to this:
- __Option 1__: Retrieve only those variants with dbSNP ID number
- __Option 2__: Retrieve nucleotide change information from REVEL downloads (_Fail. Revel coverage is not wide enough_)
- __Option 3__: Try generating VCF file for all those proteins from the transcript/exon information (_Time consuming_)
- __Option 4__: Select 10 predictors and get predictions separately from each tool (_Maybe/Some tools still require genomic coodinate format_)


### __Option 1__: Extract variants with a known dbSNP ID for easier mapping to genomic coordinates.

In [5]:
# load the cleaned dataset
merged_df = pd.read_csv('../data/humsavar/humsavar_20212025_v2.csv')

In [6]:
merged_df.dbSNP.nunique()

2577

For this, we must identify duplicated dbSNP entries to review possible inconsistencies

In [5]:
# keep these in mind to check after
merged_df[merged_df.duplicated(subset=['dbSNP'], keep=False)].dbSNP.unique()

array(['-', 'rs1645264815', 'rs2153228682', 'rs1705222655', 'rs752450983',
       'rs1599011050', 'rs1949512456', 'rs28642966', 'rs201552310',
       'rs200005406', 'rs2072648', 'rs1057517926', 'rs1757708758',
       'rs77834747', 'rs381427', 'rs421016', 'rs121908310', 'rs77933015',
       'rs121908308', 'rs782199122', 'rs1191455921', 'rs2071312',
       'rs104894264', 'rs7480563', 'rs7126405', 'rs2293232', 'rs2246901',
       'rs200291894', 'rs571714796', 'rs7255187', 'rs1684813071',
       'rs1554297905', 'rs1838076782', 'rs141269120', 'rs1057149',
       'rs1385657144', 'rs1965499910'], dtype=object)

And remove those entries **without** dbSNP annotations.

In [6]:
# first, we drop duplicate entries while keeping the first occurrence
tmp=merged_df.drop_duplicates(subset='dbSNP', keep='first')

# and extract the list of valid dbSNP IDs
dbsnplist=list(tmp[tmp.dbSNP!='-'].dbSNP.values)

After the filtering we analyze the dataset coverage.

In [8]:
print("Total number of variants in the dataset:", len(merged_df)) 
print("Unique dbSNP IDs after removing duplicates:", len(dbsnplist)) 
print("Variants with dbSNP IDs:", len(merged_df[merged_df.dbSNP!='-']))
print("Unique proteins that have at least 1 variant with a dbSNP ID:", merged_df[merged_df.dbSNP!='-']['Uniprot'].nunique())  # Unique proteins with dbSNP mapped variants  

Total number of variants in the dataset: 3824
Unique dbSNP IDs after removing duplicates: 2576
Variants with dbSNP IDs: 2613
Unique proteins that have at least 1 variant with a dbSNP ID: 976


Important to save the filtered dbSNP IDs so we can further process them (with VEP)

In [54]:
with open('../data/humsavar/humsavar_rsIDs.txt', 'w') as f:
    for db in dbsnplist:
        f.write(f"{db}" + "\n")

The website VEP tool was used for ease of access since we only have rsIDs. 

Same options were selected as with ClinVar.

### Read VEP output (predictions added!)

In [3]:
VEP_output = pd.read_csv('../data/humsavar/cleaned_Humsavar_dataset_outputVEP.txt', sep='\t')

In [4]:
VEP_output.head()

Unnamed: 0,#Uploaded_variation,Location,Allele,Consequence,IMPACT,SYMBOL,Gene,Feature_type,Feature,BIOTYPE,EXON,INTRON,HGVSc,HGVSp,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,REF_ALLELE,UPLOADED_ALLELE,DISTANCE,STRAND,FLAGS,SYMBOL_SOURCE,HGNC_ID,MANE,MANE_SELECT,MANE_PLUS_CLINICAL,TSL,APPRIS,CCDS,ENSP,SWISSPROT,TREMBL,UNIPARC,UNIPROT_ISOFORM,SIFT,PolyPhen,HGVS_OFFSET,AF,AFR_AF,AMR_AF,EAS_AF,EUR_AF,SAS_AF,gnomADe_AF,gnomADe_AFR_AF,gnomADe_AMR_AF,gnomADe_ASJ_AF,gnomADe_EAS_AF,gnomADe_FIN_AF,gnomADe_MID_AF,gnomADe_NFE_AF,gnomADe_REMAINING_AF,gnomADe_SAS_AF,gnomADg_AF,gnomADg_AFR_AF,gnomADg_AMI_AF,gnomADg_AMR_AF,gnomADg_ASJ_AF,gnomADg_EAS_AF,gnomADg_FIN_AF,gnomADg_MID_AF,gnomADg_NFE_AF,gnomADg_REMAINING_AF,gnomADg_SAS_AF,CLIN_SIG,SOMATIC,PHENO,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,TRANSCRIPTION_FACTORS,REVEL,BLOSUM62,EVE_CLASS,EVE_SCORE,CADD_PHRED,CADD_RAW,am_class,am_pathogenicity,1000Gp3_AC,1000Gp3_AF,1000Gp3_AFR_AC,1000Gp3_AFR_AF,1000Gp3_AMR_AC,1000Gp3_AMR_AF,1000Gp3_EAS_AC,1000Gp3_EAS_AF,1000Gp3_EUR_AC,1000Gp3_EUR_AF,1000Gp3_SAS_AC,1000Gp3_SAS_AF,ALFA_African_AC,ALFA_African_AF,ALFA_African_AN,ALFA_African_American_AC,ALFA_African_American_AF,ALFA_African_American_AN,ALFA_African_Others_AC,ALFA_African_Others_AF,ALFA_African_Others_AN,ALFA_Asian_AC,ALFA_Asian_AF,ALFA_Asian_AN,ALFA_East_Asian_AC,ALFA_East_Asian_AF,ALFA_East_Asian_AN,ALFA_European_AC,ALFA_European_AF,ALFA_European_AN,ALFA_Latin_American_1_AC,ALFA_Latin_American_1_AF,ALFA_Latin_American_1_AN,ALFA_Latin_American_2_AC,ALFA_Latin_American_2_AF,ALFA_Latin_American_2_AN,ALFA_Other_AC,ALFA_Other_AF,ALFA_Other_AN,ALFA_Other_Asian_AC,ALFA_Other_Asian_AF,ALFA_Other_Asian_AN,ALFA_South_Asian_AC,ALFA_South_Asian_AF,ALFA_South_Asian_AN,ALFA_Total_AC,ALFA_Total_AF,ALFA_Total_AN,ALSPAC_AC,ALSPAC_AF,Aloft_Confidence,Aloft_Fraction_transcripts_affected,Aloft_pred,Aloft_prob_Dominant,Aloft_prob_Recessive,Aloft_prob_Tolerant,AltaiNeandertal,Ancestral_allele,BayesDel_addAF_pred,BayesDel_addAF_rankscore,BayesDel_addAF_score,BayesDel_noAF_pred,BayesDel_noAF_rankscore,BayesDel_noAF_score,ChagyrskayaNeandertal,DANN_rankscore,DANN_score,DEOGEN2_pred,DEOGEN2_rankscore,DEOGEN2_score,Denisova,ESM1b_pred,ESM1b_rankscore,ESM1b_score,ESP6500_AA_AC,ESP6500_AA_AF,ESP6500_EA_AC,ESP6500_EA_AF,EVE_Class10_pred,EVE_Class20_pred,EVE_Class25_pred,EVE_Class30_pred,EVE_Class40_pred,EVE_Class50_pred,EVE_Class60_pred,EVE_Class70_pred,EVE_Class75_pred,EVE_Class80_pred,EVE_Class90_pred,EVE_rankscore,EVE_score,Eigen-PC-phred_coding,Eigen-PC-raw_coding,Eigen-PC-raw_coding_rankscore,Eigen-phred_coding,Eigen-raw_coding,Eigen-raw_coding_rankscore,Ensembl_geneid,Ensembl_proteinid,Ensembl_transcriptid,ExAC_AC,ExAC_AF,ExAC_AFR_AC,ExAC_AFR_AF,ExAC_AMR_AC,ExAC_AMR_AF,ExAC_Adj_AC,ExAC_Adj_AF,ExAC_EAS_AC,ExAC_EAS_AF,ExAC_FIN_AC,ExAC_FIN_AF,ExAC_NFE_AC,ExAC_NFE_AF,ExAC_SAS_AC,ExAC_SAS_AF,ExAC_nonTCGA_AC,ExAC_nonTCGA_AF,ExAC_nonTCGA_AFR_AC,ExAC_nonTCGA_AFR_AF,ExAC_nonTCGA_AMR_AC,ExAC_nonTCGA_AMR_AF,ExAC_nonTCGA_Adj_AC,ExAC_nonTCGA_Adj_AF,ExAC_nonTCGA_EAS_AC,ExAC_nonTCGA_EAS_AF,ExAC_nonTCGA_FIN_AC,ExAC_nonTCGA_FIN_AF,ExAC_nonTCGA_NFE_AC,ExAC_nonTCGA_NFE_AF,ExAC_nonTCGA_SAS_AC,ExAC_nonTCGA_SAS_AF,ExAC_nonpsych_AC,ExAC_nonpsych_AF,ExAC_nonpsych_AFR_AC,ExAC_nonpsych_AFR_AF,ExAC_nonpsych_AMR_AC,ExAC_nonpsych_AMR_AF,ExAC_nonpsych_Adj_AC,ExAC_nonpsych_Adj_AF,ExAC_nonpsych_EAS_AC,ExAC_nonpsych_EAS_AF,ExAC_nonpsych_FIN_AC,ExAC_nonpsych_FIN_AF,ExAC_nonpsych_NFE_AC,ExAC_nonpsych_NFE_AF,ExAC_nonpsych_SAS_AC,ExAC_nonpsych_SAS_AF,FATHMM_converted_rankscore,FATHMM_pred,FATHMM_score,GERP++_NR,GERP++_RS,GERP++_RS_rankscore,GM12878_confidence_value,GM12878_fitCons_rankscore,GM12878_fitCons_score,GTEx_V8_eQTL_gene,GTEx_V8_eQTL_tissue,GTEx_V8_sQTL_gene,GTEx_V8_sQTL_tissue,Geuvadis_eQTL_target_gene,H1-hESC_confidence_value,H1-hESC_fitCons_rankscore,H1-hESC_fitCons_score,HUVEC_confidence_value,HUVEC_fitCons_rankscore,HUVEC_fitCons_score,Interpro_domain,LIST-S2_pred,LIST-S2_rankscore,LIST-S2_score,LRT_Omega,LRT_converted_rankscore,LRT_pred,LRT_score,M-CAP_pred,M-CAP_rankscore,M-CAP_score,MPC_rankscore,MPC_score,MVP_rankscore,MVP_score,MetaLR_pred,MetaLR_rankscore,MetaLR_score,MetaRNN_pred,MetaRNN_rankscore,MetaRNN_score,MetaSVM_pred,MetaSVM_rankscore,MetaSVM_score,MutPred_AAchange,MutPred_Top5features,MutPred_protID,MutPred_rankscore,MutPred_score,MutationAssessor_pred,MutationAssessor_rankscore,MutationAssessor_score,MutationTaster_AAE,MutationTaster_converted_rankscore,MutationTaster_model,MutationTaster_pred,MutationTaster_score,PROVEAN_converted_rankscore,PROVEAN_pred,PROVEAN_score,PrimateAI_pred,PrimateAI_rankscore,PrimateAI_score,Reliability_index,SiPhy_29way_logOdds,SiPhy_29way_logOdds_rankscore,SiPhy_29way_pi,TWINSUK_AC,TWINSUK_AF,UK10K_AC,UK10K_AF,Uniprot_acc,Uniprot_entry,VARITY_ER_LOO_rankscore,VARITY_ER_LOO_score,VARITY_ER_rankscore,VARITY_ER_score,VARITY_R_LOO_rankscore,VARITY_R_LOO_score,VARITY_R_rankscore,VARITY_R_score,VindijiaNeandertal,aapos,bStatistic,bStatistic_converted_rankscore,clinvar_MedGen_id,clinvar_OMIM_id,clinvar_Orphanet_id,clinvar_clnsig,clinvar_hgvs,clinvar_id,clinvar_review,clinvar_trait,clinvar_var_source,codon_degeneracy,eQTLGen_cis_or_trans,eQTLGen_gene_id,eQTLGen_gene_symbol,eQTLGen_snp_id,fathmm-MKL_coding_group,fathmm-MKL_coding_pred,fathmm-MKL_coding_rankscore,fathmm-MKL_coding_score,fathmm-XF_coding_pred,fathmm-XF_coding_rankscore,fathmm-XF_coding_score,gMVP_rankscore,gMVP_score,genename,gnomAD_exomes_AC,gnomAD_exomes_AF,gnomAD_exomes_AFR_AC,gnomAD_exomes_AFR_AF,gnomAD_exomes_AFR_AN,gnomAD_exomes_AFR_nhomalt,gnomAD_exomes_AMR_AC,gnomAD_exomes_AMR_AF,gnomAD_exomes_AMR_AN,gnomAD_exomes_AMR_nhomalt,gnomAD_exomes_AN,gnomAD_exomes_ASJ_AC,gnomAD_exomes_ASJ_AF,gnomAD_exomes_ASJ_AN,gnomAD_exomes_ASJ_nhomalt,gnomAD_exomes_EAS_AC,gnomAD_exomes_EAS_AF,gnomAD_exomes_EAS_AN,gnomAD_exomes_EAS_nhomalt,gnomAD_exomes_FIN_AC,gnomAD_exomes_FIN_AF,gnomAD_exomes_FIN_AN,gnomAD_exomes_FIN_nhomalt,gnomAD_exomes_MID_AC,gnomAD_exomes_MID_AF,gnomAD_exomes_MID_AN,gnomAD_exomes_MID_nhomalt,gnomAD_exomes_NFE_AC,gnomAD_exomes_NFE_AF,gnomAD_exomes_NFE_AN,gnomAD_exomes_NFE_nhomalt,gnomAD_exomes_POPMAX_AC,gnomAD_exomes_POPMAX_AF,gnomAD_exomes_POPMAX_AN,gnomAD_exomes_POPMAX_nhomalt,gnomAD_exomes_SAS_AC,gnomAD_exomes_SAS_AF,gnomAD_exomes_SAS_AN,gnomAD_exomes_SAS_nhomalt,gnomAD_exomes_flag,gnomAD_exomes_nhomalt,gnomAD_exomes_non_ukb_AC,gnomAD_exomes_non_ukb_AF,gnomAD_exomes_non_ukb_AFR_AC,gnomAD_exomes_non_ukb_AFR_AF,gnomAD_exomes_non_ukb_AFR_AN,gnomAD_exomes_non_ukb_AFR_nhomalt,gnomAD_exomes_non_ukb_AMR_AC,gnomAD_exomes_non_ukb_AMR_AF,gnomAD_exomes_non_ukb_AMR_AN,gnomAD_exomes_non_ukb_AMR_nhomalt,gnomAD_exomes_non_ukb_AN,gnomAD_exomes_non_ukb_ASJ_AC,gnomAD_exomes_non_ukb_ASJ_AF,gnomAD_exomes_non_ukb_ASJ_AN,gnomAD_exomes_non_ukb_ASJ_nhomalt,gnomAD_exomes_non_ukb_EAS_AC,gnomAD_exomes_non_ukb_EAS_AF,gnomAD_exomes_non_ukb_EAS_AN,gnomAD_exomes_non_ukb_EAS_nhomalt,gnomAD_exomes_non_ukb_FIN_AC,gnomAD_exomes_non_ukb_FIN_AF,gnomAD_exomes_non_ukb_FIN_AN,gnomAD_exomes_non_ukb_FIN_nhomalt,gnomAD_exomes_non_ukb_MID_AC,gnomAD_exomes_non_ukb_MID_AF,gnomAD_exomes_non_ukb_MID_AN,gnomAD_exomes_non_ukb_MID_nhomalt,gnomAD_exomes_non_ukb_NFE_AC,gnomAD_exomes_non_ukb_NFE_AF,gnomAD_exomes_non_ukb_NFE_AN,gnomAD_exomes_non_ukb_NFE_nhomalt,gnomAD_exomes_non_ukb_SAS_AC,gnomAD_exomes_non_ukb_SAS_AF,gnomAD_exomes_non_ukb_SAS_AN,gnomAD_exomes_non_ukb_SAS_nhomalt,gnomAD_exomes_non_ukb_nhomalt,gnomAD_genomes_AC,gnomAD_genomes_AF,gnomAD_genomes_AFR_AC,gnomAD_genomes_AFR_AF,gnomAD_genomes_AFR_AN,gnomAD_genomes_AFR_nhomalt,gnomAD_genomes_AMI_AC,gnomAD_genomes_AMI_AF,gnomAD_genomes_AMI_AN,gnomAD_genomes_AMI_nhomalt,gnomAD_genomes_AMR_AC,gnomAD_genomes_AMR_AF,gnomAD_genomes_AMR_AN,gnomAD_genomes_AMR_nhomalt,gnomAD_genomes_AN,gnomAD_genomes_ASJ_AC,gnomAD_genomes_ASJ_AF,gnomAD_genomes_ASJ_AN,gnomAD_genomes_ASJ_nhomalt,gnomAD_genomes_EAS_AC,gnomAD_genomes_EAS_AF,gnomAD_genomes_EAS_AN,gnomAD_genomes_EAS_nhomalt,gnomAD_genomes_FIN_AC,gnomAD_genomes_FIN_AF,gnomAD_genomes_FIN_AN,gnomAD_genomes_FIN_nhomalt,gnomAD_genomes_MID_AC,gnomAD_genomes_MID_AF,gnomAD_genomes_MID_AN,gnomAD_genomes_MID_nhomalt,gnomAD_genomes_NFE_AC,gnomAD_genomes_NFE_AF,gnomAD_genomes_NFE_AN,gnomAD_genomes_NFE_nhomalt,gnomAD_genomes_POPMAX_AC,gnomAD_genomes_POPMAX_AF,gnomAD_genomes_POPMAX_AN,gnomAD_genomes_POPMAX_nhomalt,gnomAD_genomes_SAS_AC,gnomAD_genomes_SAS_AF,gnomAD_genomes_SAS_AN,gnomAD_genomes_SAS_nhomalt,gnomAD_genomes_flag,gnomAD_genomes_nhomalt,integrated_confidence_value,integrated_fitCons_rankscore,integrated_fitCons_score,phastCons100way_vertebrate,phastCons100way_vertebrate_rankscore,phastCons17way_primate,phastCons17way_primate_rankscore,phastCons470way_mammalian,phastCons470way_mammalian_rankscore,phyloP100way_vertebrate,phyloP100way_vertebrate_rankscore,phyloP17way_primate,phyloP17way_primate_rankscore,phyloP470way_mammalian,phyloP470way_mammalian_rankscore,ClinPred
0,rs121909521,1:229433067-229433067,G,missense_variant,MODERATE,ACTA1,ENSG00000143632,Transcript,ENST00000366683.4,protein_coding,2/7,-,ENST00000366683.4:c.49G>C,ENSP00000355644.4:p.Gly17Arg,162,49,17,G/R,Ggc/Cgc,"rs121909521,CM992116",C,C/G/T,-,-1,-,HGNC,HGNC:129,-,-,-,5,-,-,ENSP00000355644,-,A6NL76.108,UPI000C755200,-,deleterious_low_confidence(0),probably_damaging(0.98),-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,"likely_pathogenic,pathogenic",1,11,1050851991851791522640717227580292742052329476420621480,-,-,-,-,-,0.937,-2,Uncertain,0.6114285422903512,29.0,5.169387,likely_pathogenic,0.9989,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,".,.,",".,.,",".,.,",".,.,",".,.,",".,.,",C/C,C,D,0.97007,0.5807,D,0.96957,0.596359,C/C,0.77505,0.99654753831684295,"D,D",0.98979,"0.936956,0.608584",C/C,"D,.",0.81140,"-11.274,.",-,-,-,-,"U,.","U,.","U,.","U,.","U,.","U,.","U,.","U,.","U,.","U,.","P,.",0.67905,"0.6114285422903512,.",8.060451,0.7158844301797,0.83587,10.34018,0.871517802988444,0.90297,"ENSG00000143632,ENSG00000143632","ENSP00000355645,ENSP00000355644","ENST00000366684,ENST00000366683",-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0.97317,"D,D","-4.36,-3.44",4.77,3.85,0.43556,0,0.19063,0.563428,-,-,-,-,-,3,0.00061,0.0,0,0.58350,0.63947,".,.","D,D",0.94055,"0.982235,0.982402",0.000000,0.84330,D,0.000000,D,0.95926,0.555963,-,".,.",0.99485,"0.994903219395,0.994903219395",D,0.98958,0.9672,"D,D",0.98680,"0.98470706,0.98470706",D,0.99324,1.0921,"G17R,G17R,","Loss_of_glycosylation_at_S16_(P_=_0.0337),Loss_of_glycosylation_at_S16_(P_=_0.0337),","P68133,P68133,",0.97816,"0.901,0.901,","H,.",0.96631,"3.98,.","G17R,G17R,G17R,G17R",0.58761,"simple_aae,simple_aae,simple_aae,simple_aae","A,A,A,A","0.788915,0.999997,0.788915,0.999997",0.67359,"D,D","-3.43,-2.73",D,0.99310,0.936415195465,9,13.3794,0.60203,0.1589:0.8411:0.0:0.0,-,-,-,-,"P68133,A6NL76","ACTS_HUMAN,A6NL76_HUMAN",0.91962,"0.85679024,.",0.90648,"0.83711636,.",0.95732,"0.94474953,.",0.94812,"0.9357127,.",C/C,1717,713,0.56348,"C5830333,C3661900",620278,-,Pathogenic/Likely_pathogenic,NC_000001.11:g.229433067C>G,18281,"criteria_provided,_multiple_submitters,_no_conflicts","Congenital_myopathy_2c,_severe_infantile,_autosomal_dominant,not_provided","ClinGen:CA128029,OMIM:102610.0003,UniProtKB:P68133#VAR_011680",00,-,-,-,-,AEFDBCI,D,0.75777,0.97594,D,0.99847,0.980561,0.99472,"0.9947553428377525,.","ACTA1,ACTA1",-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0,0.97744,0.740716,1.000000,0.71638,0.998000,0.85391,1.000000,0.68203,7.892000,0.85858,0.520000,0.23804,7.605000,0.61698,0.999560415744781
1,rs121909521,1:229433067-229433067,T,missense_variant,MODERATE,ACTA1,ENSG00000143632,Transcript,ENST00000366683.4,protein_coding,2/7,-,ENST00000366683.4:c.49G>A,ENSP00000355644.4:p.Gly17Ser,162,49,17,G/S,Ggc/Agc,"rs121909521,CM992116",C,C/G/T,-,-1,-,HGNC,HGNC:129,-,-,-,5,-,-,ENSP00000355644,-,A6NL76.108,UPI000C755200,-,deleterious_low_confidence(0),possibly_damaging(0.616),-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,pathogenic,1,11,1050851991851791522640717227580292742052329476420621480,-,-,-,-,-,0.829,0,Uncertain,0.6114285422903512,29.3,5.215657,likely_pathogenic,0.977,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,".,.,",".,.,",".,.,",".,.,",".,.,",".,.,",C/C,C,D,0.85812,0.339657,D,0.85626,0.250118,C/C,0.77442,0.99653432494021255,"D,D",0.98241,"0.907821,0.506088",C/C,"D,.",0.83878,"-11.803,.",-,-,-,-,"U,.","U,.","U,.","U,.","U,.","U,.","U,.","U,.","U,.","U,.","U,.",0.61118,"0.5415978678391965,.",6.02208,0.583422394626358,0.73752,6.645705,0.664133181902103,0.77286,"ENSG00000143632,ENSG00000143632","ENSP00000355645,ENSP00000355644","ENST00000366684,ENST00000366683",-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0.97292,"D,D","-4.35,-3.39",4.77,3.85,0.43556,0,0.19063,0.563428,-,-,-,-,-,3,0.00061,0.0,0,0.58350,0.63947,".,.","D,D",0.93285,"0.980002,0.980302",0.000000,0.84330,D,0.000000,D,0.96751,0.623916,-,".,.",0.99602,"0.996071408077,0.996071408077",D,0.97856,0.9349,"D,D",0.95115,"0.95741785,0.95741785",D,0.99965,1.1863,"G17S,G17S,","Loss_of_glycosylation_at_S16_(P_=_0.0243),Loss_of_glycosylation_at_S16_(P_=_0.0243),","P68133,P68133,",0.91657,"0.796,0.796,","M,.",0.58353,"2.1,.","G17S,G17S,G17S,G17S",0.58761,"simple_aae,simple_aae,simple_aae,simple_aae","D,D,D,D","0.999994,0.645566,0.999994,0.645566",0.55662,"D,N","-2.58,-2.09",D,0.97194,0.90714931488,10,13.3794,0.60203,0.1589:0.8411:0.0:0.0,-,-,-,-,"P68133,A6NL76","ACTS_HUMAN,A6NL76_HUMAN",0.90950,"0.8416897,.",0.90950,"0.8416897,.",0.95908,"0.94649154,.",0.95908,"0.94649154,.",C/C,1717,713,0.56348,C3711389,161800,98904,Pathogenic,NC_000001.11:g.229433067C>T,951759,"criteria_provided,_single_submitter",Actin_accumulation_myopathy,-,00,-,-,-,-,AEFDBCI,D,0.72930,0.97141,D,0.98935,0.966886,0.97183,"0.9719442852589366,.","ACTA1,ACTA1",-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0,0.97744,0.740716,1.000000,0.71638,0.998000,0.85391,1.000000,0.68203,7.892000,0.85858,0.520000,0.23804,7.605000,0.61698,0.994793236255646
2,rs121909521,1:229433067-229433067,G,missense_variant,MODERATE,ACTA1,ENSG00000143632,Transcript,ENST00000366684.7,protein_coding,2/7,-,ENST00000366684.7:c.49G>C,ENSP00000355645.3:p.Gly17Arg,152,49,17,G/R,Ggc/Cgc,"rs121909521,CM992116",C,C/G/T,-,-1,-,HGNC,HGNC:129,MANE_Select,NM_001100.4,-,1,P1,CCDS1578.1,ENSP00000355645,P68133.190,-,UPI0000000860,-,deleterious_low_confidence(0),-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,"likely_pathogenic,pathogenic",1,11,1050851991851791522640717227580292742052329476420621480,-,-,-,-,-,0.937,-2,Uncertain,0.6114285422903512,29.0,5.169387,likely_pathogenic,0.9989,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,".,.,",".,.,",".,.,",".,.,",".,.,",".,.,",C/C,C,D,0.97007,0.5807,D,0.96957,0.596359,C/C,0.77505,0.99654753831684295,"D,D",0.98979,"0.936956,0.608584",C/C,"D,.",0.81140,"-11.274,.",-,-,-,-,"U,.","U,.","U,.","U,.","U,.","U,.","U,.","U,.","U,.","U,.","P,.",0.67905,"0.6114285422903512,.",8.060451,0.7158844301797,0.83587,10.34018,0.871517802988444,0.90297,"ENSG00000143632,ENSG00000143632","ENSP00000355645,ENSP00000355644","ENST00000366684,ENST00000366683",-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0.97317,"D,D","-4.36,-3.44",4.77,3.85,0.43556,0,0.19063,0.563428,-,-,-,-,-,3,0.00061,0.0,0,0.58350,0.63947,".,.","D,D",0.94055,"0.982235,0.982402",0.000000,0.84330,D,0.000000,D,0.95926,0.555963,-,".,.",0.99485,"0.994903219395,0.994903219395",D,0.98958,0.9672,"D,D",0.98680,"0.98470706,0.98470706",D,0.99324,1.0921,"G17R,G17R,","Loss_of_glycosylation_at_S16_(P_=_0.0337),Loss_of_glycosylation_at_S16_(P_=_0.0337),","P68133,P68133,",0.97816,"0.901,0.901,","H,.",0.96631,"3.98,.","G17R,G17R,G17R,G17R",0.58761,"simple_aae,simple_aae,simple_aae,simple_aae","A,A,A,A","0.788915,0.999997,0.788915,0.999997",0.67359,"D,D","-3.43,-2.73",D,0.99310,0.936415195465,9,13.3794,0.60203,0.1589:0.8411:0.0:0.0,-,-,-,-,"P68133,A6NL76","ACTS_HUMAN,A6NL76_HUMAN",0.91962,"0.85679024,.",0.90648,"0.83711636,.",0.95732,"0.94474953,.",0.94812,"0.9357127,.",C/C,1717,713,0.56348,"C5830333,C3661900",620278,-,Pathogenic/Likely_pathogenic,NC_000001.11:g.229433067C>G,18281,"criteria_provided,_multiple_submitters,_no_conflicts","Congenital_myopathy_2c,_severe_infantile,_autosomal_dominant,not_provided","ClinGen:CA128029,OMIM:102610.0003,UniProtKB:P68133#VAR_011680",00,-,-,-,-,AEFDBCI,D,0.75777,0.97594,D,0.99847,0.980561,0.99472,"0.9947553428377525,.","ACTA1,ACTA1",-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0,0.97744,0.740716,1.000000,0.71638,0.998000,0.85391,1.000000,0.68203,7.892000,0.85858,0.520000,0.23804,7.605000,0.61698,0.999560415744781
3,rs121909521,1:229433067-229433067,T,missense_variant,MODERATE,ACTA1,ENSG00000143632,Transcript,ENST00000366684.7,protein_coding,2/7,-,ENST00000366684.7:c.49G>A,ENSP00000355645.3:p.Gly17Ser,152,49,17,G/S,Ggc/Agc,"rs121909521,CM992116",C,C/G/T,-,-1,-,HGNC,HGNC:129,MANE_Select,NM_001100.4,-,1,P1,CCDS1578.1,ENSP00000355645,P68133.190,-,UPI0000000860,-,deleterious_low_confidence(0),-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,pathogenic,1,11,1050851991851791522640717227580292742052329476420621480,-,-,-,-,-,0.829,0,Uncertain,0.6114285422903512,29.3,5.215657,likely_pathogenic,0.977,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,".,.,",".,.,",".,.,",".,.,",".,.,",".,.,",C/C,C,D,0.85812,0.339657,D,0.85626,0.250118,C/C,0.77442,0.99653432494021255,"D,D",0.98241,"0.907821,0.506088",C/C,"D,.",0.83878,"-11.803,.",-,-,-,-,"U,.","U,.","U,.","U,.","U,.","U,.","U,.","U,.","U,.","U,.","U,.",0.61118,"0.5415978678391965,.",6.02208,0.583422394626358,0.73752,6.645705,0.664133181902103,0.77286,"ENSG00000143632,ENSG00000143632","ENSP00000355645,ENSP00000355644","ENST00000366684,ENST00000366683",-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0.97292,"D,D","-4.35,-3.39",4.77,3.85,0.43556,0,0.19063,0.563428,-,-,-,-,-,3,0.00061,0.0,0,0.58350,0.63947,".,.","D,D",0.93285,"0.980002,0.980302",0.000000,0.84330,D,0.000000,D,0.96751,0.623916,-,".,.",0.99602,"0.996071408077,0.996071408077",D,0.97856,0.9349,"D,D",0.95115,"0.95741785,0.95741785",D,0.99965,1.1863,"G17S,G17S,","Loss_of_glycosylation_at_S16_(P_=_0.0243),Loss_of_glycosylation_at_S16_(P_=_0.0243),","P68133,P68133,",0.91657,"0.796,0.796,","M,.",0.58353,"2.1,.","G17S,G17S,G17S,G17S",0.58761,"simple_aae,simple_aae,simple_aae,simple_aae","D,D,D,D","0.999994,0.645566,0.999994,0.645566",0.55662,"D,N","-2.58,-2.09",D,0.97194,0.90714931488,10,13.3794,0.60203,0.1589:0.8411:0.0:0.0,-,-,-,-,"P68133,A6NL76","ACTS_HUMAN,A6NL76_HUMAN",0.90950,"0.8416897,.",0.90950,"0.8416897,.",0.95908,"0.94649154,.",0.95908,"0.94649154,.",C/C,1717,713,0.56348,C3711389,161800,98904,Pathogenic,NC_000001.11:g.229433067C>T,951759,"criteria_provided,_single_submitter",Actin_accumulation_myopathy,-,00,-,-,-,-,AEFDBCI,D,0.72930,0.97141,D,0.98935,0.966886,0.97183,"0.9719442852589366,.","ACTA1,ACTA1",-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,0,0.97744,0.740716,1.000000,0.71638,0.998000,0.85391,1.000000,0.68203,7.892000,0.85858,0.520000,0.23804,7.605000,0.61698,0.994793236255646
4,rs121909521,1:229433067-229433067,G,intron_variant,MODIFIER,ACTA1,ENSG00000143632,Transcript,ENST00000684723.1,protein_coding,-,1/5,ENST00000684723.1:c.-6-187G>C,-,-,-,-,-,-,"rs121909521,CM992116",C,C/G/T,-,-1,-,HGNC,HGNC:129,-,-,-,-,-,-,ENSP00000508084,-,A0A804HKV3.12,UPI0002C35578,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,"likely_pathogenic,pathogenic",1,11,1050851991851791522640717227580292742052329476420621480,-,-,-,-,-,-,-,-,-,29.0,5.169387,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-


In [5]:
VEP_output["SYMBOL"].nunique()

1469

In [6]:
VEP_output['#Uploaded_variation'].nunique()

2574

In [47]:
VEP_output.shape

(61256, 480)

In [53]:
VEP_output.columns

Index(['#Uploaded_variation', 'Location', 'Allele', 'Consequence', 'IMPACT',
       'SYMBOL', 'Gene', 'Feature_type', 'Feature', 'BIOTYPE',
       ...
       'phastCons17way_primate_rankscore', 'phastCons470way_mammalian',
       'phastCons470way_mammalian_rankscore', 'phyloP100way_vertebrate',
       'phyloP100way_vertebrate_rankscore', 'phyloP17way_primate',
       'phyloP17way_primate_rankscore', 'phyloP470way_mammalian',
       'phyloP470way_mammalian_rankscore', 'ClinPred'],
      dtype='object', length=480)

In [7]:
VEP_output[["SIFT", "PolyPhen"]].head(2)

Unnamed: 0,SIFT,PolyPhen
0,deleterious_low_confidence(0),probably_damaging(0.98)
1,deleterious_low_confidence(0),possibly_damaging(0.616)


The VEP output from dbSNP is quite messy (lot of useless columns, differing column names...), similar to ClinVar. To streamline the pipeline, I created a separate script to parse the output. The script primarily splits predictor columns into labels and scores, and renames certain columns to facilitate easier merging.

In [None]:
VEP_output_parsed = pd.read_csv('../data/humsavar/cleaned_Humsavar_dataset_parsed.txt', sep=',')

In [9]:
VEP_output_parsed.head(2)

Unnamed: 0,#Uploaded_variation,Location,Allele,Consequence,IMPACT,GeneSymbol,Gene,Feature_type,Feature,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,REF_ALLELE,HGNC_ID,ENSP,SWISSPROT,TREMBL,UNIPARC,UNIPROT_ISOFORM,SIFT_label,SIFT_score,PolyPhen_label,PolyPhen_score,AF,CLIN_SIG,SOMATIC,PHENO,PUBMED,TRANSCRIPTION_FACTORS,REVEL_score,BLOSUM62,EVE_label,EVE_score,CADD_PHRED_score,CADD_RAW_score,AM_label,AM_score,BayesDel_label,BayesDel_score,PrimateAI_label,PrimateAI_score,Uniprot_acc,Uniprot_entry,clinvar_id,clinvar_review,ClinPred_score
0,rs121909521,1:229433067-229433067,G,missense_variant,MODERATE,ACTA1,ENSG00000143632,Transcript,ENST00000366683.4,162,49,17,G/R,Ggc/Cgc,"rs121909521,CM992116",C,HGNC:129,ENSP00000355644,-,A6NL76.108,UPI000C755200,-,deleterious_low_confidence,0.0,probably_damaging,0.98,-,"likely_pathogenic,pathogenic",1,11,1050851991851791522640717227580292742052329476420621480,-,0.937,-2,Uncertain,0.6114285422903512,29.0,5.169387,likely_pathogenic,0.9989,D,0.5807,D,0.936415195465,"P68133,A6NL76","ACTS_HUMAN,A6NL76_HUMAN",18281,"criteria_provided,_multiple_submitters,_no_conflicts",0.999560415744781
1,rs121909521,1:229433067-229433067,T,missense_variant,MODERATE,ACTA1,ENSG00000143632,Transcript,ENST00000366683.4,162,49,17,G/S,Ggc/Agc,"rs121909521,CM992116",C,HGNC:129,ENSP00000355644,-,A6NL76.108,UPI000C755200,-,deleterious_low_confidence,0.0,possibly_damaging,0.616,-,pathogenic,1,11,1050851991851791522640717227580292742052329476420621480,-,0.829,0,Uncertain,0.6114285422903512,29.3,5.215657,likely_pathogenic,0.977,D,0.339657,D,0.90714931488,"P68133,A6NL76","ACTS_HUMAN,A6NL76_HUMAN",951759,"criteria_provided,_single_submitter",0.994793236255646


In [10]:
VEP_output_parsed[['SIFT_score', 'PolyPhen_score', 'CADD_RAW_score', 'CADD_PHRED_score', 'ClinPred_score', 'EVE_score', 'REVEL_score', 'PrimateAI_score', 'AM_score']].isna().sum()

SIFT_score          44465
PolyPhen_score      44866
CADD_RAW_score          0
CADD_PHRED_score        0
ClinPred_score          0
EVE_score               0
REVEL_score             0
PrimateAI_score         0
AM_score                0
dtype: int64

Some function to check coverage of results

In [22]:
def check_coverage(df):
    predictors = [i for i in df.columns if '_score' in i]
    adding = []
    predictor_groups = {'CADD': ['CADD_RAW_score', 'CADD_PHRED_score']}

    for predictor in predictors:
        if predictor in predictor_groups.get('CADD', []):
            coverage_type = 'RAW' if 'RAW' in predictor else 'PHRED'
            predictor_label = f'CADD ({coverage_type})'
        else:
            predictor_label = predictor.split('_')[0]
            if predictor_label == 'AM':
                predictor_label = 'AlphaMissense' 

        tmp = df[df[predictor].notna()]  
        coverage = round(100 * len(tmp) / len(df), 2)
        adding.append([predictor_label, coverage])

    table = pd.DataFrame(adding, columns=['Predictor', 'Coverage'])
    table = table.sort_values('Coverage', ascending=False).reset_index(drop=True)
    return table

In [12]:
check_coverage(VEP_output_parsed)

Unnamed: 0,Predictor,Coverage
0,EVE,100.0
1,REVEL,100.0
2,AlphaMissense,100.0
3,BayesDel,100.0
4,CADD (PHRED),100.0
5,CADD (RAW),100.0
6,PrimateAI,100.0
7,ClinPred,100.0
8,SIFT,27.41
9,PolyPhen,26.76


Finally, we need to merge the VEP output with the original Humsavar dataset. Since the input for VEP was a list of dbSNP IDs, we filter for these IDs. In cases where multiple rows match (which happens because VEP identifies several transcript IDs), we select the row with the fewest NaNs, as this is considered the most reliable.

In [77]:
#   python3 merging_Humsavar.py cleaned_Humsavar_dataset_parsed.txt humsavar_rsIDs.txt

And now we can compare how both datasets (ClinVar and Humsavar) differ in terms of variants.

In [2]:
humsavar = pd.read_csv('../data/humsavar/cleaned_Humsavar_dataset_with_preds.txt')
humsavar.head(3)

Unnamed: 0,#Uploaded_variation,Location,Allele,Consequence,IMPACT,GeneSymbol,Gene,Feature_type,Feature,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,REF_ALLELE,HGNC_ID,ENSP,SWISSPROT,TREMBL,UNIPARC,UNIPROT_ISOFORM,SIFT_label,SIFT_score,PolyPhen_label,PolyPhen_score,AF,CLIN_SIG,SOMATIC,PHENO,PUBMED,TRANSCRIPTION_FACTORS,REVEL_score,BLOSUM62,EVE_label,EVE_score,CADD_PHRED_score,CADD_RAW_score,AM_label,AM_score,BayesDel_label,BayesDel_score,PrimateAI_label,PrimateAI_score,Uniprot_acc,Uniprot_entry,clinvar_id,clinvar_review,ClinPred_score,NaN_count
0,rs138641544,1:94042785-94042785,A,missense_variant,MODERATE,ABCA4,ENSG00000198691,Transcript,ENST00000370225.4,3407,3304,1102,D/Y,Gat/Tat,"rs138641544,CM1213230,COSV64674089",C,HGNC:34,ENSP00000359245,P78363.219,-,UPI000012511C,-,deleterious_low_confidence,0.0,probably_damaging,0.98,-,"uncertain_significance,pathogenic",11,111,2242754220029649,-,0.973,-3,-,-,28.6,5.116382,likely_pathogenic,0.7141,D,0.325626,D,0.804868936539,P78363,ABCA4_HUMAN,1456034,"criteria_provided,_conflicting_classifications",0.938792994866256,0.0
1,rs76258939,1:94037332-94037332,G,missense_variant,MODERATE,ABCA4,ENSG00000198691,Transcript,ENST00000370225.4,3729,3626,1209,M/T,aTg/aCg,"rs76258939,HM080053",A,HGNC:34,ENSP00000359245,P78363.219,-,UPI000012511C,-,tolerated_low_confidence,0.99,benign,0.0,0.0090,"benign,likely_benign",1,11,2537794137510153,-,0.16,-1,-,-,6.791,0.635554,likely_benign,0.0833,T,-0.405625,T,0.344550430775,"F6TT59,P78363","F6TT59_HUMAN,ABCA4_HUMAN",143075,"criteria_provided,_multiple_submitters,_no_conflicts",0.0247329644211997,
2,rs185093512,1:94021904-94021904,A,missense_variant,MODERATE,ABCA4,ENSG00000198691,Transcript,ENST00000370225.4,4818,4715,1572,T/M,aCg/aTg,"rs185093512,CM020913,COSV64679706",G,HGNC:34,ENSP00000359245,P78363.219,-,UPI000012511C,-,deleterious_low_confidence,0.04,benign,0.322,0.0008,uncertain_significance,11,111,23982839,-,0.597,-1,-,-,19.07,2.517531,likely_benign,0.1055,T,0.00427494,T,0.435085892677,"F6TT59,P78363","F6TT59_HUMAN,ABCA4_HUMAN",497936,"criteria_provided,_multiple_submitters,_no_conflicts",0.0644443726981652,0.0


In [3]:
def extract_uniprot_id(Uniprot_acc, reviewed_ids):
    if pd.notna(Uniprot_acc):
        acc_ids = [acc.strip() for acc in Uniprot_acc.split(',')]
        for acc_id in acc_ids:
            if acc_id in reviewed_ids:
                return acc_id
    return None

In [4]:
with open('uniprotkb_reviewed_true_AND_organism_id_2025_04_09.list') as f:
    reviewed_ids = set(line.strip() for line in f if line.strip())

In [5]:
humsavar['UniprotID'] = humsavar.apply(lambda row: extract_uniprot_id(row['Uniprot_acc'], reviewed_ids), axis=1)

In [6]:
humsavar.head()

Unnamed: 0,#Uploaded_variation,Location,Allele,Consequence,IMPACT,GeneSymbol,Gene,Feature_type,Feature,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,REF_ALLELE,HGNC_ID,ENSP,SWISSPROT,TREMBL,UNIPARC,UNIPROT_ISOFORM,SIFT_label,SIFT_score,PolyPhen_label,PolyPhen_score,AF,CLIN_SIG,SOMATIC,PHENO,PUBMED,TRANSCRIPTION_FACTORS,REVEL_score,BLOSUM62,EVE_label,EVE_score,CADD_PHRED_score,CADD_RAW_score,AM_label,AM_score,BayesDel_label,BayesDel_score,PrimateAI_label,PrimateAI_score,Uniprot_acc,Uniprot_entry,clinvar_id,clinvar_review,ClinPred_score,NaN_count,UniprotID
0,rs138641544,1:94042785-94042785,A,missense_variant,MODERATE,ABCA4,ENSG00000198691,Transcript,ENST00000370225.4,3407,3304,1102,D/Y,Gat/Tat,"rs138641544,CM1213230,COSV64674089",C,HGNC:34,ENSP00000359245,P78363.219,-,UPI000012511C,-,deleterious_low_confidence,0.0,probably_damaging,0.98,-,"uncertain_significance,pathogenic",011,111,2242754220029649,-,0.973,-3,-,-,28.6,5.116382,likely_pathogenic,0.7141,D,0.325626,D,0.804868936539,P78363,ABCA4_HUMAN,1456034,"criteria_provided,_conflicting_classifications",0.938792994866256,0.0,P78363
1,rs76258939,1:94037332-94037332,G,missense_variant,MODERATE,ABCA4,ENSG00000198691,Transcript,ENST00000370225.4,3729,3626,1209,M/T,aTg/aCg,"rs76258939,HM080053",A,HGNC:34,ENSP00000359245,P78363.219,-,UPI000012511C,-,tolerated_low_confidence,0.99,benign,0.0,0.0090,"benign,likely_benign",01,11,2537794137510153,-,0.16,-1,-,-,6.791,0.635554,likely_benign,0.0833,T,-0.405625,T,0.344550430775,"F6TT59,P78363","F6TT59_HUMAN,ABCA4_HUMAN",143075,"criteria_provided,_multiple_submitters,_no_conflicts",0.0247329644211997,,P78363
2,rs185093512,1:94021904-94021904,A,missense_variant,MODERATE,ABCA4,ENSG00000198691,Transcript,ENST00000370225.4,4818,4715,1572,T/M,aCg/aTg,"rs185093512,CM020913,COSV64679706",G,HGNC:34,ENSP00000359245,P78363.219,-,UPI000012511C,-,deleterious_low_confidence,0.04,benign,0.322,0.0008,uncertain_significance,011,111,23982839,-,0.597,-1,-,-,19.07,2.517531,likely_benign,0.1055,T,0.00427494,T,0.435085892677,"F6TT59,P78363","F6TT59_HUMAN,ABCA4_HUMAN",497936,"criteria_provided,_multiple_submitters,_no_conflicts",0.0644443726981652,0.0,P78363
3,rs1571257969,1:94021390-94021390,A,missense_variant,MODERATE,ABCA4,ENSG00000198691,Transcript,ENST00000370225.4,4971,4868,1623,G/V,gGc/gTc,"rs1571257969,CM161726",C,HGNC:34,ENSP00000359245,P78363.219,-,UPI000012511C,-,deleterious_low_confidence,0.0,probably_damaging,0.995,-,likely_pathogenic,01,11,-,-,0.889,-3,-,-,28.6,5.113081,likely_pathogenic,0.9468,D,0.446068,T,0.728230714798,"F6TT59,P78363","F6TT59_HUMAN,ABCA4_HUMAN",806162,"criteria_provided,_single_submitter",0.999383926391602,0.0,P78363
4,rs201357151,1:94010822-94010822,A,missense_variant,MODERATE,ABCA4,ENSG00000198691,Transcript,ENST00000370225.4,5795,5692,1898,R/C,Cgc/Tgc,rs201357151,G,HGNC:34,ENSP00000359245,P78363.219,-,UPI000012511C,-,deleterious_low_confidence,0.0,benign,0.322,0.0002,uncertain_significance,-,1,2547434533375396,-,0.383,-3,-,-,21.5,3.137553,likely_benign,0.1062,T,-0.156535,T,0.204550027847,"F6TT59,P78363","F6TT59_HUMAN,ABCA4_HUMAN",1013243,"criteria_provided,_multiple_submitters,_no_conflicts",0.0601704447952564,0.0,P78363


In [7]:
reviewed_df = humsavar[humsavar['UniprotID'].notna()]

In [8]:
reviewed_df.shape

(2005, 51)

In [9]:
print(len(reviewed_df))
print(reviewed_df.UniprotID.nunique())
print(reviewed_df['GeneSymbol'].nunique())

2005
686
686


In [10]:
print(humsavar.groupby('GeneSymbol')['UniprotID'].nunique().eq(1).loc[lambda x: ~x])

GeneSymbol
-                False
ABHD1            False
ACTL9            False
ADH1B            False
ADRB1            False
AHI1-DT          False
AIP              False
ALDH1B1          False
ANKRD35          False
AP1B1            False
APOB             False
APOL2            False
ARHGEF25         False
ART4             False
ASCL3            False
ATRX             False
AURKA            False
AXIN2            False
B4GALNT1         False
BBS2             False
BCORL1           False
BRCA2            False
BTBD8            False
C14orf178        False
C5orf60          False
C8B              False
CALCR            False
CALCRL           False
CAPN1            False
CARD8            False
CCDC144NL-AS1    False
CCDC168          False
CCL15            False
CCL23            False
CD200R1          False
CDKL1            False
CDKN1C           False
CDSN             False
CEACAM21         False
CEACAM5          False
CEACAM6          False
CEACAM7          False
CEP126           False


These correspond to genes that are mapped to more than one Uniprot ID, which makes no sense. After, we show how reviewed_df has all its genes mapped to a single ID.

In [11]:
print(reviewed_df.groupby('GeneSymbol')['UniprotID'].nunique().eq(1).loc[lambda x: ~x])

Series([], Name: UniprotID, dtype: bool)


Finally, we proceed to remove those variants that were already retrieved on ClinVar, so we can just take new ones from Humsavar.

In [17]:
reviewed_df.shape

(2005, 51)

In [15]:
with open('/home/aitanadiaz/Desktop/TFG/Cleaning data/data/clinvar/clinvar_unique_uniprot_ids.txt') as f:
    clinvar_unique_ids = set(line.strip() for line in f if line.strip())

filtered_df = reviewed_df[~reviewed_df['UniprotID'].isin(clinvar_unique_ids)]

In [18]:
filtered_df.shape

(1108, 51)

In [20]:
#print(humsavar_no_clinvar.BinaryClinicalSignificance.value_counts())

print(len(filtered_df))
print(filtered_df.UniprotID.nunique())
print(filtered_df['GeneSymbol'].nunique())

1108
469
469


In [23]:
check_coverage(filtered_df)

Unnamed: 0,Predictor,Coverage
0,EVE,100.0
1,REVEL,100.0
2,AlphaMissense,100.0
3,BayesDel,100.0
4,CADD (PHRED),100.0
5,CADD (RAW),100.0
6,PrimateAI,100.0
7,ClinPred,100.0
8,SIFT,99.55
9,PolyPhen,98.83


In [24]:
humsavar_no_clinvar=filtered_df.reset_index(drop=True)
humsavar_no_clinvar.to_csv('../data/humsavar/humsavar_no_clinvar.csv', index=0)

As to calculate performance metrics we need the true pathogenicity values of the variants, the Variant category column must be added.

In [None]:
humsavar_withVarCat = pd.read_csv('/home/aitanadiaz/Desktop/TFG/Cleaning data/data/humsavar/humsavar_20212025_v2.csv', sep=',')

In [36]:
humsavar_withVarCat.head()

Unnamed: 0,Main gene name,Uniprot,FTId,AA change,Variant category,dbSNP,Disease name,Source,Variant,BinaryClinicalSignificance,MANE-Select ID,Ensembl ID,Protein ID,RefSeq Nucleotide ID,RefSeq Protein ID,Error
0,AARS1,P49588,VAR_089576,p.Arg326Trp,LP/P,-,"Charcot-Marie-Tooth disease, axonal, 2N (CMT2N) [MIM:613287]",2025,R326W,P,ENST00000261772.13,ENST00000261772,ENSP00000261772.8,NM_001605.3,NP_001596.2,
1,AARS1,P49588,VAR_089577,p.Thr606Ile,LP/P,-,"Leukoencephalopathy, hereditary diffuse, with spheroids 2 (HDLS2) [MIM:619661]",2025,T606I,P,ENST00000261772.13,ENST00000261772,ENSP00000261772.8,NM_001605.3,NP_001596.2,
2,AARS1,P49588,VAR_089578,p.Ser698Phe,LP/P,-,"Charcot-Marie-Tooth disease, axonal, 2N (CMT2N) [MIM:613287]",2025,S698F,P,ENST00000261772.13,ENST00000261772,ENSP00000261772.8,NM_001605.3,NP_001596.2,
3,ABCA4,P78363,VAR_084908,p.Asp1102Tyr,LB/B,rs138641544,,2025,D1102Y,B,ENST00000370225.4,ENST00000370225,ENSP00000359245.3,NM_000350.3,NP_000341.2,
4,ABCA4,P78363,VAR_084916,p.Gly1203Asp,LP/P,-,Stargardt disease 1 (STGD1) [MIM:248200],2025,G1203D,P,ENST00000370225.4,ENST00000370225,ENSP00000359245.3,NM_000350.3,NP_000341.2,


In [37]:
humsavar_withVarCat.shape

(3824, 16)

In [None]:
hum_no_clin = pd.read_csv('/home/aitanadiaz/Desktop/TFG/Cleaning data/data/humsavar/humsavar_no_clinvar.csv', sep=',')

In [38]:
hum_no_clin.head()

Unnamed: 0,#Uploaded_variation,Location,Allele,Consequence,IMPACT,GeneSymbol,Gene,Feature_type,Feature,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,REF_ALLELE,HGNC_ID,ENSP,SWISSPROT,TREMBL,UNIPARC,UNIPROT_ISOFORM,SIFT_label,SIFT_score,PolyPhen_label,PolyPhen_score,AF,CLIN_SIG,SOMATIC,PHENO,PUBMED,TRANSCRIPTION_FACTORS,REVEL_score,BLOSUM62,EVE_label,EVE_score,CADD_PHRED_score,CADD_RAW_score,AM_label,AM_score,BayesDel_label,BayesDel_score,PrimateAI_label,PrimateAI_score,Uniprot_acc,Uniprot_entry,clinvar_id,clinvar_review,ClinPred_score,NaN_count,UniprotID
0,rs11610050,12:107651687-107651687,A,missense_variant,MODERATE,ABTB3,ENSG00000151136,Transcript,ENST00000280758.10,3512,3005,1002,A/D,gCt/gAt,rs11610050,C,HGNC:23844,ENSP00000280758,A6QL63.129,-,UPI000051901E,A6QL63-1,deleterious_low_confidence,0.0,probably_damaging,0.972,-,-,-,-,-,-,0.649,-2,-,-,31.0,5.339176,likely_pathogenic,0.9994,D,0.356338,D,0.848108232021,"A6QL63,A6QL63-2,A6QL63-5,A6QL63-4","BTBDB_HUMAN,BTBDB_HUMAN,BTBDB_HUMAN,BTBDB_HUMAN",-,-,0.999407649040222,0.0,A6QL63
1,rs12303478,12:107657629-107657629,A,missense_variant,MODERATE,ABTB3,ENSG00000151136,Transcript,ENST00000280758.10,3733,3226,1076,G/S,Ggc/Agc,"rs12303478,COSV104383452",G,HGNC:23844,ENSP00000280758,A6QL63.129,-,UPI000051901E,A6QL63-1,tolerated_low_confidence,0.09,benign,0.049,0.1048,-,01,01,-,-,0.072,0,-,-,22.4,3.396815,likely_benign,0.0921,T,-0.583583,T,0.613067984581,"A6QL63,A6QL63-2,A6QL63-5,A6QL63-4","BTBDB_HUMAN,BTBDB_HUMAN,BTBDB_HUMAN,BTBDB_HUMAN",-,-,0.0157182390193226,0.0,A6QL63
2,rs1558781,12:107543991-107543991,A,missense_variant,MODERATE,ABTB3,ENSG00000151136,Transcript,ENST00000280758.10,1849,1342,448,G/S,Ggc/Agc,"rs1558781,COSV99774826",G,HGNC:23844,ENSP00000280758,A6QL63.129,-,UPI000051901E,A6QL63-1,tolerated_low_confidence,0.43,benign,0.022,0.1763,-,01,01,-,-,0.102,0,-,-,22.3,3.373019,likely_benign,0.0774,T,-0.626984,T,0.485189527273,"A6QL63,A6QL63-2,A6QL63-3,H0YHR1,C9JV03","BTBDB_HUMAN,BTBDB_HUMAN,BTBDB_HUMAN,H0YHR1_HUMAN,C9JV03_HUMAN",-,-,0.0154826749531858,0.0,A6QL63
3,rs200582844,2:236581237-236581237,A,missense_variant,MODERATE,ACKR3,ENSG00000144476,Transcript,ENST00000272928.4,898,772,258,V/M,Gtg/Atg,"rs200582844,COSV56021506",G,HGNC:23692,ENSP00000272928,P25106.214,-,UPI000013D989,-,deleterious,0.03,probably_damaging,0.978,-,"pathogenic,not_provided",01,11,31211835,-,0.494,1,-,-,26.3,4.697178,ambiguous,0.5384,D,0.143901,T,0.672353982925,P25106,ACKR3_HUMAN,585134,no_assertion_criteria_provided,0.863086182351898,0.0,P25106
4,rs767323284,13:76955344-76955344,T,missense_variant,MODERATE,ACOD1,ENSG00000102794,Transcript,ENST00000377462.6,338,290,97,T/M,aCg/aTg,"rs767323284,COSV66291594",C,HGNC:33904,ENSP00000366682,A6NK06.110,-,UPI000015C60D,-,tolerated,0.14,probably_damaging,0.958,-,-,01,01,31548418,-,0.613,-1,-,-,24.4,4.13592,ambiguous,0.3837,D,0.316682,T,0.677448093891,"A6NK06,H7C1Q4","IRG1_HUMAN,H7C1Q4_HUMAN",-,-,0.957577884197235,0.0,A6NK06


In [None]:
hum_no_clin["#Uploaded_variation"].nunique()

1108

In [None]:
hum_no_clin.shape

(1108, 51)

In [None]:
humsavar_withVarCat['dbSNP'] = humsavar_withVarCat['dbSNP'].astype(str)
hum_no_clin['#Uploaded_variation'] = hum_no_clin['#Uploaded_variation'].astype(str)

merged_df = pd.merge(
    humsavar_withVarCat,
    hum_no_clin,
    left_on='dbSNP',
    right_on='#Uploaded_variation',
    how='right'
)

In [None]:
merged_df.head()

Unnamed: 0,Main gene name,Uniprot,FTId,AA change,Variant category,dbSNP,Disease name,Source,Variant,BinaryClinicalSignificance,MANE-Select ID,Ensembl ID,Protein ID,RefSeq Nucleotide ID,RefSeq Protein ID,Error,#Uploaded_variation,Location,Allele,Consequence,IMPACT,GeneSymbol,Gene,Feature_type,Feature,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,REF_ALLELE,HGNC_ID,ENSP,SWISSPROT,TREMBL,UNIPARC,UNIPROT_ISOFORM,SIFT_label,SIFT_score,PolyPhen_label,PolyPhen_score,AF,CLIN_SIG,SOMATIC,PHENO,PUBMED,TRANSCRIPTION_FACTORS,REVEL_score,BLOSUM62,EVE_label,EVE_score,CADD_PHRED_score,CADD_RAW_score,AM_label,AM_score,BayesDel_label,BayesDel_score,PrimateAI_label,PrimateAI_score,Uniprot_acc,Uniprot_entry,clinvar_id,clinvar_review,ClinPred_score,NaN_count,UniprotID
0,ABTB3,A6QL63,VAR_042534,p.Ala1002Asp,LB/B,rs11610050,,2025,A1002D,B,ENST00000280758.10,ENST00000280758,ENSP00000280758.5,NM_001018072.2,NP_001018082.1,,rs11610050,12:107651687-107651687,A,missense_variant,MODERATE,ABTB3,ENSG00000151136,Transcript,ENST00000280758.10,3512,3005,1002,A/D,gCt/gAt,rs11610050,C,HGNC:23844,ENSP00000280758,A6QL63.129,-,UPI000051901E,A6QL63-1,deleterious_low_confidence,0.0,probably_damaging,0.972,-,-,-,-,-,-,0.649,-2,-,-,31.0,5.339176,likely_pathogenic,0.9994,D,0.356338,D,0.848108232021,"A6QL63,A6QL63-2,A6QL63-5,A6QL63-4","BTBDB_HUMAN,BTBDB_HUMAN,BTBDB_HUMAN,BTBDB_HUMAN",-,-,0.999407649040222,0.0,A6QL63
1,ABTB3,A6QL63,VAR_042535,p.Gly1076Ser,LB/B,rs12303478,,2025,G1076S,B,ENST00000280758.10,ENST00000280758,ENSP00000280758.5,NM_001018072.2,NP_001018082.1,,rs12303478,12:107657629-107657629,A,missense_variant,MODERATE,ABTB3,ENSG00000151136,Transcript,ENST00000280758.10,3733,3226,1076,G/S,Ggc/Agc,"rs12303478,COSV104383452",G,HGNC:23844,ENSP00000280758,A6QL63.129,-,UPI000051901E,A6QL63-1,tolerated_low_confidence,0.09,benign,0.049,0.1048,-,01,01,-,-,0.072,0,-,-,22.4,3.396815,likely_benign,0.0921,T,-0.583583,T,0.613067984581,"A6QL63,A6QL63-2,A6QL63-5,A6QL63-4","BTBDB_HUMAN,BTBDB_HUMAN,BTBDB_HUMAN,BTBDB_HUMAN",-,-,0.0157182390193226,0.0,A6QL63
2,ABTB3,A6QL63,VAR_055560,p.Gly448Ser,LB/B,rs1558781,,2025,G448S,B,ENST00000280758.10,ENST00000280758,ENSP00000280758.5,NM_001018072.2,NP_001018082.1,,rs1558781,12:107543991-107543991,A,missense_variant,MODERATE,ABTB3,ENSG00000151136,Transcript,ENST00000280758.10,1849,1342,448,G/S,Ggc/Agc,"rs1558781,COSV99774826",G,HGNC:23844,ENSP00000280758,A6QL63.129,-,UPI000051901E,A6QL63-1,tolerated_low_confidence,0.43,benign,0.022,0.1763,-,01,01,-,-,0.102,0,-,-,22.3,3.373019,likely_benign,0.0774,T,-0.626984,T,0.485189527273,"A6QL63,A6QL63-2,A6QL63-3,H0YHR1,C9JV03","BTBDB_HUMAN,BTBDB_HUMAN,BTBDB_HUMAN,H0YHR1_HUMAN,C9JV03_HUMAN",-,-,0.0154826749531858,0.0,A6QL63
3,ACKR3,P25106,VAR_085335,p.Val258Met,LP/P,rs200582844,Oculomotor-abducens synkinesis (OCABSN) [MIM:619215],2025,V258M,P,ENST00000272928.4,ENST00000272928,ENSP00000272928.3,NM_020311.3,NP_064707.1,,rs200582844,2:236581237-236581237,A,missense_variant,MODERATE,ACKR3,ENSG00000144476,Transcript,ENST00000272928.4,898,772,258,V/M,Gtg/Atg,"rs200582844,COSV56021506",G,HGNC:23692,ENSP00000272928,P25106.214,-,UPI000013D989,-,deleterious,0.03,probably_damaging,0.978,-,"pathogenic,not_provided",01,11,31211835,-,0.494,1,-,-,26.3,4.697178,ambiguous,0.5384,D,0.143901,T,0.672353982925,P25106,ACKR3_HUMAN,585134,no_assertion_criteria_provided,0.863086182351898,0.0,P25106
4,ACOD1,A6NK06,VAR_086754,p.Thr97Met,LB/B,rs767323284,,2025,T97M,B,ENST00000377462.6,ENST00000377462,ENSP00000366682.1,NM_001258406.2,NP_001245335.1,,rs767323284,13:76955344-76955344,T,missense_variant,MODERATE,ACOD1,ENSG00000102794,Transcript,ENST00000377462.6,338,290,97,T/M,aCg/aTg,"rs767323284,COSV66291594",C,HGNC:33904,ENSP00000366682,A6NK06.110,-,UPI000015C60D,-,tolerated,0.14,probably_damaging,0.958,-,-,01,01,31548418,-,0.613,-1,-,-,24.4,4.13592,ambiguous,0.3837,D,0.316682,T,0.677448093891,"A6NK06,H7C1Q4","IRG1_HUMAN,H7C1Q4_HUMAN",-,-,0.957577884197235,0.0,A6NK06


In [None]:
merged_df.shape

(1116, 67)

In [None]:
duplicated_snps = merged_df[merged_df.duplicated('dbSNP', keep=False)]
print(duplicated_snps[['dbSNP']])

             dbSNP
307      rs2072648
308      rs2072648
332   rs1757708758
333   rs1757708758
726      rs7255187
727      rs7255187
728      rs7255187
737   rs1684813071
738   rs1684813071
988      rs1057149
989      rs1057149
1011  rs1385657144
1012  rs1385657144
1096  rs1965499910
1097  rs1965499910


In [None]:
cleaned_df = merged_df.drop_duplicates(subset='dbSNP', keep='first')

In [None]:
cleaned_df.shape

(1108, 67)

In [None]:
cleaned_df.to_csv('/home/aitanadiaz/Desktop/TFG/Cleaning data/data/humsavar/cleaned_Humsavar_with_preds_FINAL.csv', index=0)