In [1]:
import pickle
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import warnings

In [2]:
warnings.filterwarnings('ignore')

def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(1234)

In [3]:
df_embeds = pd.read_csv('../../classical_ML/data/Orthrus_data/orthrus_features_clean.tsv.gz', sep='\t')
df_embeds.dropna(subset = ['embed0'], inplace=True)
embed_cols = [f'embed{i}' for i in range(512)]
df_embeds = df_embeds[['Transcript ID']+embed_cols]
df_embeds.rename(columns={"Transcript ID": "Transcript_ID"}, inplace=True)
df_embeds

Unnamed: 0,Transcript_ID,embed0,embed1,embed2,embed3,embed4,embed5,embed6,embed7,embed8,...,embed502,embed503,embed504,embed505,embed506,embed507,embed508,embed509,embed510,embed511
0,ENST00000263100,0.122282,0.177534,-0.048803,-0.165734,-0.034072,-0.264644,-0.057117,-0.227631,-0.529456,...,-0.041493,0.312705,0.183835,0.282550,0.097991,0.247955,-0.156811,0.038840,-0.064718,-0.006802
1,ENST00000373997,0.342975,0.134082,0.039255,-0.224912,-0.052249,-0.148360,-0.062987,-0.092132,-0.442369,...,-0.327766,0.412198,0.237658,0.315500,0.118175,0.254118,-0.152559,0.185892,-0.129714,0.020999
2,ENST00000318602,0.331549,0.211808,-0.040499,-0.144805,-0.050687,-0.184935,0.127531,-0.119810,-0.621050,...,-0.248826,0.385974,0.509238,0.150244,0.153647,0.098654,-0.023546,0.113986,-0.099528,0.030393
3,ENST00000299698,0.337860,0.260349,-0.019156,-0.144985,-0.045211,-0.194701,0.104279,-0.138256,-0.562558,...,-0.233801,0.381406,0.526234,0.110568,0.128577,0.106606,-0.047536,0.093703,-0.121776,-0.045927
4,ENST00000442999,0.120457,0.181139,0.107973,-0.104078,-0.039713,-0.133477,0.049908,-0.193681,-0.174398,...,-0.172032,0.451827,0.141332,0.319514,0.230903,0.197436,-0.121235,0.083669,-0.072020,-0.016692
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18479,ENST00000371528,0.189914,0.113874,0.061029,-0.105412,-0.031373,-0.201013,-0.110164,-0.084314,-0.437392,...,-0.185700,0.380479,0.177239,0.300294,0.138531,0.178986,-0.119901,0.136907,-0.033310,-0.126740
18480,ENST00000294353,0.223297,0.086196,0.052600,-0.128977,-0.027397,-0.177775,-0.117537,-0.079050,-0.434602,...,-0.188737,0.404469,0.187927,0.321878,0.120758,0.228710,-0.140578,0.149540,0.003069,-0.073322
18481,ENST00000322764,0.254627,0.138448,0.156764,-0.182271,0.062424,-0.127684,-0.138878,-0.132653,-0.478736,...,-0.293639,0.455482,0.232366,0.347795,0.366158,0.279273,0.026741,0.107467,0.051708,-0.081295
18482,ENST00000381638,0.442461,0.263657,-0.102592,-0.080843,0.024760,-0.054935,0.484756,-0.382065,-0.816725,...,-0.153948,0.554390,1.176475,-0.155350,0.156531,0.152959,0.140080,-0.022837,-0.029495,-0.261842


In [4]:
df_unq = pd.read_csv('../data/tcga_annotated_clean4ML.tsv.gz', sep='\t')
df_unq = df_unq.merge(df_embeds, how='left')
df_unq

Unnamed: 0,CHROM,POS,REF,ALT,Transcript_ID,HGVSc,KIM_PTC_to_start_codon,KIM_upstream_exon_count,KIM_downstream_exon_count,KIM_last_exon,...,embed502,embed503,embed504,embed505,embed506,embed507,embed508,embed509,embed510,embed511
0,1,944753,C,T,ENST00000327044,ENST00000327044:c.2191C>T,2193,18,0,1,...,-0.266964,0.572462,0.137608,0.364898,0.278375,0.192540,-0.055143,0.177699,-0.033967,-0.119414
1,1,952113,G,A,ENST00000327044,ENST00000327044:c.1218G>A,1218,10,8,0,...,-0.266964,0.572462,0.137608,0.364898,0.278375,0.192540,-0.055143,0.177699,-0.033967,-0.119414
2,1,1255304,G,T,ENST00000349431,ENST00000349431:c.679G>T,681,6,0,1,...,-0.255506,0.340774,0.216285,0.367062,0.385427,0.249874,-0.126066,0.061456,-0.044280,0.038091
3,1,1338573,G,T,ENST00000378888,ENST00000378888:c.1288G>T,1290,11,3,0,...,-0.230311,0.540800,0.241533,0.349425,0.209922,0.237617,-0.086651,0.132743,-0.047608,-0.005701
4,1,1387314,C,T,ENST00000400809,ENST00000400809:c.1480C>T,1482,10,0,1,...,-0.111227,0.485855,0.080609,0.412174,0.167230,0.145487,-0.071526,0.118111,-0.018258,0.048301
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4013,X,152920717,C,T,ENST00000370268,ENST00000370268:c.622C>T,624,8,14,0,...,,,,,,,,,,
4014,X,153650171,G,T,ENST00000342782,ENST00000342782:c.1021G>T,1023,3,0,1,...,-0.196150,0.426428,0.199716,0.310246,0.163403,0.159778,-0.084737,0.111034,-0.035579,-0.101326
4015,X,154030948,C,T,ENST00000303391,ENST00000303391:c.880C>T,882,3,0,1,...,-0.123834,0.466796,0.185562,0.355276,0.198612,0.238267,-0.030854,0.173761,0.059287,-0.104817
4016,X,154354015,C,A,ENST00000369850,ENST00000369850:c.5586C>A,5586,34,13,0,...,-0.152846,0.604602,1.504313,-0.309990,0.027346,0.059884,0.179337,-0.128790,0.015879,-0.266402


In [5]:
prot_feat_df = pd.read_csv('../data/protein_AA_features.tsv.gz', sep='\t')
prot_feat_df.rename(columns={'MANE-Select':'Transcript_ID'}, inplace=True)
prot_feat_df.drop(columns=['Unnamed: 0', 'Entry'], axis=1, inplace=True)
prot_feat_df

Unnamed: 0,Transcript_ID,actual_or_pred_ACT_SITE,annotation_actual_ACT_SITE,annotation_pred_ACT_SITE,actual_or_pred_BINDING,annotation_actual_BINDING,annotation_pred_BINDING,actual_or_pred_COILED,annotation_actual_COILED,annotation_pred_COILED,...,annotation_pred_TRANSIT,actual_or_pred_TRANSMEM,annotation_actual_TRANSMEM,annotation_pred_TRANSMEM,actual_or_pred_TURN,annotation_actual_TURN,annotation_pred_TURN,actual_or_pred_ZN_FING,annotation_actual_ZN_FING,annotation_pred_ZN_FING
0,ENST00000436697,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000001100100000...,...,1111000000000000000000000000000000000000000000...,actual,0000111111111111111111111000000000000000000000...,,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...
1,ENST00000709217,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...,...,0000000000000000000000000000000000000000000000...,actual,0000000000000000000000000000000000000000000000...,,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...
2,ENST00000374922,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...,...,1111111000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...
3,ENST00000637218,actual,0000000000000000000000000000000000000000000000...,,actual,0000000000000000000000000000000000000000000000...,,pred,,0000000000000000000000000000000000000000000000...,...,1111111111111111111111000000000000000000000000...,actual,0000000000000000000000000000000000000000000000...,,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...
4,ENST00000469902,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...,...,0000000000000000000000000000000000000000000000...,actual,0000000000000000000000000000000000001111111111...,,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18412,ENST00000295896,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...,...,1111111111111111111111100000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...
18413,ENST00000295898,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...,...,1111111111011001000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...
18414,ENST00000306862,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...,...,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...
18415,ENST00000450660,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...,...,1111111111111111111111100000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...,pred,,0000000000000000000000000000000000000000000000...


In [6]:
# Calculate the proportion of lost features upon stop-gain

df_unq = df_unq.merge(prot_feat_df, how='left')

prot_feat_list = ['ACT_SITE', 'BINDING', 'COILED', 'COMPBIAS', 'DISULFID', 'BINDING', 'DOMAIN', 
                 'HELIX', 'MOD_RES', 'MOTIF', 'PROPEP', 'REGION', 'REPEAT', 'SIGNAL', 'STRAND',
                 'TOPO_DOM', 'TRANSIT', 'TRANSMEM', 'TURN', 'ZN_FING']

for prot_feat in prot_feat_list:
    df_unq[f'prop_lost_{prot_feat}'] = None

for idx, row in df_unq.iterrows():
    
    for prot_feat_name in prot_feat_list:
        temp_pred_or_actual = row[f'actual_or_pred_{prot_feat_name}']
        
        if not pd.isna(temp_pred_or_actual):
            
            prot_feat_binary = row[f'annotation_{temp_pred_or_actual}_{prot_feat_name}']
            
            if row['Protein_position'] <= len(prot_feat_binary):
                prot_feat_binary_lost = prot_feat_binary[row['Protein_position']:]
                df_unq.loc[idx, f'prop_lost_{prot_feat_name}'] = prot_feat_binary_lost.count('1') / len(prot_feat_binary)

prot_lost_feat_list = [f'prop_lost_{x}' for x in prot_feat_list]

In [7]:
def apply_baseline_rules(df, exon_len_threshold=407, penultimate_threshold=55, start_threshold=100):

    # long exon rule
    df['long_exon'] = 0
    df.loc[df['current_exon_len'] > exon_len_threshold, 'long_exon'] = 1

    # penultimate exon rule
    df['penultimate_flag'] = 0
    df.loc[
        (df['DIST_FROM_LAST_EXON'] < penultimate_threshold) &
        (df['DIST_FROM_LAST_EXON'] >= 0), 
        'penultimate_flag'
    ] = 1

    # close to start rule
    df['close_to_start'] = 0
    df.loc[df['CDS_position'] < start_threshold, 'close_to_start'] = 1

    return df


In [8]:
# last exon rule
df_unq['last_exon'] = 0
df_unq.loc[df_unq['current_exon_number']==df_unq['total_exon_numbers'], 'last_exon'] = 1

df_unq = apply_baseline_rules(df_unq, exon_len_threshold=355, penultimate_threshold=49, start_threshold=120) #based on grid search
df_unq

Unnamed: 0,CHROM,POS,REF,ALT,Transcript_ID,HGVSc,KIM_PTC_to_start_codon,KIM_upstream_exon_count,KIM_downstream_exon_count,KIM_last_exon,...,prop_lost_STRAND,prop_lost_TOPO_DOM,prop_lost_TRANSIT,prop_lost_TRANSMEM,prop_lost_TURN,prop_lost_ZN_FING,last_exon,long_exon,penultimate_flag,close_to_start
0,1,944753,C,T,ENST00000327044,ENST00000327044:c.2191C>T,2193,18,0,1,...,0.0,0.024032,0.0,0.0,0.0,0.0,1,1,0,0
1,1,952113,G,A,ENST00000327044,ENST00000327044:c.1218G>A,1218,10,8,0,...,0.0,0.457944,0.0,0.05474,0.0,0.0,0,0,0,0
2,1,1255304,G,T,ENST00000349431,ENST00000349431:c.679G>T,681,6,0,1,...,0.0,0.046332,0.0,0.07722,0.0,0.0,1,1,0,0
3,1,1338573,G,T,ENST00000378888,ENST00000378888:c.1288G>T,1290,11,3,0,...,0.0,0.381295,0.0,0.020144,0.0,0.0,0,0,0,0
4,1,1387314,C,T,ENST00000400809,ENST00000400809:c.1480C>T,1482,10,0,1,...,0.0,0.05,0.0,0.0,0.0,0.0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4013,X,152920717,C,T,ENST00000370268,ENST00000370268:c.622C>T,624,8,14,0,...,,,,,,,0,0,0,0
4014,X,153650171,G,T,ENST00000342782,ENST00000342782:c.1021G>T,1023,3,0,1,...,0.0,0.111979,0.0,0.0,0.0,0.0,1,1,0,0
4015,X,154030948,C,T,ENST00000303391,ENST00000303391:c.880C>T,882,3,0,1,...,,,,,,,1,1,0,0
4016,X,154354015,C,A,ENST00000369850,ENST00000369850:c.5586C>A,5586,34,13,0,...,0.144314,0.296562,0.0,0.0,0.005667,0.0,0,0,0,0


In [9]:
KIM_features = ['KIM_PTC_to_start_codon', 'KIM_upstream_exon_count',
       'KIM_downstream_exon_count', 'KIM_last_exon', 'KIM_50nt_to_last_EJ',
       'KIM_PTC_exon_length', 'KIM_PTC_to_intron', 'KIM_dist_to_stop_codon',
       'KIM_mRNA_half_life', 'KIM_AF', 'KIM_LOEUF', 'KIM_5UTR_length',
       'KIM_3UTR_length', 'KIM_Transcript_length']

baseline_features = ['last_exon', 'long_exon', 'penultimate_flag', 'close_to_start'] 

new_features = [ "hl", "mrl",
         "cDNA_position", "CDS_position", "Protein_position",
         "PERCENTILE", "GERP_DIST", "BP_DIST",
         "DIST_FROM_LAST_EXON", 'single_exon',
         "current_exon_number", "total_exon_numbers",
         "utr5_len", "total_cds_len", "current_exon_len",  
         "VEST4_score", "CADD_phred", 
         "phyloP100way_vertebrate", 
         "dn_ds", "abundance", 
         "shet", "lof.oe_ci.upper", "lof.pRec",        
         "CDS_GC", "UTR3_GC", "UTR5_GC",
         "connectedness"] 

additional_features = [
    "LoF_HC",
    "gnomad41_genome_AF", "gnomad41_exome_AF",
    "NearestExonJB_dist", 
    "TF", "tau",
    "phyloP17way_primate", "phyloP470way_mammalian",
    "phastCons100way_vertebrate","phastCons17way_primate","phastCons470way_mammalian",
    "mis.z_score", "syn.z_score",
    "lof.pNull",  "lof.pLI"  ,
    "betweenness", "exp_var",
    "utr3_len", "total_exons_len",
    "fathmm-XF_coding_score",
    "Nucleus", "Exosome", "Cytosol", "Cytoplasm",
    "Ribosome", "Membrane", "Endoplasmic_reticulum"
]

my_features = baseline_features + new_features + prot_lost_feat_list + additional_features + embed_cols

### Splits

In [11]:
test_var_ids = df_unq[df_unq['CHROM'].isin(['20', '21', '22'])]['HGVSc'].values.tolist()
test_var_ids = list(set(test_var_ids))
test_var_ids[0:5], len(test_var_ids)

(['ENST00000405938:c.1554G>A',
  'ENST00000380903:c.1531C>T',
  'ENST00000398022:c.1887C>A',
  'ENST00000359568:c.8374C>T',
  'ENST00000352957:c.163C>T'],
 224)

In [12]:
val_var_ids = df_unq[df_unq['CHROM']=='19']['HGVSc'].values.tolist()
val_var_ids = list(set(val_var_ids))
val_var_ids[0:5], len(val_var_ids)

(['ENST00000546361:c.2152C>T',
  'ENST00000338128:c.1549C>T',
  'ENST00000292123:c.1640C>G',
  'ENST00000599848:c.2245C>T',
  'ENST00000248244:c.421C>T'],
 204)

In [13]:
train_var_ids = list(set(df_unq.HGVSc.values.tolist()) - set(val_var_ids) - set(test_var_ids))
train_var_ids[0:5], len(train_var_ids)

(['ENST00000264235:c.499C>T',
  'ENST00000415136:c.172G>T',
  'ENST00000360310:c.3586C>T',
  'ENST00000317147:c.1039G>T',
  'ENST00000556440:c.2152C>T'],
 3590)

### Feature selection

In [14]:
df_unq_train = df_unq[df_unq['HGVSc'].isin(train_var_ids)]
df_unq_train

Unnamed: 0,CHROM,POS,REF,ALT,Transcript_ID,HGVSc,KIM_PTC_to_start_codon,KIM_upstream_exon_count,KIM_downstream_exon_count,KIM_last_exon,...,prop_lost_STRAND,prop_lost_TOPO_DOM,prop_lost_TRANSIT,prop_lost_TRANSMEM,prop_lost_TURN,prop_lost_ZN_FING,last_exon,long_exon,penultimate_flag,close_to_start
0,1,944753,C,T,ENST00000327044,ENST00000327044:c.2191C>T,2193,18,0,1,...,0.0,0.024032,0.0,0.0,0.0,0.0,1,1,0,0
1,1,952113,G,A,ENST00000327044,ENST00000327044:c.1218G>A,1218,10,8,0,...,0.0,0.457944,0.0,0.05474,0.0,0.0,0,0,0,0
2,1,1255304,G,T,ENST00000349431,ENST00000349431:c.679G>T,681,6,0,1,...,0.0,0.046332,0.0,0.07722,0.0,0.0,1,1,0,0
3,1,1338573,G,T,ENST00000378888,ENST00000378888:c.1288G>T,1290,11,3,0,...,0.0,0.381295,0.0,0.020144,0.0,0.0,0,0,0,0
4,1,1387314,C,T,ENST00000400809,ENST00000400809:c.1480C>T,1482,10,0,1,...,0.0,0.05,0.0,0.0,0.0,0.0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4013,X,152920717,C,T,ENST00000370268,ENST00000370268:c.622C>T,624,8,14,0,...,,,,,,,0,0,0,0
4014,X,153650171,G,T,ENST00000342782,ENST00000342782:c.1021G>T,1023,3,0,1,...,0.0,0.111979,0.0,0.0,0.0,0.0,1,1,0,0
4015,X,154030948,C,T,ENST00000303391,ENST00000303391:c.880C>T,882,3,0,1,...,,,,,,,1,1,0,0
4016,X,154354015,C,A,ENST00000369850,ENST00000369850:c.5586C>A,5586,34,13,0,...,0.144314,0.296562,0.0,0.0,0.005667,0.0,0,0,0,0


In [15]:
from sklearn.feature_selection import VarianceThreshold, mutual_info_regression
import numpy as np
import pandas as pd

# Load your dataframe and feature list
X = df_unq_train[my_features]  # Feature matrix
y = df_unq_train["NMD_efficiency"]  # Target variable

### 🔹 STEP 1: Remove Near-Constant Features (Low Variance)
var_thresh = VarianceThreshold(threshold=0.01)
X_var_filtered = X.loc[:, var_thresh.fit(X).get_support()]

removed_low_var = set(my_features) - set(X_var_filtered.columns)
print(f"Removed low variance features: {removed_low_var}")

# Update feature list
my_features = list(X_var_filtered.columns)

### 🔹 STEP 2: Remove Highly Correlated Features
corr_matrix = X_var_filtered.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = []
columns = list(upper.columns)
for column in reversed(columns):  # Start from the last column
    if any(upper[column] > 0.8):
        to_drop.append(column)

X_corr_filtered = X_var_filtered.drop(columns=to_drop)
print(f"Removed highly correlated features: {to_drop}")

# Update feature list
my_features = list(X_corr_filtered.columns)


Removed low variance features: {'embed475', 'embed511', 'embed257', 'embed347', 'embed416', 'prop_lost_ACT_SITE', 'embed170', 'embed22', 'embed73', 'embed4', 'embed63', 'embed202', 'embed34', 'embed327', 'embed413', 'embed328', 'embed298', 'embed427', 'embed31', 'embed91', 'embed251', 'embed260', 'embed408', 'embed220', 'embed502', 'embed352', 'embed16', 'embed43', 'embed375', 'embed284', 'embed267', 'embed67', 'embed190', 'embed218', 'embed272', 'embed331', 'embed399', 'embed291', 'embed17', 'embed85', 'embed289', 'embed120', 'embed12', 'embed95', 'prop_lost_SIGNAL', 'embed262', 'embed402', 'embed215', 'embed369', 'embed36', 'embed216', 'single_exon', 'embed469', 'embed94', 'embed46', 'embed157', 'embed189', 'embed268', 'embed26', 'embed386', 'embed377', 'embed431', 'embed136', 'embed109', 'embed69', 'embed115', 'embed360', 'embed258', 'embed64', 'embed113', 'embed197', 'embed400', 'embed480', 'embed495', 'embed217', 'embed178', 'embed466', 'embed103', 'embed395', 'embed13', 'embed474

In [16]:
X_corr_filtered.columns


Index(['last_exon', 'long_exon', 'penultimate_flag', 'close_to_start', 'hl',
       'mrl', 'cDNA_position', 'PERCENTILE', 'GERP_DIST', 'BP_DIST',
       'utr5_len', 'current_exon_len', 'VEST4_score', 'CADD_phred',
       'phyloP100way_vertebrate', 'dn_ds', 'abundance', 'shet',
       'lof.oe_ci.upper', 'lof.pRec', 'UTR5_GC', 'connectedness',
       'prop_lost_DOMAIN', 'prop_lost_HELIX', 'prop_lost_REGION',
       'prop_lost_TOPO_DOM', 'LoF_HC', 'NearestExonJB_dist', 'TF', 'tau',
       'phyloP17way_primate', 'phyloP470way_mammalian',
       'phastCons100way_vertebrate', 'phastCons17way_primate',
       'phastCons470way_mammalian', 'mis.z_score', 'syn.z_score', 'lof.pNull',
       'exp_var', 'utr3_len', 'total_exons_len', 'fathmm-XF_coding_score',
       'Nucleus', 'Cytosol', 'Cytoplasm', 'Ribosome', 'Membrane',
       'Endoplasmic_reticulum', 'embed6', 'embed8', 'embed90', 'embed145',
       'embed205', 'embed219', 'embed230', 'embed240', 'embed254', 'embed309',
       'embed356', 'emb