In [2]:
"""
The aim of this script is to create the proof of concept examples data for MYH6, CLN3 genes (Fig. 2A-B). 
The Fig itself is created by One_Gene_POC_Figs.ipynb script.
"""

In [None]:
import os
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.metrics import classification_report
import pickle
import ast

In [5]:
path = os.path.join('..', '..', 'Data', 'Full_Slim_Dataset_hg37-v1.6.csv')
Variants_data = pd.read_csv(path, engine='python')
print(Variants_data)

path = os.path.join('..', '..', 'Results', 'Best_Parameters', 'Best_Parameters_New_17.csv')
Best_param = pd.read_csv(path, engine='python')
print(Best_param)

path = os.path.join('..', '..', 'Data', 'Relevant_Columns_Names_Edited_2.csv')
Relevant_Cols_df = pd.read_csv(path)
overlap_cols = Relevant_Cols_df['Feature'].tolist()
print(overlap_cols)
rename_dict = dict(zip(overlap_cols, Relevant_Cols_df['Feature Name'].tolist()))
overlap_cols_names  = Relevant_Cols_df['Feature Name'].tolist()


       VariationID   OMIMs                                 Manifested_Tissues  \
0           535972  613254  brain-0,kidney,Skin - Sun Exposed (Lower leg),...   
1           535875  613254  brain-0,kidney,Skin - Sun Exposed (Lower leg),...   
2           535979  613254  brain-0,kidney,Skin - Sun Exposed (Lower leg),...   
3           567376  613254  brain-0,kidney,Skin - Sun Exposed (Lower leg),...   
4           565912  613254  brain-0,kidney,Skin - Sun Exposed (Lower leg),...   
...            ...     ...                                                ...   
67963       873299     NaN                                                NaN   
67964       873211     NaN                                                NaN   
67965       873240     NaN                                                NaN   
67966       873216     NaN                                                NaN   
67967       915562     NaN                                                NaN   

      #Chr        Pos      

In [6]:

y_columns = Variants_data.columns[Variants_data.columns.str.contains(pat = 'disease_causing')].tolist()
cols = list(Variants_data)
non_relevant_columns = ['VariationID', 'OMIMs', 'Manifested_Tissues', '#Chr', 'Pos', 'ConsDetail', 'motifEName', 'FeatureID', 'GeneID_y', 'GeneName', 'CCDS', 'Intron', 'Exon', 'SIFTcat', 'PolyPhenCat', 'bStatistic', 'targetScan', 'dbscSNV-rf_score', 'oAA', 'Ref', 'nAA', 'Alt']# it will be good to replace oAA and nAA with blssuom64 matrix. What bStatistic doing?
non_relevant_columns = non_relevant_columns + y_columns
print(non_relevant_columns)

relevant_columns = [x for x in cols if (x not in non_relevant_columns) and (x in overlap_cols)]
# relevant_columns.append(y)
print(relevant_columns)
Relevant_data = Variants_data[relevant_columns]
print(Relevant_data)

def preprocessing_new(Relevant_Data):
    
    "---------------------- One Hot Columns -------------------------"
    
    one_hot_columns = ['Type', 'AnnoType', 'Consequence', 'Domain', 'Dst2SplType'] 

    one_hot = pd.get_dummies(Relevant_Data[one_hot_columns])
    Relevant_Data = Relevant_Data.drop(one_hot_columns, axis=1)
    Relevant_Data = Relevant_Data.join(one_hot)
    
    "---------------------- Missing Values Imputation ---------------"
    
    special_imputation_cols = {'SIFTval':1, 'GC':0.42, 'CpG':0.02, 'priPhCons':0.115, 'mamPhCons':0.079, 'verPhCons':0.094,'priPhyloP':-0.033, 'mamPhyloP':-0.038, 'verPhyloP':0.017, 'GerpN':1.91, 'GerpS':-0.2}
    
    for cl in special_imputation_cols:
        Relevant_Data[cl] = Relevant_Data[cl].fillna(special_imputation_cols[cl])
        
    Relevant_Data.fillna(0, inplace=True)
    
    return Relevant_Data

Relevant_data = preprocessing_new(Relevant_data)
Relevant_data.rename(columns=rename_dict, inplace=True)

['VariationID', 'OMIMs', 'Manifested_Tissues', '#Chr', 'Pos', 'ConsDetail', 'motifEName', 'FeatureID', 'GeneID_y', 'GeneName', 'CCDS', 'Intron', 'Exon', 'SIFTcat', 'PolyPhenCat', 'bStatistic', 'targetScan', 'dbscSNV-rf_score', 'oAA', 'Ref', 'nAA', 'Alt', 'Lung_disease_causing', 'Muscle - Skeletal_disease_causing', 'Skin - Sun Exposed_disease_causing', 'Adipose - Subcutaneous_disease_causing', 'Artery - Aorta_disease_causing', 'Heart - Left Ventricle_disease_causing', 'Artery - Coronary_disease_causing', 'brain-0_disease_causing', 'Liver_disease_causing', 'Nerve - Tibial_disease_causing', 'Colon - Sigmoid_disease_causing', 'kidney_disease_causing', 'Heart - Atrial Appendage_disease_causing', 'Breast - Mammary Tissue_disease_causing', 'Uterus_disease_causing', 'Adipose - Visceral_disease_causing', 'Esophagus - Gastroesophageal Junction_disease_causing', 'Esophagus - Mucosa_disease_causing', 'brain-1_disease_causing', 'Skin - Not Sun Exposed_disease_causing', 'Artery - Tibial_disease_caus

In [8]:
RF_Best_Parameters.rename(columns={'Unnamed: 0':'Tissue', '0':'CADD', '1':'TRACE'}, inplace=True)


gene_name = 'MYH6' # disease gene only in skin
tissues = ['Skin - Not Sun Exposed', 'Heart - Left Ventricle', 'brain',  'Whole Blood', 'kidney', 'Testis', 'Lung']#, 'Muscle - Skeletal'

# gene_name = 'CLN3' # disease gene only in brain
# tissues = ['Skin - Not Sun Exposed', 'Heart - Left Ventricle', 'brain',  'Whole Blood', 'kidney', 'Testis', 'Lung']#, 'Muscle - Skeletal'

relevant_y_cols = [t + '_disease_causing' for t in tissues]
print(relevant_y_cols)

X_train = Relevant_data[Variants_data['GeneName'] != gene_name]
X_test = Relevant_data[Variants_data['GeneName'] == gene_name]
pathogenicity_list = []
for y in relevant_y_cols:
    tissue = tissues[relevant_y_cols.index(y)]
    print('-------------------', tissue, '--------------------------')
    best_parameters = Best_param['Best_Parameters'][(Best_param['Dataset'] == 'Full Trace')&(Best_param['Tissue'] == tissue.strip())&(Best_param['ML_Model'] == 'Random Forest')].values[0]
    best_parameters = ast.literal_eval(best_parameters)
    model = RandomForestClassifier(**best_parameters)
    y_train = Variants_data[y][Variants_data['GeneName'] != gene_name]
    y_test = Variants_data[y][Variants_data['GeneName'] == gene_name]
    model.fit(X_train, y_train)
    predictions_proba = model.predict_proba(X_test)
    pred_true = predictions_proba[:, 1]
    print(pred_true)
    y_pred = model.predict(X_test)
    clr = classification_report(y_test, y_pred, output_dict=True)
    Pred_true_df = pd.DataFrame({tissue: pred_true})
    print(Pred_true_df)
    pathogenicity_list.append(Pred_true_df)
    print(clr)

Pathogenicity_df = pd.concat(pathogenicity_list, axis=1)
Pathogenicity_df['NewIndex'] = X_test.index
Pathogenicity_df.set_index('NewIndex', inplace=True)
print(Pathogenicity_df)


interesting_cols = ['VariationID', '#Chr', 'Pos', 'Ref', 'Alt', 'Type', 'Length', 'AnnoType', 'Consequence', 'GeneName', 'cDNApos', 'protPos', 'PHRED']
interesting_cols.extend(relevant_y_cols)
Pathogenicity_df = pd.concat([Variants_data[interesting_cols], Pathogenicity_df], axis=1, join='inner')
print(Pathogenicity_df)
path = os.path.join('..', '..', 'Results', 'One_Gene_Examples', gene_name + '_Predictions_Example.csv')
Pathogenicity_df.to_csv(path)

['Skin - Not Sun Exposed_disease_causing', 'Heart - Left Ventricle_disease_causing', 'brain_disease_causing', 'Whole Blood_disease_causing', 'kidney_disease_causing', 'Testis_disease_causing', 'Lung_disease_causing']
------------------- Skin - Not Sun Exposed --------------------------
[0.08791667 0.08902778 0.10527778 0.01041667 0.01041667 0.01125
 0.01041667 0.01041667 0.01041667 0.01125    0.00986111 0.01263889
 0.01402778 0.01458333 0.02138889]
    Skin - Not Sun Exposed
0                 0.087917
1                 0.089028
2                 0.105278
3                 0.010417
4                 0.010417
5                 0.011250
6                 0.010417
7                 0.010417
8                 0.010417
9                 0.011250
10                0.009861
11                0.012639
12                0.014028
13                0.014583
14                0.021389
{'False': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 15}, 'accuracy': 1.0, 'macro avg': {'precis