In [1]:
import os
import gzip
import shutil
import requests

import numpy as np
import pandas as pd

import tensorflow as tf
from keras import Sequential, metrics
from keras.layers import Input, Dense, BatchNormalization, LSTM, Embedding, Bidirectional, Normalization, Conv1D, Dropout, MaxPool2D,MaxPooling1D, Flatten
from keras.models import Model

from sklearn.decomposition import PCA

import matplotlib.pyplot as plt

from sklearn.metrics import roc_curve,roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA


In [2]:
VERBOSE=False
EPOCHS=25
TRAIN_SIZE=.75

METRICS = [
      metrics.TruePositives(name='tp'),
      metrics.FalsePositives(name='fp'),
      metrics.TrueNegatives(name='tn'),
      metrics.FalseNegatives(name='fn'), 
      metrics.BinaryAccuracy(name='accuracy'),
      metrics.Precision(name='precision'),
      metrics.Recall(name='recall'),
      metrics.AUC(name='auc'),
      metrics.AUC(name='prc', curve='PR') # precision-recall curve
]

2022-02-28 13:41:02.658375: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## get the data

In [3]:

import gzip
import shutil

def download_resource(resource):
    url_dl_pattern = 'http://ctdbase.org/reports/{resource}.csv.gz'
    url = url_dl_pattern.format(resource=resource)
    
    print('downloading: {0}'.format(resource))
    local_filename = 'zipped_data/' + url.split('/')[-1]
    unzipped_filename = 'unzipped_data/' + url.split('/')[-1].replace('.gz', '')
    
    if os.path.isfile(unzipped_filename):
        print('data already exists')
        return 

    # NOTE the stream=True parameter below
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                # If you have chunk encoded response uncomment if
                # and set chunk_size parameter to None.
                #if chunk: 
                f.write(chunk)

    with gzip.open(local_filename, 'rb') as f_in:
        with open(unzipped_filename, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    
    return local_filename


resources = [
#     'CTD_chem_gene_ixn_types',
#     'CTD_chem_pathways_enriched',
#     'CTD_genes_diseases',
#     'CTD_genes_pathways',
#     'CTD_diseases_pathways',
#     'CTD_pheno_term_ixns',
#     'CTD_exposure_studies',
#     'CTD_chemicals',
#     'CTD_genes',
    'CTD_chem_gene_ixns',
    'CTD_chemicals_diseases',
    'CTD_diseases'
]

for res in resources:
    download_resource(res)


def get_df(resource):
    line_number = 27
    the_file = 'unzipped_data/{resource}.csv'.format(resource=resource)
    with open(the_file, 'r') as reader:
        for i, row in enumerate(reader):
            if i == line_number:
                header = row.replace('# ', '').split(',')

    # print(header)
    df = pd.read_csv(the_file, skiprows=29, names=header)
    return df

downloading: CTD_chem_gene_ixns
data already exists
downloading: CTD_chemicals_diseases
data already exists
downloading: CTD_diseases
data already exists


In [7]:
class DiseaseClassifier:
    def __init__(self, input_df, parent_disease, gene_count, show_plots, use_class_weights, oversample):
        self.input_df = input_df
        self.parent_disease = parent_disease
        self.target_diseases = self.get_diseases()
        self.gene_count = gene_count
        self.show_plots = show_plots
        self.stop_early = True
        self.use_class_weights = use_class_weights
        self.oversample = oversample
        self.top_n_genes = self.get_genes()
    
    def get_diseases(self):
        
        disease_df = get_df('CTD_diseases')
        disease_df['ParentIDs'].str.split('|').explode()

        hierarchy_df = disease_df\
            .assign(ParentIDs=disease_df['ParentIDs'].str.split('|')).explode('ParentIDs')

        # top_of_tree = 'MESH:D010300' # parkinsons disease
        # top_of_tree = 'MESH:D020734' # Parkinsonian Disorders
        # top_of_tree = 'MESH:D019636' # neurodegenerative diseases
        # top_of_tree = 'MESH:D009422' # nervous system diseases
        top_of_tree = self.parent_disease
        level_one = hierarchy_df.loc[hierarchy_df['ParentIDs'] == top_of_tree]
        level_two = hierarchy_df.loc[hierarchy_df['ParentIDs'].isin(level_one['DiseaseID'])]
        level_three = hierarchy_df.loc[hierarchy_df['ParentIDs'].isin(level_two['DiseaseID'])]

        # to do, do this recursively..
        all_diseases = list(level_one['DiseaseID'].unique()) \
                     + list(level_two['DiseaseID'].unique()) \
                     + list(level_three['DiseaseID'].unique()) \
                     + [top_of_tree]

        return all_diseases
    
    def get_genes(self):
        gene_df = pd.DataFrame(self.input_df.groupby(['InferenceGeneSymbol']).size()).reset_index()
        gene_df.columns = ['InferenceGeneSymbol','cnt']
        top_n_genes_df = gene_df.sort_values('cnt', ascending=False)[:self.gene_count]
        top_n_genes = top_n_genes_df['InferenceGeneSymbol'].unique()

        return top_n_genes
    
    def prep_training_data(self):
        
        gene_df = self.input_df.loc[self.input_df['DirectEvidence'].isnull()][['ChemicalName', 'DiseaseName', 'InferenceGeneSymbol', 'InferenceScore', 'DiseaseID']]

        gene_df = gene_df.loc[gene_df['InferenceGeneSymbol'].isin(self.top_n_genes)]

        evidence_df = self.input_df.loc[self.input_df['DirectEvidence'].notnull()][['ChemicalName', 'DiseaseName', 'DirectEvidence', 'DiseaseID']]
        merged_df = gene_df.merge(evidence_df, on=['ChemicalName', 'DiseaseName', 'DiseaseID'])

        dummy_df = pd.get_dummies(merged_df, prefix='', prefix_sep='',columns=['InferenceGeneSymbol'])
        gb_df = dummy_df.groupby(['DiseaseName', 'ChemicalName', 'DiseaseID']).agg({np.max}).reset_index()

        gb_df.columns = gb_df.columns.droplevel(1)

        gb_df['label'] = np.where(gb_df['DirectEvidence'] == 'marker/mechanism',
                                                   gb_df['InferenceScore'] * -1,
                                                   gb_df['InferenceScore'])
        
        return gb_df
    
    def plot_results(self, history, predicted_values, y_test, accuracy):

        auc_score = roc_auc_score(y_test, predicted_values) 

        if self.show_plots:
            # plot accuracy
            fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))
            axes[0][0].plot(history.history['accuracy'],label='accuracy')
            axes[0][0].plot(history.history['val_accuracy'],label='val_accuracy')
            axes[0][0].text(2, history.history['accuracy'][0] + .005, 'accuracy: {:.4f}'.format(accuracy))

            axes[0][0].legend()

            # plot loss
            axes[0][1].plot(history.history['loss'],label='loss')
            axes[0][1].plot(history.history['val_loss'],label='val_loss')
            axes[0][1].legend()
            fig.tight_layout()

            fpr, tpr, thresholds = roc_curve (y_test , predicted_values)

            # plot_roc_curve
            axes[1][0].plot(fpr,tpr)
            axes[1][0].text(0.7, 0.9, 'auc: {:.4f}'.format(auc_score))
            axes[1][0].axis([-.05,1.1,0,1.05]) 

            # plot confusion matrix
            cm = confusion_matrix(y_test, np.where(predicted_values > 0.5, 1, 0))

            labels = ["Non Target", "Target"]
            disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)

            disp.plot(cmap=plt.cm.Blues, ax=axes[1][1]) # xticks_rotation=45

        return auc_score
    
    def get_class_weights(self, labels):
        """
        To Do - make this dynamic to deal with N classes
        """
        neg, pos = np.bincount(labels)
        total = neg + pos
#         print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
#             total, pos, 100 * pos / total))

        # Scaling by total/2 helps keep the loss to a similar magnitude.
        # The sum of the weights of all examples stays the same.
        weight_for_0 = (1 / neg) * (total / 2.0)
        weight_for_1 = (1 / pos) * (total / 2.0)

        class_weight = {0: weight_for_0, 1: weight_for_1}

        # print('Weight for class 0: {:.2f}'.format(weight_for_0))
        # print('Weight for class 1: {:.2f}'.format(weight_for_1))
        
        return class_weight


    def train_model(self, train_df):
        
        gene_columns = train_df.columns.intersection(self.top_n_genes)
        shuffled_df = train_df.sample(frac=1)
        features, labels = shuffled_df[gene_columns], shuffled_df['binary_label']
        input_shape = features.shape[1]

        X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state=0, train_size=TRAIN_SIZE)

        model = Sequential()
        model.add(Dense(60, input_dim=input_shape, activation='relu'))
        model.add(Dense(6, input_dim=input_shape, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=METRICS)

        callbacks = []
        if self.stop_early:
            callbacks.append(tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2))

        model_fit_kw = {
            'x': X_train,
            'y': y_train,
            'epochs':25,
            'validation_split':0.2,
            'callbacks':callbacks,
            'verbose':VERBOSE
        }
        if self.use_class_weights:
            model_fit_kw['class_weight'] = self.get_class_weights(labels)
        
        ## fit the model
        history = model.fit(**model_fit_kw)
        
        ## make predictions on the test set
        predicted_values = model.predict(X_test)
        loss, accuracy = model.evaluate(X_test, y_test, verbose=VERBOSE)

        auc = self.plot_results(history, predicted_values, y_test, accuracy)

        model_metrics = m.evaluate(X_test, y_test, verbose=VERBOSE)
        model_keys = ['loss'] + [m.name for m in the_metrics]
        metrics_info = dict(zip(model_keys, model_metrics))
        
        return {
            'history':history, 
            'model': model, 
            'auc': auc, 
            'metrics_info': metrics_info
        }

    

    def main(self):
        train_df = self.prep_training_data()
        target_diseases = self.get_diseases()
        train_df['binary_label'] = np.where(train_df['DiseaseID'].isin(target_diseases),1, 0)
        
        history, model, auc, model_metrics =  self.train_model(train_df)
        # print(x)
        

        
input_df = get_df('CTD_chemicals_diseases')
        
kw = {
    'input_df': input_df,
    'parent_disease': 'MESH:D019636', # neurodegenerative diseases
    'gene_count': 1000,
    'show_plots': True,
    'use_class_weights': True,
    'oversample': False
}
dc = DiseaseClassifier(**kw)
dc.main()