In [3]:
import numpy as np
import pandas as pd
import os
import requests
import pandas as pd
import matplotlib.pyplot as plt

import datetime
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

import tensorflow as tf


from keras import Sequential
from keras.layers import Input, Dense, BatchNormalization, LSTM, Embedding, Bidirectional, Normalization
from keras.models import Model


In [4]:
import gzip
import shutil

def download_resource(resource):
    url_dl_pattern = 'http://ctdbase.org/reports/{resource}.csv.gz'
    url = url_dl_pattern.format(resource=resource)
    
    print('downloading: {0}'.format(resource))
    local_filename = 'zipped_data/' + url.split('/')[-1]
    unzipped_filename = 'unzipped_data/' + url.split('/')[-1].replace('.gz', '')
    
    if os.path.isfile(unzipped_filename):
        print('data already exists')
        return 

    # NOTE the stream=True parameter below
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                # If you have chunk encoded response uncomment if
                # and set chunk_size parameter to None.
                #if chunk: 
                f.write(chunk)

    with gzip.open(local_filename, 'rb') as f_in:
        with open(unzipped_filename, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    
    return local_filename

In [5]:
resources = ['CTD_chem_gene_ixns',
#     'CTD_chem_gene_ixn_types',
    'CTD_chemicals_diseases',
    'CTD_chem_pathways_enriched',
    'CTD_genes_diseases',
    'CTD_genes_pathways',
    'CTD_diseases_pathways',
    'CTD_pheno_term_ixns',
    'CTD_exposure_studies',
    'CTD_chemicals',
    'CTD_diseases',
    'CTD_genes'
]

for res in resources:
    download_resource(res)


downloading: CTD_chem_gene_ixns
data already exists
downloading: CTD_chemicals_diseases
data already exists
downloading: CTD_chem_pathways_enriched
data already exists
downloading: CTD_genes_diseases
data already exists
downloading: CTD_genes_pathways
data already exists
downloading: CTD_diseases_pathways
data already exists
downloading: CTD_pheno_term_ixns
data already exists
downloading: CTD_exposure_studies
data already exists
downloading: CTD_chemicals
data already exists
downloading: CTD_diseases
data already exists
downloading: CTD_genes
data already exists


In [6]:
def get_df(resource):

    line_number = 27
    the_file = 'unzipped_data/{resource}.csv'.format(resource=resource)
    with open(the_file, 'r') as reader:
        for i, row in enumerate(reader):
            if i == line_number:
                header = row.replace('# ', '').split(',')

    # print(header)
    df = pd.read_csv(the_file, skiprows=29, names=header)
    return df


In [7]:
disease_df = get_df('CTD_diseases')
disease_df[:4]
disease_df['ParentIDs'].str.split('|').explode()

hierarchy_df = disease_df.assign(ParentIDs=disease_df['ParentIDs'].str.split('|')).explode('ParentIDs')

top_of_tree = 'MESH:D019636' # neurodegenerative diseases
level_one = hierarchy_df.loc[hierarchy_df['ParentIDs'] == top_of_tree]
level_two = hierarchy_df.loc[hierarchy_df['ParentIDs'].isin(level_one['DiseaseID'])]
level_three = hierarchy_df.loc[hierarchy_df['ParentIDs'].isin(level_two['DiseaseID'])]

all_diseases = list(level_one['DiseaseID'].unique()) \
    + list(level_two['DiseaseID'].unique() ) \
    + list(level_three['DiseaseID'].unique() ) \

# all_diseases



In [8]:
# get the network data for visualization
df = get_df('CTD_chemicals_diseases')

park_disease_df = df.loc[df['DiseaseName'] == 'Parkinson Disease']
park_disease_df = park_disease_df.loc[park_disease_df['ChemicalName'] == 'Dopamine']
park_disease_df = park_disease_df.loc[park_disease_df['InferenceGeneSymbol'].notnull()]

park_gene_chem_network = park_disease_df[['InferenceGeneSymbol', 'ChemicalName']]


disease_df = park_gene_chem_network.copy()
disease_df['DiseaseName'] = 'Parkinson Disease'
disease_df = disease_df[['InferenceGeneSymbol', 'DiseaseName']]
disease_df.columns = ['FROM', 'TO']


park_gene_chem_network.columns = ['FROM', 'TO']
network_df = pd.concat([park_gene_chem_network, disease_df])

network_df.to_csv('parkinsons_network.csv', sep = '|', index=False)


In [11]:
def get_disease_inference_df(disease_name=None):
    """
    this will join the direct evidence to the gene network.
    """
    df = get_df('CTD_chemicals_diseases')
    # print(df[:5])
#     if disease_name:
#         df = df.loc[df['DiseaseName'] == 'Parkinson Disease']
        
    gene_df = df.loc[df['DirectEvidence'].isnull()][['ChemicalName', 'DiseaseName', 'InferenceGeneSymbol', 'InferenceScore', 'DiseaseID']]

    evidence_df = df.loc[df['DirectEvidence'].notnull()][['ChemicalName', 'DiseaseName', 'DirectEvidence', 'DiseaseID']]
    merged_df = gene_df.merge(evidence_df, on=['ChemicalName', 'DiseaseName', 'DiseaseID'])

    dummy_df = pd.get_dummies(merged_df, prefix='', prefix_sep='',columns=['InferenceGeneSymbol'])
    gb_df = dummy_df.groupby(['DiseaseName', 'ChemicalName', 'DiseaseID']).agg({np.max}).reset_index()

    gb_df.columns = gb_df.columns.droplevel(1)


    # # # dummy_df
    gb_df['label'] = np.where(gb_df['DirectEvidence'] == 'marker/mechanism',
                                               gb_df['InferenceScore'] * -1,
                                               gb_df['InferenceScore'])

    return gb_df


# park_train_df = get_disease_inference_df('Parkinsons Disease')
# whole_network_df = get_disease_inference_df()

# len(whole_network_df.columns)
# print(whole_network_df['label'])
# x = df.loc[df['DiseaseName'].str.contains('Parkinsons')]

train_df = get_disease_inference_df()

## USE THIS FOR PREDICTING THE INF SCORE
# train_df['binary_label'] = np.where(train_df['DiseaseID'].isin(all_diseases),train_df['label'], 0)

train_df['binary_label'] = np.where(train_df['DiseaseID'].isin(all_diseases),1, 0)

train_df.to_csv('disease_inf.csv')



In [24]:


# disease_inf = pd.read_csv('disease_inf.csv')
disease_inf[:5]

# import numpy as np

# non_vector_columns = ['DiseaseID','ChemicalName','InferenceScore','DirectEvidence','DiseaseName','label']
# vector_columns = [col for col in disease_inf if col not in non_vector_columns]

# # self.vector_columns = vector_columns

# X = disease_inf[vector_columns]

# # X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
# from sklearn.decomposition import NMF
# model = NMF(n_components=2, init='random', random_state=0)
# W = model.fit_transform(X)
# H = model.components_



print(len(H[0]))

6840


In [None]:
# def get_gene_network(disease_name=None):
#     """
#     this gets the gene network... doesnt look at direct evidence
#     """
#     df = get_df('CTD_chemicals_diseases')

#     gene_df = df.loc[df['DirectEvidence'].isnull()][['ChemicalName', 'DiseaseName', 'InferenceGeneSymbol', 'InferenceScore', 'DiseaseID']]

#     dummy_df = pd.get_dummies(gene_df, prefix='', prefix_sep='',columns=['InferenceGeneSymbol'])
#     gb_df = dummy_df.groupby(['DiseaseName', 'ChemicalName', 'DiseaseID']).agg({np.max}).reset_index()

#     gb_df.columns = gb_df.columns.droplevel(1)
    
#     # dummy_df
#     gb_df['label'] = gb_df['InferenceScore']

#     return gb_df

# train_df = get_gene_network()
# train_df['label'] = np.where(train_df['DiseaseID'].isin(all_diseases),1, 0)



In [27]:
# print(train_df[:5])

print(len(train_df.loc[train_df['binary_label'] == 0]))
print(len(train_df.loc[train_df['binary_label'] == 1]))

# train_df['label'] = np.where(train_df['DiseaseID'].isin(all_diseases),1, 0)
# print(train_df.groupby('label').size())
# train_df = train_df.sample(frac=1)
# train_df[train_df['DiseaseName'] == 'Parkinson Disease']

52436
472


In [35]:
x = train_df.groupby(['binary_label']).size()
print(x)

binary_label
0    52436
1      472
dtype: int64


In [None]:
# park_df = train_df.loc[train_df['DiseaseName'] == 'Parkinson Disease']
# train_df = park_df


In [None]:
train_df

In [34]:
EPOCHS = 10
class Algo:
    def __init__(self, df, vector_size=50, model_type='DNN', plot=False, lstm_units=10,\
                 stop_early=True, verbose=1, epochs=EPOCHS, dropout_pct=.02):
        self.df = df
        self.vector_size = vector_size
        self.model_type = model_type
#         self.loss_fn = 'mean_squared_error'       
        self.lstm_units = lstm_units
        self.plot = plot
        self.stop_early = stop_early
        self.experiment_name = '{0} - vec: {1}'.format(model_type, vector_size)
        self.dropout_pct = dropout_pct
        self.verbose=verbose
        self.epochs = EPOCHS

    def vectorize(self):
        non_vector_columns = ['DiseaseID','ChemicalName','InferenceScore','DirectEvidence','DiseaseName','label']

        vector_columns = [col for col in self.df.columns if col not in non_vector_columns]
        self.vector_columns = vector_columns

        gene_vectors = self.df[vector_columns]
#         pca = PCA(n_components=10)
#         principalComponents = pca.fit_transform(gene_vectors)       
#         return principalComponents
        return gene_vectors
    
    # def get_model(self, input_len, data_shape):
    def get_model(self, vectors):
        
        node_count = 8
        output_label_len = 1
        self.model = Sequential([
            Dense(input_shape=[len(vectors[0])], units = node_count, activation = tf.nn.relu),
            Dense(name = "output_layer", units = output_label_len, activation = tf.nn.softmax)
        ])
        # keras.utils.plot_model(self.model, "mnist_model.png", show_shapes=True)
        self.model.compile(optimizer='rmsprop',           
               loss = 'binary_crossentropy',
               metrics=['accuracy'])

        
#         normalizer = Normalization(input_shape=[len(vectors[0]),], axis=None)
#         normalizer.adapt(vectors)
# #         [None, ]
#         model = Sequential([
#             normalizer,
#             Dense(units=1)
#         ])
#         model.compile(
#             optimizer=tf.optimizers.Adam(learning_rate=0.1),
#             #loss='binary_crossentropy' #. mean_absolute_error
#             loss='mean_absolute_error'
#         )

        return self.model
    
    def train(self, train_vectors, train_labels):
        
        callbacks = [] 
#         if self.stop_early:
#             callbacks.append(tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2))

        input_len = len(train_vectors[0])
        outputlen = 1
    
        # model = self.get_model(input_len, train_vectors.shape)
        model = self.get_model(train_vectors)

        history = model.fit(train_vectors, 
                        train_labels, 
                        batch_size=8, 
                        epochs=self.epochs, 
                        verbose=self.verbose, 
                        validation_split=0.3, 
                        callbacks=callbacks)
    
        return model, history
    
    def evaluate(self, model, test_vectors, test_labels):
        res = model.evaluate(test_vectors, test_labels, verbose=self.verbose)
        return res

    def plot_loss(self, history):
p
        
    def main(self):
        
        start_time = datetime.datetime.now()
        v = self.vectorize()

        print(v)
        label_vecs = self.df['binary_label']
        
        train_vectors, test_vectors, train_labels, test_labels = train_test_split(
              np.array(v), label_vecs, test_size=0.1)
        
        model, history = self.train(train_vectors, train_labels)
        # test_loss, _ = self.evaluate(model, test_vectors, test_labels)
        self.plot_loss(history)
#         predictions = self.predict(model, test_vectors, test_labels)
        return model
#        return predictions, results, model
        
#         train_loss = history.history['loss'][-1]
#         validation_loss = history.history['val_loss'][-1]
        
#         end_time = datetime.datetime.now()
#         task_duration = (end_time-start_time).total_seconds()
#         results = {
#             'experiment_name': self.experiment_name,
#             'task_duration': task_duration,
#             'test_loss': test_loss,
#             'train_loss': train_loss,
#             'validation_loss': validation_loss,
#             'epochs_completed': len(history.history['accuracy']),
#             'epochs': EPOCHS,
#             'stop_early': self.stop_early,
#             'vector_size': self.vector_size,
#             'dropout_pct': self.dropout_pct,
#             'lstm_units': self.lstm_units,
#             'model_type':self.model_type
#         }
#         if self.plot:
#             self.plot_loss(history)
        
#         return predictions, results, model

    def predict(self, model, test_vectors, test_labels):

        preds = model.predict(test_vectors).flatten()

        test_labels['predictions'] = preds
        final = self.df.merge(test_labels, left_index=True, right_index=True)
        return final



a = Algo(train_df, vector_size=50, model_type = 'DNN', plot=True)
v = a.vectorize()
# print(a.vector_columns)
print(v)
# test_predictions, results, trained_model  = a.main()
# trained_model  = a.main()
# print(results)


       A  A1BG  A2M  AAAS  AADAC  AADAT  AASS  ABAT  ABCA1  ABCA12  ...  \
0      0     0    0     0      0      0     0     0      0       0  ...   
1      0     0    0     0      0      0     0     0      0       0  ...   
2      0     0    0     0      0      0     0     0      0       0  ...   
3      0     0    0     0      0      0     0     0      0       0  ...   
4      0     0    0     0      0      0     0     0      0       0  ...   
...   ..   ...  ...   ...    ...    ...   ...   ...    ...     ...  ...   
52903  0     0    0     0      0      0     0     0      0       0  ...   
52904  0     0    0     0      0      0     0     0      0       0  ...   
52905  0     0    0     0      0      0     0     0      0       0  ...   
52906  0     0    0     0      0      0     0     0      0       0  ...   
52907  0     0    0     0      0      0     0     0      0       0  ...   

       ZPBP2  ZSCAN22  ZSCAN31  ZSWIM5  ZSWIM9  ZW10  ZWILCH  ZWINT  ZYX  \
0          0        0  

In [None]:
train_df[:-5]['label']

In [None]:
train_df['preds'] = trained_model.predict(train_df[a.vector_columns])
# train_df.sort_values('preds', ascending=False)[:5]
train_df.sort_values('preds', ascending=True)[:5]

In [None]:
park_disease_df = train_df.loc[train_df['DiseaseName'] == 'Parkinson Disease']
park_disease_df

In [None]:
inp_df = get_df('CTD_chem_gene_ixns')

In [None]:
# inp_df[:5]
# # dopamine_df = inp_df.loc[inp_df['ChemicalName'] == 'Dopamine'][['ChemicalName', 'GeneSymbol']]

# dopamine_df = inp_df[['ChemicalName', 'GeneSymbol']]

# gb_df = dopamine_df.groupby(['ChemicalName', 'GeneSymbol']).size().reset_index()
# gb_df.columns = ['ChemicalName','GeneSymbol', 'InteractionCount']

# # gb_df.columns = gb_df.columns.droplevel(1)
# gb_df[:5]

# dummy_df = pd.get_dummies(gb_df, prefix='', prefix_sep='',columns=['GeneSymbol'])

# # dummy_df.sort_values('InteractionCount', ascending=False)[:5]

# gb_df = dummy_df.groupby(['ChemicalName']).agg({np.max}).reset_index()
# gb_df.columns = gb_df.columns.droplevel(1)

# gb_df

# trained_model.predict(gb_df[a.vector_columns])

In [None]:
#