In [1]:
import pandas as pd
import numpy as np
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay

In [4]:
df = pd.read_csv('./Genomes-GeneticDisorders/train_processed_encoded.csv',index_col=0)
# df.drop(['Test 1','Test 2','Test 3','Test 4','Test 5'], axis=1, inplace=True )
df.head()

Unnamed: 0,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Mother's age,Father's age,Status,Respiratory Rate (breaths/min),...,Birth defects,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5,Genetic Disorder,Disorder Subclass
0,2.0,1.0,0.0,1.0,0.0,4.760603,5.0,20.0,0.0,0.0,...,1.0,9.857562,3.0,1.0,1.0,1.0,1.0,1.0,0.0,5.0
1,4.0,1.0,1.0,0.0,0.0,4.910669,5.0,23.0,1.0,1.0,...,0.0,5.52256,2.0,1.0,1.0,1.0,1.0,0.0,0.0,2.0
2,6.0,1.0,0.0,0.0,0.0,4.893297,23.0,22.0,0.0,0.0,...,1.0,7.477132,2.0,0.0,1.0,1.0,1.0,1.0,1.0,3.0
3,12.0,1.0,0.0,1.0,0.0,4.70528,3.0,20.0,1.0,1.0,...,1.0,7.919321,1.0,0.0,0.0,1.0,0.0,0.0,0.0,6.0
4,11.0,1.0,0.0,1.0,1.0,4.720703,14.0,20.0,0.0,1.0,...,0.0,4.09821,3.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0


In [5]:
# Define X data
X = df.iloc[:,0:28]
X = (X - X.mean())/X.std()
X.head()

Unnamed: 0,Patient Age,Genes in mother's side,Inherited from father,Maternal gene,Paternal gene,Blood cell count (mcL),Mother's age,Father's age,Status,Respiratory Rate (breaths/min),...,History of anomalies in previous pregnancies,No. of previous abortion,Birth defects,White Blood cell count (thousand per microliter),Blood test result,Symptom 1,Symptom 2,Symptom 3,Symptom 4,Symptom 5
0,-1.127531,0.82473,-0.801981,0.80061,-0.874915,-0.692508,-0.850755,-1.081659,-0.996226,-0.898872,...,0.895583,-0.00206,0.905726,0.94097,1.16241,0.762643,0.821633,0.850322,1.105249,1.182666
1,-0.655954,0.82473,1.246856,-1.248992,-0.874915,0.059089,-0.850755,-0.87906,1.003743,1.112455,...,0.895583,-0.00206,-1.104036,-0.778559,0.294317,0.762643,0.821633,0.850322,1.105249,-0.845509
2,-0.184377,0.82473,-0.801981,-1.248992,-0.874915,-0.027917,0.977537,-0.946593,-0.996226,-0.898872,...,0.895583,1.489342,0.905726,-0.003256,0.294317,-1.311171,0.821633,0.850322,1.105249,1.182666
3,1.230354,0.82473,-0.801981,0.80061,-0.874915,-0.969588,-1.053899,-1.081659,1.003743,1.112455,...,0.895583,-0.747761,0.905726,0.172144,-0.573775,-1.311171,-1.217033,0.850322,-0.904733,-0.845509
4,0.994566,0.82473,-0.801981,0.80061,1.142917,-0.892347,0.063391,-1.081659,-0.996226,1.112455,...,-1.11654,1.489342,-1.104036,-1.343544,1.16241,-1.311171,-1.217033,-1.175972,-0.904733,-0.845509


In [6]:
# Define Encoder
encoder = LabelEncoder()

# Define Labels
GD = pd.get_dummies(df["Genetic Disorder"])
DS = pd.get_dummies(df["Disorder Subclass"])
GDDS = df["Genetic Disorder"].astype(str) + df["Disorder Subclass"].astype(str)
encoder.fit(GDDS)
GDDS = pd.get_dummies(pd.Series(encoder.transform(GDDS))) #0.06.0

In [7]:
labelsets = [GD,DS,GDDS]
datanames = ['GD', 'DS', 'GDDS']
data = {}

In [8]:
for i in range(len(labelsets)) :
    
    # Split data
    X_train, X_rem, y_train, y_rem = train_test_split(X,labelsets[i], train_size=0.7)
    X_val, X_test, y_val, y_test = train_test_split(X_rem,y_rem, test_size=0.5)
    
    print('-'*9, datanames[i], '-'*9)
    print(X_train.shape), print(y_train.shape)
    print(X_val.shape), print(y_val.shape)
    print(X_test.shape), print(y_test.shape)
    
    data[datanames[i]] = {
        'X': {
            'train' : X_train,
            'val' : X_val,
            'test' : X_test
        },
        'y' : {
            'train': y_train,
            'val': y_val,
            'test': y_test
        }
    }

--------- GD ---------
(15458, 28)
(15458, 3)
(3312, 28)
(3312, 3)
(3313, 28)
(3313, 3)
--------- DS ---------
(15458, 28)
(15458, 9)
(3312, 28)
(3312, 9)
(3313, 28)
(3313, 9)
--------- GDDS ---------
(15458, 28)
(15458, 17)
(3312, 28)
(3312, 17)
(3313, 28)
(3313, 17)


In [9]:
def evaluate(y_pred, y_true) :
    y_pred = np.argmax(y_pred, axis=1)
    y_true = np.argmax(np.array(y_true), axis=1)
    
   # Performance metrics
    accuracy = np.sum(y_pred == y_true) / len(y_pred)
    print('Accuracy:', accuracy)
    
    # Confusion matrix
    confmat = confusion_matrix(y_true, y_pred)
    print(confmat)
#     tn, fp, fn, tp = confmat.ravel()

    print(classification_report(y_true, y_pred))
#     print('Specificty: '+ str(tn / (tn+fp)))
#     disp = ConfusionMatrixDisplay(confusion_matrix=confmat,display_labels=best.classes_)  
#     disp.plot()
#     plt.show()

In [12]:
# define baseline model:
# input: X data
# output: GD and DS

# Define callback for early stopping
callback = EarlyStopping(monitor='loss', min_delta=0.005,patience=3)

# Define model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(24, input_dim=28, activation='relu'))
    model.add(Dense(18, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
 
baseline_model = baseline_model()
baseline_model.fit(
                   data['GD']['X']['train'], 
                   data['GD']['y']['train'], 
                   validation_data=(
                   data['GD']['X']['val'],
                   data['GD']['y']['val']),
                   epochs=50,
                   callbacks=[callback]
                  )
y_pred = baseline_model.predict(data['GD']['X']['test'])
evaluate(y_pred, data['GD']['y']['test'])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Accuracy: 0.5922124962269846
[[1451   64  309]
 [ 145   83   94]
 [ 694   45  428]]
              precision    recall  f1-score   support

           0       0.63      0.80      0.71      1824
           1       0.43      0.26      0.32       322
           2       0.52      0.37      0.43      1167

    accuracy                           0.59      3313
   macro avg       0.53      0.47      0.49      3313
weighted avg       0.57      0.59      0.57      3313



In [13]:
np.argmax(y_pred, axis=1)

array([ 6, 13, 14, ..., 13,  6, 13])

In [14]:
data['GDDS']['y']['test']

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
10376,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
15696,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2171,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
881,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10056,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14134,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1512,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
13029,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4768,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# define baseline model
def GD_model():
    # create model
    model = Sequential()
    model.add(Dense(14, input_dim=28, activation='relu'))
    model.add(Dense(14, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
 
GD_model = GD_model()
GD_model.fit(X, Y_GD, epochs=15)
# estimator = KerasClassifier(build_fn=GD_model, epochs=20, batch_size=25, verbose=0)
# kfold = KFold(n_splits=2, shuffle=True)
# results = cross_val_score(estimator, X, Y_GD, cv=kfold)
# print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))


In [None]:
# define disease subclass model
def DS_model():
    # create model
    model = Sequential()
    model.add(Dense(14, input_dim=31, activation='relu'))
    model.add(Dense(14, activation='relu'))
    model.add(Dense(9, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
 
DS_model = DS_model()
DS_model.fit(X2, Y_DS, epochs=15)

In [None]:
# define baseline model
def DS_model2():
    # create model
    model = Sequential()
    model.add(Dense(28, input_dim=31, activation='relu'))
    model.add(Dense(23, activation='relu'))
    model.add(Dense(9, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
 
DS_model2 = DS_model2()
DS_model2.fit(X2, Y_DS, epochs=25)

In [None]:
test = np.array(X.iloc[0:3,:])
GD_model.predict(test)

In [None]:
X2 = X.join(Y_GD)
# Standardize
X2 = (X2 - X2.mean())/X2.std()

In [None]:
X2.shape

In [None]:
GD_out = GD_model.predict(X)
print(np.sum(np.argmax(GD_out, axis=1) == np.argmax(np.array(Y_GD),axis=1)) / 22083)

In [None]:
X3 = X.join(pd.DataFrame(GD_out))
X3.shape

In [None]:
np.argmax(np.array(Y_comb),axis=1)

In [None]:
outputs2 = DS_model.predict(X3)
print(np.sum(np.argmax(outputs2, axis=1) == np.argmax(np.array(Y_comb),axis=1)))

In [None]:
M1_pred = GD_model.predict(test)
print(np.argmax(M1_pred, axis=1))
print(np.argmax(np.array(Y_GD)[0:3,:], axis=1))


In [None]:
X_ex = np.concatenate((test, M1_pred), axis=1)
M2_pred = DS_model.predict(X_ex)
print(np.argmax(M2_pred, axis=1))
print(np.argmax(np.array(Y_DS)[0:3,:], axis=1))

In [None]:
baseline_pred = baseline_model.predict(X.iloc[0:3,:])
print(np.argmax(baseline_pred, axis=1))
print(np.argmax(np.array(Y_comb)[0:3,:], axis=1))

In [None]:
encoder.inverse_transform(np.argmax(baseline_pred, axis=1))

In [None]:
encoder.inverse_transform(np.argmax(np.array(Y_comb)[0:3,:], axis=1))