# CNN model input data loading, train, and evaluate
This jupyter notebook shows how preprocessed input datas be further processed as final input form then, train and evaluate.

CNN model from https://github.com/IcarPA-TBlab/MetagenomicDC/blob/master/models/CNN.py

This model is used in the paper from Fiannaca (https://doi.org/10.1186/s12859-018-2182-6)

In [None]:
from sklearn.metrics import f1_score, matthews_corrcoef
import sys
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Convolution1D
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution1D, MaxPooling1D
from keras.utils import np_utils
from keras import backend as K
import numpy as np
import wandb
from wandb.keras import WandbCallback

wandb.init(project="CNN", entity="bachelorprojectgroup9",settings=wandb.Settings(start_method="fork"))

In [1]:
def load_data(file):
    lista = []
    records = list(open(file, "r"))
    records = records[1:]
    for seq in records:
        elements = seq.split(",")
        level = elements[-1].split("\n")
        classe = level[0]
        lista.append(classe)

    #make taxon list
    lista = set(lista)
    classes = list(lista)
    X = []
    Y = []
    for seq in records:
        elements = seq.split(",")
        X.append(elements[1:-1])
        level = elements[-1].split("\n")
        classe = level[0]
        Y.append(classes.index(classe))
    X = np.array(X, dtype=float)
    Y = np.array(Y, dtype=int)
    data_max = np.amax(X)
    X = X / data_max
    return  X, Y, classes, len(X[0])

## 1. Data loading process

input data contains k-mer matrix, k-mer frequency table, and corresponding taxon.

In [2]:
X, Y, classes, input_len =load_data('/Users/jihwanlim/Desktop/G9_CNN/Sample.txt')
print(X)
print(Y)
print(classes)
print(input_len)

[[0.22222222 0.22222222 0.         ... 0.11111111 0.11111111 0.        ]
 [0.         0.         0.11111111 ... 0.         0.         0.        ]
 [0.         0.11111111 0.11111111 ... 0.         0.22222222 0.22222222]]
[0 1 2]
['Eubacteriales', 'Rhodospirillales', 'Bacteroidales']
1024


Input data (X) is shown in above. each value of k-mer frequency table is divided with whth the maximum value of table for normalization.

(Y)refers to the labeling of corressponding each input data and classes shows the name of label, taxon name.

Input length depends on the k value of k-mer.

In [7]:
def create_model(nb_classes, input_length):
    model = Sequential()
    model.add(Convolution1D(5, 5, padding='valid', input_shape=(input_length,1)))  # input_dim
    model.add(Activation('relu'))
    model.add(MaxPooling1D(pool_size=2, padding='valid'))
    model.add(Convolution1D(10, 5, padding='valid'))
    model.add(Activation('relu'))
    model.add(MaxPooling1D(pool_size=2, padding='valid'))
    model.add(Flatten())
    ##
    ##MLP
    model.add(Dense(500))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(nb_classes))
    model.add(Activation('softmax'))
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

## 2. Train Model



In [None]:
def train_model(model, datatr, labelstr, dataval, labelsval, nb_classes):
    datatr = datatr.reshape(datatr.shape + (1,))
    labelstr = np_utils.to_categorical(labelstr, nb_classes)

    dataval = dataval.reshape(dataval.shape + (1,))
    labelsval = np_utils.to_categorical(labelsval, nb_classes)

    print ('Fitting model...')
    model_fit = model.fit(datatr, labelstr, epochs=30, batch_size=512, verbose=1, validation_data = (dataval, labelsval),callbacks=[WandbCallback()])

    tr_scores = model.evaluate(datatr, labelstr, verbose=1)

    return model_fit

Input data is rearragend into (num_dataset, input_length, 1) numpy array shape with x.reshape(x.shape + (1,))
Labels are transformed into one hot encoding form.

In [3]:
X_reshape = X.reshape(X.shape + (1,))
print(X_reshape)

[[[0.22222222]
  [0.22222222]
  [0.        ]
  ...
  [0.11111111]
  [0.11111111]
  [0.        ]]

 [[0.        ]
  [0.        ]
  [0.11111111]
  ...
  [0.        ]
  [0.        ]
  [0.        ]]

 [[0.        ]
  [0.11111111]
  [0.11111111]
  ...
  [0.        ]
  [0.22222222]
  [0.22222222]]]


# 3. Evaluate Model

In [None]:
def evaluate_model(model, datate, labelste, nb_classes):

    labelste_bin = np_utils.to_categorical(labelste, nb_classes)
    datate = datate.reshape(datate.shape + (1,))

    preds = model.predict_classes(datate, verbose=1)

    score, acc = model.evaluate(datate, labelste_bin, verbose=1)
    print('Test loss:', score)
    print('Test accuracy:', acc)
    return preds, labelste_bin

Preds value means that predeiction of test input data by CNN model.

labelste_bin is the answer of prediction.

In [None]:
if __name__ == "__main__":
    
    print ('Loading data...')

    x_train, y_train, classes, input_length = load_data(sys.argv[2])

    x_val, y_val, classes_val, val_input_length_semi = load_data(sys.argv[3])

    x_test, y_test, classes_test, test_input_length_semi = load_data(sys.argv[4])

    nb_classes = len(classes)

    print ('Loading model...')
    
    model = create_model(nb_classes, len(x_train[0]))   
    model_fit = train_model(model, x_train, y_train, x_val, y_val, nb_classes)
    
    model.save("/home/ba3-project-9/CNN/result/CNN_aa_d3_20,256.h5")

    print(model.summary())


    pred, Y_test = evaluate_model(model, x_test, y_test, nb_classes)
 

    Y_test_decode = []
    Y_test = Y_test.tolist()
    for matrix in Y_test:
        Y_test_decode.append(matrix.index(1.0))

    f1 = f1_score(Y_test_decode, pred, average='weighted')
    mcc = matthews_corrcoef(Y_test_decode, pred)
    print('f1_score:', f1)
    print('MCC:', mcc)

    np.save("/home/ba3-project-9/CNN/result/pred_tanh", pred)
    np.save("/home/ba3-project-9/CNN/result/Y_test_tanh", Y_test)