<span style="color:black"><span style = "font-size:30px"> CNN model training</span>
&nbsp;&nbsp;&nbsp;
   
    
In this process, we built a convolutional neural network model (CNN) to predict promoter strength. We used the training data set from the previous step (see the section of ‘Acquisition of promoter dataset').

1.	Import python modules required for training a CNN model and import the training dataset generated in the previous step (Acquisition of promoter dataset).

In [None]:
# 1.

from __future__ import absolute_import, division, print_function, unicode_literals
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import numpy as np
import random
import time
import kFold
import itertools
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import theano
from keras import optimizers
from tensorflow.keras import datasets, layers, models
from scipy.stats import pearsonr
from sklearn.model_selection import KFold
from scipy.stats import pearsonr
from tqdm import tqdm
data = pd.read_excel('PCC6803 Promoter and reads 100bp.xlsx')

2.	Define the 'One-hot encoding' (OHE) function (see the section of ‘VAE model training’). The logarithmic scale (log2) of the reads from dRNA-seq is used as the promoter strength of each promoter. 

In [None]:
# 2.

def one_hot_encoding(df, seq_column, expression):
    bases = ['A','C','G','T']
    base_dict = dict(zip(bases,range(4)))
    n = len(df)
    total_width = df[seq_column].str.len().max()+20
    X = np.zeros((n,1,4,total_width))
    seqs = df[seq_column].values
    for i in range(n):
        seq = seqs[i]
        for b in range(len(seq)):
            X[i,0,base_dict[seq[b]], b+10+100-len(seq)] = 1    
    X = X.astype(theano.config.floatX)
    y = np.asarray(df[expression].values, dtype = theano.config.floatX)[:,np.newaxis]
    return X, y, total_width

Xtot, ytot, _ = one_hot_encoding(data,'Promoter','Reads')
ytot= np.log2(ytot+1)

3.	Define the 'hyperdict’. The 'hyperdict' is a variable set containing the number of kernels ('con1_num, con2_num'), the width of kernels ('con1_len', 'con2_len', 'den1_len'), the sign showing whether the computer builds the additional layer or not ('con2_prob', 'den_prob'), droplate ('droplate'), batch size ('batch_size') and epochs ('epochs').

In [None]:
# 3.

hyperdict = {'con1_num' : [4,8,16,32],'con1_len' : [6,12,18,24], 'con2_prob' : [0,1],
             'con2_num' : [4,8,16,32],'con2_len' : [6,12,18,24],'den_prob' : [0,1],
             'den1_len' : [4,8,16,32], 'droprate' : [0.1,0.2,0.3,0.5] , 'batch_size' :[32,64], 'epochs' : [50, 75,100, 150, 200]} 

4.	Define the 'create_model' function. The 'create_model' function returns the model constructed with the randomly chosen elements in ‘hyperdict’. 

In [None]:
# 4.

def create_model(con1n = None, con1l= None,con2p = None, con2n= None, con2l= None,denp = None,den1= None, dout = None):
    model= models.Sequential()
    model.add(layers.Conv2D(con1n,(4, con1l), activation = 'relu', data_format = 'channels_first', input_shape = (1,4,120)))
    model.add(layers.Dropout(dout))
    if con2p == 1:     
        model.add(layers.Conv2D(con2n,(1,con2l),activation = 'relu' ,data_format = 'channels_first'))
        model.add(layers.Dropout(dout))
    model.add(layers.Flatten())
    model.add(layers.Dropout(dout))
    if denp == 1:
        model.add(layers.Dense(den1, activation = 'relu'))
    model.add(layers.Dense(1))
    return model

5.	Train the CNN model. This code helps you yield the best model with a minimum loss function value. In addition, to prevent possible overfitting during the training, we conduct k-fold cross-validation (k = 5). 

In [None]:
# 5.

start = time.time() 

total_val = 10000
trial = 100
kfold = kFold(n_split = 5)
oplist = []
for _ in range(trial):
    instant_val = 0
    for X_train, y_train in kfold.split(X):
        c1n = random.choice(hyperdict['con1_num'])
        c1l = random.choice(hyperdict['con1_len'])
        c2p = random.choice(hyperdict['con2_prob'])
        c2n = random.choice(hyperdict['con2_num'])
        c2l = random.choice(hyperdict['con2_len'])
        denp =random.choice(hyperdict['den_prob'])
        den1 = random.choice(hyperdict['den1_len'])
        dout = random.choice(hyperdict['droprate'])
        ep = random.choice(hyperdict['epochs'])
        b_size = random.choice(hyperdict['batch_size'])
        model = create_model(c1n,c1l,c2p,c2n,c2l,denp,den1,dout)
        model.compile(optimizer = 'Adam', loss = 'mean_squared_error', metrics = ['mean_squared_error'])
        history = model.fit(X_train, y_train, epochs = ep, verbose = 1, batch_size = b_size)    
        instant_val += history.loss[-1]
    if total_val >instant_val/5:
        model.save('CNN_model.h5')
print("time :", time.time() - start)