In [10]:
import os
import time
import datetime

import torch
import argparse
import numpy as np
import pandas as pd
import torch.nn.functional as F

from sklearn import metrics
from utils import datasets, dataset_loader_two_tfs, test_dataset_loader
from network import MTL_Model, ConvNet

if(torch.cuda.is_available()):
    print('Torch',torch.__version__, 'is available')
else:
    print('Torch is not available. Process is terminated')
    quit()

Torch 1.10.2 is available


In [11]:
# tfs = args.TF
tfs = ['ARID3A', 'ZBTB7A']
CodeTesting = True
print('TF Binding Prediction for', tfs[0], 'and', tfs[1])
print('Searching for all hyperparameter settings...')

TF Binding Prediction for ARID3A and ZBTB7A
Searching for all hyperparameter settings...


In [12]:
# Hyperparameters
num_epochs = 150
reverse_mode = False
num_motif_detector = 16
motif_len =24
batch_size = 64
beta1 = 2*10**-6
beta2 = 5*10**-6
beta3 = 2*10**-6
if CodeTesting:
    pool_type = ['max']
    hidden_layer_type = [True] # add one hidden layer or not
    dropout_rate_type = [0.2]
    lr_type = [0.01]
    scheduler_type = [True] # use Cosine Annealing or not
    opt_type = ['Adam'] # optimizer

# total_cases = len(pool_type)*len(hidden_layer_type)*len(dropout_rate_type)*len(lr_type_sgd)*len(scheduler_type)*len(opt_type)
total_cases = len(pool_type)*len(hidden_layer_type)*len(dropout_rate_type)*len(lr_type)*len(scheduler_type)*len(opt_type)

print('Total cases :', total_cases)

Total cases : 1


In [13]:
# Settings
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# dataset
path = './data/encode/'
all_dataset_names = datasets(path)
TF_to_idx = {'ARID3A' : 0, 'CTCFL' : 1, 'ELK1' : 2, 'FOXA1' : 3, 'GABPA' : 4, 'MYC' : 5, 'REST' : 6, 'SP1' : 7, 'USF1' : 8, 'ZBTB7A' : 9}
TF1_idx = TF_to_idx[tfs[0]]
TF2_idx = TF_to_idx[tfs[1]]
if(CodeTesting):
    print(f'{tfs[0]} idx : {TF1_idx}')
    print(f'{tfs[1]} idx : {TF2_idx}')

ARID3A idx : 0
ZBTB7A idx : 9


In [14]:
tf1_dataset_name = all_dataset_names[TF1_idx]
tf1_train_dataset_path = tf1_dataset_name[0]
tf1_test_dataset_path = tf1_dataset_name[1]
tf1_name = tf1_train_dataset_path.split(path)[1].split("_AC")[0]

tf2_dataset_name = all_dataset_names[TF2_idx]
tf2_train_dataset_path = tf2_dataset_name[0]
tf2_test_dataset_path = tf2_dataset_name[1]
tf2_name = tf2_train_dataset_path.split(path)[1].split("_AC")[0]

# append할때 순서를 섞어줘야하나???
train_data_loader, valid_data_loader, all_data_loader = dataset_loader_two_tfs(tf1_train_dataset_path, tf2_train_dataset_path, batch_size, reverse_mode)

In [16]:
print('Model Training')

Model Training


In [17]:
# specify hyperparameters
case_num = 1
(share, remainder) = divmod(case_num, len(opt_type))
opt = opt_type[remainder]
(share, remainder) = divmod(share, len(scheduler_type))
scheduler = scheduler_type[remainder]
(share, remainder) = divmod(share, len(lr_type))
lr = lr_type[remainder]
(share, remainder) = divmod(share, len(dropout_rate_type))
dropout_rate = dropout_rate_type[remainder]
(share, remainder) = divmod(share, len(hidden_layer_type))
hidden_layer = hidden_layer_type[remainder]
(share, remainder) = divmod(share, len(pool_type))
pool = pool_type[remainder]

In [13]:
model = MTL_Model(num_motif_detector,motif_len,pool,hidden_layer,'training',lr,opt, dropout_rate,beta1,beta2,beta3,device,reverse_complemet_mode=False)

# optimizer
if hidden_layer == True:
    if opt == 'SGD':
        print('not possible')
    else:
        optimizer = torch.optim.SGD([
            model.net1.wConv,model.net1.wRect,model.net1.wNeu,model.net1.wNeuBias,model.net1.wHidden,model.net1.wHiddenBias,
            model.net2.wConv,model.net2.wRect,model.net2.wNeu,model.net2.wNeuBias,model.net2.wHidden,model.net2.wHiddenBias
        ], lr = lr)
else:
    print('not possible')

# scheduler
if scheduler == True:
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=0)
else:
    scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1, end_factor=1) # constant learning rate

In [23]:
train_loader = train_data_loader
valid_loader = valid_data_loader

best_AUC_1 = 0
best_AUC_2 = 0

with open("./test/"+'2020_4_7'+'.txt', "a") as file:
    file.write('Train AUC over epochs : ')
file.close()

In [24]:
for epoch in range(num_epochs):
    if epoch%10 == 0:
        print(epoch, 'th epoch over ', num_epochs)
    for idx, (data, target) in enumerate(train_loader):
        data = data.to(device)
        target = target.to(device)
        if reverse_mode:
            target_2=torch.randn(int(target.shape[0]/2),1)
            for i in range(target_2.shape[0]):
                target_2[i]=target[2*i]
            target=target_2.to(device)

        # Forward pass
        output1, output2 = model.forward(data)
        
        if hidden_layer == True:
            loss = F.binary_cross_entropy(torch.sigmoid(output1),target)+model.net1.beta1*model.net1.wConv.norm()+model.net1.beta2*model.net1.wHidden.norm()+model.net1.beta3*model.net1.wNeu.norm()
            loss += F.binary_cross_entropy(torch.sigmoid(output2),target)+model.net2.beta1*model.net2.wConv.norm()+model.net2.beta2*model.net2.wHidden.norm()+model.net2.beta3*model.net2.wNeu.norm()
        else:
            print('not possible')
            
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    scheduler.step()
        
    with torch.no_grad():
        # for train set
        model.mode='test'
        auc1 = []
        auc2 = []
        for idx, (data, target) in enumerate(train_loader):
            data = data.to(device)
            target = target.to(device)
            if reverse_mode:
                target_2=torch.randn(int(target.shape[0]/2), 1)
                for i in range(target_2.shape[0]):
                    target_2[i]=target[2*i]
                target=target_2.to(device)

            # Forward pass
            output1, output2 = model.forward(data)

            pred1_sig = torch.sigmoid(output1)
            pred2_sig = torch.sigmoid(output2)

            pred1=pred1_sig.cpu().detach().numpy().reshape(output1.shape[0])
            pred2=pred2_sig.cpu().detach().numpy().reshape(output2.shape[0])

            labels=target.cpu().numpy().reshape(output1.shape[0])

            try:
                auc1.append(metrics.roc_auc_score(labels, pred1))
                auc2.append(metrics.roc_auc_score(labels, pred2))
            except ValueError:
                pass

        AUC_training_1 = np.mean(auc1)
        AUC_training_2 = np.mean(auc2)

        # for valid set
        model.mode='test'
        auc1 = []
        auc2 = []
        for idx, (data, target) in enumerate(valid_loader):
            data = data.to(device)
            target = target.to(device)
            if reverse_mode:
                target_2=torch.randn(int(target.shape[0]/2), 1)
                for i in range(target_2.shape[0]):
                    target_2[i]=target[2*i]
                target=target_2.to(device)

            # Forward pass
            output1, output2 = model.forward(data)

            pred1_sig=torch.sigmoid(output1)
            pred2_sig=torch.sigmoid(output2)

            pred1 = pred1_sig.cpu().detach().numpy().reshape(output1.shape[0])
            pred2 = pred2_sig.cpu().detach().numpy().reshape(output2.shape[0])

            labels=target.cpu().numpy().reshape(output1.shape[0])

            try:
                auc1.append(metrics.roc_auc_score(labels, pred1))
                auc2.append(metrics.roc_auc_score(labels, pred2))
            except ValueError:
                pass

        AUC_valid_1 = np.mean(auc1)
        AUC_valid_2 = np.mean(auc2)

        with open("./test/"+'2020_4_7'+'.txt', "a") as file:
            file.write(str(AUC_training_1))
            file.write('-')
            file.write(str(AUC_valid_1))
            file.write(':')
            file.write(str(AUC_training_2))
            file.write('-')
            file.write(str(AUC_valid_2))
            file.write(', ')
        file.close()

        if AUC_valid_1 > best_AUC_1:
            best_AUC_1 = AUC_valid_1
            best_model = model
            state = {'conv': model.net1.wConv,
                    'rect':model.net1.wRect,
                    'wHidden':model.net1.wHidden,
                    'wHiddenBias':model.net1.wHiddenBias,
                    'wNeu':model.net1.wNeu,
                    'wNeuBias':model.net1.wNeuBias}

            isExist = os.path.exists('./Models/' + '2020_4_7')
            if not isExist:
                os.makedirs('./Models/' + '2020_4_7')

            torch.save(state, './Models/' + '2020_4_7'+ '/' + str(1) + '.pth')

        if AUC_valid_2 > best_AUC_2:
            best_AUC_2 = AUC_valid_2
            best_model = model
            state = {'conv': model.net2.wConv,
                    'rect':model.net2.wRect,
                    'wHidden':model.net2.wHidden,
                    'wHiddenBias':model.net2.wHiddenBias,
                    'wNeu':model.net2.wNeu,
                    'wNeuBias':model.net2.wNeuBias}

            isExist = os.path.exists('./Models/' + '2020_4_7')
            if not isExist:
                os.makedirs('./Models/' + '2020_4_7')

            torch.save(state, './Models/' + '2020_4_7'+ '/' + str(2) + '.pth')

with open("./test/"+'2020_4_7'+'.txt', "a") as file:
    file.write('\n')
    file.write('Best Trainig(validation) AUC : ')
    file.write(str(best_AUC_1))
    file.write(' and ')
    file.write(str(best_AUC_2))
    file.write('\n')
file.close()

print('Training Completed')

0 th epoch over  150
10 th epoch over  150
20 th epoch over  150
30 th epoch over  150
40 th epoch over  150
50 th epoch over  150
60 th epoch over  150
70 th epoch over  150
80 th epoch over  150
90 th epoch over  150
100 th epoch over  150
110 th epoch over  150
120 th epoch over  150
130 th epoch over  150
140 th epoch over  150
Training Completed


In [18]:
# Testing

print('Model Testing')

test_loader1 = test_dataset_loader(tf1_test_dataset_path, motif_len)
test_loader2 = test_dataset_loader(tf2_test_dataset_path, motif_len)

# using the model with best validation AUC
checkpoint1 = torch.load('./Models/' + '2020_4_7'+ '/' + str(1) + '.pth')
model1 = ConvNet(num_motif_detector,motif_len,pool,hidden_layer,'testing',lr,opt,dropout_rate,beta1,beta2,beta3, device, reverse_complemet_mode=False).to(device)
model1.wConv=checkpoint1['conv']
model1.wRect=checkpoint1['rect']
model1.wHidden=checkpoint1['wHidden']
model1.wHiddenBias=checkpoint1['wHiddenBias']
model1.wNeu=checkpoint1['wNeu']
model1.wNeuBias=checkpoint1['wNeuBias']

checkpoint2 = torch.load('./Models/' + '2020_4_7'+ '/' + str(2) + '.pth')
model2 = ConvNet(num_motif_detector,motif_len,pool,hidden_layer,'testing',lr,opt,dropout_rate,beta1,beta2,beta3, device, reverse_complemet_mode=False).to(device)
model2.wConv=checkpoint2['conv']
model2.wRect=checkpoint2['rect']
model2.wHidden=checkpoint2['wHidden']
model2.wHiddenBias=checkpoint2['wHiddenBias']
model2.wNeu=checkpoint2['wNeu']
model2.wNeuBias=checkpoint2['wNeuBias']

Model Testing


In [19]:
with torch.no_grad():
    model1.mode='test'
    auc=[]
    
    for idx, (data, target) in enumerate(test_loader1):
        data = data.to(device)
        target = target.to(device)
        if model1.reverse_complemet_mode:
            target_2=torch.randn(int(target.shape[0]/2),1)
            for i in range(target_2.shape[0]):
                target_2[i]=target[2*i]
            target=target_2.to(device)

        # Forward pass
        output = model1(data)
        pred_sig=torch.sigmoid(output)
        pred=pred_sig.cpu().detach().numpy().reshape(output.shape[0])
        labels=target.cpu().numpy().reshape(output.shape[0])
        try:
            auc.append(metrics.roc_auc_score(labels, pred))
        except ValueError:
            pass

    AUC1_test=np.mean(auc)
    # print('AUC on test data = ', AUC_test)
    with open("./test/"+'2020_4_7'+'.txt', "a") as file:
        file.write('AUC 1 Test : ')
        file.write(str(round(AUC1_test, 5)))
        file.write('\n')
    file.close()

with torch.no_grad():
    model2.mode='test'
    auc=[]
    
    for idx, (data, target) in enumerate(test_loader2):
        data = data.to(device)
        target = target.to(device)
        if model2.reverse_complemet_mode:
            target_2=torch.randn(int(target.shape[0]/2),1)
            for i in range(target_2.shape[0]):
                target_2[i]=target[2*i]
            target=target_2.to(device)

        # Forward pass
        output = model2(data)
        pred_sig=torch.sigmoid(output)
        pred=pred_sig.cpu().detach().numpy().reshape(output.shape[0])
        labels=target.cpu().numpy().reshape(output.shape[0])
        try:
            auc.append(metrics.roc_auc_score(labels, pred))
        except ValueError:
            pass

    AUC2_test=np.mean(auc)
    # print('AUC on test data = ', AUC_test)
    with open("./test/"+'2020_4_7'+'.txt', "a") as file:
        file.write('AUC 2 Test : ')
        file.write(str(round(AUC2_test, 5)))
        file.write('\n')
    file.close()

print('Testing Completed')

Testing Completed
