In [1]:
import os
import torch
import numpy as np
import pandas as pd
import torch.nn.functional as F

from sklearn import metrics
from TFBP import datasets, dataset_loader_MT, test_dataset_loader_MT
from TFBP import MTL_Model,

if(torch.cuda.is_available()):
    print('Torch',torch.__version__, 'is available')
else:
    print('Torch is not available. Process is terminated')
    quit()

Torch 1.10.2 is available


In [2]:
# tfs = args.TF
tfs = ['ARID3A', 'ZBTB7A']
CodeTesting = True

print('TF Binding Prediction for', tfs[0], 'and', tfs[1])
print('Searching for all hyperparameter settings...')

TF Binding Prediction for ARID3A and ZBTB7A
Searching for all hyperparameter settings...


In [3]:
# Hyperparameters
num_epochs = 150
reverse_mode = False
num_motif_detector = 16
motif_len =24
batch_size = 64
beta1 = 2*10**-6
beta2 = 2*10**-6 # hyperparameter tuning 해야함!!
beta3 = 5*10**-6
beta4 = 2*10**-6
if CodeTesting:
    pool_type = ['max']
    dropout_rate_type = [0.2]
    lr_adam_type = [0.01]
    scheduler_type = [True] # use Cosine Annealing or not
    opt_type = ['Adam'] # optimizer

# total_cases = len(pool_type)*len(hidden_layer_type)*len(dropout_rate_type)*len(lr_type_sgd)*len(scheduler_type)*len(opt_type)
total_cases = len(pool_type)*len(dropout_rate_type)*len(lr_adam_type)*len(scheduler_type)*len(opt_type)

print('Total cases :', total_cases)

Total cases : 1


In [4]:
# Settings
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# dataset
path = './data/encode/'
all_dataset_names = datasets(path)
TF_to_idx = {'ARID3A' : 0, 'CTCFL' : 1, 'ELK1' : 2, 'FOXA1' : 3, 'GABPA' : 4, 'MYC' : 5, 'REST' : 6, 'SP1' : 7, 'USF1' : 8, 'ZBTB7A' : 9}
TF1_idx = TF_to_idx[tfs[0]]
TF2_idx = TF_to_idx[tfs[1]]
if(CodeTesting):
    print(f'{tfs[0]} idx : {TF1_idx}')
    print(f'{tfs[1]} idx : {TF2_idx}')

ARID3A idx : 0
ZBTB7A idx : 9


In [5]:
tf1_dataset_name = all_dataset_names[TF1_idx]
tf1_train_dataset_path = tf1_dataset_name[0]
tf1_test_dataset_path = tf1_dataset_name[1]
tf1_name = tf1_train_dataset_path.split(path)[1].split("_AC")[0]

tf2_dataset_name = all_dataset_names[TF2_idx]
tf2_train_dataset_path = tf2_dataset_name[0]
tf2_test_dataset_path = tf2_dataset_name[1]
tf2_name = tf2_train_dataset_path.split(path)[1].split("_AC")[0]

In [8]:
train_data_loader, valid_data_loader, all_data_loader = dataset_loader_MT(tf1_train_dataset_path, tf2_train_dataset_path, batch_size, reverse_mode)

In [7]:
print('Model Training')

Model Training


In [6]:
# specify hyperparameters
case_num = 1
(share, remainder) = divmod(case_num, len(opt_type))
opt = opt_type[remainder]
(share, remainder) = divmod(share, len(scheduler_type))
scheduler = scheduler_type[remainder]
(share, remainder) = divmod(share, len(lr_adam_type))
lr = lr_adam_type[remainder]
(share, remainder) = divmod(share, len(dropout_rate_type))
dropout_rate = dropout_rate_type[remainder]
(share, remainder) = divmod(share, len(pool_type))
pool = pool_type[remainder]

In [9]:
model = MTL_Model(num_motif_detector,motif_len,pool,'training',lr, dropout_rate, device)

In [10]:
# optimizer
if opt == 'SGD':
    print('not possible')
else:
    optimizer = torch.optim.SGD([
        model.net.wConv1, model.net.wRect1, model.net.wConv2, model.net.wRect2,
        model.net1.wNeu,model.net1.wNeuBias,model.net1.wHidden,model.net1.wHiddenBias,
        model.net2.wNeu,model.net2.wNeuBias,model.net2.wHidden,model.net2.wHiddenBias
    ], lr = lr)

# scheduler
if scheduler == True:
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs, eta_min=0)
else:
    scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=1, end_factor=1) # constant learning rate

In [11]:
train_loader = train_data_loader
valid_loader = valid_data_loader

loss_best = 1000

with open("./test/"+'2020_4_12'+'.txt', "a") as file:
    file.write('(train auc 1, train auc 2, train loss, valid auc 1, valid auc 2, valid loss)) : ')
    file.write('\n')
file.close()

In [10]:
def divide_batch(data, target, task, device):
    task1_data, task1_target = [], []
    task2_data, task2_target = [], []

    for i in range(len(task)):
        if task[i] == 0:
            task1_data.append(data[i].numpy())
            task1_target.append(target[i].numpy())
        elif task[i] == 1:
            task2_data.append(data[i].numpy())
            task2_target.append(target[i].numpy())
        else:
            print('?')
            quit()

    task1_data_array = np.array(task1_data)
    task2_data_array = np.array(task2_data)
    task1_data_tensor = torch.tensor(task1_data_array)
    task2_data_tensor = torch.tensor(task2_data_array)

    task1_target_array = np.array(task1_target)
    task2_target_array = np.array(task2_target)
    task1_target_tensor = torch.tensor(task1_target_array)
    task2_target_tensor = torch.tensor(task2_target_array)

    data1 = task1_data_tensor.to(device)
    data2 = task2_data_tensor.to(device)
    target1 = task1_target_tensor.to(device)
    target2 = task2_target_tensor.to(device)
    
    return data1, data2, target1, target2

In [13]:
for epoch in range(num_epochs):
    if epoch%10 == 0:
        print(epoch, 'th epoch over ', num_epochs)
    for idx, (data, target, task) in enumerate(train_loader):

        data1, data2, target1, target2 = divide_batch(data, target, task, device)

        output1 = model.forward(data1, 0)
        output2 = model.forward(data2, 1)

        # task1 loss
        loss = F.binary_cross_entropy(torch.sigmoid(output1),target1) # BCE
        + beta1*model.net.wConv1.norm() + beta2*model.net.wConv2.norm()  # Base
        + beta3*model.net1.wHidden.norm() + beta4*model.net1.wNeu.norm() # classifier head

        # task2 loss
        loss += F.binary_cross_entropy(torch.sigmoid(output2),target2) 
        + beta1*model.net.wConv1.norm() + beta2*model.net.wConv2.norm() # Base
        + beta3*model.net2.wHidden.norm() + beta4*model.net2.wNeu.norm() # classifier head

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    scheduler.step()
        
    with torch.no_grad():
        # for train set
        model.mode='test'
        auc1 = []
        auc2 = []
        train_loss = []
        for idx, (data, target, task) in enumerate(train_loader):
            data1, data2, target1, target2 = divide_batch(data, target, task, device)

            # Forward pass
            output1 = model.forward(data1, 0)
            output2 = model.forward(data2, 1)

            loss = F.binary_cross_entropy(torch.sigmoid(output1),target1) + beta1*model.net.wConv1.norm() + beta2*model.net.wConv2.norm() + beta3*model.net1.wHidden.norm() + beta4*model.net1.wNeu.norm()
            loss += F.binary_cross_entropy(torch.sigmoid(output2),target2) + beta1*model.net.wConv1.norm() + beta2*model.net.wConv2.norm() + beta3*model.net2.wHidden.norm() + beta4*model.net2.wNeu.norm()
            train_loss.append(loss.cpu())

            pred1_sig = torch.sigmoid(output1)
            pred2_sig = torch.sigmoid(output2)

            pred1 = pred1_sig.cpu().detach().numpy().reshape(output1.shape[0])
            pred2 = pred2_sig.cpu().detach().numpy().reshape(output2.shape[0])

            label1 = target1.cpu().numpy().reshape(output1.shape[0])
            label2 = target2.cpu().numpy().reshape(output2.shape[0])

            try:
                auc1.append(metrics.roc_auc_score(label1, pred1))
                auc2.append(metrics.roc_auc_score(label2, pred2))
            except ValueError:
                pass

        AUC_training_1 = np.mean(auc1)
        AUC_training_2 = np.mean(auc2)
        Loss_train = np.mean(train_loss)

        # for valid set
        model.mode='test'
        auc1 = []
        auc2 = []
        valid_loss = []
        for idx, (data, target, task) in enumerate(valid_loader):
            data1, data2, target1, target2 = divide_batch(data, target, task, device)

            # Forward pass
            output1 = model.forward(data1, 0)
            output2 = model.forward(data2, 1)

            loss = F.binary_cross_entropy(torch.sigmoid(output1),target1) + beta1*model.net.wConv1.norm() + beta2*model.net.wConv2.norm() + beta3*model.net1.wHidden.norm() + beta4*model.net1.wNeu.norm()
            loss += F.binary_cross_entropy(torch.sigmoid(output2),target2) + beta1*model.net.wConv1.norm() + beta2*model.net.wConv2.norm() + beta3*model.net2.wHidden.norm() + beta4*model.net2.wNeu.norm()
            valid_loss.append(loss.cpu())

            pred1_sig=torch.sigmoid(output1)
            pred2_sig=torch.sigmoid(output2)

            pred1 = pred1_sig.cpu().detach().numpy().reshape(output1.shape[0])
            pred2 = pred2_sig.cpu().detach().numpy().reshape(output2.shape[0])

            label1 = target1.cpu().numpy().reshape(output1.shape[0])
            label2 = target2.cpu().numpy().reshape(output2.shape[0])

            try:
                auc1.append(metrics.roc_auc_score(label1, pred1))
                auc2.append(metrics.roc_auc_score(label2, pred2))
            except ValueError:
                pass

        AUC_valid_1 = np.mean(auc1)
        AUC_valid_2 = np.mean(auc2)
        Loss_valid = np.mean(valid_loss)

        with open("./test/"+'2020_4_12'+'.txt', "a") as file:
            file.write('(')
            file.write(str(AUC_training_1))
            file.write(',')
            file.write(str(AUC_training_2))
            file.write(',')
            file.write(str(Loss_train))
            file.write(',')
            file.write(str(AUC_valid_1))
            file.write(',')
            file.write(str(AUC_valid_2))
            file.write(',')
            file.write(str(Loss_valid))
            file.write(')')
            file.write('\n')
        file.close()

        if Loss_valid < loss_best:
            loss_best = Loss_valid
            best_model = model
            state = {'conv1': model.net.wConv1,
                    'rect1':model.net.wRect1,
                    'conv2':model.net.wConv2,
                    'rect2':model.net.wRect2,
                    'wHidden1':model.net1.wHidden,
                    'wHiddenBias1':model.net1.wHiddenBias,
                    'wNeu1':model.net1.wNeu,
                    'wNeuBias1':model.net1.wNeuBias,
                    'wHidden2':model.net2.wHidden,
                    'wHiddenBias2':model.net2.wHiddenBias,
                    'wNeu2':model.net2.wNeu,
                    'wNeuBias2':model.net2.wNeuBias}

            isExist = os.path.exists('./Models/' + '2020_4_12')
            if not isExist:
                os.makedirs('./Models/' + '2020_4_12')

            torch.save(state, './Models/' + '2020_4_12'+ '/' + str(1) + '.pth')

print('Training Completed')

0 th epoch over  150
10 th epoch over  150
20 th epoch over  150
30 th epoch over  150
40 th epoch over  150
50 th epoch over  150
60 th epoch over  150
70 th epoch over  150
80 th epoch over  150
90 th epoch over  150
100 th epoch over  150
110 th epoch over  150
120 th epoch over  150
130 th epoch over  150
140 th epoch over  150
Training Completed


In [7]:
# Testing

print('Model Testing')

test_loader = test_dataset_loader_MT(tf1_test_dataset_path, tf2_test_dataset_path, motif_len)

Model Testing


In [8]:
checkpoint = torch.load('./Models/' + '2020_4_12'+ '/' + str(1) + '.pth')
model = MTL_Model(num_motif_detector, motif_len, pool, 'testing', lr, dropout_rate, device)

model.net.wConv1 = checkpoint['conv1']
model.net.wRect1 = checkpoint['rect1']
model.net.wConv2 = checkpoint['conv2']
model.net.wRect2 = checkpoint['rect2']

model.net1.wHidden = checkpoint['wHidden1']
model.net1.wHiddenBias = checkpoint['wHiddenBias1']
model.net1.wNeu = checkpoint['wNeu1']
model.net1.wNeuBias = checkpoint['wNeuBias1']

model.net2.wHidden = checkpoint['wHidden2']
model.net2.wHiddenBias = checkpoint['wHiddenBias2']
model.net2.wNeu = checkpoint['wNeu2']
model.net2.wNeuBias = checkpoint['wNeuBias2']

In [11]:
for idx, (data, target, task) in enumerate(test_loader):
    data1, data2, target1, target2 = divide_batch(data, target, task, device)
    if idx == 0:
        break

In [15]:
with torch.no_grad():
    test_auc_1 = []
    test_auc_2 = []
    
    for idx, (data, target, task) in enumerate(test_loader):
        data1, data2, target1, target2 = divide_batch(data, target, task, device)

        # Forward pass
        output1 = model.forward(data1, 0)
        output2 = model.forward(data2, 1)

        pred1_sig=torch.sigmoid(output1)
        pred2_sig=torch.sigmoid(output2)

        pred1 = pred1_sig.cpu().detach().numpy().reshape(output1.shape[0])
        pred2 = pred2_sig.cpu().detach().numpy().reshape(output2.shape[0])

        label1 = target1.cpu().numpy().reshape(output1.shape[0])
        label2 = target2.cpu().numpy().reshape(output2.shape[0])

        try:
            test_auc_1.append(metrics.roc_auc_score(label1, pred1))
            test_auc_2.append(metrics.roc_auc_score(label2, pred2))
        except ValueError:
            pass

    AUC_test_1 = np.mean(test_auc_1)
    AUC_test_2 = np.mean(test_auc_2)
    print('AUC on test data = ', AUC_test_1, AUC_test_2)

    with open("./test/"+'2020_4_12'+'.txt', "a") as file:
        file.write('AUC Test 1 : ')
        file.write(str(AUC_test_1))
        file.write(", ")
        file.write('AUC Test 2 : ')
        file.write(str(AUC_test_1))
        file.write('\n')
    file.close()

print('Testing Completed')

AUC on test data =  0.5652440000000001 0.571552
Testing Completed
