In [1]:
import os 
import random
import json
import shutil
import glob
import itertools 

import pandas as pd
import numpy as np

import torch
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.tensorboard as tb
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter

In [11]:
class DimaDataset(Dataset):
    def __init__(self,
                 genome_file,
                 y_file,
                 window = 501, 
                 n_sequences = 10000
                ):
        self.genome_file = genome_file
        self.y_file = y_file
        self.window = window
        self.n_sequences = n_sequences
        self.order = np.random.permutation(n_sequences)
        if window is None:
            window = 501
    def __len__(self):
        return self.n_sequences
    def __getitem__(self, idx):
        idx = self.order[idx]
        genome = np.memmap(self.genome_file, 
                           dtype='float32', mode='r', 
                           shape=(4,self.n_sequences*self.window)
                          )
        encoseq = np.zeros((4, self.window), dtype='float32')
        
        encoseq[:] = genome[:4,(self.window*idx):self.window*(idx+1)]
        ys = np.memmap(self.y_file, 
                           dtype='float32', mode='r', 
                           shape=(self.n_sequences)
                          )
        signal = float(ys[idx])
        return torch.Tensor(encoseq).view(4, -1), torch.FloatTensor([signal])





In [16]:
"""
class BeerDataset(Dataset):
    def __init__(self,
                 genome_file,
                 y_file,
                 window = 500, 
                 n_sequences = 10000
                ):
        self.genome_file = genome_file
        self.y_file = y_file
        self.window = window
        self.n_sequences = n_sequences
        self.order = np.random.permutation(n_sequences)
        if window is None:
            window = 500
    def __len__(self):
        return self.n_sequences
    def __getitem__(self, idx):
        idx = self.order[idx]
        genome = np.memmap(self.genome_file, 
                           dtype='float32', mode='r', 
                           shape=(4,self.n_sequences*self.window)
                          )
        encoseq = np.zeros((4, self.window), dtype='float32')
        
        encoseq[:] = genome[:4,(self.window*idx):self.window*(idx+1)]
        ys = np.memmap(self.y_file, 
                           dtype='float32', mode='r', 
                           shape=(self.n_sequences)
                          )
        signal = float(ys[idx])
        return torch.Tensor(encoseq).view(4, -1), torch.FloatTensor([signal])
"""

"\nclass BeerDataset(Dataset):\n    def __init__(self,\n                 genome_file,\n                 y_file,\n                 window = 500, \n                 n_sequences = 10000\n                ):\n        self.genome_file = genome_file\n        self.y_file = y_file\n        self.window = window\n        self.n_sequences = n_sequences\n        self.order = np.random.permutation(n_sequences)\n        if window is None:\n            window = 500\n    def __len__(self):\n        return self.n_sequences\n    def __getitem__(self, idx):\n        idx = self.order[idx]\n        genome = np.memmap(self.genome_file, \n                           dtype='float32', mode='r', \n                           shape=(4,self.n_sequences*self.window)\n                          )\n        encoseq = np.zeros((4, self.window), dtype='float32')\n        \n        encoseq[:] = genome[:4,(self.window*idx):self.window*(idx+1)]\n        ys = np.memmap(self.y_file, \n                           dtype='float32',

In [12]:
class Nocturn(nn.Module):
    def __init__(self):
        super().__init__()
        self.Conv1 = nn.Conv1d(in_channels=4, out_channels=16, kernel_size=8)
        self.Conv2 = nn.Conv1d(in_channels=16, out_channels=64, kernel_size=8)
        self.Conv3 = nn.Conv1d(in_channels=64, out_channels=960, kernel_size=8)
        self.Maxpool = nn.MaxPool1d(kernel_size=4, stride=4)
        self.Drop2 = nn.Dropout(p=0.5)
        self.Linear1 = nn.Linear(21120, 32)
        self.Drop1 = nn.Dropout(p=0.2)
        self.Linear2 = nn.Linear(32, 1)
        
    def forward(self, x):
        res = self.Conv1(x)
        res = F.leaky_relu(res)
        res = self.Maxpool(res)
        res = self.Drop1(res)
        res = self.Conv2(res)
        res = F.leaky_relu(res)
        res = self.Drop1(res)
        res = self.Maxpool(res)
        res = self.Conv3(res)
        res = F.leaky_relu(res)
        res = self.Drop2(res)
        res = res.view(res.shape[0], -1)
        res = self.Linear1(res)
        res = F.leaky_relu(res)
        res = self.Linear2(res)
        res = torch.sigmoid(res)
        res = res.view(-1)
        return res

In [27]:
learning_rate = 0.0001
#base_lr = max_lr / 6

In [28]:
MODEL_TAG="nocturn_final"#"model_class_4"
drpr = "NEWpreprocessed_HepG2_dataset/"
drys = "NEWys_HepG2_dataset/"

np.random.seed(777)
batch_size = 32
device = torch.device("cuda")
model = Nocturn()
model = model.to(device)
loss_fn = nn.BCELoss()
optimizer = optim.RMSprop(params=model.parameters(), lr=learning_rate, weight_decay=0.0005)
b = DimaDataset(genome_file = drpr + \
        '34959preprocessed_HepG2_positive.dat', 
        y_file = drys + '34959ys_HepG2_positive.dat', n_sequences = 34959)
dl = DataLoader(b, batch_size=32, num_workers=4, shuffle=False)

bn = DimaDataset(genome_file =  drpr + \
       '187127preprocessed_HepG2_negative.dat', 
        y_file = drys + '187127ys_HepG2_negative.dat', n_sequences = 187127)
dln = DataLoader(bn, batch_size=32, num_workers=4, shuffle=False)
DIR_PATH = 'models_tb'

In [29]:
train_end = min(len(dl), len(dln)) * 4 // 5

#scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.70)

In [30]:
del scheduler

NameError: name 'scheduler' is not defined

In [31]:
VERSION = 1

In [32]:
sm = SummaryWriter(os.path.join(DIR_PATH, f"{MODEL_TAG}_{VERSION}"))

In [33]:
GLOBAL_STEP = 0

In [34]:

print(train_end)

874


In [35]:
model_dir = os.path.join('models', MODEL_TAG + str(VERSION))
os.mkdir(model_dir)

In [None]:
for epoch in range(1, 100):
    it = iter(enumerate(zip(dl, dln)))
    for i, ((px, _), (nx, _)) in it:
        py = torch.ones(px.shape[0])
        ny = torch.zeros(nx.shape[0])
        py = py.view(ny.shape[0])
        ny = ny.view(ny.shape[0])

        x = torch.cat([px, nx]).to(device)
        y = torch.cat([py, ny]).to(device)
        optimizer.zero_grad()
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        loss.backward()

        sm.add_scalar('train', loss.item(), GLOBAL_STEP)
        optimizer.step()
        
        
        GLOBAL_STEP += 1
        #scheduler.step()

        if i == train_end:
            break  
    total_loss = 0.00
    with torch.no_grad():
        correct = 0
        n = 0
        for i, ((px, _), (nx, _)) in it:
            py = torch.ones(px.shape[0])
            ny = torch.zeros(nx.shape[0])
            py = py.view(py.shape[0])
            ny = ny.view(ny.shape[0])
            x = torch.cat([px, nx]).to(device)
            y = torch.cat([py, ny]).to(device)
            y_pred = model(x)
            y_round = y_pred.round()
            correct += y_round.eq(y.data.view_as(y)).cpu().sum().item()
            n += y_pred.shape[0]
            loss = loss_fn(y_pred,y)
            total_loss += loss.item()

    sm.add_scalar ('accuracy', correct/n, GLOBAL_STEP)
    total_loss /= (i - train_end)
    sm.add_scalar('test', total_loss, GLOBAL_STEP)


    if epoch % 5 == 0:
        path = os.path.join(model_dir, f"model_{epoch}")
        torch.save(model.state_dict(), path)

        path = os.path.join(model_dir, f"optimizer_{epoch}")
        torch.save(optimizer.state_dict(), path)