In [2]:
DATA_PATH = 'deepsea_train/'

### Data exploration

In [3]:
## VALIDATION DATA ###
valid_data_raw = scipy.io.loadmat(DATA_PATH+'valid.mat')
x_valid = torch.FloatTensor(valid_data_raw['validxdata'])
y_valid = torch.FloatTensor(valid_data_raw['validdata'])

Each training sample consists of a 1,000-bp sequence from the human GRCh37 reference genome centered on each 200-bp bin and is paired with a label vector for 919 chromatin features. 

The 1,000-bp DNA sequence is represented by a 1,000 × 4 binary matrix, with columns corresponding to A, G, C and T.

In [None]:
x_train.shape

In [7]:
x_train[0].shape

(4, 4400000)

In [9]:
y_train[0].shape

(4400000,)

# Deep Sea

In [4]:
nfeats = 4
height = 1
nkernels = [320,480,960]
dropouts = [0.2,0.5]

In [5]:
class deep_sea_nn(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv1d(in_channels=nfeats,      out_channels=nkernels[0], kernel_size=8)
        self.conv2 = nn.Conv1d(in_channels=nkernels[0], out_channels=nkernels[1], kernel_size=8)
        self.conv3 = nn.Conv1d(in_channels=nkernels[1], out_channels=nkernels[2], kernel_size=8)
        self.maxpool = nn.MaxPool1d(kernel_size=4, stride=4)
        self.drop1 = nn.Dropout(p=dropouts[0])
        self.drop2 = nn.Dropout(p=dropouts[1])
        self.linear1 = nn.Linear(53*960, 925)
        self.linear2 = nn.Linear(925, 919)
    
    def foward(self, input):
        ## convolution 1 ##
        ds = self.conv1(input)
        ds = F.relu(ds)
        ds = self.maxpool(ds)
        ds = self.drop1(ds)
        
        ## convolution 2 ##
        ds = self.conv2(ds)
        ds = F.relu(ds)
        ds = self.maxpool(ds)
        ds = self.drop1(ds)
        
        ## convolution 3 ##
        ds = self.conv3(ds)
        ds = F.relu(ds)
        ds = self.drop2(ds)
        
        ds = ds.view(-1, 53*960)
        ds = self.Linear1(ds)
        ds = F.relu(x)
        ds = self.Linear2(ds)
        
        return ds
        

In [6]:
## HYPERPARMETERS
params = {'batch_size': 100,'num_workers': 2}
device = 'cuda'

learning_rate = 0.01
batch_size = 100
epochs = 10

In [7]:
deep_sea = deep_sea_nn()
#deep_sea.to(device)
print(deep_sea)

optimizer = optim.SGD(deep_sea.parameters(), lr=learning_rate,momentum=0.9)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5,verbose=1)
loss_func = nn.BCEWithLogitsLoss()

deep_sea_nn(
  (conv1): Conv1d(4, 320, kernel_size=(8,), stride=(1,))
  (conv2): Conv1d(320, 480, kernel_size=(8,), stride=(1,))
  (conv3): Conv1d(480, 960, kernel_size=(8,), stride=(1,))
  (maxpool): MaxPool1d(kernel_size=4, stride=4, padding=0, dilation=1, ceil_mode=False)
  (drop1): Dropout(p=0.2)
  (drop2): Dropout(p=0.5)
  (linear1): Linear(in_features=50880, out_features=925, bias=True)
  (linear2): Linear(in_features=925, out_features=919, bias=True)
)


In [8]:
valid_loader = Data.DataLoader(dataset=Data.TensorDataset(x_valid, y_valid), shuffle=False,**params)

In [10]:
a = np.load(DATA_PATH+"x_train_part_0.npy")

In [11]:
a.shape

(1, 4, 4400000)

In [9]:
train_losses, valid_losses = [], []
running_loss = 0
running_val_loss = 0

for epoch in range(epochs):
    running_loss = 0
    deep_sea.train()
    ### for each part of the training set ###
    for i in range(0,10):
        x_train = torch.FloatTensor(np.load(DATA_PATH+"x_train_part_{}.npy".format(i)))
        y_train = torch.FloatTensor(np.load(DATA_PATH+"y_train_part_{}.npy".format(i)))
        
        train_loader = Data.DataLoader(dataset=Data.TensorDataset(x_train, y_train), shuffle=True, **params)
        
        for i, (inputs, labels) in enumerate(train_loader):
            #x, y = inputs.to(device), labels.to(device)
            x, y = inputs, labels
            
            optimizer.zero_grad()
            
            out = deep_sea.foward(x)
            loss = loss_func(out.to(device), y)
            loss.backward()
            optimizer.step()
            
            running_loss = loss.item()
            train_losses.append(running_loss)
            
    ## Validation ##
    for i, (inputs, labels) in enumerate(valid_loader):
        model.eval()
        with torch.no_grad():
            x, y = inputs.to(device), labels.to(device)

            val_out = deep_sea.foward(x)
            val_loss = loss_func(val_out, y)

            running_val_loss = val_loss.item()
            valid_losses(running_val_loss)

    print(f"Epoch {epoch+1}/{epochs}.. "
          f"Train loss: {train_losses}.. "
          f"Validation loss: {valid_losses}.. ")
            
            

RuntimeError: shape '[-1, 50880]' is invalid for input of size 263990400