In [1]:
import numpy as np
import pandas as pd
import torch

In [2]:
DATA_PATH = "./data/PSI_Biology_solubility_trainset.csv"

SEQ_CLIP_LEN = 20

In [3]:
data = pd.read_csv(DATA_PATH)

In [4]:
y = torch.tensor(data.solubility.values)

y

tensor([1, 1, 1,  ..., 1, 1, 0])

In [5]:
x = data.fasta.apply(lambda x: x[:SEQ_CLIP_LEN]).to_numpy(str)

x

array(['MTYKDGTYSSDGTYTSPNGL', 'MTAMNILVLGSDSRGSSDAD',
       'MKAEGNTAMNILVLGSDSRG', ..., 'MDLFPDEIYVFTPEGRIVEL',
       'MVEQEQEAITFEVVAREWHA', 'MELGLRTYSARLLGSNPKLV'], dtype='<U20')

In [1]:
def one_hot(seq):
    chars = ['A', 'C', 'D', 'E',
             'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

    mapping = dict(zip(chars, range(len(chars))))
    seq2 = [mapping[i] for i in seq]
    return np.eye(len(chars))[seq2].flatten()


def one_hot_encode(arr):
    encoding = None
    for seq in arr:
        if encoding is None:
            encoding = one_hot(seq)
        else:
            encoding = np.vstack([encoding, one_hot(seq)])
    return encoding

In [7]:
x = torch.tensor(one_hot_encode(x))

x

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.]], dtype=torch.float64)

In [3]:
from src import model_config
import torch.nn as nn

In [23]:
model = nn.Sequential(
    *sum([
        [
            nn.Conv1d(*sizes), 
            nn.SiLU()
        ] for sizes in model_config['cnn']], []),
    nn.Linear(*model_config["clf"])
)

In [25]:
sum([[nn.Conv1d(*sizes), nn.SiLU()] for sizes in model_config['cnn']], [])

[Conv1d(20, 16, kernel_size=(5,), stride=(1,)),
 SiLU(),
 Conv1d(16, 8, kernel_size=(5,), stride=(1,)),
 SiLU(),
 Conv1d(8, 8, kernel_size=(5,), stride=(1,)),
 SiLU()]

In [10]:
import torch
import torch.nn.functional as F

DATA_PATH = "./data/PSI_Biology_solubility_trainset.csv"

SEQ_CLIP_LEN = 500

from src.data import load_data

y, x = load_data(DATA_PATH, SEQ_CLIP_LEN)

In [11]:
y, y.shape

(array([1, 1, 1, ..., 1, 1, 0], dtype=int64), (11226,))

In [13]:
x = F.one_hot(torch.tensor(x).to(torch.int64), num_classes=21)

x

  x = F.one_hot(torch.tensor(x).to(torch.int64), num_classes=21)


tensor([[[[1, 0, 0,  ..., 0, 0, 0],
          [1, 0, 0,  ..., 0, 0, 0],
          [1, 0, 0,  ..., 0, 0, 0],
          ...,
          [1, 0, 0,  ..., 0, 0, 0],
          [1, 0, 0,  ..., 0, 0, 0],
          [1, 0, 0,  ..., 0, 0, 0]],

         [[1, 0, 0,  ..., 0, 0, 0],
          [1, 0, 0,  ..., 0, 0, 0],
          [1, 0, 0,  ..., 0, 0, 0],
          ...,
          [1, 0, 0,  ..., 0, 0, 0],
          [1, 0, 0,  ..., 0, 0, 0],
          [1, 0, 0,  ..., 0, 0, 0]],

         [[1, 0, 0,  ..., 0, 0, 0],
          [1, 0, 0,  ..., 0, 0, 0],
          [1, 0, 0,  ..., 0, 0, 0],
          ...,
          [1, 0, 0,  ..., 0, 0, 0],
          [1, 0, 0,  ..., 0, 0, 0],
          [0, 1, 0,  ..., 0, 0, 0]],

         ...,

         [[0, 1, 0,  ..., 0, 0, 0],
          [1, 0, 0,  ..., 0, 0, 0],
          [1, 0, 0,  ..., 0, 0, 0],
          ...,
          [1, 0, 0,  ..., 0, 0, 0],
          [1, 0, 0,  ..., 0, 0, 0],
          [1, 0, 0,  ..., 0, 0, 0]],

         [[0, 1, 0,  ..., 0, 0, 0],
          [1, 0, 

In [15]:
x.size()

torch.Size([11226, 500, 21, 21])