In [1]:
import pandas as pd
import numpy as np
import torch
import torchvision
import matplotlib.pyplot as plt

In [2]:
ls datasets

41598_2019_46420_MOESM2_ESM.xlsx   newdb_all_hxb2.fasta
aminoacids_oneletter_code.csv      newdb_ccr5.fasta
dataset_chen.csv                   newdb_ccr5.tsv
dataset_chen.fasta                 newdb_cxcr4.fasta
muscle_aligner_test_aligned.fasta  newdb_cxcr4.tsv
muscle_aligner_test.fasta          newdb_dualtropic.fasta
newdb_aligned_all_labels.tsv       newdb_dualtropic.tsv
newdb_aligned.csv                  newdb_wrangled.tsv
newdb_aligned_muscle.fasta         [0m[01;34msrep21280[0m/
newdb_all_hxb2_aligned_2.fasta     teste_1seq_to_all_aligned.fasta
newdb_all_hxb2_aligned.fasta       teste_1seq_to_all.fasta


In [136]:
df = pd.read_csv('datasets/newdb_and_hivcopred_aligned.tsv', sep='\t', names=['name', 'label','sequence'])

In [137]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5333 entries, 0 to 5332
Data columns (total 3 columns):
name        5333 non-null object
label       5333 non-null object
sequence    5333 non-null object
dtypes: object(3)
memory usage: 125.1+ KB


In [138]:
df.head(10)

Unnamed: 0,name,label,sequence
0,RKF859742,CCR5,CERPTMD-I--QDI---H--I--GP----M--A-WY---S-TYIER...
1,RAM262120,CCR5,CVRPGNN-SV-QEM---R--V--GP----M--A-WY-----S-MEL...
2,RAM262127,CCR5,CVRPGDN-SV-KEM---R--A--GP----M--A-WY-----S-MEL...
3,RAM262126,CCR5,CVRPGNN-SV-KEM---R--V--GP----M--A-LY-----S-MEL...
4,RAM262125,CCR5,CVRPGNN-TV-KEM---R--V--GP----M--A-WY-----S-MEL...
5,RAM262114,CCR5,CVRPGSN-SV-QEI---K--I--GP----M--A-WY-----S-MQL...
6,RX84327,CCR5,CVRPGNN-SV-QEI---K--I--GP----M--A-WY-----S-MQI...
7,RX84327,CCR5,CVRPGNN-SV-QEI---K--I--GP----M--A-WY-----S-MQI...
8,RU24566,CCR5,CHRPGNL-SV-QEM---K--I--GP----L--S-WY-----SMGLA...
9,RU24566,CCR5,CHRPGNL-SV-QEM---K--I--GP----L--S-WY-----SMGLA...


In [146]:
df.drop_duplicates(subset='sequence', keep='first', inplace=True)

In [147]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3239 entries, 0 to 5331
Data columns (total 4 columns):
name             3239 non-null object
label            3239 non-null object
sequence         3239 non-null object
label_numeric    3239 non-null int64
dtypes: int64(1), object(3)
memory usage: 126.5+ KB


In [148]:
seq = df.loc[0, 'sequence']

In [149]:
# Function to call labels
def tropism_label(row):
    # For CCR5
    if row.label == 'CCR5':
        return 0
    # For CXCR4
    elif row.label == 'CXCR4':
        return 1
    # For R5X4
    elif row.label == 'R5X4':
        return 1

In [150]:
df['label_numeric'] = df.apply(tropism_label, axis=1)

In [151]:
df.label.value_counts()

CCR5     2490
R5X4      465
CXCR4     284
Name: label, dtype: int64

In [152]:
df.label_numeric.value_counts()

0    2490
1     749
Name: label_numeric, dtype: int64

In [153]:
df.shape

(3239, 4)

In [154]:
df

Unnamed: 0,name,label,sequence,label_numeric
0,RKF859742,CCR5,CERPTMD-I--QDI---H--I--GP----M--A-WY---S-TYIER...,0
1,RAM262120,CCR5,CVRPGNN-SV-QEM---R--V--GP----M--A-WY-----S-MEL...,0
2,RAM262127,CCR5,CVRPGDN-SV-KEM---R--A--GP----M--A-WY-----S-MEL...,0
3,RAM262126,CCR5,CVRPGNN-SV-KEM---R--V--GP----M--A-LY-----S-MEL...,0
4,RAM262125,CCR5,CVRPGNN-TV-KEM---R--V--GP----M--A-WY-----S-MEL...,0
5,RAM262114,CCR5,CVRPGSN-SV-QEI---K--I--GP----M--A-WY-----S-MQL...,0
6,RX84327,CCR5,CVRPGNN-SV-QEI---K--I--GP----M--A-WY-----S-MQI...,0
8,RU24566,CCR5,CHRPGNL-SV-QEM---K--I--GP----L--S-WY-----SMGLA...,0
10,RX96522,CCR5,CERPGNH-TV-QEI---R--I--GP----L--A-WY-----SMGIE...,0
12,RAF009608,CCR5,CSRPE-M-DV-QEI---R--N--GP----M--A-WYSMALAKGGTT...,0


In [155]:
# Check of len of sequence is the same for all rows
set(df['sequence'].apply(len))

{60}

In [157]:
# Save the Newdb processed dataset into TSV
df.to_csv('newdb_wrangled.tsv', sep='\t')

## Converting Protein Sequence to Vectors

In [158]:
df_aa = pd.read_csv('datasets/aminoacids_oneletter_code.csv', sep='\t')

In [159]:
df_aa

Unnamed: 0,Pos_array,3-letters-code,1-letter-code,Aminoacid
0,1.0,Ala,A,Alanine
1,2.0,Asn,N,Asparagine
2,3.0,Asp,D,Aspartic acid
3,4.0,Cys,C,Cysteine
4,5.0,Gln,Q,Glutamine
5,6.0,Glu,E,Glutamic acid
6,7.0,Gly,G,Glycine
7,8.0,His,H,Histidine
8,9.0,Ile,I,Isoleucine
9,10.0,Leu,L,Leucine


In [160]:
df_aa['1-letter-code'].to_list()

['A',
 'N',
 'D',
 'C',
 'Q',
 'E',
 'G',
 'H',
 'I',
 'L',
 'K',
 'M',
 'F',
 'P',
 'O',
 'S',
 'U',
 'T',
 'W',
 'Y',
 'V',
 'B',
 'Z',
 'X',
 'J',
 '-']

In [197]:
def get_array_from_sequence(seq):
    dict_aa_pos = {
    'A':1, 'R':2, 'N':3, 'D':4, 'C':5, 'Q':6, 'E':7, 'G':8,
    'H':9, 'I':10, 'L':11, 'K':12, 'M':13, 'F':14, 'P':15, 
    'O':16, 'S':17, 'U':18, 'T':19, 'W':20, 'Y':21, 'V':22, 
    'B':23, 'Z':24, 'J':25, 'X':0}
    
    f_array = np.zeros(26)
    for aa in seq:
        arr = np.zeros(26)
        if dict_aa_pos.get(aa):
            arr[dict_aa_pos.get(aa)] = 1
        f_array = np.vstack((f_array, arr))
    f_array = np.delete(f_array, 0,0)
    
    return torch.from_numpy((f_array.flatten()).astype(float))
    

In [198]:
ex = get_array_from_sequence('CSRP-GNN-TR-TSI---PI--GP-GR--A-WF---AT--G----D--V-TGDPRKAHC')

In [199]:
sum(ex)

tensor(35., dtype=torch.float64)

In [301]:
# Create list to append data from the df
list_data = []
list_labels = []

# Convert the sequences and labels to arrays to use as data on pytorch
for index, row in df.iterrows():
    list_data.append(get_array_from_sequence(str(row.sequence)))
    list_labels.append(int(row.label_numeric))
    

In [302]:
list_data[0]

tensor([0., 0., 0.,  ..., 0., 0., 0.], dtype=torch.float64)

In [303]:
list_data[0].shape

torch.Size([1560])

In [304]:
len(list_labels)

3239

In [305]:
batch_size = 32
validation_split = .5
shuffle_dataset = True
random_seed= 5

# Creating data indices for training and validation splits:
dataset_size = len(list_data)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))

In [306]:
split

1619

In [307]:
if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

In [308]:
len(val_indices)

1619

In [309]:
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

# To transform list_data and list_labels on trainloader

# For training
train_data = []
for i in train_indices:
    train_data.append([list_data[i], list_labels[i]])

train_loader = torch.utils.data.DataLoader(train_data, shuffle=True, batch_size=32)

# For validation
test_data = []
for j in val_indices:
    test_data.append([list_data[j], list_labels[j]])

test_loader = torch.utils.data.DataLoader(test_data, shuffle=True, batch_size=32)


In [278]:
import torch.nn as nn
import torch.nn.functional as F

# Define the class Net
class Net(nn.Module):
    def __init__(self):    
        # Define all the parameters of the net
        super(Net, self).__init__()
        self.fc1 = nn.Linear(1560, 200)
        self.fc2 = nn.Linear(200, 2)

    def forward(self, x):   
        # Do the forward pass
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
model = Net()
model = model.float()

In [None]:
## Creating e new Neural Network based on LeNet

In [117]:
class AminoNet(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        #self.conv1 = nn.Conv2d(3, 6, 5)
        #self.pool = nn.MaxPool2d(2, 2)
        #self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(1534, 900)
        self.fc2 = nn.Linear(300, 100)
        self.fc3 = nn.Linear(100, 2)
        self.conv1 = nn.Conv1d(900, 300, 78, 26)

    def forward(self, x):
        #x = self.pool(F.relu(self.conv1(x)))
        #x = self.pool(F.relu(self.conv2(x)))
        #x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.conv1(self.fc1(x)))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

model = LeNet()
model = model.float()

In [113]:
class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()

        # input channel = 1, output channel = 6, kernel_size = 5
        # input size = (32, 32), output size = (28, 28)
        self.conv1 = nn.Conv1d(1, 6, 3)
        # input channel = 6, output channel = 16, kernel_size = 5
        # input size = (14, 14), output size = (10, 10)
        self.conv2 = nn.Conv1d(6, 16, 3)
        # input dim = 16*5*5, output dim = 120
        self.fc1 = nn.Linear(1534, 120)
        # input dim = 120, output dim = 84
        self.fc2 = nn.Linear(120, 84)
        # input dim = 84, output dim = 10
        self.fc3 = nn.Linear(84, 2)

    def forward(self, x):
        # pool size = 2
        # input size = (28, 28), output size = (14, 14), output channel = 6
        x = F.max_pool2d(F.relu(self.conv1(x)), 1)
        # pool size = 2
        # input size = (10, 10), output size = (5, 5), output channel = 16
        x = F.max_pool2d(F.relu(self.conv2(x)), 1)
        # flatten as one dimension
        x = x.view(x.size()[0], -1)
        # input dim = 16*5*5, output dim = 120
        x = F.relu(self.fc1(x))
        # input dim = 120, output dim = 84
        x = F.relu(self.fc2(x))
        # input dim = 84, output dim = 10
        x = self.fc3(x)
        return x

model = LeNet()
model = model.float()

In [228]:
class Net(nn.Module):
    
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
        self.conv2_drop = nn.Dropout2d()
        self.fc1 = nn.Linear(320, 50)
        self.fc2 = nn.Linear(50, 10)

    def forward(self, x):
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
        x = x.view(-1, 320)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)
        return F.log_softmax(x, dim=-1)

model = Net()
model = model.float()

In [233]:
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        #self.conv1 = nn.Conv2d(3, 6, 5)
        #self.pool = nn.MaxPool2d(2, 2)
        #self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(1560, 767)
        self.fc2 = nn.Linear(767, 300)
        self.fc3 = nn.Linear(300, 600)
        self.fc4 = nn.Linear(600, 200)
        self.fc5 = nn.Linear(200, 100)
        self.fc6 = nn.Linear(100, 2)

    def forward(self, x):
        #x = self.pool(F.relu(self.conv1(x)))
        #x = self.pool(F.relu(self.conv2(x)))
        #x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.dropout(x, training=self.training)
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = self.fc6(x)
        return x


model = Net()
model = model.float()

In [310]:
model

Net(
  (fc1): Linear(in_features=1560, out_features=200, bias=True)
  (fc2): Linear(in_features=200, out_features=2, bias=True)
)

In [311]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [312]:
for epoch in range(500):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs.float())
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        #print(running_loss)
        if i % 1000 == 0:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 50))
            running_loss = 0.0

print('Finished Training')

[1,     1] loss: 0.034
[2,     1] loss: 0.006
[3,     1] loss: 0.007
[4,     1] loss: 0.004
[5,     1] loss: 0.007
[6,     1] loss: 0.009
[7,     1] loss: 0.004
[8,     1] loss: 0.004
[9,     1] loss: 0.008
[10,     1] loss: 0.002
[11,     1] loss: 0.002
[12,     1] loss: 0.004
[13,     1] loss: 0.009
[14,     1] loss: 0.003
[15,     1] loss: 0.004
[16,     1] loss: 0.003
[17,     1] loss: 0.007
[18,     1] loss: 0.004
[19,     1] loss: 0.003
[20,     1] loss: 0.004
[21,     1] loss: 0.007
[22,     1] loss: 0.008
[23,     1] loss: 0.005
[24,     1] loss: 0.003
[25,     1] loss: 0.005
[26,     1] loss: 0.009
[27,     1] loss: 0.002
[28,     1] loss: 0.007
[29,     1] loss: 0.009
[30,     1] loss: 0.005
[31,     1] loss: 0.003
[32,     1] loss: 0.008
[33,     1] loss: 0.005
[34,     1] loss: 0.007
[35,     1] loss: 0.004
[36,     1] loss: 0.006
[37,     1] loss: 0.001
[38,     1] loss: 0.004
[39,     1] loss: 0.003
[40,     1] loss: 0.007
[41,     1] loss: 0.003
[42,     1] loss: 0.004
[

In [313]:
correct = 0
total = 0
with torch.no_grad():
    for data in test_loader:
        images, labels = data
        outputs = model(images.float())
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the 10000 test images: %d %%' % (
    100 * correct / total))

Accuracy of the network on the 10000 test images: 87 %


In [299]:
correct

1295

In [300]:
total

1339

# Processing CM dataset

In [164]:
df_cm = pd.read_csv('datasets/cm_aligned.tsv', sep='\t', names=['name','sequence'])

In [167]:
df_cm.head(10)

Unnamed: 0,name,sequence
0,1432.KF859742.O.CCR5,C-ERPTMD--I-QD---I-H--IGP----MA-WY--STYIER-QAK...
1,MD47.KF859744.O.CCR5,C-QRPGHQ-EI-QE---I-K--TGP-----LAWY--SMYLKE-NTT...
2,BCF02.U24562.O.CCR5,C-QRPGHQ-TV-QE---I-R--IGP-----MAWY--SM---G-LAA...
3,152.KF859743.O.CCR5,C-RRPAMK--V-QE---M-R--IGP----MA-WY--SMALEE-GNN...
4,DUR.X84327.O.CCR5,C-VRPGNN-SV-QE---I-K--IGP-----MAWY--SM-----QIE...
5,DUR.AM262121.O.CCR5,C-VRPGNN-SV-QE---M-R--VGP--M--A-WY--SM-----ELE...
6,DUR.AM262130.O.CCR5,C-VRPGNN-SV-KE---M-R--VGP--M--A-LY--SM-----ELE...
7,DUR.AM262127.O.CCR5,C-VRPGDN-SV-KE---M-R--AGP-----MAWY--SM-----ELE...
8,CA9.X96522.O.CCR5,C-ERPGNH-TV-QE---I-R--IGP-LA----WY--SM---G-IEK...
9,BCF01.U24566.O.CCR5,C-HRPGNL-SV-QE---M-K--IGP--LS---WY--SM---G-LAA...


In [185]:
def get_label(row):
    if 'CCR5' in row['name'] and 'CXCR4' in row['name']:
        return 'R5X4'
    elif 'CCR5' in row['name']:
        return 'CCR5'
    elif 'CXCR4' in row['name']:
        return 'CXCR4'

In [186]:
df_cm.name

0             1432.KF859742.O.CCR5
1             MD47.KF859744.O.CCR5
2              BCF02.U24562.O.CCR5
3              152.KF859743.O.CCR5
4                DUR.X84327.O.CCR5
                   ...            
2674    H13988_DS2.JF508074.B.CCR5
2675    H13988_DS2.JF508043.B.CCR5
2676            39.AF022258.B.CCR5
2677           122.DQ002264.B.CCR5
2678          Pat1.AF541016.B.CCR5
Name: name, Length: 2679, dtype: object

In [187]:
df.loc[0]['name']

'RKF859742'

In [193]:
df_cm['label'] = df_cm.apply(get_label, axis=1)

In [194]:
df_cm['label_numeric'] = df_cm.apply(tropism_label, axis=1)

In [196]:
df_cm.head(20)

Unnamed: 0,name,sequence,label,label_numeric
0,1432.KF859742.O.CCR5,C-ERPTMD--I-QD---I-H--IGP----MA-WY--STYIER-QAK...,CCR5,0
1,MD47.KF859744.O.CCR5,C-QRPGHQ-EI-QE---I-K--TGP-----LAWY--SMYLKE-NTT...,CCR5,0
2,BCF02.U24562.O.CCR5,C-QRPGHQ-TV-QE---I-R--IGP-----MAWY--SM---G-LAA...,CCR5,0
3,152.KF859743.O.CCR5,C-RRPAMK--V-QE---M-R--IGP----MA-WY--SMALEE-GNN...,CCR5,0
4,DUR.X84327.O.CCR5,C-VRPGNN-SV-QE---I-K--IGP-----MAWY--SM-----QIE...,CCR5,0
5,DUR.AM262121.O.CCR5,C-VRPGNN-SV-QE---M-R--VGP--M--A-WY--SM-----ELE...,CCR5,0
6,DUR.AM262130.O.CCR5,C-VRPGNN-SV-KE---M-R--VGP--M--A-LY--SM-----ELE...,CCR5,0
7,DUR.AM262127.O.CCR5,C-VRPGDN-SV-KE---M-R--AGP-----MAWY--SM-----ELE...,CCR5,0
8,CA9.X96522.O.CCR5,C-ERPGNH-TV-QE---I-R--IGP-LA----WY--SM---G-IEK...,CCR5,0
9,BCF01.U24566.O.CCR5,C-HRPGNL-SV-QE---M-K--IGP--LS---WY--SM---G-LAA...,CCR5,0
