<a href="https://colab.research.google.com/github/2019mohamed/DNA-and-NLP/blob/main/one_hot_vector_based.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data.sampler import SubsetRandomSampler , BatchSampler
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim


class MLP(nn.Module):
    """MLP with linear output"""
    def __init__(self, num_layers, input_dim, hidden_dim, output_dim):
        """MLP layers construction
        Paramters
        ---------
        num_layers: int
            The number of linear layers
        input_dim: int
            The dimensionality of input features
        hidden_dim: int
            The dimensionality of hidden units at ALL layers
        output_dim: int
            The number of classes for prediction
        """
        super(MLP, self).__init__()
        self.linear_or_not = True  # default is linear model
        self.num_layers = num_layers
        self.output_dim = output_dim

        if num_layers < 1:
            raise ValueError("number of layers should be positive!")
        elif num_layers == 1:
            # Linear model
            self.linear = nn.Linear(input_dim, output_dim)
        else:
            # Multi-layer model
            self.linear_or_not = False
            self.linears = torch.nn.ModuleList()
            self.batch_norms = torch.nn.ModuleList()

            self.linears.append(nn.Linear(input_dim, hidden_dim))
            for layer in range(num_layers - 2):
                self.linears.append(nn.Linear(hidden_dim, hidden_dim))
            self.linears.append(nn.Linear(hidden_dim, output_dim))

            for layer in range(num_layers - 1):
                self.batch_norms.append(nn.BatchNorm1d((hidden_dim)))

    def forward(self, x):
        if self.linear_or_not:
            # If linear model
            return self.linear(x)
        else:
            # If MLP
            h = x
            for i in range(self.num_layers - 1):
                h = F.relu(self.linears[i](h))
            return self.linears[-1](h)

class SeqData (Dataset):
    def __init__(self):
        data = pd.read_csv('promoters.csv')
        data['Sequence'] = data['Sequence'].str.replace('\t\t' , '')
        data['Sequence'] = data['Sequence'].str.replace('\t' , '')
        self.seqs = list(data['Sequence'])
        #print(self.seqs[0])
        self.maxlen = len(max(self.seqs , key = lambda k :len(k)))
        self.labels = list(data['Class'])
        self.map = {'a':0 , 'c':1 , 'g':2 , 't':3 }
        #print(self.seqs[34],' ',self.labels[34])
    
    def __len__(self):
        return len(self.seqs)

    def __getitem__(self, index):
        x = np.zeros((self.maxlen,len(self.map)))
        seq = self.seqs[index].lower()
        for i, alpa in enumerate(seq):
            x[i,self.map[alpa]] = 1
        #print(x)
        l = 0 if self.labels[index] == '+' else 1
        return x , l
    
    
def train(net, trainloader, optimizer, criterion):
    net.train()

    running_loss = 0
    total_iters = len(trainloader)

    for idx , data  in enumerate(trainloader):

        x, labels = data
        outputs = net(x.float())

        loss = criterion(outputs, labels)
        running_loss += loss.item()

        # backprop
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    running_loss = running_loss / total_iters

    return running_loss


def eval_net(net, dataloader, criterion):
    net.eval()

    total = 0
    total_loss = 0
    total_correct = 0

    for idx ,  data in enumerate(dataloader):
        x, labels = data

        total += len(labels)
        outputs = net(x.float())
        _, predicted = torch.max(outputs.data, 1)

        total_correct += (predicted == labels.data).sum().item()
        loss = criterion(outputs, labels)
        total_loss += loss.item() * len(labels)

    loss, acc = 1.0*total_loss / total, 1.0*total_correct / total


    return loss, acc
 
    
class Model(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers):
        super(Model, self).__init__()

        # Defining some parameters
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        #Defining the layers
        # RNN Layer
        #self.rnn = nn.RNN(input_size, hidden_dim, n_layers, batch_first = True ) 
        self.rnn = nn.LSTM(input_size, hidden_dim, n_layers, batch_first=True)
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_size)
    
    def forward(self, x):
        
        batch_size = x.size(0)

        # Initializing hidden state for first input using method defined below
        hidden = self.init_hidden(batch_size)
        c = self.init_hidden(batch_size)

        # Passing in the input and hidden state into the model and obtaining outputs
        #out, hidden = self.rnn(x, hidden)
        out , hidden = self.rnn (x , (hidden , c))
        # Reshaping the outputs such that it can be fit into the fully connected layer
        out = out[: , -1 , :]
        out = self.fc(out)
        
        return out
    
    def init_hidden(self, batch_size):
        # This method generates the first hidden state of zeros which we'll use in the forward pass
        # We'll send the tensor holding the hidden state to the device we specified earlier as well
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim)
        return hidden       

def split_rand(dataset,batch_size, split_ratio=0.7, seed=42, shuffle=True):
    import math
    num_entries = len(dataset)
    indices = list(range(num_entries))
    np.random.seed(seed)
    np.random.shuffle(indices)
    split = int(math.floor(split_ratio * num_entries))
    train_idx, valid_idx = indices[:split], indices[split:]
    
    
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)

    train_loader = DataLoader(
            dataset, sampler=train_sampler,
            batch_size=batch_size)
    
    valid_loader = DataLoader(
            dataset, sampler=valid_sampler,
            batch_size=batch_size)

    return train_loader, valid_loader
    

model = Model (4 , 2 , 264 , 2)


dataset = SeqData()

train_loader , test_loader = split_rand(dataset, batch_size = 16)

criterion = nn.CrossEntropyLoss()  
optimizer = optim.Adam(model.parameters(), lr=0.001)

for _ in range(300):
    print(train(model, train_loader, optimizer, criterion))
    
    print(eval_net(model, test_loader, criterion))
    

0.6965733051300049
(0.7009388506412506, 0.40625)
0.6927249789237976
(0.7114456593990326, 0.40625)
0.6918678402900695
(0.7113458514213562, 0.40625)
0.6882955074310303
(0.7092073559761047, 0.40625)
0.6900540828704834
(0.7120343744754791, 0.40625)
0.6872353792190552
(0.7117859125137329, 0.40625)
0.6881177186965942
(0.7116632759571075, 0.40625)
0.689265513420105
(0.7069198489189148, 0.40625)
0.6808666586875916
(0.7001826465129852, 0.40625)
0.6886512637138367
(0.6845649480819702, 0.625)
0.6861053705215454
(0.6832996308803558, 0.625)
0.6828104615211487
(0.6893715262413025, 0.5625)
0.6761790037155151
(0.6878548562526703, 0.5625)
0.6719621777534485
(0.6889884769916534, 0.53125)
0.6517516016960144
(0.651681661605835, 0.65625)
0.6851989507675171
(0.6170934438705444, 0.65625)
0.6369097352027893
(0.6626282930374146, 0.625)
0.6562268853187561
(0.6374286413192749, 0.625)
0.6038472414016723
(0.6227944791316986, 0.65625)
0.5887384653091431
(0.5791071653366089, 0.6875)
0.5691644012928009
(0.53045746684