<a href="https://colab.research.google.com/github/FAhtisham/Latext-based-EnhancerGAN/blob/main/EDGAN%20(Autoencoder%20only)%20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


import numpy as np
import matplotlib.pyplot as plt

import numpy as np
import collections

from tqdm import tqdm 

# Dataset Preprocessing

In [83]:
class Nucleotides:
  def __init__(self, seqs):
    self.nuc_pairs= self.make_pairs(seqs)
    print("total words in vocab", self.nuc_pairs)
    self.encoding= {w:i for i,w in enumerate(self.nuc_pairs,1)}
    self.decoding= {i:w for i,w in enumerate(self.nuc_pairs,1)}

  
  def make_pairs(self, seqs, clip=1):
    nuc_pairs= collections.Counter()

    for seq in tqdm(seqs):
      nuc_pairs.update(seq)

    # check why this statement is so important (84, remains same without it)
    for nucs in list(nuc_pairs.keys()):
      if nuc_pairs[nucs] < clip:
        nuc_pairs.pop(nucs)

    return list(sorted(nuc_pairs.keys()))


  def size(self):
    assert len(self.encoding) == len(self.decoding)
    return len(self.encoding)

  


class Sequences(Dataset):
    def __init__(self, seq_len=131):
      self.seq_len= seq_len
      print(self.seq_len)
      self.seqs= self.convert_seqs_to_words(self.load_data())
      self.seqs= self.get_size_specific_seqs(self.seqs)
      self.nucleotides= Nucleotides(self.seqs)
    

    def read_fasta(self,fp):
        name, seq = None, []
        for line in fp:
            line = line.rstrip()
            if line.startswith(">"):
                if name: yield (name, ''.join(seq))
                name, seq = line, []
            else:
                seq.append(line)
        if name: yield (name, ''.join(seq))
    
    def load_data(self):
      sequences = []
      # Reading FASTA file
      with open("permissive_enhancers","r") as fp:
        for name, seq in self.read_fasta(fp):
          sequences.append(seq)
      print("Sequences Read Succesfully !!!!")
      print("Total Raw Sequences: ",len(sequences))
      return sequences
    
    def add_padding(self,seq, p_len):
      seq = seq + ("P" * p_len)
      return seq
  
  
    def convert_seqs_to_words(self,sequences):
      f_sequences = []
      for i in range(len(sequences)):
        temp = ""
        str_ = sequences[i]
        j=0
        if len(str_)%3!=0:
          n = len(str_)
          while n % 3 != 0:
            n+=1
            str_= self.add_padding(str_,n-len(str_))
        for k in range(0,len(str_)):
          j+=1
          if  j%3==0:
            temp = temp + str_[k-2:j] + ' ' 
          #j+=3
        f_sequences.append(temp) 
      f_sequences= [j.split() for j in f_sequences]
      return f_sequences
    
    def encode(self, seq):
      enc= self.nucleotides.encoding
      a = np.array([enc.get(c) for c in seq])
      return a
    
    
    def get_size_specific_seqs(self, seqs):
      final_sequences=[]
      for i in range(len(seqs)):
        if (len(seqs[i]) == 131):
          final_sequences.append(seqs[i])

      return final_sequences
        
    def __len__(self):
      return len(self.seqs)

    def __getitem__(self,i):
      return torch.from_numpy(self.encode(self.seqs[i]))

In [84]:
seq_len=131
obj= Sequences(seq_len)
print(obj.nucleotides.encoding)

131
Sequences Read Succesfully !!!!
Total Raw Sequences:  43011


100%|██████████| 400/400 [00:00<00:00, 42119.94it/s]

total words in vocab ['AAA', 'AAC', 'AAG', 'AAP', 'AAT', 'ACA', 'ACC', 'ACG', 'ACP', 'ACT', 'AGA', 'AGC', 'AGG', 'AGP', 'AGT', 'APP', 'ATA', 'ATC', 'ATG', 'ATP', 'ATT', 'CAA', 'CAC', 'CAG', 'CAP', 'CAT', 'CCA', 'CCC', 'CCG', 'CCP', 'CCT', 'CGA', 'CGC', 'CGG', 'CGP', 'CGT', 'CPP', 'CTA', 'CTC', 'CTG', 'CTP', 'CTT', 'GAA', 'GAC', 'GAG', 'GAP', 'GAT', 'GCA', 'GCC', 'GCG', 'GCP', 'GCT', 'GGA', 'GGC', 'GGG', 'GGP', 'GGT', 'GPP', 'GTA', 'GTC', 'GTG', 'GTP', 'GTT', 'TAA', 'TAC', 'TAG', 'TAP', 'TAT', 'TCA', 'TCC', 'TCG', 'TCP', 'TCT', 'TGA', 'TGC', 'TGG', 'TGP', 'TGT', 'TPP', 'TTA', 'TTC', 'TTG', 'TTT']
{'AAA': 1, 'AAC': 2, 'AAG': 3, 'AAP': 4, 'AAT': 5, 'ACA': 6, 'ACC': 7, 'ACG': 8, 'ACP': 9, 'ACT': 10, 'AGA': 11, 'AGC': 12, 'AGG': 13, 'AGP': 14, 'AGT': 15, 'APP': 16, 'ATA': 17, 'ATC': 18, 'ATG': 19, 'ATP': 20, 'ATT': 21, 'CAA': 22, 'CAC': 23, 'CAG': 24, 'CAP': 25, 'CAT': 26, 'CCA': 27, 'CCC': 28, 'CCG': 29, 'CCP': 30, 'CCT': 31, 'CGA': 32, 'CGC': 33, 'CGG': 34, 'CGP': 35, 'CGT': 36, 'CPP': 37




In [85]:
def load(batch_size, seq_len):
  data= Sequences(seq_len)
  return (DataLoader(data, batch_size, shuffle=True), data.nucleotides)

# Autoencoder Model

In [163]:
class Autoencoder(nn.Module):
  def __init__(self, nuc_pair_size, embedding_dims, e_hidden_dims, bottleneck_dims, dec_hidden_dims, seq_length, dropout_size = 0.2):
    super().__init__()
    nuc_pair_size+=1
    self.seq_length= seq_length
    # define the vars over here (layers, objects)
    self.embedding= nn.Embedding( nuc_pair_size, embedding_dims)
    self.rnn1= nn.LSTM(input_size= embedding_dims, hidden_size= e_hidden_dims)
    self.fc1= nn.Linear(in_features = e_hidden_dims, out_features= bottleneck_dims)
    self.a1= nn.ReLU(True)
    self.dropout= nn.Dropout(dropout_size)

    self.fc2= nn.Linear(in_features = bottleneck_dims, out_features= d_hidden_dims)
    self.rnn2= nn.LSTM(input_size= dec_hidden_dims, hidden_size= d_hidden_dims)
    self.fc3= nn.Linear(in_features= d_hidden_dims, out_features= nuc_pair_size)



  def encoder(self, x):
    x= self.embedding(x).permute(1,0,2)
    _,(hidden_states, _)= self.rnn1(x)
    lv= self.fc1(hidden_states) # latent vector
    lv= self.dropout(lv)
    return lv


  def decoder(self, lv):
    lv= self.fc2(lv)
    output, _= self.rnn2(lv.repeat(self.seq_length,1,1),(lv,lv))
    output= output.permute(1,0,2)
    logits= self.fc3(output)
    return logits.transpose(1,2)
  
  def forward(self,x):
    lv= self.encoder(x)
    logits= self.decoder(lv)
    return (lv.squeeze(), logits)

# Check the common size among seqs

In [164]:
# for i in range(len(obj.seqs)):
#   print(len(obj.seqs[i]))

In [165]:
# f_sequences = obj.seqs
# sizes=[]
# for i in range(len(f_sequences)):
#   if(len(f_sequences[i]) not in sizes):
#     sizes.append(len(f_sequences[i]))


# import numpy
# a = np.zeros(shape=(len(sizes)))

# for i in range(len(f_sequences)):
#   for j in range(len(sizes)):
#     if (len(f_sequences[i]) == sizes[j]):
#       a[j]+=1

# print(a, sizes)

# final_sequences = []
# for i in range(len(f_sequences)):
#   if (len(f_sequences[i]) == 131):
#     final_sequences.append(f_sequences[i])

# print(len(final_sequences))



# Training Loop


In [166]:
import argparse


device = ("cuda" if torch.cuda.is_available() else "cpu")
device

def train(epoch):
  model.train()
  train_loss=0
  for i,x in enumerate(train_loader):
    optimizer.zero_grad()

    x = x.to(device)
    _, logits= model(x)


    loss= criterion(logits, x)
    train_loss+=loss.item()
    loss.backward()

    optimizer.step()

    if interval > 0 and i % interval ==0:
      print("epoch: ", epoch, " batch: ", batch_size*i,"/", len(train_loader.dataset), " loss:", loss.item())
      # print("epoch: {} | Batch:{}/{} ({:0.f}%)| Loss:{:.6f}".format(
      #     epoch, batch_size*i, len(train_loader.dataset),
      #     100.*(batch_size*i)/len(train_loader.dataset),
      #     loss.item
        
      # ))

  train_loss /= len(train_loader)
  print('(Train) Epoch: {}|loss{:.4f}'.format(epoch, train_loss))
  return train_loss







In [167]:
'''
parser = argparse.ArgumentParser()
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--epochs", type=int, default=10)
parser.add_argument("--batch-size", type=int, default=32)
parser.add_argument("--lr", type=float, default=5e-4)
parser.add_argument("--dropout", type=int, default=0.2)
parser.add_argument("--embedding-dim", type=int, default=200)
parser.add_argument("e-hidden-dim", type=int, default=100)

args = parser.parse_args()

print(args)
'''

seed= 0
epochs= 15
batch_size= 32
lr= 5e-04
dropout= 0.2
embedding_dims=200
e_hidden_dims= 100
d_hidden_dims=600
seq_length= 131
bottleneck_dims=200
interval=10
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True

train_loader, nuc_pairs= load(batch_size, seq_length)
model = Autoencoder( nuc_pairs.size(), embedding_dims, e_hidden_dims, bottleneck_dims, d_hidden_dims, seq_length, dropout).to(device)


criterion= nn.CrossEntropyLoss()
optimizer= optim.Adam(model.parameters(), lr=lr)

131
Sequences Read Succesfully !!!!
Total Raw Sequences:  43011


100%|██████████| 400/400 [00:00<00:00, 45964.98it/s]

total words in vocab ['AAA', 'AAC', 'AAG', 'AAP', 'AAT', 'ACA', 'ACC', 'ACG', 'ACP', 'ACT', 'AGA', 'AGC', 'AGG', 'AGP', 'AGT', 'APP', 'ATA', 'ATC', 'ATG', 'ATP', 'ATT', 'CAA', 'CAC', 'CAG', 'CAP', 'CAT', 'CCA', 'CCC', 'CCG', 'CCP', 'CCT', 'CGA', 'CGC', 'CGG', 'CGP', 'CGT', 'CPP', 'CTA', 'CTC', 'CTG', 'CTP', 'CTT', 'GAA', 'GAC', 'GAG', 'GAP', 'GAT', 'GCA', 'GCC', 'GCG', 'GCP', 'GCT', 'GGA', 'GGC', 'GGG', 'GGP', 'GGT', 'GPP', 'GTA', 'GTC', 'GTG', 'GTP', 'GTT', 'TAA', 'TAC', 'TAG', 'TAP', 'TAT', 'TCA', 'TCC', 'TCG', 'TCP', 'TCT', 'TGA', 'TGC', 'TGG', 'TGP', 'TGT', 'TPP', 'TTA', 'TTC', 'TTG', 'TTT']





In [168]:
model

Autoencoder(
  (embedding): Embedding(84, 200)
  (rnn1): LSTM(200, 100)
  (fc1): Linear(in_features=100, out_features=200, bias=True)
  (a1): ReLU(inplace=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=200, out_features=600, bias=True)
  (rnn2): LSTM(600, 600)
  (fc3): Linear(in_features=600, out_features=84, bias=True)
)

In [170]:
best_loss = 0
for epoch in range(epochs):
  loss = train(epoch)
  if loss < best_loss:
      best_loss= loss
      print('saved')
      torch.save(model.state_dict(), 'ae.th')

KeyboardInterrupt: ignored

# Practice

In [39]:
arr = torch.tensor([[1,2],[3,4],[5,6]])
arr.size()
arr = arr.permute(1,0)
arr.size()
arr

tensor([[1, 3, 5],
        [2, 4, 6]])

In [38]:
arr2 = torch.randn(3,5,2)
print(arr2.size())
print(arr2)
arr2= arr2.permute(2,0,1)
print(arr2.size())
print(arr2)

torch.Size([3, 5, 2])
tensor([[[ 0.2006,  1.8473],
         [ 0.1055, -1.1586],
         [ 0.1208,  0.3094],
         [-0.0632, -0.5114],
         [ 0.2881,  1.3463]],

        [[-0.3929,  1.1025],
         [ 1.2375,  0.4721],
         [-0.2026,  0.5434],
         [-0.2871, -0.9966],
         [-0.8792, -0.4815]],

        [[-1.4430, -0.9805],
         [ 0.2667,  0.6952],
         [-0.1730, -1.1442],
         [ 0.8840,  0.5541],
         [-0.0819, -0.5837]]])
torch.Size([2, 3, 5])
tensor([[[ 0.2006,  0.1055,  0.1208, -0.0632,  0.2881],
         [-0.3929,  1.2375, -0.2026, -0.2871, -0.8792],
         [-1.4430,  0.2667, -0.1730,  0.8840, -0.0819]],

        [[ 1.8473, -1.1586,  0.3094, -0.5114,  1.3463],
         [ 1.1025,  0.4721,  0.5434, -0.9966, -0.4815],
         [-0.9805,  0.6952, -1.1442,  0.5541, -0.5837]]])
