## Installing Proper Required Libraries

In [135]:
!pip install Bio
!pip install pandas
!pip install numpy
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output

# download data
!wget --user=ftp ftp://ccg.epfl.ch/epd/current/epd_16K.seq

clear_output()

## Building DataFrames

In [209]:
from Bio import SeqIO
import pandas as pd
import numpy as np
import math
import random

# Initializing lists that will contain info about each record
accessIDs = []
DNA_Sequences = []
sequence_lengths = []
parsed_subsequences = []

# All DNA sequences of the human genome are 16,000 characters long (Hence epd_16K)
# if using epd_16K. If using epd_seq, then change the sequence_length variable
# to 600
sequence_length = 16000

test = 1

# Loop through all records parsed by SeqIO.
#
# ") Hs" on the description line means that
# we've found a Homo sapien DNA sequence, so
# add the appropriate properties of seq_record
# to the appropriate arrays. This is our way
# to filter out non-homosapien sequences
for seq_record in SeqIO.parse('epd_16K.seq', 'fasta'):
  if seq_record.seq.find('N') == -1 and seq_record.description.find(') Hs') != -1:
    accessIDs.append(seq_record.id)
    DNA_Sequences.append(seq_record.seq)
    sequence_lengths.append(len(seq_record))

print("Number of sequences: " + str(len(DNA_Sequences)))

# Defining rows of PSFM and initializing all frequencies to 0
adenine_frequencies = np.zeros(sequence_length)
guanine_frequencies = np.zeros(sequence_length)
thymine_frequencies = np.zeros(sequence_length)
cytosine_frequencies = np.zeros(sequence_length)
other_frequency = np.zeros(sequence_length)

# Loop through all the DNA sequences in the DNA_Sequence list.
# For each sequence, check if the nucleobase (character) at 
# postition sequence_index is adenine, guanine, thymine, or cytosine.
# If it's any of those, then increment the count at that list's index.
for sequence in DNA_Sequences:

  # Randomly generating where the window begins,
  # and since we want each sequence to be 6,000 chars
  # long, we add 6,000 to the rng splicedStart, giving
  # us the spliced end. Then, since we moved the start of
  # the sequence up by a number of splicedStart, to get where
  # the promoter is in relation to that new start, we can
  # subtract splicedStart from 10,000
  # i.e. promoterindex = 10,000 - splicedStart
  splicedStart = random.randint(5000, 9000)
  splicedEnd = splicedStart + 6000
  sub_sequence = sequence[splicedStart : splicedEnd]
  promoterIndex = 10000 - splicedStart

  # OneHot on subsequence string to create list of lists for features
  parsed_str = str(sub_sequence).replace('A', '1000').replace('G', '0100').replace('T', '0010').replace('C', '0001')

  one_hot_sequence = [int(i) for i in parsed_str]
  one_hot_sequence = np.asarray(one_hot_sequence)
  # one_hot_sequence = np.asarray(one_hot_sequence, dtype = 'int32')
  # one_hot_sequence = pd.Series(data = one_hot_sequence)

  if test == 1:
    test = 0
    print('OHS Type: ', type(one_hot_sequence))
  
  pair = (sub_sequence, one_hot_sequence, promoterIndex, splicedStart, splicedEnd)
  parsed_subsequences.append(pair)
  

  # Reset where the index is in the
  # sequence string
  sequence_index = 0

  for nucleobase in sequence:
    if nucleobase == 'A':
      adenine_frequencies[sequence_index] += 1
    elif nucleobase == 'G':
      guanine_frequencies[sequence_index] += 1
    elif nucleobase == 'T':
      thymine_frequencies[sequence_index] += 1
    elif nucleobase == 'C':
      cytosine_frequencies[sequence_index] += 1
    else:
      other_frequency[sequence_index] += 1

    sequence_index += 1

subsequence_df = pd.DataFrame(parsed_subsequences, columns = ['Sub-Sequences', 'One-Hot', 'PromoterIndex', 'SequenceStart', 'SequenceEnd'])

# In the above loop, we calculated how many occurences M of a particular nucelobase N
# at some sequence index I. In this loop, we divide each number of occurences by the total
# number of possible occurences there could be, which would be the number of records
# because each record is guaranteed to have a nucleobase at character I in the human
# DNA sequence. I.e.
# P(sequence[i] == N) == (# of nucleobases N found at index i) / (# of possible nucleobases N at index i)
for i in range(sequence_length):
  adenine_frequencies[i] = (adenine_frequencies[i] / len(DNA_Sequences)) * 100
  guanine_frequencies[i] = (guanine_frequencies[i] / len(DNA_Sequences)) * 100
  thymine_frequencies[i] = (thymine_frequencies[i] / len(DNA_Sequences)) * 100
  cytosine_frequencies[i] = (cytosine_frequencies[i] / len(DNA_Sequences)) * 100


# Defining the series that will be fed into the pandas Data Frame, based off the lists defined above
records_df = pd.DataFrame({'IDs': accessIDs, 'Sequences': DNA_Sequences, 'Lengths': sequence_lengths},
                          columns = ['IDs', 'Sequences', 'Lengths'])

# Defining the data frame for the Position Specific Frequency Matrix (PSFM)
frequency_df = pd.DataFrame({'Adenine Frequencies': adenine_frequencies, 'Guanine Frequencies': guanine_frequencies, 'Thymine Frequencies': thymine_frequencies, 'Cytosine Frequencies': cytosine_frequencies},
                            columns = ['Adenine Frequencies', 'Guanine Frequencies', 'Thymine Frequencies', 'Cytosine Frequencies'])

Number of sequences: 1837
OHS Type:  <class 'numpy.ndarray'>


# RECORDS DATAFRAME INFORMATION

---



## Printing records dataframe

In [None]:
records_df.head(n=20)

# FREQUENCY DATAFRAME INFORMATION

---




## Printing frequency data frame

In [None]:
frequency_df.head(n=10500)

## Describing Frequency data frame

In [None]:
frequency_df.describe()

## Plotting frequency graph

In [None]:
plt.figure()
frequency_df.plot(subplots=True, figsize=(25, 15), xlabel = "Position in DNA Sequence", ylabel = "P(nucleobase)", title = "Position Specific Frequency Graph", ylim = (0, 55))

# SUBSEQUENCE DATAFRAME INFORMATION

---



## Printing Subsequence DataFrame

In [None]:
subsequence_df.head(n=20)

## Describing the subsequence dataframe

In [None]:
print('Index frequencies:')
print(subsequence_df['PromoterIndex'].value_counts())

# Histogram plot of subsequence dataframe

In [None]:
plt.figure()
subsequence_df['PromoterIndex'].plot.hist(histtype = 'bar', bins = 6000, figsize = (25, 5), title = 'Promoter Indices')

## Density plot of subsequence dataframe

In [None]:
plt.figure()
subsequence_df['PromoterIndex'].plot.density(figsize = (25, 5), title = 'Promoter Indices')

## Printing one-hot features

In [210]:
print(subsequence_df['One-Hot'].head(n = 10))

print('Type: ', type(subsequence_df['One-Hot']))

0    [0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, ...
1    [0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, ...
2    [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, ...
3    [0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, ...
4    [0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, ...
5    [0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, ...
6    [0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, ...
7    [1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, ...
8    [0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, ...
9    [0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, ...
Name: One-Hot, dtype: object
Type:  <class 'pandas.core.series.Series'>


## Arranging features and labels

In [211]:
X = subsequence_df['One-Hot'].to_numpy()
y = subsequence_df['PromoterIndex'].to_numpy()

print("Features: \n")
print(X)
print("\nFeatures type: ", type(X))

print("Labels: \n")
print(y)
print("\n")

Features: 

[array([0, 1, 0, ..., 1, 0, 0], dtype=int32)
 array([0, 0, 0, ..., 0, 0, 1], dtype=int32)
 array([1, 0, 0, ..., 0, 0, 1], dtype=int32) ...
 array([0, 0, 0, ..., 0, 0, 1], dtype=int32)
 array([0, 0, 0, ..., 0, 1, 0], dtype=int32)
 array([0, 1, 0, ..., 0, 1, 0], dtype=int32)]

Features type:  <class 'numpy.ndarray'>
Labels: 

[1739 2694 2764 ... 4746 1768 1030]




## PyTorch Dataset

In [212]:
import torch
from torch.utils.data import Dataset, random_split

class Sequences(Dataset):
    def __init__(self, X, y):
        self.sequences = torch.from_numpy(X) # convert numpy arrays to torch tensors
        self.labels = torch.from_numpy(y)

        self.sequences = self.sequences.unsqueeze(1) # add extra dimension for torch
        self.labels = self.labels.unsqueeze(1)

    def __len__(self):
        assert len(self.sequences) == len(self.labels) # ensure 1-to-1 correspondence
        return len(self.labels)

    def __getitem__(self, i):
        return self.sequences[i], self.labels[i] # return X, y pair

data = Sequences(X, y)
trainCount = int(0.8 * len(data)) # percent of our data for train
train, test = random_split(data, [trainCount, len(data) - trainCount])

TypeError: ignored