**Author**: Yichen Fang

**Credit to**: Joanne Chen for the original `generate_one_seq` and `generate_fasta` functions (it was subsequently modified).

This notebook can be used to generate random pseudo-dna-sequences.

In [None]:
import pickle
import random
import glob
import numpy as np

In [None]:
seq_num = 5000 # number of sequences to generate
seq_len = 1000 # the length of each sequence
path_to_buffer_file = "/home/ubuntu/formatted/random_sequences/" + "random_sequnce_buffer.txt"
# path to the folder that stores the random sequence list as a buffer

In [None]:
def generate_one_seq(length):
    ''' Randomly generates a sequence.
        length: the length of the sequence
        Returns a pseudo header, the pseudo sequence itself, and a randomly assigned class indicator for the sequence
    '''
    header = "VT0000|" + str(np.random.randint(0, 2)) + "|MEMB001A|+|" + str(np.random.randint(1000,9000))
    random_list = [random.choice("ACGT") for i in range(length)]
    sequence = "".join(random_list) # 1 long string
    # class_indicator = np.random.randint(0, 2)
    # return header, class_indicator, sequence
    return header + "\n" + sequence

def generate_fasta(path):
    """
    path: a string indicating the location in which random file will be created / used
    """
    ofile = open(path, "w")
    u = 0
    for i in range(1000):
        ofile.write(">" + generate_one_seq(1000) + "\n")
        u += 1
        if u % 100 == 0:
            print("finished: " + str(u))
    ofile.close()
    # return glob(path)[0]

In [None]:
generate_fasta("/home/ubuntu/formatted/random_sequences/output/1000_raw_random_sequences.fa")

In [None]:
seq_lst = []
counter = 0

for seq in range(seq_num):
    seq_header, seq_indicator, seq_detail = generate_one_seq(seq_len)
    seq_lst.append([seq_header, seq_indicator, seq_detail])
    counter += 1
    if counter % 100 == 0:
        print("finished: " + str(counter))

In [None]:
base_pairs = {'A': [1, 0, 0, 0], 
              'C': [0, 1, 0, 0],
              'G': [0, 0, 1, 0],
              'T': [0, 0, 0, 1],
              'a': [1, 0, 0, 0],
              'c': [0, 1, 0, 0],
              'g': [0, 0, 1, 0],
              't': [0, 0, 0, 1],
              'n': [0, 0, 0, 0],
              'N': [0, 0, 0, 0]}

def one_hot_encoding(seq_info, seq_len):
    ''' Transform the random sequence into one-hot encoding,
        and adding random motif scores (random float among [-1, 1) to the end of each position.'''
    encoding = [[e for e in base_pairs[n]] for n in seq_info[2]]
    for i in range(seq_len):
        encoding[i].extend([np.random.uniform(low=-1, high=1) for _ in range(3)])
    return encoding

In [None]:
counter = 0

for i in range(seq_num):
    encoding = one_hot_encoding(seq_lst[i], seq_len)
    seq_lst[i][2] = encoding
    counter += 1
    if counter % 100 == 0:
        print("finished: " + str(counter))

In [None]:
seq_lst[1]

In [None]:
with open(path_to_buffer_file, "wb") as buff:
    pickle.dump(seq_lst, buff)