In [4]:
import numpy as np
import pandas as pd
from Bio import SeqIO

In [5]:
species = ["celegans", "hsapiens", "pfalciparum", "scerevisiae"]
num_sequences = {
    "athaliana":22703,
    "dmelanogaster":16972,
    "celegans":7120, 
    "hsapiens": 29598,
    "pfalciparum":5597,
    "scerevisiae":5117
}


In [None]:
def get_probability_dataframe(seq_array):
    n_positions = seq_array.shape[1]
    nucleotides = ['A', 'T', 'C', 'G']
    prob_df = pd.DataFrame(0, index=nucleotides, columns=range(n_positions), dtype=float)

    for nucleotide in nucleotides:
        nucleotide_counts = (seq_array == nucleotide).sum(axis=0)
        prob_df.loc[nucleotide] = nucleotide_counts / len(sequences)
    return prob_df

for spec in species:
    sequences = [str(record.seq) for record in SeqIO.parse(f"/Users/tennisnyjmac/Desktop/Promoters/PromoterShape/raw_promoter_sequences/{spec}_200.fa", "fasta")]
    seq_array = np.array([list(seq) for seq in sequences])

    prob_df = get_probability_dataframe(seq_array)
    print(spec, prob_df)
    

In [None]:
for spec in species:
    sequences = [str(record.seq) for record in SeqIO.parse(f"/Users/tennisnyjmac/Desktop/Promoters/PromoterShape/raw_promoter_sequences/{spec}_200.fa", "fasta")]
    seq_array = np.array([list(seq) for seq in sequences])

    # I get the dataframe with probabilities of each nucleotide occurrence
    prob_df = get_probability_dataframe(seq_array)

    # I collect corresponding number of sequences generated based on the prob_df
    num = num_sequences[spec]
    sequence_length = seq_array.shape[1]
    generated_sequences = []

    for _ in range(num):
        sequence = ""
        for pos in range(sequence_length):
            probabilities = prob_df.loc[:, pos].values
            probabilities = probabilities / probabilities.sum()
            nucleotides = prob_df.index.values
            nucleotide = np.random.choice(nucleotides, p=probabilities)
            sequence += nucleotide
        generated_sequences.append(sequence)
        #print(sequence)

    # Write out sequences in a fasta file
    with open(f"{spec}_probability_based_control_sequences.fa", "w") as file:
        for i, sequence in enumerate(generated_sequences):
            file.write(f">Sequence_{i+1}\n{sequence}\n")