In [None]:
import numpy as np
import pandas as pd

## Read the motifs and select the top 5

In [None]:
motifs = {}

for name, file in (('internal', 'internal.csv'), ('dnafountain', 'dnafountain.csv')):
    idf = pd.read_csv(file)
    idf = idf.groupby('motif').agg({'p_value': np.min})
    display(idf.sort_values('p_value').head(5))
    motifs[name] = list(idf.sort_values('p_value').head(5).index)

motifs

## Generate 10000 random sequences of length 108

In [None]:
seqs = np.random.choice(['A','C','G','T'], size=(10000, 108))
seqs = [''.join(s) for s in seqs]
assert len(set(seqs)) == 10000 and len(seqs[0]) == 108

df = pd.DataFrame({'seq_id': [f"Seq{str(i).zfill(5)}" for i in range(len(seqs))], 'seq': seqs})
df.head(5)

## Check for each sequence if it contains any of the motifs

In [None]:
def find_motif(seq, motifs):
    return any(m in seq for m in motifs)

for name, m in motifs.items():
    df[f'has_motif_{name}'] = df['seq'].apply(find_motif, args=(m,))
df['has_motif'] = np.any(df[[f'has_motif_{name}' for name in motifs]].values, axis=1)
print(df['has_motif'].value_counts())
df.head(5)

## Select 40 sequences without any motif

In [None]:
df_selection = df[~df['has_motif']].sample(frac=1).head(40)

assert len(df_selection) == 40
df_selection.head(5)

## Add each motif at the front, the end, the middle, and shifted by 5 nt at the front and end

In [None]:
new_ids = []
new_seqs = []

# go through each of the selected sequences
for row in df_selection.iterrows():
    row = row[1]

    # go through each of the motif sources
    for name, m in motifs.items():
        # go through each of the motifs
        for motif in m:

            # add the motif at the front
            new_ids.append(f"{row['seq_id']}+{name}_{motif}_front")
            new_seqs.append(motif + row['seq'][len(motif):])

            # add the motif at the back
            new_ids.append(f"{row['seq_id']}+{name}_{motif}_back")
            new_seqs.append(row['seq'][:-len(motif)] + motif)

            # add the motif in the middle
            new_ids.append(f"{row['seq_id']}+{name}_{motif}_middle")
            start_middle = int(len(row['seq']) / 2 - len(motif) / 2)
            new_seqs.append(row['seq'][:start_middle] + motif + row['seq'][start_middle+len(motif):])

            # add the motif 5 nt before the front
            new_ids.append(f"{row['seq_id']}+{name}_{motif}_5ntfront")
            new_seqs.append(row['seq'][:5] + motif + row['seq'][5+len(motif):])

            # add the motif 5 nt before the back
            new_ids.append(f"{row['seq_id']}+{name}_{motif}_5ntback")
            new_seqs.append(row['seq'][:-(5+len(motif))] + motif + row['seq'][-5:])

In [None]:
# add the new sequences to a dataframe
df_selection_motifinsert = pd.DataFrame({'seq_id': new_ids, 'seq': new_seqs})

# add metadata
df_selection_motifinsert['has_insertedmotif'] = True
df_selection_motifinsert[['source_seqid', 'insert_id']] = df_selection_motifinsert['seq_id'].str.split('+', expand=True)
df_selection_motifinsert[['insert_motif_source', 'insert_motif', 'insert_motif_position']] = df_selection_motifinsert['insert_id'].str.split('_', expand=True)

# add motif information
for name, m in motifs.items():
    df_selection_motifinsert[f'has_motif_{name}'] = df_selection_motifinsert['seq'].apply(find_motif, args=(m,))
df_selection_motifinsert['has_motif'] = np.any(df_selection_motifinsert[[f'has_motif_{name}' for name in motifs]].values, axis=1)

# sanity checks
assert np.all(df_selection_motifinsert['seq'].apply(len) == 108)
assert len(df_selection_motifinsert) == 40*5*2*5
assert np.all(df_selection_motifinsert['has_motif'] == True)

df_selection_motifinsert.head(5)

## Combine the collections of random sequences with those with inserted motifs

In [None]:
# add some metadata to the random sequences first
df['has_insertedmotif'] = False

total_df = pd.concat([df, df_selection_motifinsert], ignore_index=True)

# sanity checks
assert len(total_df) == 12000
assert np.all(total_df['seq'].apply(len) == 108)

total_df.head(5)

# Save everything

In [None]:
# save dataframe
total_df.to_csv('./sequence_data.csv', index=False)

# save fasta file
with open("design_files.fasta", "w") as f:
    for row in total_df.iterrows():
        row = row[1]
        f.write(f">{row['seq_id']}\n{row['seq']}\n\n")

# shuffle the dataframe to make sure the sequences are not in any order that may affect synthesis
total_df_shuffled = total_df.sample(frac=1)

# save text file without primers
with open("design_files.txt", "w") as f:
    for row in total_df_shuffled.iterrows():
        row = row[1]
        f.write(f"{row['seq']}\n")

# save text file with primers
with open("design_files_w_primers.txt", "w") as f:
    for row in total_df_shuffled.iterrows():
        row = row[1]
        f.write(f"ACACGACGCTCTTCCGATCT{row['seq']}AGATCGGAAGAGCACACGTCT\n")