In [2]:
import random

def generate_random_sequence(length):
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    return ''.join(random.choice(amino_acids) for _ in range(length))

def create_fasta_file(filename, lengths, num_sequences):
    with open(filename, 'w') as file:
        for length in lengths:
            for i in range(num_sequences):
                sequence = generate_random_sequence(length)
                file.write(f">Sequence_{length}_{i+1}\n{sequence}\n")

# Define sequence lengths and number of sequences for each length
sequence_lengths = [10, 50, 100, 1500]
num_sequences = 5

# Create FASTA file
fasta_filename = 'random_sequences.fasta'
create_fasta_file(fasta_filename, sequence_lengths, num_sequences)

print(f"FASTA file '{fasta_filename}' created with random sequences.")


FASTA file 'random_sequences.fasta' created with random sequences.


In [7]:
import pandas as pd
import random

import pandas as pd

# Amino acid information
amino_acids = {
    'CYS': {'hydrophobic': True, 'value': 1.660},
    'MET': {'hydrophobic': True, 'value': 2.370},
    'PHE': {'hydrophobic': True, 'value': 4.100},
    'ILE': {'hydrophobic': True, 'value': 5.810},
    'LEU': {'hydrophobic': True, 'value': 9.430},
    'VAL': {'hydrophobic': True, 'value': 6.580},
    'TRP': {'hydrophobic': True, 'value': 1.240},
    'TYR': {'hydrophobic': True, 'value': 3.190},
    'ALA': {'hydrophobic': True, 'value': 7.580},
    'GLY': {'hydrophobic': True, 'value': 6.840},
    'THR': {'hydrophobic': False, 'value': 5.670},
    'SER': {'hydrophobic': False, 'value': 7.130},
    'GLN': {'hydrophobic': False, 'value': 3.970},
    'ASN': {'hydrophobic': False, 'value': 4.440},
    'GLU': {'hydrophobic': False, 'value': 6.360},
    'ASP': {'hydrophobic': False, 'value': 5.270},
    'HIS': {'hydrophobic': False, 'value': 2.240},
    'ARG': {'hydrophobic': False, 'value': 5.160},
    'LYS': {'hydrophobic': False, 'value': 5.940},
    'PRO': {'hydrophobic': False, 'value': 4.920}
}

# Create DataFrame
df = pd.DataFrame.from_dict(amino_acids, orient='index')
df.index.name = 'Amino Acid'


# Add 'One_Letter' column
one_letter_mapping = {
    'CYS': 'C', 'MET': 'M', 'PHE': 'F', 'ILE': 'I', 'LEU': 'L',
    'VAL': 'V', 'TRP': 'W', 'TYR': 'Y', 'ALA': 'A', 'GLY': 'G',
    'THR': 'T', 'SER': 'S', 'GLN': 'Q', 'ASN': 'N', 'GLU': 'E',
    'ASP': 'D', 'HIS': 'H', 'ARG': 'R', 'LYS': 'K', 'PRO': 'P'
}

df['One_Letter'] = df.index.map(one_letter_mapping)


# Display the DataFrame
print(df)

def generate_random_sequence(df, n):
    sequence = ''
    hydrophobic_mode = random.choice([True, False])

    for _ in range(n):
        # Switch mode with probability 0.05
        if not hydrophobic_mode:
            if random.random() < 0.05:
                hydrophobic_mode = not hydrophobic_mode
        elif hydrophobic_mode:
            if random.random() < 0.15:
                hydrophobic_mode = not hydrophobic_mode

        # Select amino acids based on the mode and probabilities
        if hydrophobic_mode:
            amino_acids = df[df['hydrophobic']]['One_Letter'].tolist()
            probabilities = df[df['hydrophobic']]['value'].tolist()
        else:
            amino_acids = df[~df['hydrophobic']]['One_Letter'].tolist()
            probabilities = df[~df['hydrophobic']]['value'].tolist()

        # Normalize probabilities to sum to 1
        probabilities = [prob / sum(probabilities) for prob in probabilities]

        # Choose amino acid based on probabilities
        selected_aa = random.choices(amino_acids, weights=probabilities)[0]
        sequence += selected_aa

    return sequence

# Example usage:
# Assuming 'df' is the DataFrame created earlier

# Set the seed for reproducibility
random.seed(42)

# Generate a random sequence of length 50 in hydrophobic mode
# Generate 10 sequences of length 100
sequences = [generate_random_sequence(df, 100) for _ in range(100)]

# Write sequences to a FASTA file
fasta_filename = 'random_sequences_2.fasta'
with open(fasta_filename, 'w') as file:
    for i, seq in enumerate(sequences, start=1):
        file.write(f">Sequence{i}\n{seq}\n")

            hydrophobic  value One_Letter
Amino Acid                               
CYS                True   1.66          C
MET                True   2.37          M
PHE                True   4.10          F
ILE                True   5.81          I
LEU                True   9.43          L
VAL                True   6.58          V
TRP                True   1.24          W
TYR                True   3.19          Y
ALA                True   7.58          A
GLY                True   6.84          G
THR               False   5.67          T
SER               False   7.13          S
GLN               False   3.97          Q
ASN               False   4.44          N
GLU               False   6.36          E
ASP               False   5.27          D
HIS               False   2.24          H
ARG               False   5.16          R
LYS               False   5.94          K
PRO               False   4.92          P


In [3]:
import random
def mutate_sequence(sequence, mutation_percentage):
    # Calculate the number of residues to mutate
    num_mutations = int(len(sequence) * (mutation_percentage / 100.0))

    # Randomly select positions to mutate
    positions_to_mutate = random.sample(range(len(sequence)), num_mutations)

    # Create a list to store the mutated sequence
    mutated_sequence = list(sequence)

    # Define the set of amino acids for mutation
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

    # Perform mutations
    for position in positions_to_mutate:
        # Ensure the mutated amino acid is different from the original
        new_amino_acid = random.choice([aa for aa in amino_acids if aa != sequence[position]])
        mutated_sequence[position] = new_amino_acid

    # Convert the list back to a string
    mutated_sequence = ''.join(mutated_sequence)

    return mutated_sequence

fasta_filename = 'mutated_sequences_2.fasta'
with open(fasta_filename, 'w') as file:
    for i in [0,  10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 100]:
        seq = mutate_sequence("MTDKITKKKRNETYSIYIYKVLRQVHPKIGVSSKAMNIMNSFVNDLFERLVSESYNLSNSSRSKTLTAREIQTSVRLVIPGELAKHSVSEGTKAVAKYRSSI", i)
        file.write(f">Sequence{i}\n{seq}\n")
    

In [4]:
import random

def mutate_and_swap(sequence, mutation_percentage):
    # Calculate the number of residues to mutate
    num_mutations = int(len(sequence) * (mutation_percentage / 100.0))

    # Randomly select positions to mutate
    positions_to_mutate = random.sample(range(len(sequence)), num_mutations)

    # Create a list to store the mutated sequence
    mutated_sequence = list(sequence)

    # Define hydrophobic and non-hydrophobic amino acids
    hydrophobic_aa = 'GAYWVLIFVM'
    non_hydrophobic_aa = 'TSQNEDHRKP'

    # Perform mutations and swaps
    for position in positions_to_mutate:
        # Swap hydrophobic with non-hydrophobic and vice versa
        if sequence[position] in hydrophobic_aa:
            mutated_sequence[position] = random.choice(non_hydrophobic_aa)
        elif sequence[position] in non_hydrophobic_aa:
            mutated_sequence[position] = random.choice(hydrophobic_aa)

    # Convert the list back to a string
    mutated_sequence = ''.join(mutated_sequence)

    return mutated_sequence

# Example usage:
fasta_filename = 'mutated_sequences_3.fasta'
with open(fasta_filename, 'w') as file:
    for i in [0,  10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 100]:
        seq = mutate_and_swap("MTDKITKKKRNETYSIYIYKVLRQVHPKIGVSSKAMNIMNSFVNDLFERLVSESYNLSNSSRSKTLTAREIQTSVRLVIPGELAKHSVSEGTKAVAKYRSSI", i)
        file.write(f">Sequence{i}\n{seq}\n")