In [15]:
def read_fasta(filename):
    """Reads a FASTA file and returns a list of DNA sequences."""
    with open(filename, 'r') as file:
        sequences = []
        current_sequence = ""
        for line in file:
            line = line.strip()
            if line.startswith(">"):  # Skip headers
                continue
            current_sequence += line  # Add the sequence to the list
        sequences.append(current_sequence)  # Append the last sequence
    return sequences

def generate_profile_matrix(sequences):
    """Generates the profile matrix for a collection of DNA sequences."""
    length = len(sequences[0])  # Length of each sequence
    profile_matrix = {'A': [0] * length, 'C': [0] * length, 'G': [0] * length, 'T': [0] * length}
    
    # Count occurrences of each nucleotide in each position
    for seq in sequences:
        for i in range(length):
            nucleotide = seq[i]
            profile_matrix[nucleotide][i] += 1
    
    return profile_matrix

def generate_consensus_string(profile_matrix):
    """Generates the consensus string from the profile matrix."""
    consensus = []
    for i in range(len(profile_matrix['A'])):
        # Get the nucleotide with the maximum count at each position
        max_count = -1
        max_nucleotide = ""
        for nucleotide in ['A', 'C', 'G', 'T']:
            if profile_matrix[nucleotide][i] > max_count:
                max_count = profile_matrix[nucleotide][i]
                max_nucleotide = nucleotide
        consensus.append(max_nucleotide)
    
    return ''.join(consensus)

def print_profile_matrix(profile_matrix):
    """Prints the profile matrix."""
    for nucleotide in ['A', 'C', 'G', 'T']:
        print(f"{nucleotide}: {' '.join(map(str, profile_matrix[nucleotide]))}")

# Example usage
filename = "Datasets/rosalind_cons.txt"  # Replace with the actual file path
sequences = read_fasta(filename)
profile_matrix = generate_profile_matrix(sequences)
consensus_string = generate_consensus_string(profile_matrix)

# Print the results
print(consensus_string)
print_profile_matrix(profile_matrix)


AATACGGTCAAACCACCCTGCAACATTGGCTTTTAGATATGGTGAGTGGCGCCCCACTGCCGTCGTTCTGTGCTCGAGCTTGTGGACCCCAGAAAATCTCTCGCTTAGACTAACGAGAAACATTCCCGAAATACGGCGCGAATGGATCCGAATTTCCATCAACGCCGTAGGGCCTTCTCGAGGAGAGGACATTAGAGTAACAGACGTGGGCTTAGCAATGACATCCAGGGTGGTACGAAAAATGTTCGGACAAACCTGTGGTATATCTTTGGTATTCTCGACGGTTTAATACGTCACCCTCACGACGTGCATCATAATCGATTTTTGGATAGTGGTTGCCCATCACCATTTTGAGAAACTTAATGCGGATGTAGTGAGTTCTACCTTCCAATAAGTCCGCCATTGACGCACCCCTGCCAGCGATTTTCCGGGAAGCGCAGATAGTGAGTCACCAGACAGCTCCTGGAACGTATAGCTTAGGATTCATGCCCCTGGTATCGATACTGATTCTAAGTAGTAACTATCCCCGAGCGCCAAATATGCCGCTTACTACCATTGGAACTGCTTTCTTTACGTCGGTAGCTAACTTTACCGGAAACGATGGATTTATTAGCTACCGGCCCTTGGGACTGTGATATTTGAAGCACAAAGGTGGTTGGTTTAACCAGAGAGTACACATAACCTTTGCACGACTGGCGTCTAAGATCGTTGCCGCTACATTCACAGGCTGATTTCCGATTCCGAAGTCAGCTCTGTTCCGTAAGGCCAGCAACGCTTAACCGTGCATCGAGAACGGGTATCAATAAGTTTAGATAGGCATTACATGTATGTTCAAATCACATAACTACGGGGACCTCTTTTAGGTATGGACGGCTAGTAGACTGTGCGCCCCTGGGCTAAGTTCAAAACGGTAAAAATCCGAGGCGGGGCACGGGCTGATAGAAGCTTTAAAGAGAGGCCCTTTTCTTGGAAAATGAGTAGTACCTCACAGTGTCGCGGA