In [1]:
### CLEAN FASTA FUNCTION ###

# Author: Raquel Alfonso Rodriguez
# Date: 16/04/25 (created)
# Variables: 
    # input_file: directory of file that needs to be cleaned, with extenstion .fasta or .txt
    # output_file: directory of cleaned file, also with extension .fasta or .txt
    # line_length: the amount of characters per line for the output file
# Description:

# Takes fasta files that need to be cleaned up, like this:

# b'>Phy000D0LN_SCHPO\n--------------------------------------------------------------------------------\n-----MG-------
# LEQFKKYPLTFG--PTPITSMK-RLSKTLGGKVEIFAKREDCNSGLAFGGNKIRKLEYLIPEAIDG\nG--YDTLVSIGGIQSNQTRQVAAVAAHLGLDCVLIQEDWV-DY-K--DT
# ------MYDRVGNIELSRIVNADVRLDS-SK\nF--DI----------------------------GIRP-SFKNALEELTKKGKKPFPIPAGCSEHPYGGLGFVGCVEEIYE\nQE
# K-Q-LGFKFDKIVVCTVTGSSFAGIIVG-----MALTGR-Q-KDVIGID-ASATPEKTKAQVLRIAQNT---AKLIG-\n---L------------------EKELTESDVN-IDTR
# FAHPAYGIPNEGTIEAIKLCGATEGVLTDPVYEGKSMQGLIHL\n-V-RNNEI-AEGSKVLYIHLGGAPALSAYSAYFKNT------------------------------------
# --------\n----------\n>Phy000FV3R_ASPFU\n----------------------------------------------------------------------------
# ----\n------MAVPLPAAFAQIPRYPLLYP-HPSPIHPLTLNNSRNNTPYISLYAKREDHSSPLACAGNKYRKLEYIIPDIRAS\nHSAPPTLVTEGAIQSNHTVQVAAVAKRLG
# LEAVVILHKGTG-G-ASTKV------SFLRTGNPQVVRLLGAEVRMLES-S\nTVPAD----------------------------GDKD-PIPGILEELRAQGKAPYWIPSGASLH
# PLGGLGYARCAFEIAI\nQEQ-A-QGLRFDYIFVACGSGSTVGGLIAGLKLLE-KKEGR-KPRQVIGITSPTKPQEYHEERVLTFARRA---AALIG-\n---L-------------
# -----EKDISMDDVR-LDDRFAGTAYGVLDAQTSTALDLMAKKEAVMLDPVYTSKVVRGLLHW\n-V-QQEELAPDCMNVLFIHTGGQAALGAYADMVEL-----------------
# ----------------------------\n----------\n

# and turns them into proper fasta formats, like this:

# >Phy000D0LN_SCHPO
# MGLEQFKKYPLTFGPTPITSMKRLSKTLGGKVEIFAKREDCNSGLAFGGNKIRKLEYLIPEAIDGGYDTLVSIGGIQSNQ
# TRQVAAVAAHLGLDCVLIQEDWVDYKDTMYDRVGNIELSRIVNADVRLDSSKFDIGIRPSFKNALEELTKKGKKPFPIPA
# GCSEHPYGGLGFVGCVEEIYEQEKQLGFKFDKIVVCTVTGSSFAGIIVGMALTGRQKDVIGIDASATPEKTKAQVLRIAQ
# NTAKLIGLEKELTESDVNIDTRFAHPAYGIPNEGTIEAIKLCGATEGVLTDPVYEGKSMQGLIHLVRNNEIAEGSKVLYI
# HLGGAPALSAYSAYFKNT
# >Phy000FV3R_ASPFU
# MAVPLPAAFAQIPRYPLLYPHPSPIHPLTLNNSRNNTPYISLYAKREDHSSPLACAGNKYRKLEYIIPDIRASHSAPPTL
# VTEGAIQSNHTVQVAAVAKRLGLEAVVILHKGTGGASTKVSFLRTGNPQVVRLLGAEVRMLESSTVPADGDKDPIPGILE
# ELRAQGKAPYWIPSGASLHPLGGLGYARCAFEIAIQEQAQGLRFDYIFVACGSGSTVGGLIAGLKLLEKKEGRKPRQVIG
# ITSPTKPQEYHEERVLTFARRAAALIGLEKDISMDDVRLDDRFAGTAYGVLDAQTSTALDLMAKKEAVMLDPVYTSKVVR
# GLLHWVQQEELAPDCMNVLFIHTGGQAALGAYADMVEL

In [1]:
from openpyxl import load_workbook
import os

In [2]:
def fix_fasta_nodash(input_file, output_file, line_length=80):

    # First, it gets rid of unwanted characters such as \n and b':
    
    temp_file = r'C:\Users\alfonsorodrr\temp.fasta'
    
    with open(input_file, 'r') as infile, open(temp_file, 'w') as outfile:
        
        for line in infile:
            line = line.strip()
            if line.startswith("b'"):
                line = line[2:-1]

            line = line.replace('\\n', '\n').strip()
            clean_seq = line.replace('-', '').strip()

            if line:
                outfile.write(clean_seq + '\n')

    # Secondly, it formats the new file with proper line length

    with open(temp_file, 'r') as infile:
        lines = infile.readlines()

    fixed_fasta = []
    current_header = None
    current_sequence = []

    for line in lines:
        line = line.strip()
        
        if not line:
            continue # Skip empty lines
            
        if line.startswith('>'):
            if current_header:
                # Write the previous sequece
                fixed_fasta.append(current_header)
                sequence = ''.join(current_sequence).replace(' ', '')
                for i in range(0, len(sequence), line_length):
                    fixed_fasta.append(sequence[i:i+line_length])
            # Start new header
            current_header = line
            current_sequence = []
        else:
            current_sequence.append(line)

    # Write the last sequence
    if current_header:
        fixed_fasta.append(current_header)
        sequence = ''.join(current_sequence).replace(' ', '')
        for i in range(0, len(sequence), line_length):
            fixed_fasta.append(sequence[i:i+line_length])

    # Write output file
    with open(output_file, 'w') as outfile:
        outfile.write('\n'.join(fixed_fasta) + '\n')

    # Delete temporary file
    os.remove(temp_file)

In [3]:
def fix_fasta_keepdash(input_file, output_file, line_length=80):

    # First, it gets rid of unwanted characters such as \n and b':
    
    temp_file = r'C:\Users\alfonsorodrr\temp.fasta'
    
    with open(input_file, 'r') as infile, open(temp_file, 'w') as outfile:
        
        for line in infile:
            line = line.strip()
            if line.startswith("b'"):
                line = line[2:-1]

            line = line.replace('\\n', '\n').strip()

            if line:
                outfile.write(line + '\n')

    # Secondly, it formats the new file with proper line length

    with open(temp_file, 'r') as infile:
        lines = infile.readlines()

    fixed_fasta = []
    current_header = None
    current_sequence = []

    for line in lines:
        line = line.strip()
        
        if not line:
            continue # Skip empty lines
            
        if line.startswith('>'):
            if current_header:
                # Write the previous sequece
                fixed_fasta.append(current_header)
                sequence = ''.join(current_sequence).replace(' ', '')
                for i in range(0, len(sequence), line_length):
                    fixed_fasta.append(sequence[i:i+line_length])
            # Start new header
            current_header = line
            current_sequence = []
        else:
            current_sequence.append(line)

    # Write the last sequence
    if current_header:
        fixed_fasta.append(current_header)
        sequence = ''.join(current_sequence).replace(' ', '')
        for i in range(0, len(sequence), line_length):
            fixed_fasta.append(sequence[i:i+line_length])

    # Write output file
    with open(output_file, 'w') as outfile:
        outfile.write('\n'.join(fixed_fasta) + '\n')

    # Delete temporary file
    os.remove(temp_file)

In [7]:
# NO DASH

directory = os.fsencode(r'C:\Users\alfonsorodrr\OneDrive - FUJITSU\Escritorio\FUJITSU\QCare - Árboles filogenéticos\Github\Folders\Jacobo\Jacobo 10-50-100-150\150')
os.chdir(directory)

for file in os.listdir(directory):
    input_file = os.fsdecode(file)
    if input_file.endswith(".clean.fasta"): 
        output_file = input_file.replace(".clean.fasta", "_nd.fasta")
        fix_fasta_nodash(input_file, output_file, line_length=80)
        # os.remove(input_file) # WARNING!! deletes .clean.fasta to make things easier for batch

In [8]:
# KEEP DASH

directory = os.fsencode(r'C:\Users\alfonsorodrr\OneDrive - FUJITSU\Escritorio\FUJITSU\Github\Folders\New_benchmarking\Puccinia_graminis\Selection branches')
os.chdir(directory)

for file in os.listdir(directory):
    input_file = os.fsdecode(file)
    if input_file.endswith(".clean.fasta"): 
        output_file = input_file.replace(".clean.fasta", "_kd.fasta")
        fix_fasta_keepdash(input_file, output_file, line_length=80)
        # os.remove(input_file) # WARNING!! deletes .clean.fasta to make things easier for batch