In [3]:
def count_headers(file_path):
    header_count = 0
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('>'):
                header_count += 1
    return header_count

# AtPRRs.fasta protein number
atprrs_file = 'AtPRRs.fasta'
atprrs_header_count = count_headers(atprrs_file)
print(f"AtPRRs.fasta protein number: {atprrs_header_count}")

# extracted_sequences.fasta protein number
extracted_file = 'extracted_sequences.fasta'
extracted_header_count = count_headers(extracted_file)
print(f"extracted_sequences.fasta protein number: {extracted_header_count}")


AtPRRs.fasta protein number: 274
extracted_sequences.fasta protein number: 247


In [5]:
def count_proteins(fasta_file, list_file):
    with open(list_file, 'r') as list_file:
        proteins_list = [line.strip() for line in list_file]

    protein_count = 0

    # read FASTA
    with open(fasta_file, 'r') as fasta_file:
        current_protein = None
        for line in fasta_file:
            if line.startswith('>'):
                current_protein = line.strip()[1:].split('.')[0]  # extract AGI code from header
                if current_protein in proteins_list:
                    protein_count += 1

    return protein_count

# Count protein number of AtPRRs.fasta exist in Final_receptor_list.txt
atprrs_file = 'AtPRRs.fasta'
Final_receptor_list = 'Final_receptor_list.txt'
atprrs_protein_count = count_proteins(atprrs_file, Final_receptor_list)
print(f"Protein number of AtPRRs.fasta exist in final_sequences.txt: {atprrs_protein_count}")

# Count protein number of extracted_sequences.fasta exist in Final_receptor_list.txt
extracted_file = 'extracted_sequences.fasta'
extracted_protein_count = count_proteins(extracted_file, Final_receptor_list)
print(f"Protein number of extracted_sequences.fasta exist in final_sequences.txt: {extracted_protein_count}")


Protein number of AtPRRs.fasta exist in final_sequences.txt: 265
Protein number of extracted_sequences.fasta exist in final_sequences.txt: 247


In [9]:
def find_duplicate_headers(fasta_files):
    headers_set = set()

    for fasta_file in fasta_files:
        with open(fasta_file, 'r') as file:
            for line in file:
                if line.startswith('>'):
                    header = line.strip()
                    if header in headers_set:
                        return header 
                    headers_set.add(header)

    return None 

fasta_files = ['AtPRRs.fasta', 'extracted_sequences.fasta']

duplicate_header = find_duplicate_headers(fasta_files)

if duplicate_header:
    print(f"Duplicated protein: {duplicate_header}")
else:
    print("No duplicated protein")


No duplicated protein


In [10]:
import os

def read_fasta(filename):
    """Reads a FASTA file and returns a dictionary with headers as keys and sequences as values."""
    if not os.path.exists(filename):
        print(f"File not found: {filename}")
        return {}

    with open(filename, 'r') as file:
        fasta_dict = {}
        header = None
        sequence = []

        for line in file:
            line = line.strip()
            if line.startswith('>'):
                if header:
                    fasta_dict[header] = ''.join(sequence)
                header = line[1:].split()[0]  # Only keep the AGI code
                sequence = []
            else:
                sequence.append(line)

        if header:
            fasta_dict[header] = ''.join(sequence)

    return fasta_dict

def write_fasta(fasta_dict, output_filename):
    """Writes a dictionary with headers as keys and sequences as values to a FASTA file."""
    with open(output_filename, 'w') as output_file:
        for header, sequence in fasta_dict.items():
            output_file.write(f">{header}\n")
            # Write sequence in lines of 80 characters
            for i in range(0, len(sequence), 80):
                output_file.write(sequence[i:i+80] + '\n')

def merge_fasta_files(input_filenames, output_filename):
    """Merges multiple FASTA files into one and standardizes the format."""
    merged_fasta = {}

    for filename in input_filenames:
        fasta_dict = read_fasta(filename)
        for header, sequence in fasta_dict.items():
            # Remove any '*' characters from the sequence
            standardized_sequence = sequence.replace('*', '')
            merged_fasta[header] = standardized_sequence

    write_fasta(merged_fasta, output_filename)
    print(f"Merged and standardized sequences have been saved to {output_filename}")

def count_headers(filename):
    """Counts the number of headers (lines starting with '>') in a FASTA file."""
    with open(filename, 'r') as file:
        headers = [line for line in file if line.startswith('>')]
    return len(headers)

def check_duplicate_headers(filename):
    """Checks for duplicate headers (lines starting with '>') in a FASTA file."""
    seen_headers = set()
    duplicate_headers = set()

    with open(filename, 'r') as file:
        for line in file:
            if line.startswith('>'):
                header = line.strip()
                if header in seen_headers:
                    duplicate_headers.add(header)
                else:
                    seen_headers.add(header)

    return duplicate_headers

# define file names
input_filenames = [
    'AtPRRs.fasta',
    'extracted_sequences.fasta'
]
output_filename = 'merged_sequences.fasta'

# Formatting fasta
merge_fasta_files(input_filenames, output_filename)

# check duplicated results
duplicate_headers = check_duplicate_headers(output_filename)

if duplicate_headers:
    print("Duplicate headers found:")
    for header in duplicate_headers:
        print(header)
else:
    print("No duplicate headers found in the merged FASTA file.")

# count protein number
header_count = count_headers(output_filename)
print(f"Total number of proteins in the merged FASTA file: {header_count}")


Merged and standardized sequences have been saved to merged_sequences.fasta
No duplicate headers found in the merged FASTA file.
Total number of proteins in the merged FASTA file: 521


In [12]:
def read_fasta(filename):
    """Reads a FASTA file and returns a dictionary with headers as keys and sequences as values."""
    fasta_dict = {}
    with open(filename, 'r') as file:
        header = None
        sequence = []
        for line in file:
            line = line.strip()
            if line.startswith('>'):
                if header:
                    fasta_dict[header] = ''.join(sequence)
                header = line[1:]  # Keep the header as is
                sequence = []
            else:
                sequence.append(line)
        if header:
            fasta_dict[header] = ''.join(sequence)
    return fasta_dict

def write_fasta(fasta_dict, output_filename):
    """Writes a dictionary with headers as keys and sequences as values to a FASTA file."""
    with open(output_filename, 'w') as output_file:
        for header, sequence in fasta_dict.items():
            output_file.write(f">{header}\n")
            for i in range(0, len(sequence), 80):
                output_file.write(sequence[i:i+80] + '\n')

def standardize_fasta(fasta_dict):
    """Standardizes the format of a FASTA dictionary by removing '*' and wrapping sequences to 80 characters per line."""
    standardized_dict = {}
    for header, sequence in fasta_dict.items():
        standardized_sequence = sequence.replace('*', '')
        standardized_dict[header] = standardized_sequence
    return standardized_dict

def merge_fasta_files(fasta_dict1, fasta_dict2):
    """Merges two FASTA dictionaries into one."""
    merged_dict = {**fasta_dict1, **fasta_dict2}
    return merged_dict

def count_headers(fasta_dict):
    """Counts the number of headers in a FASTA dictionary."""
    return len(fasta_dict)

# 파일 이름 정의
merged_filename = 'merged_sequences.fasta'
still_missing_filename = 'still_missing_genes.fasta'
final_merged_filename = 'final_sequences.fasta'

# Read and standardize the still_missing_4genes.fasta file
still_missing_fasta_dict = read_fasta(still_missing_filename)
standardized_still_missing_fasta_dict = standardize_fasta(still_missing_fasta_dict)

# Read the merged_sequences.fasta file
merged_fasta_dict = read_fasta(merged_filename)

# Merge the two FASTA dictionaries
final_fasta_dict = merge_fasta_files(merged_fasta_dict, standardized_still_missing_fasta_dict)

# Write the merged FASTA dictionary to the final_sequences.fasta file
write_fasta(final_fasta_dict, final_merged_filename)

# Count the headers in the final_sequences.fasta file
header_count = count_headers(final_fasta_dict)
print(f"Total number of headers in the final FASTA file: {header_count}")


Total number of headers in the final FASTA file: 527


In [13]:
import os

def read_fasta(filename):
    """Reads a FASTA file and returns a dictionary with headers as keys and sequences as values."""
    fasta_dict = {}
    with open(filename, 'r') as file:
        header = None
        sequence = []
        for line in file:
            line = line.strip()
            if line.startswith('>'):
                if header:
                    fasta_dict[header] = ''.join(sequence)
                header = line[1:]  # Keep the header as is
                sequence = []
            else:
                sequence.append(line)
        if header:
            fasta_dict[header] = ''.join(sequence)
    return fasta_dict

def write_fasta(fasta_dict, output_filename):
    """Writes a dictionary with headers as keys and sequences as values to a FASTA file."""
    with open(output_filename, 'w') as output_file:
        for header, sequence in fasta_dict.items():
            output_file.write(f">{header}\n")
            for i in range(0, len(sequence), 80):
                output_file.write(sequence[i:i+80] + '\n')

def read_protein_list(filename):
    """Reads a file containing a list of protein headers and returns a set of headers."""
    with open(filename, 'r') as file:
        headers = {line.strip().lower() for line in file}  # Convert to lowercase for case insensitivity
    return headers

def normalize_header(header):
    """Normalizes header to match format in the protein list."""
    return header.split('.')[0].lower()

# Define file names
final_sequences_filename = 'final_sequences.fasta'
protein_list_filename = 'Final_receptor_list.txt'
output_filename = 'final_selected_sequences.fasta'

# Check if the protein list file exists
if not os.path.isfile(protein_list_filename):
    print(f"File not found: {protein_list_filename}")
else:
    # Read the protein list
    protein_list = read_protein_list(protein_list_filename)

    # Read the final_sequences.fasta file
    final_fasta_dict = read_fasta(final_sequences_filename)

    # Filter the sequences based on the protein list and track missing proteins
    filtered_fasta_dict = {}
    missing_proteins = protein_list.copy()  # Start with all proteins, remove as we find them

    for header, seq in final_fasta_dict.items():
        normalized_header = normalize_header(header)
        if normalized_header in protein_list:
            filtered_fasta_dict[header] = seq
            missing_proteins.discard(normalized_header)

    # Write the filtered sequences to the output file
    write_fasta(filtered_fasta_dict, output_filename)

    # Output the number of selected proteins
    print(f"Total number of selected proteins in the final FASTA file: {len(filtered_fasta_dict)}")

    # Print the missing proteins
    if missing_proteins:
        print("The following proteins from Final_receptor_list.txt were not found in final_sequences.fasta:")
        for protein in missing_proteins:
            print(protein)
    else:
        print("All proteins from Final_receptor_list.txt were found in final_sequences.fasta.")


Total number of selected proteins in the final FASTA file: 518
The following proteins from Final_receptor_list.txt were not found in final_sequences.fasta:
at4g23231
at1g10860
at1g62090
at4g20790
at4g21370
at1g07655
at4g32710
at1g54470
at2g41890
at3g45920
at1g11140
at1g16140
at3g24400
at4g11500
