In [13]:
# read missing seq lst
with open('missing_list.txt', 'r') as file:
    lines = file.readlines()

# remove 0 and #N/A
filtered_lines = [line for line in lines if line.strip() not in ('0', '#N/A')]

# write on txt
with open('missing_list_filtered.txt', 'w') as file:
    file.writelines(filtered_lines)


In [14]:
def read_agi_list(filename):
    """Reads a list of AGI codes from a file."""
    with open(filename, 'r') as file:
        agi_list = [line.strip() for line in file if line.strip()]
    return agi_list

def extract_sequences(agi_list, protein_db_filename, output_filename):
    """Extracts sequences from the protein database for the given AGI codes in FASTA format."""
    with open(protein_db_filename, 'r') as db_file:
        lines = db_file.readlines()

    extracted_sequences = []
    current_sequence = []
    recording = False
    extracted_count = 0

    for line in lines:
        if line.startswith('>'):
            if recording:
                extracted_sequences.append(''.join(current_sequence))
                current_sequence = []
                recording = False
                extracted_count += 1
            agi_code = line.split()[0][1:]  # Extract AGI code from header
            if agi_code in agi_list:
                # Edit header to only contain AGI code
                line = f">{agi_code}\n"
                extracted_sequences.append(line)  # Add edited header to result
                recording = True
        elif recording:
            current_sequence.append(line)

    if recording:
        extracted_sequences.append(''.join(current_sequence))
        extracted_count += 1

    with open(output_filename, 'w') as output_file:
        output_file.writelines(extracted_sequences)

    print(f"Total {extracted_count} sequences have been extracted and saved to {output_filename}")

    # find missing AGI codes
    missing_agi_codes = set(agi_list) - {line.split()[0][1:] for line in extracted_sequences if line.startswith('>')}
    if missing_agi_codes:
        print("The following AGI codes were not found in the protein database:")
        for agi_code in missing_agi_codes:
            print(agi_code)

agi_list_filename = 'missing_list_filtered.txt'
protein_db_filename = 'TAIR10_pep_20101214'
output_filename = 'extracted_sequences.fasta'

# read AGI code
agi_list = read_agi_list(agi_list_filename)

# extract sequence according to AGI code
extract_sequences(agi_list, protein_db_filename, output_filename)


Total 247 sequences have been extracted and saved to extracted_sequences.fasta
The following AGI codes were not found in the protein database:
AT2G32660.2
At3g25020.1
At2g42800.1
At4g13900.1
At2g33080.1
At4g04220.1


In [9]:
import csv

def check_csv_headers(file_path):
    with open(file_path, 'r', newline='') as csvfile:
        reader = csv.reader(csvfile)
        headers = next(reader)  
        print("CSV Headers:", headers)

check_csv_headers('missing_check.csv')


CSV Headers: ['\ufeffGene_ID', 'Major_splicing_variants', 'Seq_info', 'missing_seq_index']


In [10]:
import csv

def main():
    input_file = 'missing_check.csv'
    output_file = 'sequence_not_available_list.txt'
    
    gene_ids = []

    # Read csv file
    with open(input_file, 'r', newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            # change header name accoring to csv
            if row['Major_splicing_variants'] == '#N/A' and row['Seq_info'] == '#N/A' and row['missing_seq_index'] == '#N/A':
                gene_ids.append(row['\ufeffGene_ID'])
    
    # save results
    with open(output_file, 'w') as file:
        for gene_id in gene_ids:
            file.write(gene_id + '\n')
    
    # print results
    print(f"Gene IDs saved in {output_file}: {len(gene_ids)}")

if __name__ == "__main__":
    main()


Gene IDs saved in sequence_not_available_list.txt: 14
