In [5]:
import csv

def parse_tmr_gff3(filename):
    """Parses the TMRs.gff3 file and extracts information into a structured format."""
    data = []
    protein_count = 0
    signal_count = 0
    outside_count = 0
    tmhelix_count = 0
    inside_count = 0
    proteins_without_tmhelix = []

    with open(filename, 'r') as file:
        current_protein = None
        protein_data = {'Protein': '', 'Signal peptide': 0, 'ECD start': 0, 'ECD end': 0, 'TM': 0, 'Cytoplasm': 0}

        for line in file:
            if line.startswith('#'):
                if 'Length' in line:
                    protein_count += 1
                continue
            if line.startswith('//'):
                if current_protein:
                    data.append(protein_data)
                    if not protein_data['TM']:
                        proteins_without_tmhelix.append(current_protein)
                current_protein = None
                protein_data = {'Protein': '', 'Signal peptide': 0, 'ECD start': 0, 'ECD end': 0, 'TM': 0, 'Cytoplasm': 0}
                continue

            parts = line.strip().split('\t')
            if len(parts) >= 4:
                header, feature_type, start, end = parts[:4]
                start, end = int(start), int(end)

                if current_protein != header:
                    if current_protein:
                        data.append(protein_data)
                        if not protein_data['TM']:
                            proteins_without_tmhelix.append(current_protein)
                    current_protein = header
                    protein_data = {'Protein': header, 'Signal peptide': 0, 'ECD start': 0, 'ECD end': 0, 'TM': 0, 'Cytoplasm': 0}

                if feature_type == 'signal':
                    protein_data['Signal peptide'] = end
                    signal_count += 1
                elif feature_type == 'outside':
                    protein_data['ECD start'] = start
                    protein_data['ECD end'] = end
                    outside_count += 1
                elif feature_type == 'TMhelix':
                    protein_data['TM'] = f"{start}-{end}"
                    tmhelix_count += 1
                elif feature_type == 'inside':
                    protein_data['Cytoplasm'] = f"{start}-{end}"
                    inside_count += 1

        if current_protein:
            data.append(protein_data)
            if not protein_data['TM']:
                proteins_without_tmhelix.append(current_protein)

    return data, protein_count, signal_count, outside_count, tmhelix_count, inside_count, proteins_without_tmhelix

def write_to_csv(data, output_filename):
    """Writes the structured data to a CSV file."""
    fieldnames = ['Protein', 'Signal peptide', 'ECD start', 'ECD end', 'TM', 'Cytoplasm']

    with open(output_filename, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

# TMRs.gff3 파일 파싱 및 정보 추출
data, protein_count, signal_count, outside_count, tmhelix_count, inside_count, proteins_without_tmhelix = parse_tmr_gff3("TMRs.gff3")

# domain_prediction.csv 파일로 저장
write_to_csv(data, "domain_prediction.csv")

# 결과 출력
print(f"Number of analyzed proteins: {protein_count}")
print(f"Signal peptide count: {signal_count}")
print(f"Outside count: {outside_count}")
print(f"TMhelix count: {tmhelix_count}")
print(f"Inside count: {inside_count}")
print(f"Proteins without TMhelix: {', '.join(proteins_without_tmhelix)}")


Number of analyzed proteins: 518
Signal peptide count: 459
Outside count: 500
TMhelix count: 504
Inside count: 522
Proteins without TMhelix: AT1G80080.1, AT1G70450.1, AT1G70740.1, AT1G51620.1, AT1G55200.1, AT1G16670.1, AT1G28390.1, AT2G41140.1, AT2G32800.1, AT3G46410.1, AT3G09010.1, AT3G13690.1, AT3G46760.1, AT3G57120.1, AT3G51990.1, AT4G23240.1, AT4G00960.1, AT4G11890.3, AT5G56790.1, AT5G23170.1, AT5G41680.1, AT2G33080.1, AT2G42800.1


In [6]:
import csv

def parse_tmr_gff3(filename):
    """Parses the TMRs.gff3 file and extracts information into a structured format."""
    data = []
    protein_count = 0
    signal_count = 0
    outside_count = 0
    tmhelix_count = 0
    inside_count = 0
    proteins_without_signal = 0
    proteins_without_outside = 0
    proteins_without_tmhelix = 0
    proteins_without_inside = 0

    with open(filename, 'r') as file:
        current_protein = None
        protein_data = {'Protein': '', 'Signal peptide': 0, 'ECD start': 0, 'ECD end': 0, 'TM': 0, 'Cytoplasm': 0}
        has_signal = False
        has_outside = False
        has_tmhelix = False
        has_inside = False

        for line in file:
            if line.startswith('#'):
                if 'Length' in line:
                    protein_count += 1
                continue
            if line.startswith('//'):
                if current_protein:
                    data.append(protein_data)
                    if not has_signal:
                        proteins_without_signal += 1
                    if not has_outside:
                        proteins_without_outside += 1
                    if not has_tmhelix:
                        proteins_without_tmhelix += 1
                    if not has_inside:
                        proteins_without_inside += 1

                current_protein = None
                protein_data = {'Protein': '', 'Signal peptide': 0, 'ECD start': 0, 'ECD end': 0, 'TM': 0, 'Cytoplasm': 0}
                has_signal = False
                has_outside = False
                has_tmhelix = False
                has_inside = False
                continue

            parts = line.strip().split('\t')
            if len(parts) >= 4:
                header, feature_type, start, end = parts[:4]
                start, end = int(start), int(end)

                if current_protein != header:
                    if current_protein:
                        data.append(protein_data)
                        if not has_signal:
                            proteins_without_signal += 1
                        if not has_outside:
                            proteins_without_outside += 1
                        if not has_tmhelix:
                            proteins_without_tmhelix += 1
                        if not has_inside:
                            proteins_without_inside += 1

                    current_protein = header
                    protein_data = {'Protein': header, 'Signal peptide': 0, 'ECD start': 0, 'ECD end': 0, 'TM': 0, 'Cytoplasm': 0}
                    has_signal = False
                    has_outside = False
                    has_tmhelix = False
                    has_inside = False

                if feature_type == 'signal':
                    protein_data['Signal peptide'] = end
                    has_signal = True
                elif feature_type == 'outside':
                    protein_data['ECD start'] = start
                    protein_data['ECD end'] = end
                    has_outside = True
                elif feature_type == 'TMhelix':
                    protein_data['TM'] = f"{start}-{end}"
                    has_tmhelix = True
                elif feature_type == 'inside':
                    protein_data['Cytoplasm'] = f"{start}-{end}"
                    has_inside = True

        if current_protein:
            data.append(protein_data)
            if not has_signal:
                proteins_without_signal += 1
            if not has_outside:
                proteins_without_outside += 1
            if not has_tmhelix:
                proteins_without_tmhelix += 1
            if not has_inside:
                proteins_without_inside += 1

    return data, protein_count, proteins_without_signal, proteins_without_outside, proteins_without_tmhelix, proteins_without_inside

def write_to_csv(data, output_filename):
    """Writes the structured data to a CSV file."""
    fieldnames = ['Protein', 'Signal peptide', 'ECD start', 'ECD end', 'TM', 'Cytoplasm']

    with open(output_filename, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

# pasing TMRs.gff3 file and extract protein information
data, protein_count, proteins_without_signal, proteins_without_outside, proteins_without_tmhelix, proteins_without_inside = parse_tmr_gff3("TMRs.gff3")

# save as domain_prediction.csv 
write_to_csv(data, "domain_prediction.csv")

# print the results
print(f"Number of analyzed proteins: {protein_count}")
print(f"Proteins without signal peptide: {proteins_without_signal}")
print(f"Proteins without outside: {proteins_without_outside}")
print(f"Proteins without TMhelix: {proteins_without_tmhelix}")
print(f"Proteins without inside: {proteins_without_inside}")


Number of analyzed proteins: 518
Proteins without signal peptide: 59
Proteins without outside: 20
Proteins without TMhelix: 23
Proteins without inside: 3


In [3]:
import csv

def parse_tmr_gff3(filename):
    """Parses the TMRs.gff3 file and extracts information into a structured format."""
    data = []
    protein_count = 0
    signal_count = 0
    outside_count = 0
    tmhelix_count = 0
    inside_count = 0
    proteins_without_signal = 0
    proteins_without_outside = 0
    proteins_without_tmhelix = 0
    proteins_without_inside = 0
    non_tm_proteins = []

    with open(filename, 'r') as file:
        current_protein = None
        protein_data = {'Protein': '', 'Signal peptide': 0, 'ECD start': 0, 'ECD end': 0, 'TM': 0, 'Cytoplasm': 0}
        has_signal = False
        has_outside = False
        has_tmhelix = False
        has_inside = False

        for line in file:
            if line.startswith('#'):
                if 'Length' in line:
                    protein_count += 1
                continue
            if line.startswith('//'):
                if current_protein:
                    data.append(protein_data)
                    if not has_signal:
                        proteins_without_signal += 1
                    if not has_outside:
                        proteins_without_outside += 1
                    if not has_tmhelix:
                        proteins_without_tmhelix += 1
                        non_tm_proteins.append(current_protein)
                    if not has_inside:
                        proteins_without_inside += 1

                current_protein = None
                protein_data = {'Protein': '', 'Signal peptide': 0, 'ECD start': 0, 'ECD end': 0, 'TM': 0, 'Cytoplasm': 0}
                has_signal = False
                has_outside = False
                has_tmhelix = False
                has_inside = False
                continue

            parts = line.strip().split('\t')
            if len(parts) >= 4:
                header, feature_type, start, end = parts[:4]
                start, end = int(start), int(end)

                if current_protein != header:
                    if current_protein:
                        data.append(protein_data)
                        if not has_signal:
                            proteins_without_signal += 1
                        if not has_outside:
                            proteins_without_outside += 1
                        if not has_tmhelix:
                            proteins_without_tmhelix += 1
                            non_tm_proteins.append(current_protein)
                        if not has_inside:
                            proteins_without_inside += 1

                    current_protein = header
                    protein_data = {'Protein': header, 'Signal peptide': 0, 'ECD start': 0, 'ECD end': 0, 'TM': 0, 'Cytoplasm': 0}
                    has_signal = False
                    has_outside = False
                    has_tmhelix = False
                    has_inside = False

                if feature_type == 'signal':
                    protein_data['Signal peptide'] = end
                    has_signal = True
                elif feature_type == 'outside':
                    protein_data['ECD start'] = start
                    protein_data['ECD end'] = end
                    has_outside = True
                elif feature_type == 'TMhelix':
                    protein_data['TM'] = f"{start}-{end}"
                    has_tmhelix = True
                elif feature_type == 'inside':
                    protein_data['Cytoplasm'] = f"{start}-{end}"
                    has_inside = True

        if current_protein:
            data.append(protein_data)
            if not has_signal:
                proteins_without_signal += 1
            if not has_outside:
                proteins_without_outside += 1
            if not has_tmhelix:
                proteins_without_tmhelix += 1
                non_tm_proteins.append(current_protein)
            if not has_inside:
                proteins_without_inside += 1

    return data, protein_count, proteins_without_signal, proteins_without_outside, proteins_without_tmhelix, proteins_without_inside, non_tm_proteins

def write_to_csv(data, output_filename):
    """Writes the structured data to a CSV file."""
    fieldnames = ['Protein', 'Signal peptide', 'ECD start', 'ECD end', 'TM', 'Cytoplasm']

    with open(output_filename, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

def write_non_tm_list(non_tm_proteins, output_filename):
    """Writes the list of proteins without TMhelix to a text file."""
    with open(output_filename, 'w') as file:
        for protein in non_tm_proteins:
            file.write(protein + '\n')

# pasing TMRs.gff3 file and extract protein information
data, protein_count, proteins_without_signal, proteins_without_outside, proteins_without_tmhelix, proteins_without_inside, non_tm_proteins = parse_tmr_gff3("TMRs.gff3")

# save as domain_prediction.csv
write_to_csv(data, "domain_prediction.csv")

# save as non_TM_list.txt 파일로 저장
write_non_tm_list(non_tm_proteins, "non_TM_list.txt")

# print the results
print(f"Number of analyzed proteins: {protein_count}")
print(f"Proteins without signal peptide: {proteins_without_signal}")
print(f"Proteins without outside: {proteins_without_outside}")
print(f"Proteins without TMhelix: {proteins_without_tmhelix}")
print(f"Proteins without inside: {proteins_without_inside}")


Number of analyzed proteins: 518
Proteins without signal peptide: 59
Proteins without outside: 20
Proteins without TMhelix: 23
Proteins without inside: 3


In [7]:
import csv

def extract_long_ecd_proteins(input_filename, output_filename):
    """Extracts proteins with ECD length greater than 50 and saves the list to a text file."""
    long_ecd_proteins = []

    with open(input_filename, 'r', newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            ecd_start = int(row['ECD start'])
            ecd_end = int(row['ECD end'])
            ecd_length = ecd_end - ecd_start
            if ecd_length > 50:
                long_ecd_proteins.append(row['Protein'])

    with open(output_filename, 'w') as file:
        for protein in long_ecd_proteins:
            file.write(protein + '\n')

# extract protein list ECD size is bigger than 50 aa and save it as domain_prediction.csv
extract_long_ecd_proteins("domain_prediction.csv", "ECD_list.txt")


In [8]:
from Bio import SeqIO
import csv

def extract_ecd_sequences(input_fasta, ecd_list, domain_csv, output_fasta):
    """Extracts ECD sequences from selected protein sequences based on domain prediction information."""
    selected_proteins = set()

    # Read protein IDs from ECD list
    with open(ecd_list, 'r') as file:
        for line in file:
            protein_id = line.strip()
            selected_proteins.add(protein_id)

    ecd_sequences = {}

    # Read domain prediction information from CSV
    with open(domain_csv, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            protein = row['Protein']
            ecd_start = int(row['ECD start'])
            ecd_end = int(row['ECD end'])

            if protein in selected_proteins:
                if protein not in ecd_sequences:
                    ecd_sequences[protein] = []

                ecd_sequences[protein].append((ecd_start, ecd_end))

    # Extract ECD sequences from protein sequences
    num_proteins = 0  # Initialize the protein count
    with open(output_fasta, 'w') as outfile:
        for record in SeqIO.parse(input_fasta, 'fasta'):
            protein_id = record.id.split()[0]
            if protein_id in ecd_sequences:
                num_proteins += 1  # Increment the protein count
                for start, end in ecd_sequences[protein_id]:
                    ecd_sequence = record.seq[start - 1:end]
                    outfile.write(f'>{protein_id}\n{ecd_sequence}\n')

    print(f'Total number of proteins extracted: {num_proteins}')

# Input and output file paths
input_fasta = 'final_selected_sequences.fasta'
ecd_list = 'ECD_list.txt'
domain_csv = 'domain_prediction.csv'
output_fasta = 'ECD_sequences.fasta'

# Extract ECD sequences and save to output fasta file
extract_ecd_sequences(input_fasta, ecd_list, domain_csv, output_fasta)


Total number of proteins extracted: 488


In [2]:
from Bio import SeqIO

# set the file path
fasta_file = "ECD_sequences.fasta"
ecd_list_file = "ECD_list.txt"
output_file = "ECD_containing_full_sequences.fasta"

# read sequence ID from ECD_list.txt
with open(ecd_list_file, "r") as file:
    ecd_list = {line.strip() for line in file}

# filtering the sequence from ECD_sequences.fasta
filtered_sequences = []
for record in SeqIO.parse(fasta_file, "fasta"):
    if record.id in ecd_list:
        filtered_sequences.append(record)

# save the sequence as new file
SeqIO.write(filtered_sequences, output_file, "fasta")

# print filtered ECD sequences
print(f"{len(filtered_sequences)} filtered ECD proteins")


488 filtered ECD proteins
