In [1]:
from Bio import Phylo, SeqIO

# Phylogenetic tree load (Newick format)
tree = Phylo.read("datafiles/muscle-I20240610-105518-0257-60243362-p1m.phylotree", "newick")

# Load Protein seq
protein_seqs = SeqIO.to_dict(SeqIO.parse("datafiles/muscle-I20240610-105518-0257-60243362-p1m.fa", "fasta"))

# Extract protein order
ordered_proteins = []
for leaf in tree.get_terminals():
    seq_id = leaf.name
    if seq_id in protein_seqs:
        ordered_proteins.append(protein_seqs[seq_id])

# Save
output_file = "datafiles/ordered_proteins.fasta"
with open(output_file, "w") as output_handle:
    SeqIO.write(ordered_proteins, output_handle, "fasta")

print(f"Ordered protein sequences have been written to {output_file}")


Ordered protein sequences have been written to datafiles/ordered_proteins.fasta


In [2]:
from Bio import Phylo

# Phylogenetic tree load (Newick format)
tree = Phylo.read("datafiles/muscle-I20240610-105518-0257-60243362-p1m.phylotree", "newick")

# Load Protein seq
ordered_proteins = []
for leaf in tree.get_terminals():
    seq_id = leaf.name
    ordered_proteins.append(seq_id)

# Save the extracted protein orders in the text file
output_file = "datafiles/ordered_proteins.txt"
with open(output_file, "w") as output_handle:
    for protein in ordered_proteins:
        output_handle.write(protein + "\n")

print(f"Ordered protein names have been written to {output_file}")


Ordered protein names have been written to datafiles/ordered_proteins.txt


In [3]:
import csv

# Read protein list
input_file = "datafiles/ordered_proteins.txt"
with open(input_file, "r") as file:
    protein_names = [line.strip() for line in file]

# Generate CSV file
output_file = "datafiles/ordered_proteins.csv"
with open(output_file, "w", newline='') as csvfile:
    fieldnames = ["Tree_order", "Gene", "Protein"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for i, protein in enumerate(protein_names, start=1):
        gene = protein.split('.')[0]  # '.' extract former letters
        writer.writerow({"Tree_order": i, "Gene": gene, "Protein": protein})

print(f"CSV file has been written to {output_file}")


CSV file has been written to datafiles/ordered_proteins.csv


In [4]:
import csv

# read RLK_list
rlk_list_file = "datafiles/RLK_list.txt"
with open(rlk_list_file, "r") as file:
    rlk_genes = set(line.strip().upper() for line in file)

# read protein names
input_file = "datafiles/ordered_proteins.txt"
with open(input_file, "r") as file:
    protein_names = [line.strip() for line in file]

# counting setting
rlk_count = 0
rlp_count = 0

# write csv
output_file = "datafiles/ordered_proteins.csv"
with open(output_file, "w", newline='') as csvfile:
    fieldnames = ["Tree_order", "Gene", "Protein", "Type"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for i, protein in enumerate(protein_names, start=1):
        gene = protein.split('.')[0].upper() 
        if gene in rlk_genes:
            gene_type = "RLK"
            rlk_count += 1
        else:
            gene_type = "RLP"
            rlp_count += 1
        writer.writerow({"Tree_order": i, "Gene": gene, "Protein": protein, "Type": gene_type})

print(f"CSV file has been written to {output_file}")
print(f"RLK count: {rlk_count}")
print(f"RLP count: {rlp_count}")


CSV file has been written to datafiles/ordered_proteins.csv
RLK count: 436
RLP count: 52


In [5]:
import pandas as pd

# Read ordered_proteins.csv 
ordered_proteins_file = "datafiles/ordered_proteins.csv"
ordered_proteins_df = pd.read_csv(ordered_proteins_file)

# Read class.csv 
class_file = "datafiles/class.csv"
class_df = pd.read_csv(class_file)

# Merge by Gene to make Class column
merged_df = pd.merge(ordered_proteins_df, class_df, on="Gene", how="left")

# NaN > "Unknown"
merged_df["Class"] = merged_df["Class"].fillna("Unknown")

# Write csv
output_file = "datafiles/ordered_proteins_1.csv"
merged_df.to_csv(output_file, index=False)

print(f"Result saved to {output_file}")


Result saved to datafiles/ordered_proteins_1.csv


In [45]:
import pandas as pd

# Read ordered_proteins_1.csv
file_path = "datafiles/ordered_proteins_1.csv"
df = pd.read_csv(file_path)

# Count Unknown number
unknown_count = (df["Class"] == "Unknown").sum()

print(f"Number of 'Unknown' classes: {unknown_count}")

print("Genes with 'Unknown' class:")
unknown_genes = df.loc[df["Class"] == "Unknown", "Gene"].unique()
for gene in unknown_genes:
    print(gene)


Number of 'Unknown' classes: 0
Genes with 'Unknown' class:


In [6]:
from Bio import SeqIO

# read ordered_proteins.txt
with open("datafiles/ordered_proteins.txt") as f:
    protein_list = [line.strip() for line in f]

# make list for saving
header_data = []

# pasing fasta file and extract header 
fasta_file = "datafiles/TAIR10_pep_20110103_representative_gene_model"
for record in SeqIO.parse(fasta_file, "fasta"):
    protein_id = record.id
    
    if protein_id in protein_list:
        header_data.append(record.description)

# save header data into the file
output_file = "datafiles/TAIR_header_ordered_proteins.txt"
with open(output_file, "w") as f:
    for header in header_data:
        f.write(f"{header}\n")

print(f"Header data saved to {output_file}")


Header data saved to datafiles/TAIR_header_ordered_proteins.txt


In [38]:
import csv

# extract protein ID from ordered_proteins.txt
def get_protein_ids(file_path):
    with open(file_path, 'r') as f:
        protein_ids = [line.strip() for line in f]
    return protein_ids

# extract symbol from TAIR_header_ordered_proteins.txt
def get_symbols(file_path, protein_ids):
    symbols_dict = {}
    with open(file_path, 'r') as f:
        for line in f:
            line_parts = line.strip().split(' | ')
            protein_id = line_parts[0]
            if protein_id in protein_ids:
                symbols = line_parts[1].split(': ')[1].split(', ')
                symbols_dict[protein_id] = symbols
    return symbols_dict

# extract protein ID from ordered_proteins.txt
protein_ids = get_protein_ids('datafiles/ordered_proteins.txt')

# pasing protein symbol from TAIR_header_ordered_proteins.txt
symbols_dict = get_symbols('datafiles/TAIR_header_ordered_proteins.txt', protein_ids)

# write csv
with open('datafiles/TAIR_Symbol_ordered_proteins.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Gene', 'Protein', 'Symbols'])  # 헤더 작성
    for protein_id in protein_ids:
        gene = protein_id[:9]  # 앞 9자를 Gene 열에 추가
        symbols = ', '.join(symbols_dict.get(protein_id, ['N/A']))
        writer.writerow([gene, protein_id, symbols])

print(f"csv file saved")


csv file saved


In [39]:
# Merge Batch_Symbols.csv with TAIR_Symbols and save as TAIR_Batch ~
import pandas as pd

df_main = pd.read_csv("datafiles/TAIR_Symbol_ordered_proteins.csv")

df_additional = pd.read_csv("datafiles/Batch_Symbols.csv")

df_merged = pd.merge(df_main, df_additional, on='Gene', how='left')

df_merged = df_merged.loc[:,~df_merged.columns.duplicated()]

df_merged.to_csv("datafiles/TAIR_Batch_symbol_ordered_proteins.csv", index=False)

print("New file 'TAIR_Batch_symbol_ordered_proteins.csv' created successfully!")


New file 'TAIR_Batch_symbol_ordered_proteins.csv' created successfully!


In [40]:
# Merge Expression_Symbols.csv with TAIR_Batch_Symbols and save as TAIR_Batch_Ex ~
import pandas as pd

df_merged = pd.read_csv("datafiles/TAIR_Batch_symbol_ordered_proteins.csv")

df_expression = pd.read_csv("datafiles/Expression_Symbols.csv")

df_final = pd.merge(df_merged, df_expression, on='Gene', how='left')

df_final = df_final.loc[:, ~df_final.columns.duplicated()]

df_final = df_final.rename(columns={'Expression_name': 'Expression_Symbol'})

df_final.to_csv("datafiles/TAIR_Batch_Ex_symbol_ordered_proteins.csv", index=False)

print("New file 'TAIR_Batch_Ex_symbol_ordered_proteins.csv' created successfully!")


New file 'TAIR_Batch_Ex_symbol_ordered_proteins.csv' created successfully!


In [41]:
import pandas as pd

df_ordered_proteins = pd.read_csv("datafiles/ordered_proteins_1.csv")

df_annotation = pd.read_csv("datafiles/Final_annotation.csv")

df_merged = pd.merge(df_ordered_proteins, df_annotation[['Gene', 'Protein_name']], on='Gene', how='left')

df_merged.to_csv("ordered_proteins_1.csv", index=False)

print("Protein_name has been added to ordered_proteins_1.csv file.")


Protein_name has been added to ordered_proteins_1.csv file.


In [13]:
# Sort domain_prediction.csv by ordered_proteins.txt

import pandas as pd

ordered_proteins_path = 'datafiles/ordered_proteins.txt'
domain_prediction_path = 'datafiles/domain_prediction.csv'
output_path = 'datafiles/sorted_domain_prediction.csv'

# read ordered_proteins.txt
with open(ordered_proteins_path, 'r') as file:
    ordered_proteins = file.read().splitlines()

# read domain_prediction.csv
domain_df = pd.read_csv(domain_prediction_path)

protein_column_name = 'Protein' 

# read Protein column list
proteins_in_csv = domain_df[protein_column_name].tolist()

# check missing proteins in csv
missing_proteins = set(proteins_in_csv) - set(ordered_proteins)
if missing_proteins:
    print("The following proteins are in the CSV but not in the TXT file:")
    for protein in missing_proteins:
        print(protein)

missing_in_csv = set(ordered_proteins) - set(proteins_in_csv)
if missing_in_csv:
    print("The following proteins are in the TXT file but not in the CSV file:")
    for protein in missing_in_csv:
        print(protein)

# sort csv file by ordered_proteins.txt
sorted_domain_df = domain_df.set_index(protein_column_name).reindex(ordered_proteins + list(missing_proteins)).reset_index()

# write csv
sorted_domain_df.to_csv(output_path, index=False, sep=',')


Index(['Protein', 'Signal peptide', 'ECD start', 'ECD end', 'TM', 'Cytoplasm'], dtype='object')
The following proteins are in the CSV but not in the TXT file:
AT3G13690.1
AT1G70740.1
AT3G46760.1
AT4G23240.1
AT4G00960.1
AT1G51620.1
AT1G55200.1
AT5G15730.1
AT3G25490.1
AT5G41680.1
AT5G07620.1
AT5G54590.2
AT3G51990.1
AT5G61570.1
AT5G23170.1
AT5G42440.1
AT1G54820.1
AT2G32800.1
AT3G58690.1
AT5G56790.1
AT3G46410.1
AT1G70450.1
AT3G09010.1
AT2G41140.1
AT1G28390.1
AT3G57120.1
AT5G13290.1
AT4G11890.3
AT1G78530.1
AT1G16670.1


In [16]:
import pandas as pd

# set the file path
sorted_domain_prediction_path = 'datafiles/sorted_domain_prediction.csv'
ordered_proteins_1_path = 'datafiles/ordered_proteins_1.csv'
output_path = 'datafiles/merged_sorted_domain_prediction.csv'

# read CSV
sorted_domain_df = pd.read_csv(sorted_domain_prediction_path, sep=',')
ordered_proteins_1_df = pd.read_csv(ordered_proteins_1_path, sep=',')

# merge two dataframes by Protein column
merged_df = pd.merge(sorted_domain_df, ordered_proteins_1_df, on='Protein', how='left')

# reorder columns
column_order = ['Tree_order','Gene', 'Protein', 'Protein_name', 'Type', 'Class', 'Signal_peptide', 'ECD_start', 'ECD_end', 'TM', 'Cytoplasm']
merged_df = merged_df[column_order]

# save csv
merged_df.to_csv(output_path, index=False, sep=',')
