## **Library** ##

In [None]:
from Bio import SeqIO
from Bio.Seq import translate
import os

## **All GBK Files** ##

In [None]:
input_dir_gbk = "/media/preonath/Pharokka_result_gbk_files"
gbk_files = [f for f in os.listdir(input_dir_gbk) if f.endswith(".gbk")]
print(gbk_files)

## **Extract translation seq of tail fiber protein Gene** ##

In [None]:
def extract_translation_seq_of_tail_fiber_gene(input_dir_gbk, gbk_files):
    gbk_dic_tail = {}
    for gbk_file in gbk_files:
        file_path = os.path.join(input_dir_gbk, gbk_file)
        file_name = os.path.basename(file_path)
        phage_name = os.path.splitext(file_name)[0].split('_pharokka')[0]

        for rec in SeqIO.parse(file_path, "gb"):
            for feature in rec.features:
                # print(feature)
                for key, val in feature.qualifiers.items():
                    if "tail fiber protein" in val:
                        translation_str=str(feature.qualifiers['translation'])
                        # print(translation_str)

#                         # Add phage_name and its start and end position as a tuple to the gbk_dic dictionary
                        gbk_dic_tail[phage_name] = (translation_str)

    gbk_dic_tail_keys_sorted = sorted(list(gbk_dic_tail.keys()))
    gbk_dic_tail_keys_sorted_dict = {i:gbk_dic_tail[i] for i in gbk_dic_tail_keys_sorted}

    return(gbk_dic_tail_keys_sorted_dict)

extract_translation_seq_of_tail_fiber_gene(input_dir_gbk, gbk_files)

## **Extract translation seq of terminase large subunit Gene** ##

In [None]:
def extract_translation_seq_of_terminase_gene(input_dir_gbk, gbk_files):
    gbk_dic_terminase = {}
    for gbk_file in gbk_files:
        file_path = os.path.join(input_dir_gbk, gbk_file)
        file_name = os.path.basename(file_path)
        phage_name = os.path.splitext(file_name)[0].split('_pharokka')[0]

        for rec in SeqIO.parse(file_path, "gb"):
            for feature in rec.features:
                for key, val in feature.qualifiers.items():
                    if "terminase large subunit" in val:

                        translation_str=str(feature.qualifiers['translation'])
                        # print(translation_str)

                        # Add phage_name and its start and end position as a tuple to the gbk_dic dictionary
                        gbk_dic_terminase[phage_name] = (translation_str)

    gbk_dic_terminase_keys_sorted = sorted(list(gbk_dic_terminase.keys()))
    gbk_dic_terminase_keys_sorted_dict = {i:gbk_dic_terminase[i] for i in gbk_dic_terminase_keys_sorted}

    return(gbk_dic_terminase_keys_sorted_dict)


extract_translation_seq_of_terminase_gene(input_dir_gbk, gbk_files)

## **Join Tail fiber and Terminase Sequence** ##

In [None]:
def join_tail_sequence_terminase_sequence(input_dir_gbk, gbk_files,output_file):
    tail_sequence_subseq_dict = extract_translation_seq_of_tail_fiber_gene(input_dir_gbk, gbk_files)
    terminase_sequence_subseq_dict = extract_translation_seq_of_terminase_gene(input_dir_gbk, gbk_files)
    joint_dict = {}
    for phage_name in tail_sequence_subseq_dict.keys() & terminase_sequence_subseq_dict.keys():
        joint_dict[">"+phage_name] = f"{tail_sequence_subseq_dict[phage_name]}{terminase_sequence_subseq_dict[phage_name]}"
    with open(output_file, 'w') as f:
        for phage_id, sequence in sorted(joint_dict.items()):
            f.write(f"{phage_id}\n\n{sequence}\n\n".replace("[","").replace("]","").replace("'",""))

join_tail_sequence_terminase_sequence(input_dir_gbk, gbk_files,"join_tail_sequence_terminase_sequence.fa")
