In [4]:
import pandas as pd
from Bio import SeqIO
from Bio.SeqFeature import FeatureLocation
from Bio.Seq import UndefinedSequenceError
import os

# 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

# Parsing host annotations

In [2]:
hosts = pd.read_csv("../data/raw/inphared_8Sep2023/8Sep2023_vConTACT2_host_annotations.tsv", sep = "\t")

hosts = hosts.rename(columns={
    "Node_ID": "id_inphared",
    "Description": "isolation_host_inphared",
    "Host": "host_inphared"
})

# Display the DataFrame to check if the changes are applied
hosts

Unnamed: 0,id_inphared,isolation_host_inphared,host_inphared,Colour (Host)
0,GU339467,Mycobacterium phage RedRock,Mycobacterium,#8C1292
1,MF417929,Uncultured Caudovirales phage clone 2F_1,Unspecified,#D3D3D3
2,MH616963,crAssphage sp. isolate ctbg_1,Unspecified,#D3D3D3
3,MH552500,crAssphage sp. isolate ctcc615,Unspecified,#D3D3D3
4,BK010471,Carjivirus communis,Unspecified,#D3D3D3
...,...,...,...,...
20885,AY526908,Bordetella phage BMP-1,Bordetella,#1479C4
20886,AJ270057,Chlamydia phage 2,Chlamydia,#9A4AD6
20887,AB016282,Bacillus phage phi105,Bacillus,#74AF9B
20888,AF109874,Lactococcus phage Tuc2009,Lactococcus,#395C1F


In [3]:
def engineer_features(folder):
    # Read data from the TSV file
    df = pd.read_csv("../data/raw/inphared_8Sep2023/8Sep2023_vConTACT2_host_annotations.tsv", sep = "\t")

    df = df.rename(columns={
        "Node_ID": "id_inphared",
        "Description": "isolation_host_inphared",
        "Host": "host_inphared"
    })


    # Initialize lists for storing data
    ids = []
    topologies = []
    sequences = []

    # Read the GenBank file for topology and sequences
    genbank_file = f'{folder}/pharokka.gbk'
    for record in SeqIO.parse(genbank_file, "genbank"):
        ids.append(record.id)    
        topologies.append(record.annotations.get('topology', 'N/A'))
        sequences.append(str(record.seq))



    # Create DataFrames from the topologies and sequences
    topology_df = pd.DataFrame({'id_inphared': ids, 'topology': topologies})
    sequence_df = pd.DataFrame({'id_inphared': ids, 'sequence': sequences})

    # Merge the TSV data with the GenBank topology and sequence data
    df = pd.merge(df, topology_df, on='id_inphared', how='left')
    df = pd.merge(df, sequence_df, on='id_inphared', how='left')
    del(topology_df)
    del(sequence_df)

    df = pd.get_dummies(df, columns=['topology'])


    df_length_gc_cds_density = pd.read_csv(f"{folder}/pharokka_length_gc_cds_density.tsv", sep="\t")
    df_length_gc_cds_density['contig'] = df_length_gc_cds_density['contig'].str.slice(0,8)
    df_length_gc_cds_density


    df = pd.merge(df, df_length_gc_cds_density, left_on='id_inphared', right_on='contig', how='outer')
    del(df_length_gc_cds_density)

    df_cds = pd.read_csv(f"{folder}/pharokka_cds_functions.tsv", sep="\t")
    
    # Define a dictionary of replacements
    replacements = {
        "DNA, RNA and nucleotide metabolism":"nucleotide_metabolism",
        "head and packaging": "head_packaging",
        "moron, auxiliary metabolic gene and host takeover": "host_takeover",
        "transcription regulation": "transcription",
        "unknown function": "unkown_function"
    }

    # Replace the values using the dictionary
    df_cds['Description'] = df_cds['Description'].replace(replacements)

    df_cds = df_cds.pivot(index='contig', columns='Description', values='Count').reset_index()
    df_cds['contig'] = df_cds['contig'].str.slice(0,8)

    df = pd.merge(df, df_cds, left_on='id_inphared', right_on='contig', how='outer')
    del(df_cds)


    df_frame = pd.read_csv(f"{folder}/pharokka_cds_final_merged_output.tsv", sep="\t",  low_memory=False)

    frame_counts = df_frame.groupby('contig')['frame'].value_counts().unstack(fill_value=0)

    # Ensure both '+' and '-' columns are present
    frame_counts['+'] = frame_counts.get('+', 0)
    frame_counts['-'] = frame_counts.get('-', 0)

    # Rename columns explicitly
    frame_counts = frame_counts.rename(columns={'+': 'frame_positive', '-': 'frame_negative'})

    # Reset index if needed
    frame_counts = frame_counts.reset_index()
    frame_counts = frame_counts.rename_axis(None, axis=1).reset_index()
    frame_counts['contig'] = frame_counts['contig'].str.slice(0,8)
    df = pd.merge(df, frame_counts, left_on='id_inphared', right_on='contig', how='outer')

    df['jumbophage'] = df['length'].apply(lambda x: x >= 200000)
    df['jumbophage'] = df['jumbophage'].astype(int)  # Convert True/False to 1/0
    

    # Creating new manual features
    df['positive_strand_%'] = round(df['frame_positive'] / df['CDS'] * 100,2)
    df['negative_strand_%'] = round(df['frame_negative'] / df['CDS'] * 100,2)


    df = df.rename(columns={'id_inphared': 'id'})
    df['dummy_index'] = 0

    # Group by the dummy index and aggregate using 'first'
    df = df.groupby('dummy_index').first()

    # Reset the index
    df = df.reset_index(drop=True)

    columns = ['id','host_inphared','isolation_host_inphared','length', 'jumbophage', 'gc_perc',
        'CDS','frame_positive', 'frame_negative', 'cds_coding_density',  'positive_strand_%',
        "negative_strand_%",
        
        'CARD_AMR_Genes',
        'CRISPRs', 'VFDB_Virulence_Factors', 'connector',
       'head_packaging', 'host_takeover', 'integration and excision', 'lysis',
       'nucleotide_metabolism', 'other', 'tRNAs', 'tail', 'tmRNAs',
       'transcription', 'unkown_function', 'transl_table', 'sequence']
    
    df = df[columns]
    return df


In [4]:

def staining_feature(staining_df, features_df):
    stain = pd.read_csv(staining_df, index_col = 0)
    stain = stain[['Accession', 'staining']]
    stain = stain.rename(columns={'Accession': 'id'})

    features_df = pd.merge(features_df, stain, on='id', how='left')



    return features_df

In [5]:
# Code to generate features of the phages in a given folder 
def process_pharokka_output(args_data, output_csv):
    # Check if output CSV file exists, if not create it
    if not os.path.exists(output_csv):
        with open(output_csv, 'w') as f:
            pass

    # Iterate over each folder in the main folder
    for folder in os.listdir(args_data):
        folder_path = os.path.join(args_data, folder)

        # Check if the path is a directory
        if os.path.isdir(folder_path):
            # Check if "pharokka.gbk" file exists in the folder
            if "pharokka.gbk" in os.listdir(folder_path):
                print(f"Processing {folder_path}...")
                # Pass the full path of "pharokka.gbk" to your function and get the dataframe
                df = engineer_features(folder_path)

                # Append the dataframe to the CSV file
                with open(output_csv, 'a') as f:
                    df.to_csv(f, header=f.tell()==0, index=False)
            else:
                print(f"Skipped {folder_path}: 'pharokka.gbk' file not found or not a directory")

args_data = '../data/interim/pharokka/10_folders/'  # Replace with your main folder path  # Replace with your main folder path
output_csv = '../data/test/new_engineer_features.csv'

print("-> Starting Feature Engineering ----------------------")
# Delete the output CSV file if it already exists
if os.path.exists(output_csv):
    print("Output file already exists. Overwriting it...")
    os.remove(output_csv)
    
if os.path.isdir(args_data):
    print(f"Processing Pharokka phages in directory: {args_data}")
    process_pharokka_output(args_data, output_csv)

# Print completion message
print(f"Feature engineering completed and data stored to CSV in {output_csv}.")

-> Starting Feature Engineering ----------------------
Output file already exists. Overwriting it...
Processing Pharokka phages in directory: ../data/interim/pharokka/10_folders/
Processing ../data/interim/pharokka/10_folders/sequence_16443.fasta.pharokka...
Processing ../data/interim/pharokka/10_folders/sequence_16444.fasta.pharokka...
Processing ../data/interim/pharokka/10_folders/sequence_16445.fasta.pharokka...
Processing ../data/interim/pharokka/10_folders/sequence_16446.fasta.pharokka...
Processing ../data/interim/pharokka/10_folders/sequence_16447.fasta.pharokka...
Processing ../data/interim/pharokka/10_folders/sequence_16448.fasta.pharokka...
Processing ../data/interim/pharokka/10_folders/sequence_16449.fasta.pharokka...
Processing ../data/interim/pharokka/10_folders/sequence_16450.fasta.pharokka...
Processing ../data/interim/pharokka/10_folders/sequence_16451.fasta.pharokka...
Processing ../data/interim/pharokka/10_folders/sequence_16452.fasta.pharokka...
Processing ../data/in

In [6]:
df = pd.read_csv('../data/test/new_engineer_features.csv')
df.columns

Index(['id', 'host_inphared', 'isolation_host_inphared', 'length',
       'jumbophage', 'gc_perc', 'CDS', 'frame_positive', 'frame_negative',
       'cds_coding_density', 'positive_strand_%', 'negative_strand_%',
       'CARD_AMR_Genes', 'CRISPRs', 'VFDB_Virulence_Factors', 'connector',
       'head_packaging', 'host_takeover', 'integration and excision', 'lysis',
       'nucleotide_metabolism', 'other', 'tRNAs', 'tail', 'tmRNAs',
       'transcription', 'unkown_function', 'transl_table', 'sequence'],
      dtype='object')

# Fixing engineering

New engineering code

In [9]:
annotations = pd.read_csv("../data/raw/inphared_8Sep2023/8Sep2023_vConTACT2_host_annotations.tsv", sep = "\t")
ids_not_in_annotations = []

def engineer_features(folder):
    # Read data from the TSV file


    # Initialize lists for storing data
    ids = []
    topologies = []
    sequences = []

    # Read the GenBank file for topology and sequences
    genbank_file = f'{folder}/pharokka.gbk'
    for record in SeqIO.parse(genbank_file, "genbank"):
        id =record.id
        topology = record.annotations.get('topology', 'N/A')
        sequence = str(record.seq)
        topologies.append(record.annotations.get('topology', 'N/A'))
        sequences.append(str(record.seq))



    df = pd.read_csv("../data/raw/inphared_8Sep2023/8Sep2023_vConTACT2_host_annotations.tsv", sep = "\t")

    df = df.rename(columns={
        "Node_ID": "id_inphared",
        "Description": "isolation_host_inphared",
        "Host": "host_inphared"
    })


    df = df[df['id_inphared'] == id]
    # if id == "NC_001423":
    #     print(id)
    #     display(df)

    if id not in df["id_inphared"].unique():
        ids_not_in_annotations.append(id) 


    # Create DataFrames from the topologies and sequences
    topology_df = pd.DataFrame({'id_inphared': id, 'topology': topologies})
    sequence_df = pd.DataFrame({'id_inphared': id, 'sequence': sequences})

    # Merge the TSV data with the GenBank topology and sequence data
    df = pd.merge(df, topology_df, on='id_inphared', how='left')
    df = pd.merge(df, sequence_df, on='id_inphared', how='left')
    del(topology_df)
    del(sequence_df)

    df = pd.get_dummies(df, columns=['topology'])


    df_length_gc_cds_density = pd.read_csv(f"{folder}/pharokka_length_gc_cds_density.tsv", sep="\t")
    df_length_gc_cds_density['contig'] = df_length_gc_cds_density['contig'].str.slice(0,8)
    df_length_gc_cds_density


    df = pd.merge(df, df_length_gc_cds_density, left_on='id_inphared', right_on='contig', how='outer')
    del(df_length_gc_cds_density)

    df_cds = pd.read_csv(f"{folder}/pharokka_cds_functions.tsv", sep="\t")
    
    # Define a dictionary of replacements
    replacements = {
        "DNA, RNA and nucleotide metabolism":"nucleotide_metabolism",
        "head and packaging": "head_packaging",
        "moron, auxiliary metabolic gene and host takeover": "host_takeover",
        "transcription regulation": "transcription",
        "unknown function": "unkown_function"
    }

    # Replace the values using the dictionary
    df_cds['Description'] = df_cds['Description'].replace(replacements)

    df_cds = df_cds.pivot(index='contig', columns='Description', values='Count').reset_index()
    df_cds['contig'] = df_cds['contig'].str.slice(0,8)

    df = pd.merge(df, df_cds, left_on='id_inphared', right_on='contig', how='outer')
    del(df_cds)


    df_frame = pd.read_csv(f"{folder}/pharokka_cds_final_merged_output.tsv", sep="\t",  low_memory=False)

    frame_counts = df_frame.groupby('contig')['frame'].value_counts().unstack(fill_value=0)

    # Ensure both '+' and '-' columns are present
    frame_counts['+'] = frame_counts.get('+', 0)
    frame_counts['-'] = frame_counts.get('-', 0)

    # Rename columns explicitly
    frame_counts = frame_counts.rename(columns={'+': 'frame_positive', '-': 'frame_negative'})

    # Reset index if needed
    frame_counts = frame_counts.reset_index()
    frame_counts = frame_counts.rename_axis(None, axis=1).reset_index()
    frame_counts['contig'] = frame_counts['contig'].str.slice(0,8)
    df = pd.merge(df, frame_counts, left_on='id_inphared', right_on='contig', how='outer')

    df['jumbophage'] = df['length'].apply(lambda x: x >= 200000)
    df['jumbophage'] = df['jumbophage'].astype(int)  # Convert True/False to 1/0
    


    df = df.rename(columns={'id_inphared': 'id'})
    df['dummy_index'] = 0

    # Group by the dummy index and aggregate using 'first'
    df = df.groupby('dummy_index').first()
    # Creating new manual features
    df['positive_strand_%'] = round(df['frame_positive'] / df['CDS'] * 100,2)
    df['negative_strand_%'] = round(df['frame_negative'] / df['CDS'] * 100,2)


    # Reset the index
    df = df.reset_index(drop=True)

    columns = ['id','host_inphared','isolation_host_inphared','length', 'jumbophage', 'gc_perc',
        'CDS','frame_positive', 'frame_negative', 'cds_coding_density',  'positive_strand_%',
        "negative_strand_%",'CARD_AMR_Genes', 'CRISPRs', 'VFDB_Virulence_Factors', 'connector',
       'head_packaging', 'host_takeover', 'integration and excision', 'lysis',
       'nucleotide_metabolism', 'other', 'tRNAs', 'tail', 'tmRNAs',
       'transcription', 'unkown_function', 'transl_table', 'sequence']
    
    df = df[columns]
    return df, ids_not_in_annotations




Processing files that contain NaNs in the final dataset

In [10]:
# Code to generate features of the phages in a given folder 
def process_pharokka_output(args_data, output_csv):
    # Check if output CSV file exists, if not create it
    if not os.path.exists(output_csv):
        with open(output_csv, 'w') as f:
            pass

    # Iterate over each folder in the main folder
    for folder in os.listdir(args_data):
        folder_path = os.path.join(args_data, folder)

        # Check if the path is a directory
        if os.path.isdir(folder_path):
            # Check if "pharokka.gbk" file exists in the folder
            
            if "pharokka.gbk" in os.listdir(folder_path):
                print(f"Processing {folder_path}...")
                # Pass the full path of "pharokka.gbk" to your function and get the dataframe
                df, ids_not_in_annotations = engineer_features(folder_path)

                # Append the dataframe to the CSV file
                with open(output_csv, 'a') as f:
                    df.to_csv(f, header=f.tell()==0, index=False)
            else:
                print(f"Skipped {folder_path}: 'pharokka.gbk' file not found or not a directory")

args_data = '../data/interim/pharokka/nan_folders/'  # Replace with your main folder path
output_csv = '../data/test/nan_engineer_features.csv'
# args_data = '../data/interim/pharokka/10_folders/'  # Replace with your main folder path  # Replace with your main folder path
# output_csv = '../data/test/test10_engineer_features.csv'

# args_data = '../data/interim/pharokka/pharokka_full_output/'  # Replace with your main folder path
# output_csv = '../data/test/fixing_engineer_features.csv'

print("-> Starting Feature Engineering ----------------------")
# Delete the output CSV file if it already exists
if os.path.exists(output_csv):
    print("Output file already exists. Overwriting it...")
    os.remove(output_csv)
    
if os.path.isdir(args_data):
    print(f"Processing Pharokka phages in directory: {args_data}")
    process_pharokka_output(args_data, output_csv)

# Print completion message
print(f"Feature engineering completed and data stored to CSV in {output_csv}.")

-> Starting Feature Engineering ----------------------
Output file already exists. Overwriting it...
Processing Pharokka phages in directory: ../data/interim/pharokka/nan_folders/
Processing ../data/interim/pharokka/nan_folders/sequence_10925.fasta.pharokka...
Processing ../data/interim/pharokka/nan_folders/sequence_10926.fasta.pharokka...
Processing ../data/interim/pharokka/nan_folders/sequence_10927.fasta.pharokka...
Processing ../data/interim/pharokka/nan_folders/sequence_10928.fasta.pharokka...
Processing ../data/interim/pharokka/nan_folders/sequence_10929.fasta.pharokka...
Processing ../data/interim/pharokka/nan_folders/sequence_10930.fasta.pharokka...
Processing ../data/interim/pharokka/nan_folders/sequence_10931.fasta.pharokka...
Processing ../data/interim/pharokka/nan_folders/sequence_10932.fasta.pharokka...
Processing ../data/interim/pharokka/nan_folders/sequence_10933.fasta.pharokka...
Processing ../data/interim/pharokka/nan_folders/sequence_10934.fasta.pharokka...
Processing

Some analysis of the hosts annotation data to see what is different between two rows in the engineered features that look the same

In [None]:
ann = pd.read_csv("../data/raw/inphared_8Sep2023/1Feb2024_vConTACT2_host_annotations.tsv.gz", sep = "\t")
display(ann[ann["Node_ID"] == "PZACG"])
display(ann[ann["Node_ID"] == "NC_001423"])


Finding the ids that are in the Genbank files but not in the annotation file, and saving these annotations in a list. For that, I am just creating a single script coming frm engineer features

In [38]:
annotations = pd.read_csv("../data/raw/inphared_8Sep2023/8Sep2023_vConTACT2_host_annotations.tsv", sep="\t")

def get_ids(folder, annotations):
    ids_not_in_annotations = []
    genbank_file = os.path.join(folder, 'pharokka.gbk')
    for record in SeqIO.parse(genbank_file, 'genbank'):
        id = record.id

    if id not in annotations["Node_ID"].unique():
        ids_not_in_annotations.append(id) 
        print(id, "is not in the annotations file")
        
    return ids_not_in_annotations

args_data ='../data/interim/pharokka/pharokka_full_output/' 

output_file_path = '../data/test/ids_with_nan.txt'  # Define the output file path outside the loop

for folder in os.listdir(args_data):
    folder_path = os.path.join(args_data, folder)
    if os.path.isdir(folder_path) and "pharokka.gbk" in os.listdir(folder_path):
        print(f"Processing {folder_path}...")
        ids_not_in_annotations = get_ids(folder_path, annotations)

        # Save ids_not_in_annotations to a text file
        with open(output_file_path, 'a') as file:  # Open in append mode 'a'
            for id in ids_not_in_annotations:
                file.write(id + '\n')

Processing ../data/interim/pharokka/pharokka_full_output/sequence_001.fasta.pharokka...
Processing ../data/interim/pharokka/pharokka_full_output/sequence_002.fasta.pharokka...
Processing ../data/interim/pharokka/pharokka_full_output/sequence_003.fasta.pharokka...
Processing ../data/interim/pharokka/pharokka_full_output/sequence_004.fasta.pharokka...
Processing ../data/interim/pharokka/pharokka_full_output/sequence_005.fasta.pharokka...
Processing ../data/interim/pharokka/pharokka_full_output/sequence_006.fasta.pharokka...
Processing ../data/interim/pharokka/pharokka_full_output/sequence_007.fasta.pharokka...
Processing ../data/interim/pharokka/pharokka_full_output/sequence_008.fasta.pharokka...
Processing ../data/interim/pharokka/pharokka_full_output/sequence_009.fasta.pharokka...
Processing ../data/interim/pharokka/pharokka_full_output/sequence_010.fasta.pharokka...
Processing ../data/interim/pharokka/pharokka_full_output/sequence_011.fasta.pharokka...
Processing ../data/interim/pharo

In [None]:
def save_list_to_txt(lst, file_path):
    with open(file_path, 'w') as file:
        for item in lst:
            file.write(str(item) + '\n')

# Example usage:
output_csv = '../data/test/ids_with_nan.txt'
save_list_to_txt(ids_not_in_annotations, output_csv)

Comparing with hosts annotations

In [26]:
hosts = pd.read_csv("../data/raw/inphared_8Sep2023/8Sep2023_vConTACT2_host_annotations.tsv", sep = "\t")

hosts = hosts.rename(columns={
    "Node_ID": "id_inphared",
    "Description": "isolation_host_inphared",
    "Host": "host_inphared"
})

# Display the DataFrame to check if the changes are applied
hosts

['NC_001423']

## Yet another engineering code
I think I was using the wrong input file to extract the ids for the phages. I am going to use now the itol txt file

In [46]:
# Parsing the itol text file
import csv
import re

# Define the input and output file paths
input_file_path = "../data/raw/inphared_8Sep2023/1Jan2024_itol_host_annotations.txt"
output_file_path = "../data/raw/inphared_8Sep2023/1Jan2024_itol_host_annotations.csv"


# Open the input and output files
with open(input_file_path, "r") as input_file, open(output_file_path, "w", newline='') as output_file:
    # Create a CSV writer object
    csv_writer = csv.writer(output_file)
    
    # Write the header to the CSV file
    csv_writer.writerow(["id", "host"])
    
    # Skip the first 24 rows
    for _ in range(24):
        next(input_file)
    
    # Iterate over the remaining lines
    for line in input_file:
        # Split the line using regular expression to handle both tab and whitespace separators
        columns = re.split(r'\t| ', line.strip())
        
        # Extract the first and third columns
        id_value = columns[0]
        host_value = columns[2]
        
        # Write the extracted values to the CSV file
        csv_writer.writerow([id_value, host_value])

print("CSV file has been created successfully.")

CSV file has been created successfully.


In [50]:
annotations = pd.read_csv(output_file_path)
annotations[annotations["id"] == "OQ818694"]

Unnamed: 0,id,host
26927,OQ818694,Erwinia


Now that I have the csv file, I am going to engineer the features using that file

In [55]:
def engineer_features(folder, annotations_file):
    # Read data from the TSV file


    topologies = []
    sequences = []

    # Read the GenBank file for topology and sequences
    genbank_file = f'{folder}/pharokka.gbk'
    for record in SeqIO.parse(genbank_file, "genbank"):
        id =record.id
        topologies.append(record.annotations.get('topology', 'N/A'))
        sequences.append(str(record.seq))

    df = annotations_file
    df = df[df['id_inphared'] == id]

    # Create DataFrames from the topologies and sequences
    topology_df = pd.DataFrame({'id_inphared': id, 'topology': topologies})
    sequence_df = pd.DataFrame({'id_inphared': id, 'sequence': sequences})


    # Merge the TSV data with the GenBank topology and sequence data
    df = pd.merge(df, topology_df, on='id_inphared', how='left')
    df = pd.merge(df, sequence_df, on='id_inphared', how='left')
    del(topology_df)
    del(sequence_df)

    df = pd.get_dummies(df, columns=['topology'])


    df_length_gc_cds_density = pd.read_csv(f"{folder}/pharokka_length_gc_cds_density.tsv", sep="\t")
    df_length_gc_cds_density['contig'] = df_length_gc_cds_density['contig'].str.slice(0,8)
    df_length_gc_cds_density


    df = pd.merge(df, df_length_gc_cds_density, left_on='id_inphared', right_on='contig', how='outer')
    del(df_length_gc_cds_density)

    df_cds = pd.read_csv(f"{folder}/pharokka_cds_functions.tsv", sep="\t")
    
    # Define a dictionary of replacements
    replacements = {
        "DNA, RNA and nucleotide metabolism":"nucleotide_metabolism",
        "head and packaging": "head_packaging",
        "moron, auxiliary metabolic gene and host takeover": "host_takeover",
        "transcription regulation": "transcription",
        "unknown function": "unkown_function"
    }

    # Replace the values using the dictionary
    df_cds['Description'] = df_cds['Description'].replace(replacements)

    df_cds = df_cds.pivot(index='contig', columns='Description', values='Count').reset_index()
    df_cds['contig'] = df_cds['contig'].str.slice(0,8)

    df = pd.merge(df, df_cds, left_on='id_inphared', right_on='contig', how='outer')
    del(df_cds)


    df_frame = pd.read_csv(f"{folder}/pharokka_cds_final_merged_output.tsv", sep="\t",  low_memory=False)

    frame_counts = df_frame.groupby('contig')['frame'].value_counts().unstack(fill_value=0)

    # Ensure both '+' and '-' columns are present
    frame_counts['+'] = frame_counts.get('+', 0)
    frame_counts['-'] = frame_counts.get('-', 0)

    # Rename columns explicitly
    frame_counts = frame_counts.rename(columns={'+': 'frame_positive', '-': 'frame_negative'})

    # Reset index if needed
    frame_counts = frame_counts.reset_index()
    frame_counts = frame_counts.rename_axis(None, axis=1).reset_index()
    frame_counts['contig'] = frame_counts['contig'].str.slice(0,8)
    df = pd.merge(df, frame_counts, left_on='id_inphared', right_on='contig', how='outer')

    df['jumbophage'] = df['length'].apply(lambda x: x >= 200000)
    df['jumbophage'] = df['jumbophage'].astype(int)  # Convert True/False to 1/0
    


    df = df.rename(columns={'id_inphared': 'id'})
    df['dummy_index'] = 0

    # Group by the dummy index and aggregate using 'first'
    df = df.groupby('dummy_index').first()
    # Creating new manual features
    df['positive_strand_%'] = round(df['frame_positive'] / df['CDS'] * 100,2)
    df['negative_strand_%'] = round(df['frame_negative'] / df['CDS'] * 100,2)


    # Reset the index
    df = df.reset_index(drop=True)

    columns = ['id','host_inphared','length', 'jumbophage', 'gc_perc',
        'CDS','frame_positive', 'frame_negative', 'cds_coding_density',  'positive_strand_%',
        "negative_strand_%",'CARD_AMR_Genes', 'CRISPRs', 'VFDB_Virulence_Factors', 'connector',
       'head_packaging', 'host_takeover', 'integration and excision', 'lysis',
       'nucleotide_metabolism', 'other', 'tRNAs', 'tail', 'tmRNAs',
       'transcription', 'unkown_function', 'transl_table', 'sequence']
    
    df = df[columns]
    return df




In [57]:
# Code to generate features of the phages in a given folder 
def process_pharokka_output(args_data, output_csv, annotations_file):
    # Check if output CSV file exists, if not create it
    if not os.path.exists(output_csv):
        with open(output_csv, 'w') as f:
            pass

    # Iterate over each folder in the main folder
    for folder in os.listdir(args_data):
        folder_path = os.path.join(args_data, folder)

        # Check if the path is a directory
        if os.path.isdir(folder_path):
            # Check if "pharokka.gbk" file exists in the folder
            
            if "pharokka.gbk" in os.listdir(folder_path):
                print(f"Processing {folder_path}...")
                # Pass the full path of "pharokka.gbk" to your function and get the dataframe
                df = engineer_features(folder_path, annotations_file)

                # Append the dataframe to the CSV file
                with open(output_csv, 'a') as f:
                    df.to_csv(f, header=f.tell()==0, index=False)
            else:
                print(f"Skipped {folder_path}: 'pharokka.gbk' file not found or not a directory")

# args_data = '../data/interim/pharokka/nan_folders/'  # Replace with your main folder path
# output_csv = '../data/test/nan_engineer_features.csv'
# args_data = '../data/interim/pharokka/10_folders/'  # Replace with your main folder path  # Replace with your main folder path
# output_csv = '../data/test/test10_engineer_features.csv'

args_data = '../data/interim/pharokka/pharokka_full_output/'  # Replace with your main folder path
output_csv = '../data/test/final_hopefully_engineer_features.csv'


annotations_file = pd.read_csv("../data/raw/inphared_8Sep2023/1Jan2024_itol_host_annotations.csv")
annotations_file = annotations_file.rename(columns={
    "id": "id_inphared",
    "host": "host_inphared"
})
print("-> Starting Feature Engineering ----------------------")
# Delete the output CSV file if it already exists
if os.path.exists(output_csv):
    print("Output file already exists. Overwriting it...")
    os.remove(output_csv)
    
if os.path.isdir(args_data):
    print(f"Processing Pharokka phages in directory: {args_data}")
    process_pharokka_output(args_data, output_csv, annotations_file)

# Print completion message
print(f"Feature engineering completed and data stored to CSV in {output_csv}.")

-> Starting Feature Engineering ----------------------
Processing Pharokka phages in directory: ../data/interim/pharokka/pharokka_full_output/
Processing ../data/interim/pharokka/pharokka_full_output/sequence_001.fasta.pharokka...
Processing ../data/interim/pharokka/pharokka_full_output/sequence_002.fasta.pharokka...
Processing ../data/interim/pharokka/pharokka_full_output/sequence_003.fasta.pharokka...
Processing ../data/interim/pharokka/pharokka_full_output/sequence_004.fasta.pharokka...
Processing ../data/interim/pharokka/pharokka_full_output/sequence_005.fasta.pharokka...
Processing ../data/interim/pharokka/pharokka_full_output/sequence_006.fasta.pharokka...
Processing ../data/interim/pharokka/pharokka_full_output/sequence_007.fasta.pharokka...
Processing ../data/interim/pharokka/pharokka_full_output/sequence_008.fasta.pharokka...
Processing ../data/interim/pharokka/pharokka_full_output/sequence_009.fasta.pharokka...
Processing ../data/interim/pharokka/pharokka_full_output/sequence