In [2]:
import pandas as pd
from Bio import SeqIO
from Bio.SeqFeature import FeatureLocation
from Bio.Seq import UndefinedSequenceError
import os

# Parsing host annotations

In [7]:
hosts = pd.read_csv("../data/raw/inphared_8Sep2023/8Sep2023_vConTACT2_host_annotations.tsv", sep = "\t")

hosts = hosts.rename(columns={
    "Node_ID": "id_inphared",
    "Description": "isolation_host_inphared",
    "Host": "host_inphared"
})

# Display the DataFrame to check if the changes are applied
hosts

Unnamed: 0,id_inphared,isolation_host_inphared,host_inphared,Colour (Host)
0,GU339467,Mycobacterium phage RedRock,Mycobacterium,#8C1292
1,MF417929,Uncultured Caudovirales phage clone 2F_1,Unspecified,#D3D3D3
2,MH616963,crAssphage sp. isolate ctbg_1,Unspecified,#D3D3D3
3,MH552500,crAssphage sp. isolate ctcc615,Unspecified,#D3D3D3
4,BK010471,Carjivirus communis,Unspecified,#D3D3D3
...,...,...,...,...
20885,AY526908,Bordetella phage BMP-1,Bordetella,#1479C4
20886,AJ270057,Chlamydia phage 2,Chlamydia,#9A4AD6
20887,AB016282,Bacillus phage phi105,Bacillus,#74AF9B
20888,AF109874,Lactococcus phage Tuc2009,Lactococcus,#395C1F


In [41]:
def engineer_features(folder):
    # Read data from the TSV file
    df = pd.read_csv("../data/raw/inphared_8Sep2023/8Sep2023_vConTACT2_host_annotations.tsv", sep = "\t")

    df = df.rename(columns={
        "Node_ID": "id_inphared",
        "Description": "isolation_host_inphared",
        "Host": "host_inphared"
    })


    # Initialize lists for storing data
    ids = []
    topologies = []
    sequences = []

    # Read the GenBank file for topology and sequences
    genbank_file = f'{folder}/pharokka.gbk'
    for record in SeqIO.parse(genbank_file, "genbank"):
        ids.append(record.id)    
        topologies.append(record.annotations.get('topology', 'N/A'))
        sequences.append(str(record.seq))



    # Create DataFrames from the topologies and sequences
    topology_df = pd.DataFrame({'id_inphared': ids, 'topology': topologies})
    sequence_df = pd.DataFrame({'id_inphared': ids, 'sequence': sequences})

    # Merge the TSV data with the GenBank topology and sequence data
    df = pd.merge(df, topology_df, on='id_inphared', how='left')
    df = pd.merge(df, sequence_df, on='id_inphared', how='left')
    del(topology_df)
    del(sequence_df)

    df = pd.get_dummies(df, columns=['topology'])


    df_length_gc_cds_density = pd.read_csv(f"{folder}/pharokka_length_gc_cds_density.tsv", sep="\t")
    df_length_gc_cds_density['contig'] = df_length_gc_cds_density['contig'].str.slice(0,8)
    df_length_gc_cds_density


    df = pd.merge(df, df_length_gc_cds_density, left_on='id_inphared', right_on='contig', how='outer')
    del(df_length_gc_cds_density)

    df_cds = pd.read_csv(f"{folder}/pharokka_cds_functions.tsv", sep="\t")
    
    # Define a dictionary of replacements
    replacements = {
        "DNA, RNA and nucleotide metabolism":"nucleotide_metabolism",
        "head and packaging": "head_packaging",
        "moron, auxiliary metabolic gene and host takeover": "host_takeover",
        "transcription regulation": "transcription",
        "unknown function": "unkown_function"
    }

    # Replace the values using the dictionary
    df_cds['Description'] = df_cds['Description'].replace(replacements)

    df_cds = df_cds.pivot(index='contig', columns='Description', values='Count').reset_index()
    df_cds['contig'] = df_cds['contig'].str.slice(0,8)

    df = pd.merge(df, df_cds, left_on='id_inphared', right_on='contig', how='outer')
    del(df_cds)


    df_frame = pd.read_csv(f"{folder}/pharokka_cds_final_merged_output.tsv", sep="\t",  low_memory=False)

    frame_counts = df_frame.groupby('contig')['frame'].value_counts().unstack(fill_value=0)

    # Ensure both '+' and '-' columns are present
    frame_counts['+'] = frame_counts.get('+', 0)
    frame_counts['-'] = frame_counts.get('-', 0)

    # Rename columns explicitly
    frame_counts = frame_counts.rename(columns={'+': 'frame_positive', '-': 'frame_negative'})

    # Reset index if needed
    frame_counts = frame_counts.reset_index()
    frame_counts = frame_counts.rename_axis(None, axis=1).reset_index()
    frame_counts['contig'] = frame_counts['contig'].str.slice(0,8)
    df = pd.merge(df, frame_counts, left_on='id_inphared', right_on='contig', how='outer')

    df['jumbophage'] = df['length'].apply(lambda x: x >= 200000)
    df['jumbophage'] = df['jumbophage'].astype(int)  # Convert True/False to 1/0
    

    # Creating new manual features
    df['positive_strand_%'] = round(df['frame_positive'] / df['CDS'] * 100,2)
    df['negative_strand_%'] = round(df['frame_negative'] / df['CDS'] * 100,2)


    df = df.rename(columns={'id_inphared': 'id'})
    df['dummy_index'] = 0

    # Group by the dummy index and aggregate using 'first'
    df = df.groupby('dummy_index').first()

    # Reset the index
    df = df.reset_index(drop=True)

    columns = ['id','host_inphared','isolation_host_inphared','length', 'jumbophage', 'gc_perc',
        'CDS','frame_positive', 'frame_negative', 'cds_coding_density',  'positive_strand_%',
        "negative_strand_%",
        
        'CARD_AMR_Genes',
        'CRISPRs', 'VFDB_Virulence_Factors', 'connector',
       'head_packaging', 'host_takeover', 'integration and excision', 'lysis',
       'nucleotide_metabolism', 'other', 'tRNAs', 'tail', 'tmRNAs',
       'transcription', 'unkown_function', 'transl_table', 'sequence']
    
    df = df[columns]
    return df


In [42]:

def staining_feature(staining_df, features_df):
    stain = pd.read_csv(staining_df, index_col = 0)
    stain = stain[['Accession', 'staining']]
    stain = stain.rename(columns={'Accession': 'id'})

    features_df = pd.merge(features_df, stain, on='id', how='left')



    return features_df

In [43]:
# Code to generate features of the phages in a given folder 
def process_pharokka_output(args_data, output_csv):
    # Check if output CSV file exists, if not create it
    if not os.path.exists(output_csv):
        with open(output_csv, 'w') as f:
            pass

    # Iterate over each folder in the main folder
    for folder in os.listdir(args_data):
        folder_path = os.path.join(args_data, folder)

        # Check if the path is a directory
        if os.path.isdir(folder_path):
            # Check if "pharokka.gbk" file exists in the folder
            if "pharokka.gbk" in os.listdir(folder_path):
                print(f"Processing {folder_path}...")
                # Pass the full path of "pharokka.gbk" to your function and get the dataframe
                df = engineer_features(folder_path)

                # Append the dataframe to the CSV file
                with open(output_csv, 'a') as f:
                    df.to_csv(f, header=f.tell()==0, index=False)
            else:
                print(f"Skipped {folder_path}: 'pharokka.gbk' file not found or not a directory")

args_data = '../data/interim/pharokka/10_folders/'  # Replace with your main folder path  # Replace with your main folder path
output_csv = '../data/test/new_engineer_features.csv'

print("-> Starting Feature Engineering ----------------------")
# Delete the output CSV file if it already exists
if os.path.exists(output_csv):
    print("Output file already exists. Overwriting it...")
    os.remove(output_csv)
    
if os.path.isdir(args_data):
    print(f"Processing Pharokka phages in directory: {args_data}")
    process_pharokka_output(args_data, output_csv)

# Print completion message
print(f"Feature engineering completed and data stored to CSV in {output_csv}.")

-> Starting Feature Engineering ----------------------
Output file already exists. Overwriting it...
Processing Pharokka phages in directory: ../data/interim/pharokka/10_folders/
Processing ../data/interim/pharokka/10_folders/sequence_16443.fasta.pharokka...
Processing ../data/interim/pharokka/10_folders/sequence_16444.fasta.pharokka...
Processing ../data/interim/pharokka/10_folders/sequence_16445.fasta.pharokka...
Processing ../data/interim/pharokka/10_folders/sequence_16446.fasta.pharokka...
Processing ../data/interim/pharokka/10_folders/sequence_16447.fasta.pharokka...
Processing ../data/interim/pharokka/10_folders/sequence_16448.fasta.pharokka...
Processing ../data/interim/pharokka/10_folders/sequence_16449.fasta.pharokka...
Processing ../data/interim/pharokka/10_folders/sequence_16450.fasta.pharokka...
Processing ../data/interim/pharokka/10_folders/sequence_16451.fasta.pharokka...
Processing ../data/interim/pharokka/10_folders/sequence_16452.fasta.pharokka...
Processing ../data/in

In [46]:
df = pd.read_csv('../data/test/new_engineer_features.csv')
df.columns

Index(['id', 'host_inphared', 'isolation_host_inphared', 'length',
       'jumbophage', 'gc_perc', 'CDS', 'frame_positive', 'frame_negative',
       'cds_coding_density', 'positive_strand_%', 'negative_strand_%',
       'CARD_AMR_Genes', 'CRISPRs', 'VFDB_Virulence_Factors', 'connector',
       'head_packaging', 'host_takeover', 'integration and excision', 'lysis',
       'nucleotide_metabolism', 'other', 'tRNAs', 'tail', 'tmRNAs',
       'transcription', 'unkown_function', 'transl_table', 'sequence'],
      dtype='object')