In [2]:
import numpy as np
import pandas as pd
import os
from Bio import SeqIO
from Bio.SeqFeature import FeatureLocation
from Bio.Seq import UndefinedSequenceError
os.getcwd()

'/Users/Alvaro/Library/Mobile Documents/com~apple~CloudDocs/DTU/Autumn 2023/Phage project/phage/notebooks'

In [3]:
code = "MZ079855__Klebsiella_phage_vB_Kpn_3" # different genome lengths annotated
entry = f"../data/interim/pharokka/tmp_pharokka/{code}.fna_pharokka/"
entry

'../data/interim/pharokka/tmp_pharokka/MZ079855__Klebsiella_phage_vB_Kpn_3.fna_pharokka/'

Genbank file

In [4]:
def engineer_features(genbank_file):
    # Lists to hold data
    ids = []
    genome_lengths = []
    gc_contents = []
    sequences = []
    reverse_complements = []
    cds_numbers = []
    positive_strands = []
    negative_strands = []
    coding_capacities = []
    molecule_types = []
    topologies = []
    trna_counts = []

    
    # Read the GenBank file
    for record in SeqIO.parse(genbank_file, "genbank"):
        try:
            # Attempt to access the sequence, which may raise UndefinedSequenceError
            sequence = str(record.seq)
            # print(record.id)
        except UndefinedSequenceError:
            # print(f"Skipping record {record.id} as sequence is undefined.")
            continue  # Skip this record

        # Calculate genome length and GC content
        total_length = len(sequence)
        gc_content = round((sequence.count('C') + sequence.count('G')) / total_length * 100, 3)

        # Initialize counters
        plus = 0
        minus = 0
        coding_count = 0
        trna_count = 0
        seen = set()  # Store seen barcodes

        for feature in record.features:
            start = feature.location.start
            end = feature.location.end
            length = len(FeatureLocation(start, end))
            barcode = f"{start}_{end}_{length}"

            if feature.type != 'source' and barcode not in seen:
                coding_count += length
                seen.add(barcode)

            if feature.type == 'CDS':
                if feature.location.strand == 1:
                    plus += 1
                elif feature.location.strand == -1:
                    minus += 1
            elif feature.type == 'tRNA':
                trna_count += 1

        
        # Calculate total number of CDS
        total_CDS = plus + minus

        # Calculate strand usage as a percentage
        per_plus = round((plus / total_CDS) * 100, 2) if total_CDS != 0 else 0
        per_minus = round((minus / total_CDS) * 100, 2) if total_CDS != 0 else 0

        # Calculate coding capacity as a percentage
        coding_capacity = (coding_count / total_length) * 100

        # Extract molecule_type and topology
        molecule_type = record.annotations.get('molecule_type', 'N/A')
        topology = record.annotations.get('topology', 'N/A')

        # Append data to lists
        ids.append(record.id)
        genome_lengths.append(total_length)
        gc_contents.append(gc_content)
        sequences.append(sequence)
        reverse_complements.append(str(sequence[::-1]))
        cds_numbers.append(total_CDS)
        positive_strands.append(per_plus)
        negative_strands.append(per_minus)
        coding_capacities.append(coding_capacity)
        molecule_types.append(molecule_type)
        topologies.append(topology)
        trna_counts.append(trna_count)
    
    print("Processing the entries...")
    # Convert lists to pandas DataFrame
    df = pd.DataFrame({
        'id': ids,
        'genome_length': genome_lengths,
        'gc_%': gc_contents,
        'sequence': sequences,
        'reverse_complement': reverse_complements,
        'cds_number': cds_numbers,
        'positive_strand_%': positive_strands,
        'negative_strand_%': negative_strands,
        'coding_capacity': coding_capacities,
        'molecule_type': molecule_types,
        'topology': topologies,
        'trna_count': trna_counts
    })


    df['id'] = df['id'].str[:-2]

    # Check for unexpected molecule types
    expected_molecule_types = ['ss-DNA', 'DNA', 'RNA', 'ss-RNA']

    # Check and correct 'cRNA' entries
    cRNA_entries = df[df['molecule_type'] == 'cRNA']
    if not cRNA_entries.empty:
        for entry_id in cRNA_entries['id']:
            print(f"Info: Entry with id '{entry_id}' has molecule type 'cRNA'. Changing it to 'RNA'.")
        df.loc[df['molecule_type'] == 'cRNA', 'molecule_type'] = 'RNA'

    # Check and correct 'cDNA' entries
    cDNA_entries = df[df['molecule_type'] == 'cDNA']
    if not cDNA_entries.empty:
        for entry_id in cDNA_entries['id']:
            print(f"Info: Entry with id '{entry_id}' has molecule type 'cDNA'. Changing it to 'DNA'.")
        df.loc[df['molecule_type'] == 'cDNA', 'molecule_type'] = 'DNA'

    unexpected_types = set(df['molecule_type']) - set(expected_molecule_types)

    if unexpected_types:
        for utype in unexpected_types:
            # Get ids of entries with the unexpected molecule type
            ids_to_exclude = df[df['molecule_type'] == utype]['id'].tolist()
            for entry_id in ids_to_exclude:
                print(f"Warning: Entry with id '{entry_id}' has unrecognized molecule type '{utype}'. It will not be considered.")
            df = df[df['molecule_type'] != utype]
            
    df = pd.get_dummies(df, columns=['molecule_type'])

    expected_columns = ['jumbophage', 'molecule_type_ss-DNA', 'molecule_type_DNA', 'molecule_type_RNA', 'molecule_type_ss-RNA']
    for col in expected_columns:
        if col not in df.columns:
            df[col] = 0  # Filling with zeros
        df[col] = df[col].astype(bool)  # Convert to boolean

    df['jumbophage'] = df['genome_length'].apply(lambda x: x >= 200000)
    df['jumbophage'] = df['jumbophage'].astype(int)  # Convert True/False to 1/0
    df = pd.get_dummies(df, columns=['topology'])
    return df

In [5]:
df = engineer_features(f"{entry}pharokka.gbk")
df

Processing the entries...


Unnamed: 0,id,genome_length,gc_%,sequence,reverse_complement,cds_number,positive_strand_%,negative_strand_%,coding_capacity,trna_count,molecule_type_DNA,jumbophage,molecule_type_ss-DNA,molecule_type_RNA,molecule_type_ss-RNA,topology_linear
0,MZ079855__Klebsiella_phage_vB_Kpn,112003,41.361,TGTCACCGTGAGTTCGCTTGTTTTTTAGAGCCACAGGAATTTTTTC...,TTTTAAAATAATCGTTTTACCCCGTTTTATCGAACTGTTCACTTTT...,186,71.51,28.49,84.663804,17,True,0,False,False,False,True


CDS final merged output

In [6]:
df = pd.read_csv(f"{entry}pharokka_cds_final_merged_output.tsv", sep="\t")
df['gene'].value_counts().sum()
print(df.columns)
df.head()

Index(['gene', 'start', 'stop', 'frame', 'contig', 'score', 'mmseqs_phrog',
       'mmseqs_alnScore', 'mmseqs_seqIdentity', 'mmseqs_eVal',
       'mmseqs_top_hit', 'pyhmmer_phrog', 'pyhmmer_bitscore', 'pyhmmer_evalue',
       'custom_hmm_id', 'custom_hmm_bitscore', 'custom_hmm_evalue', 'phrog',
       'Method', 'Region', 'color', 'annot', 'category', 'vfdb_hit',
       'vfdb_alnScore', 'vfdb_seqIdentity', 'vfdb_eVal', 'vfdb_short_name',
       'vfdb_description', 'vfdb_species', 'CARD_hit', 'CARD_alnScore',
       'CARD_seqIdentity', 'CARD_eVal', 'CARD_species', 'ARO_Accession',
       'CARD_short_name', 'Protein_Accession', 'DNA_Accession',
       'AMR_Gene_Family', 'Drug_Class', 'Resistance_Mechanism',
       'transl_table'],
      dtype='object')


Unnamed: 0,gene,start,stop,frame,contig,score,mmseqs_phrog,mmseqs_alnScore,mmseqs_seqIdentity,mmseqs_eVal,...,CARD_eVal,CARD_species,ARO_Accession,CARD_short_name,Protein_Accession,DNA_Accession,AMR_Gene_Family,Drug_Class,Resistance_Mechanism,transl_table
0,DHKIYHKJ_CDS_0001,175,2,-,MZ079855__Klebsiella_phage_vB_Kpn_3,4.2,No_PHROG,No_PHROG,No_PHROG,No_PHROG,...,,,,,,,,,,11
1,DHKIYHKJ_CDS_0002,730,930,+,MZ079855__Klebsiella_phage_vB_Kpn_3,4.5,No_PHROG,No_PHROG,No_PHROG,No_PHROG,...,,,,,,,,,,11
2,DHKIYHKJ_CDS_0003,1074,904,-,MZ079855__Klebsiella_phage_vB_Kpn_3,20.8,26718,148.0,1.0,6.179e-46,...,,,,,,,,,,11
3,DHKIYHKJ_CDS_0004,1187,1074,-,MZ079855__Klebsiella_phage_vB_Kpn_3,3.8,No_PHROG,No_PHROG,No_PHROG,No_PHROG,...,,,,,,,,,,11
4,DHKIYHKJ_CDS_0005,1332,1670,+,MZ079855__Klebsiella_phage_vB_Kpn_3,45.5,24947,285.0,1.0,3.171e-92,...,,,,,,,,,,11


In [21]:
df[['gene', 'start', 'stop', 'frame', 'Region', 'annot', 'category']]

Unnamed: 0,gene,start,stop,frame,Region,annot,category
0,DHKIYHKJ_CDS_0001,175,2,-,CDS,hypothetical protein,unknown function
1,DHKIYHKJ_CDS_0002,730,930,+,CDS,hypothetical protein,unknown function
2,DHKIYHKJ_CDS_0003,1074,904,-,CDS,hypothetical protein,unknown function
3,DHKIYHKJ_CDS_0004,1187,1074,-,CDS,hypothetical protein,unknown function
4,DHKIYHKJ_CDS_0005,1332,1670,+,CDS,hypothetical protein,unknown function
...,...,...,...,...,...,...,...
181,DHKIYHKJ_CDS_0182,108400,108798,+,CDS,hypothetical protein,unknown function
182,DHKIYHKJ_CDS_0183,108854,109255,+,CDS,hypothetical protein,unknown function
183,DHKIYHKJ_CDS_0184,109268,109777,+,CDS,hypothetical protein,unknown function
184,DHKIYHKJ_CDS_0185,109790,110110,+,CDS,hypothetical protein,unknown function


In [14]:
df["Region"].value_counts()

Region
CDS    186
Name: count, dtype: int64

In [9]:
df["annot"].value_counts().sum()

annot
hypothetical protein                                          97
tail length tape measure protein                               4
anaerobic ribonucleoside reductase large subunit               3
DNA polymerase                                                 3
replication origin binding                                     3
NAD-dependent DNA ligase                                       3
central tail fiber J                                           3
NrdD-like anaerobic ribonucleotide reductase large subunit     2
ribonucleoside diphosphate reductase small subunit             2
endolysin                                                      2
Rz-like spanin                                                 2
DnaB-like replicative helicase                                 2
DNA primase                                                    2
NinI-like serine-threonine phosphatase                         2
DNA helicase                                                   2
DNA binding protein

In [17]:
df["category"].value_counts()

category
unknown function                                     98
DNA, RNA and nucleotide metabolism                   38
tail                                                 21
other                                                 8
head and packaging                                    7
lysis                                                 5
moron, auxiliary metabolic gene and host takeover     4
connector                                             4
transcription regulation                              1
Name: count, dtype: int64

In [24]:
df["frame"].value_counts()


frame
+    133
-     53
Name: count, dtype: int64

CDS functions

In [46]:
df = pd.read_csv(f"{entry}pharokka_cds_functions.tsv", sep="\t")
df

Unnamed: 0,Description,Count,contig
0,CDS,186,MZ079855__Klebsiella_phage_vB_Kpn_3
1,connector,4,MZ079855__Klebsiella_phage_vB_Kpn_3
2,"DNA, RNA and nucleotide metabolism",38,MZ079855__Klebsiella_phage_vB_Kpn_3
3,head and packaging,7,MZ079855__Klebsiella_phage_vB_Kpn_3
4,integration and excision,0,MZ079855__Klebsiella_phage_vB_Kpn_3
5,lysis,5,MZ079855__Klebsiella_phage_vB_Kpn_3
6,"moron, auxiliary metabolic gene and host takeover",4,MZ079855__Klebsiella_phage_vB_Kpn_3
7,other,8,MZ079855__Klebsiella_phage_vB_Kpn_3
8,tail,21,MZ079855__Klebsiella_phage_vB_Kpn_3
9,transcription regulation,1,MZ079855__Klebsiella_phage_vB_Kpn_3


Length gc CDS density

In [47]:
df = pd.read_csv("../data/interim/pharokka/tmp_pharokka/MZ223858__Enterobacter_phage_EV136_1.fna_pharokka/pharokka_length_gc_cds_density.tsv", sep="\t")
df

Unnamed: 0,contig,length,gc_perc,transl_table,cds_coding_density
0,MZ223858__Enterobacter_phage_EV136_1,39701,52.07,11,91.05


Top hits mash inphared

In [48]:
df = pd.read_csv(f"{entry}pharokka_top_hits_mash_inphared.tsv", sep="\t")
df

Unnamed: 0,contig,Accession,mash_distance,mash_pval,mash_matching_hashes,Description,Classification,Genome_Length_(bp),Jumbophage,molGC_(%),...,Sub-family,Family,Order,Class,Phylum,Kingdom,Realm,Baltimore_Group,Genbank_Division,Isolation_Host_(beware_inconsistent_and_nonsense_values)
0,MZ079855__Klebsiella_phage_vB_Kpn_3,MZ079855,0.0,0.0,1000/1000,Klebsiella phage vB_Kpn_3,Klebsiella phage vB_Kpn_3 Sugarlandvirus Demer...,112003,False,41.361,...,Unclassified,Demerecviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,MDR-Klebsiella pneumoniae


In [49]:
feature_columns = ['Accession',
       'Genome_Length_(bp)', 'Jumbophage', 'molGC_(%)', 'Molecule',
       'Modification_Date', 'Number_CDS', 'Positive_Strand_(%)',
       'Negative_Strand_(%)', 'Coding_Capacity_(%)',
       'Low_Coding_Capacity_Warning', 'tRNAs']
for column in df.columns:
    if column in feature_columns:
        print(f"Column {column}: ",df[column].to_list())

Column Accession:  ['MZ079855']
Column Genome_Length_(bp):  [112003]
Column Jumbophage:  [False]
Column molGC_(%):  [41.361]
Column Molecule:  ['DNA']
Column Modification_Date:  ['16-JUL-2021']
Column Number_CDS:  [182]
Column Positive_Strand_(%):  [70.8791208791209]
Column Negative_Strand_(%):  [29.1208791208791]
Column Coding_Capacity_(%):  [83.2638411471121]
Column tRNAs:  [19]


Top hits card

In [50]:
df = pd.read_csv(f"{entry}top_hits_card.tsv", sep="\t")
df

Unnamed: 0,contig,gene,card_hit,card_alnScore,card_seqIdentity,start,stop,frame


Top hits vfdb

In [39]:
df = pd.read_csv(f"{entry}top_hits_vfdb.tsv", sep="\t")
df

Unnamed: 0,contig,gene,vfdb_hit,vfdb_alnScore,vfdb_seqIdentity,start,stop,frame


Genbank file

In [33]:
genbank_file_path = f'{entry}pharokka.gbk'

# Lists to hold data
ids = []
genome_lengths = []
gc_contents = []
sequences = []
reverse_complements = []
cds_numbers = []
positive_strands = []
negative_strands = []
coding_capacities = []
molecule_types = []
topologies = []
trna_counts = []

# Read the GenBank file
for record in SeqIO.parse(genbank_file_path, "genbank"):
    try:
        # Attempt to access the sequence, which may raise UndefinedSequenceError
        sequence = str(record.seq)
        # print(record.id)
    except UndefinedSequenceError:
        # print(f"Skipping record {record.id} as sequence is undefined.")
        continue  # Skip this record

    # Calculate genome length and GC content
    total_length = len(sequence)
    gc_content = round((sequence.count('C') + sequence.count('G')) / total_length * 100, 3)

    # Initialize counters
    plus = 0
    minus = 0
    coding_count = 0
    trna_count = 0
    seen = set()  # Store seen barcodes

    for feature in record.features:
        start = feature.location.start
        end = feature.location.end
        length = len(FeatureLocation(start, end))
        barcode = f"{start}_{end}_{length}"

        if feature.type != 'source' and barcode not in seen:
            coding_count += length
            seen.add(barcode)

        if feature.type == 'CDS':
            if feature.location.strand == 1:
                plus += 1
            elif feature.location.strand == -1:
                minus += 1
        elif feature.type == 'tRNA':
            trna_count += 1


    # Calculate total number of CDS
    total_CDS = plus + minus

    # Calculate strand usage as a percentage
    per_plus = round((plus / total_CDS) * 100, 2) if total_CDS != 0 else 0
    per_minus = round((minus / total_CDS) * 100, 2) if total_CDS != 0 else 0

    # Calculate coding capacity as a percentage
    coding_capacity = (coding_count / total_length) * 100

    # Extract molecule_type and topology
    molecule_type = record.annotations.get('molecule_type', 'N/A')
    topology = record.annotations.get('topology', 'N/A')

    # Append data to lists
    ids.append(record.id)
    genome_lengths.append(total_length)
    gc_contents.append(gc_content)
    sequences.append(sequence)
    reverse_complements.append(str(sequence[::-1]))
    cds_numbers.append(total_CDS)
    positive_strands.append(per_plus)
    negative_strands.append(per_minus)
    coding_capacities.append(coding_capacity)
    molecule_types.append(molecule_type)
    topologies.append(topology)
    trna_counts.append(trna_count)

# Convert lists to pandas DataFrame
df = pd.DataFrame({
    'id': ids,
    'genome_length': genome_lengths,
    'gc_%': gc_contents,
    'sequence': sequences,
    'reverse_complement': reverse_complements,
    'cds_number': cds_numbers,
    'positive_strand_%': positive_strands,
    'negative_strand_%': negative_strands,
    'coding_capacity': coding_capacities,
    'molecule_type': molecule_types,
    'topology': topologies,
    'trna_count': trna_counts
})


# Filter df based on previously known sequences with a staining class
accession_values = pd.read_csv('../data/processed/model_data.csv')
accession_values = accession_values['Accession'].tolist()
df['id'] = df['id'].str[:-2]
# df = df[df['id'].isin(accession_values)]


# Check for unexpected molecule types
expected_molecule_types = ['ss-DNA', 'DNA', 'RNA', 'ss-RNA']

# Check and correct 'cRNA' entries
cRNA_entries = df[df['molecule_type'] == 'cRNA']
if not cRNA_entries.empty:
    for entry_id in cRNA_entries['id']:
        print(f"Info: Entry with id '{entry_id}' has molecule type 'cRNA'. Changing it to 'RNA'.")
    df.loc[df['molecule_type'] == 'cRNA', 'molecule_type'] = 'RNA'

# Check and correct 'cDNA' entries
cDNA_entries = df[df['molecule_type'] == 'cDNA']
if not cDNA_entries.empty:
    for entry_id in cDNA_entries['id']:
        print(f"Info: Entry with id '{entry_id}' has molecule type 'cDNA'. Changing it to 'DNA'.")
    df.loc[df['molecule_type'] == 'cDNA', 'molecule_type'] = 'DNA'

unexpected_types = set(df['molecule_type']) - set(expected_molecule_types)

if unexpected_types:
    for utype in unexpected_types:
        # Get ids of entries with the unexpected molecule type
        ids_to_exclude = df[df['molecule_type'] == utype]['id'].tolist()
        for entry_id in ids_to_exclude:
            print(f"Warning: Entry with id '{entry_id}' has unrecognized molecule type '{utype}'. It will not be considered.")
        df = df[df['molecule_type'] != utype]

        
df = pd.get_dummies(df, columns=['molecule_type'])

df = pd.get_dummies(df, columns=['topology'])

expected_columns = ['topology_linear', 'topology_circular', 'molecule_type_ss-DNA', 'molecule_type_DNA', 'molecule_type_RNA', 'molecule_type_ss-RNA']
for col in expected_columns:
    if col not in df.columns:
        df[col] = 0  # Filling with zeros
    df[col] = df[col].astype(bool)  # Convert to boolean



df['jumbophage'] = df['genome_length'].apply(lambda x: x >= 200000)
df['jumbophage'] = df['jumbophage'].astype(int)  # Convert True/False to 1/0

# Change order of entries
df = df[['id', 'genome_length', 'jumbophage', 'topology_linear', 'topology_circular', 'gc_%', 'trna_count','cds_number', 'coding_capacity','positive_strand_%','negative_strand_%', 'molecule_type_ss-DNA', 'molecule_type_DNA', 'molecule_type_RNA', 'molecule_type_ss-RNA']]
# df = df[['id', 'genome_length', 'jumbophage', 'molecule_type', 'topology', 'gc_%', 'trna_count','cds_number', 'coding_capacity','positive_strand_%','negative_strand_%' ,'sequence', 'reverse_complement']]

df

Unnamed: 0,id,genome_length,jumbophage,topology_linear,topology_circular,gc_%,trna_count,cds_number,coding_capacity,positive_strand_%,negative_strand_%,molecule_type_ss-DNA,molecule_type_DNA,molecule_type_RNA,molecule_type_ss-RNA
0,MZ079855__Klebsiella_phage_vB_Kpn,112003,0,True,False,41.361,17,186,84.663804,71.51,28.49,False,True,False,False


In [34]:
df_inphared = pd.read_csv(f"{entry}pharokka_top_hits_mash_inphared.tsv", sep="\t")
df_inphared[['molGC_(%)', 'Molecule', 'Number_CDS', 'Positive_Strand_(%)',
       'Negative_Strand_(%)', 'Coding_Capacity_(%)']]

Unnamed: 0,molGC_(%),Molecule,Number_CDS,Positive_Strand_(%),Negative_Strand_(%),Coding_Capacity_(%)
0,41.361,DNA,182,70.879121,29.120879,83.263841


In [37]:
feature_columns = ['molGC_(%)', 'Molecule', 'Number_CDS', 'Positive_Strand_(%)',
       'Negative_Strand_(%)', 'Coding_Capacity_(%)']

print("Number of CDS annotated by PHAROKKA")


print("Information gathered by INPHARED")
for column in df_inphared.columns:
    if column in feature_columns:
        print(f"Column {column}: ",df_inphared[column].to_list())


print

Information gathered by INPHARED
Column molGC_(%):  [41.361]
Column Molecule:  ['DNA']
Column Number_CDS:  [182]
Column Positive_Strand_(%):  [70.8791208791209]
Column Negative_Strand_(%):  [29.1208791208791]
Column Coding_Capacity_(%):  [83.2638411471121]
