In [59]:
import numpy as np
import pandas as pd
import os
os.getcwd()

'/mnt/c/Users/Alvaro/Desktop/projects/phage/notebooks'

# Cleaning the phage dataset

In [60]:
df = pd.read_csv("../data/raw/inphared_8Sep2023/8Sep2023_data.tsv", sep="\t")
df.rename(columns={'Isolation Host (beware inconsistent and nonsense values)': 'Isolation Host'}, inplace=True)
print("Shape before filtering", df.shape)
# df = df[['Accession','Host','Isolation Host']]

Shape before filtering (26287, 27)


In [61]:
# This will be saved for testing
discarded = df[(df['Host'] == 'Unspecified') | (df['Isolation Host'] == 'Unspecified')]

# Keep only the 'Host' and 'Isolation Host' columns
discarded = discarded[['Accession','Host', 'Isolation Host']]

# discarded.to_csv("../data/interim/discarded_phages.csv")
print(discarded.shape)

(11946, 3)


Removing Unspecified hosts and isolation hosts

In [62]:
df = df[df['Host'] != "Unspecified"] 
print("Shape after filtering unspecified hosts", df.shape)
df = df[df['Isolation Host'] != "Unspecified"]
print("Shape after filtering unspecified isolation hosts", df.shape)
df.head()

Shape after filtering unspecified hosts (23367, 27)
Shape after filtering unspecified isolation hosts (14341, 27)


Unnamed: 0,Accession,Description,Classification,Genome Length (bp),Jumbophage,molGC (%),Molecule,Modification Date,Number CDS,Positive Strand (%),...,Sub-family,Family,Order,Class,Phylum,Kingdom,Realm,Baltimore Group,Genbank Division,Isolation Host
31,MN335248,Vibrio phage XacF13,Vibrio phage XacF13 Xylivirus XacF13 Xylivirus...,7045,False,60.298,ss-DNA,21-OCT-2019,13,84.615385,...,Unclassified,Inoviridae,Tubulavirales,Faserviricetes,Hofneiviricota,Loebvirae,Monodnaviria,Group II,ENV,Xanthomonas vesicatoria
32,MK250029,Prevotella phage Lak-C1,Prevotella phage Lak-C1 Caudoviricetes Uroviri...,540217,True,25.796,DNA,13-JAN-2019,830,47.108434,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,ENV,Prevotella sp.
33,MK250028,Prevotella phage Lak-B9,Prevotella phage Lak-B9 Caudoviricetes Uroviri...,550053,True,26.012,DNA,13-JAN-2019,859,52.270081,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,ENV,Prevotella sp.
34,MK250027,Prevotella phage Lak-B8,Prevotella phage Lak-B8 Caudoviricetes Uroviri...,551627,True,26.022,DNA,13-JAN-2019,860,53.023256,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,ENV,Prevotella sp.
35,MK250026,Prevotella phage Lak-B7,Prevotella phage Lak-B7 Caudoviricetes Uroviri...,550702,True,26.02,DNA,13-JAN-2019,859,53.201397,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,ENV,Prevotella sp.


Reduce Isolation Host naming

In [63]:
# Only keep first two strings in each row of Isolation Host to reduce name incompatibilities
# Split each value on space and select the first two elements
df['Isolation Host'] = df['Isolation Host'].str.split().str[:2].str.join(' ')

## Fixing misspelled genera, wrong taxonomy, etc

In [64]:
df['Host'] = df['Host'].replace('Enteroccous', 'Enterococcus')

Modifying the Enterobacteria in Salmonella cases

In [65]:
print("Before")
display(df[df['Isolation Host'] == 'Salmonella enterica'].head(7))

print("After (Salmonella enterica no longer appears on the dataset as Enterobacteria)")
# Update the "Host" column based on the condition
df.loc[df['Isolation Host'] == 'Salmonella enterica', 'Host'] = 'Salmonella'
display(df[df['Host'] == 'Enterobacteria'].head(7))

Before


Unnamed: 0,Accession,Description,Classification,Genome Length (bp),Jumbophage,molGC (%),Molecule,Modification Date,Number CDS,Positive Strand (%),...,Sub-family,Family,Order,Class,Phylum,Kingdom,Realm,Baltimore Group,Genbank Division,Isolation Host
69,OR413347,Salmonella phage EH7,Salmonella phage EH7 Viruses,110344,False,39.504,DNA,03-SEP-2023,157,32.484076,...,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,PHG,Salmonella enterica
70,OR413345,Salmonella phage EH4,Salmonella phage EH4 Viruses,108554,False,39.84,DNA,03-SEP-2023,154,29.220779,...,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,PHG,Salmonella enterica
71,OR413344,Salmonella phage EH3,Salmonella phage EH3 Viruses,115218,False,40.127,DNA,03-SEP-2023,162,30.864198,...,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,PHG,Salmonella enterica
72,OR413342,Salmonella phage EH1,Salmonella phage EH1 Viruses,109346,False,39.552,DNA,03-SEP-2023,153,31.372549,...,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,PHG,Salmonella enterica
2301,OR067837,Salmonella phage KKP 3953,Salmonella phage KKP 3953 Epseptimavirus Marka...,115522,False,39.9,DNA,23-AUG-2023,179,26.256983,...,Markadamsvirinae,Demerecviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Salmonella enterica
2303,OR067835,Salmonella phage KKP 3828,Salmonella phage KKP 3828 Koutsourovirus Slope...,43099,False,51.962,DNA,23-AUG-2023,66,100.0,...,Slopekvirinae,Autographiviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Salmonella enterica
2338,OQ927978,Salmonella phage SEA1,Salmonella phage SEA1 Caudoviricetes Urovirico...,162022,False,36.996,DNA,23-AUG-2023,262,18.320611,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Salmonella enterica


After (Salmonella enterica no longer appears on the dataset as Enterobacteria)


Unnamed: 0,Accession,Description,Classification,Genome Length (bp),Jumbophage,molGC (%),Molecule,Modification Date,Number CDS,Positive Strand (%),...,Sub-family,Family,Order,Class,Phylum,Kingdom,Realm,Baltimore Group,Genbank Division,Isolation Host
2994,NC_075033,Enterobacteria phage f1,Enterobacteria phage f1 Inovirus M13 Inovirus ...,6407,False,40.503,DNA,08-MAY-2023,8,87.5,...,Unclassified,Inoviridae,Tubulavirales,Faserviricetes,Hofneiviricota,Loebvirae,Monodnaviria,Group II,PHG,Escherichia coli
2995,NC_075032,Enterobacteria phage f1,Enterobacteria phage f1 Inovirus M13 Inovirus ...,6407,False,40.581,DNA,08-MAY-2023,7,100.0,...,Unclassified,Inoviridae,Tubulavirales,Faserviricetes,Hofneiviricota,Loebvirae,Monodnaviria,Group II,PHG,Escherichia coli
2996,NC_075031,Enterobacteria phage f1,Enterobacteria phage f1 Inovirus M13 Inovirus ...,6407,False,40.393,DNA,08-MAY-2023,8,87.5,...,Unclassified,Inoviridae,Tubulavirales,Faserviricetes,Hofneiviricota,Loebvirae,Monodnaviria,Group II,PHG,Escherichia coli
5601,NC_055742,Enterobacteria phage vB_EcoM_IME341,Enterobacteria phage vB_EcoM_IME341 Dhakavirus...,172379,False,39.515,DNA,10-JAN-2023,279,84.946237,...,Tevenvirinae,Straboviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Escherichia coli
5602,NC_055741,Enterobacteria phage vB_EcoM_IME339,Enterobacteria phage vB_EcoM_IME339 Tequatrovi...,164366,False,35.64,DNA,10-JAN-2023,260,84.230769,...,Tevenvirinae,Straboviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Escherichia coli
5603,NC_055740,Enterobacteria phage vB_EcoM_IME281,Enterobacteria phage vB_EcoM_IME281 Dhakavirus...,170531,False,39.442,DNA,10-JAN-2023,277,84.476534,...,Tevenvirinae,Straboviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Escherichia coli
5691,NC_054904,Enterobacteria phage vB_EcoM_IME340,Enterobacteria phage vB_EcoM_IME340 Tequatrovi...,165549,False,35.549,DNA,10-JAN-2023,260,84.230769,...,Tevenvirinae,Straboviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Escherichia coli


In [66]:
print("Before")
display(df[df['Isolation Host'] == 'Salmonella typhimurium'].head(7))

print("After")
# Update the "Host" column based on the condition
df.loc[df['Isolation Host'] == 'Salmonella typhimurium', 'Host'] = 'Salmonella'
display(df[df['Isolation Host'] == 'Salmonella typhimurium'].head(7))

Before


Unnamed: 0,Accession,Description,Classification,Genome Length (bp),Jumbophage,molGC (%),Molecule,Modification Date,Number CDS,Positive Strand (%),...,Sub-family,Family,Order,Class,Phylum,Kingdom,Realm,Baltimore Group,Genbank Division,Isolation Host
5465,NC_004831,Salmonella phage SP6,Salmonella phage SP6 Zindervirus SP6 Zindervir...,43769,False,47.227,DNA,11-JAN-2023,50,100.0,...,Molineuxvirinae,Autographiviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Salmonella typhimurium
7008,NC_031026,Salmonella phage phSE-2,Salmonella phage phSE-2 Tlsvirus phSE2 Tlsviru...,49167,False,42.889,DNA,09-JAN-2023,81,76.54321,...,Tempevirinae,Drexlerviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Salmonella typhimurium
7392,NC_031042,Salmonella phage NR01,Salmonella phage NR01 Tequintavirus NR01 Tequi...,111325,False,38.838,DNA,08-JAN-2023,152,32.894737,...,Markadamsvirinae,Demerecviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Salmonella typhimurium
8038,NC_022768,Salmonella phage Maynard,Salmonella phage Maynard Kuttervirus maynard K...,154701,False,45.552,DNA,08-JAN-2023,196,64.795918,...,Cvivirinae,Ackermannviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Salmonella typhimurium
8458,NC_019545,Salmonella phage SPN3UB,Salmonella phage SPN3UB Caudoviricetes Uroviri...,47355,False,49.613,DNA,08-JAN-2023,68,85.294118,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Salmonella typhimurium
8466,NC_019530,Salmonella phage Sh19,Salmonella phage Sh19 Kuttervirus SH19 Kutterv...,157785,False,44.685,DNA,08-JAN-2023,206,34.951456,...,Cvivirinae,Ackermannviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Salmonella typhimurium
8884,NC_006949,Enterobacteria phage ES18,Enterobacteria phage ES18 Caudoviricetes Urovi...,46900,False,48.593,DNA,07-JAN-2023,73,68.493151,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Salmonella typhimurium


After


Unnamed: 0,Accession,Description,Classification,Genome Length (bp),Jumbophage,molGC (%),Molecule,Modification Date,Number CDS,Positive Strand (%),...,Sub-family,Family,Order,Class,Phylum,Kingdom,Realm,Baltimore Group,Genbank Division,Isolation Host
5465,NC_004831,Salmonella phage SP6,Salmonella phage SP6 Zindervirus SP6 Zindervir...,43769,False,47.227,DNA,11-JAN-2023,50,100.0,...,Molineuxvirinae,Autographiviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Salmonella typhimurium
7008,NC_031026,Salmonella phage phSE-2,Salmonella phage phSE-2 Tlsvirus phSE2 Tlsviru...,49167,False,42.889,DNA,09-JAN-2023,81,76.54321,...,Tempevirinae,Drexlerviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Salmonella typhimurium
7392,NC_031042,Salmonella phage NR01,Salmonella phage NR01 Tequintavirus NR01 Tequi...,111325,False,38.838,DNA,08-JAN-2023,152,32.894737,...,Markadamsvirinae,Demerecviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Salmonella typhimurium
8038,NC_022768,Salmonella phage Maynard,Salmonella phage Maynard Kuttervirus maynard K...,154701,False,45.552,DNA,08-JAN-2023,196,64.795918,...,Cvivirinae,Ackermannviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Salmonella typhimurium
8458,NC_019545,Salmonella phage SPN3UB,Salmonella phage SPN3UB Caudoviricetes Uroviri...,47355,False,49.613,DNA,08-JAN-2023,68,85.294118,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Salmonella typhimurium
8466,NC_019530,Salmonella phage Sh19,Salmonella phage Sh19 Kuttervirus SH19 Kutterv...,157785,False,44.685,DNA,08-JAN-2023,206,34.951456,...,Cvivirinae,Ackermannviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Salmonella typhimurium
8884,NC_006949,Enterobacteria phage ES18,Enterobacteria phage ES18 Caudoviricetes Urovi...,46900,False,48.593,DNA,07-JAN-2023,73,68.493151,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Salmonella typhimurium


Modifying the Enterobacteria in Escherichia cases

In [67]:
print("Before")
display(df[df['Isolation Host'] == 'Escherichia coli'].head(10))

print("After")
# Update the "Host" column based on the condition
df.loc[df['Isolation Host'] == 'Escherichia coli', 'Host'] = 'Escherichia'
display(df[df['Isolation Host'] == 'Escherichia coli'].head(7))

Before


Unnamed: 0,Accession,Description,Classification,Genome Length (bp),Jumbophage,molGC (%),Molecule,Modification Date,Number CDS,Positive Strand (%),...,Sub-family,Family,Order,Class,Phylum,Kingdom,Realm,Baltimore Group,Genbank Division,Isolation Host
57,HM208303,Escherichia phage vB_EcoP_24B,Escherichia phage vB_EcoP_24B Traversvirus tv2...,57677,False,49.68,DNA,20-AUG-2012,76,67.105263,...,Sepvirinae,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,SYN,Escherichia coli
65,OR204652,Escherichia phage pO111,Escherichia phage pO111 Caudoviricetes Uroviri...,40444,False,48.65,DNA,03-SEP-2023,51,1.960784,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Escherichia coli
66,OR204651,Escherichia phage pO103,Escherichia phage pO103 Caudoviricetes Uroviri...,39837,False,48.329,DNA,03-SEP-2023,48,100.0,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Escherichia coli
67,OR204650,Escherichia phage pO91,Escherichia phage pO91 Caudoviricetes Uroviric...,39650,False,49.788,DNA,03-SEP-2023,52,100.0,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Escherichia coli
76,MZ892903,Escherichia phage ELT1,Escherichia phage ELT1 Viruses,170727,False,35.43,DNA,31-AUG-2023,274,15.693431,...,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,PHG,Escherichia coli
99,ON470617,Escherichia phage vB_EcoM-813R1,Escherichia phage vB_EcoM-813R1 Punavirus Caud...,99607,False,47.338,DNA,06-SEP-2023,132,50.757576,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Escherichia coli
2280,OR062947,Escherichia phage 348Ecol098PP,Escherichia phage 348Ecol098PP Mosigvirus Teve...,170844,False,37.547,DNA,26-AUG-2023,266,84.210526,...,Tevenvirinae,Straboviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Escherichia coli
2281,OR062946,Escherichia phage 310Ecol104PP,Escherichia phage 310Ecol104PP Tequatrovirus T...,167023,False,35.567,DNA,26-AUG-2023,262,16.030534,...,Tevenvirinae,Straboviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Escherichia coli
2282,OR062945,Escherichia phage 308Ecol101PP,Escherichia phage 308Ecol101PP Mosigvirus Teve...,169543,False,37.705,DNA,26-AUG-2023,272,84.558824,...,Tevenvirinae,Straboviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Escherichia coli
2283,OR062944,Escherichia phage 303Ecol101PP,Escherichia phage 303Ecol101PP Tequatrovirus T...,166904,False,35.391,DNA,26-AUG-2023,267,84.644195,...,Tevenvirinae,Straboviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Escherichia coli


After


Unnamed: 0,Accession,Description,Classification,Genome Length (bp),Jumbophage,molGC (%),Molecule,Modification Date,Number CDS,Positive Strand (%),...,Sub-family,Family,Order,Class,Phylum,Kingdom,Realm,Baltimore Group,Genbank Division,Isolation Host
57,HM208303,Escherichia phage vB_EcoP_24B,Escherichia phage vB_EcoP_24B Traversvirus tv2...,57677,False,49.68,DNA,20-AUG-2012,76,67.105263,...,Sepvirinae,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,SYN,Escherichia coli
65,OR204652,Escherichia phage pO111,Escherichia phage pO111 Caudoviricetes Uroviri...,40444,False,48.65,DNA,03-SEP-2023,51,1.960784,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Escherichia coli
66,OR204651,Escherichia phage pO103,Escherichia phage pO103 Caudoviricetes Uroviri...,39837,False,48.329,DNA,03-SEP-2023,48,100.0,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Escherichia coli
67,OR204650,Escherichia phage pO91,Escherichia phage pO91 Caudoviricetes Uroviric...,39650,False,49.788,DNA,03-SEP-2023,52,100.0,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Escherichia coli
76,MZ892903,Escherichia phage ELT1,Escherichia phage ELT1 Viruses,170727,False,35.43,DNA,31-AUG-2023,274,15.693431,...,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,PHG,Escherichia coli
99,ON470617,Escherichia phage vB_EcoM-813R1,Escherichia phage vB_EcoM-813R1 Punavirus Caud...,99607,False,47.338,DNA,06-SEP-2023,132,50.757576,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Escherichia coli
2280,OR062947,Escherichia phage 348Ecol098PP,Escherichia phage 348Ecol098PP Mosigvirus Teve...,170844,False,37.547,DNA,26-AUG-2023,266,84.210526,...,Tevenvirinae,Straboviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Escherichia coli


In [68]:
print("Before")
display(df[df['Isolation Host'] == 'E. coli'].head(10))

print("After")
# Update the "Host" column based on the condition
df.loc[df['Isolation Host'] == 'E. coli', 'Host'] = 'Escherichia'
display(df[df['Isolation Host'] == 'E. coli'].head(7))

Before


Unnamed: 0,Accession,Description,Classification,Genome Length (bp),Jumbophage,molGC (%),Molecule,Modification Date,Number CDS,Positive Strand (%),...,Sub-family,Family,Order,Class,Phylum,Kingdom,Realm,Baltimore Group,Genbank Division,Isolation Host
7162,NC_049943,Enterobacteria phage 2851,Enterobacteria phage 2851 Pankowvirus pv2851 P...,57248,False,51.053,DNA,08-JAN-2023,81,65.432099,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,E. coli
25798,FM180578,Enterobacteria phage 2851,Enterobacteria phage 2851 Pankowvirus pv2851 P...,57248,False,51.053,DNA,03-DEC-2008,81,65.432099,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,E. coli
25825,AJ298298,Enterobacteria phage phiP27,Enterobacteria phage phiP27 Caudoviricetes Uro...,42575,False,49.351,DNA,15-APR-2005,63,79.365079,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,E. coli


After


Unnamed: 0,Accession,Description,Classification,Genome Length (bp),Jumbophage,molGC (%),Molecule,Modification Date,Number CDS,Positive Strand (%),...,Sub-family,Family,Order,Class,Phylum,Kingdom,Realm,Baltimore Group,Genbank Division,Isolation Host
7162,NC_049943,Enterobacteria phage 2851,Enterobacteria phage 2851 Pankowvirus pv2851 P...,57248,False,51.053,DNA,08-JAN-2023,81,65.432099,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,E. coli
25798,FM180578,Enterobacteria phage 2851,Enterobacteria phage 2851 Pankowvirus pv2851 P...,57248,False,51.053,DNA,03-DEC-2008,81,65.432099,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,E. coli
25825,AJ298298,Enterobacteria phage phiP27,Enterobacteria phage phiP27 Caudoviricetes Uro...,42575,False,49.351,DNA,15-APR-2005,63,79.365079,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,E. coli


In [69]:
print("Before")
display(df[df['Isolation Host'] == 'Escherichia coli;'].head(10))
display(df[df['Isolation Host'] == 'Escherichia coli,'].head(10))

print("After")
# Update the "Host" column based on the condition
df.loc[df['Isolation Host'] == 'Escherichia coli,', 'Host'] = 'Escherichia'
df.loc[df['Isolation Host'] == 'Escherichia coli;', 'Host'] = 'Escherichia'
display(df[df['Isolation Host'] == 'E. coli'].head(7))
display(df[df['Isolation Host'] == 'Escherichia coli;'].head(10))
display(df[df['Isolation Host'] == 'Escherichia coli,'].head(10))


Before


Unnamed: 0,Accession,Description,Classification,Genome Length (bp),Jumbophage,molGC (%),Molecule,Modification Date,Number CDS,Positive Strand (%),...,Sub-family,Family,Order,Class,Phylum,Kingdom,Realm,Baltimore Group,Genbank Division,Isolation Host
5317,NC_016570,Escherichia phage Cba120,Escherichia phage Cba120 Kuttervirus CBA120 Ku...,157304,False,44.499,DNA,11-JAN-2023,203,35.46798,...,Cvivirinae,Ackermannviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Escherichia coli;
6072,NC_048731,Escherichia phage fp01,Escherichia phage fp01 Tequintavirus fp01 Tequ...,109515,False,39.003,DNA,10-JAN-2023,152,29.605263,...,Markadamsvirinae,Demerecviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Escherichia coli;
13973,JN593240,Escherichia phage Cba120,Escherichia phage Cba120 Kuttervirus CBA120 Ku...,157304,False,44.499,DNA,28-OCT-2021,203,35.46798,...,Cvivirinae,Ackermannviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Escherichia coli;
17601,MH745368,Escherichia phage fp01,Escherichia phage fp01 Tequintavirus fp01 Tequ...,109515,False,39.003,DNA,03-MAR-2020,55,61.818182,...,Markadamsvirinae,Demerecviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Escherichia coli;
19687,MK562504,Enterobacteria phage CHB7,Enterobacteria phage CHB7 Suspvirus Ounavirina...,87998,False,40.049,DNA,02-APR-2019,134,28.358209,...,Ounavirinae,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Escherichia coli;


Unnamed: 0,Accession,Description,Classification,Genome Length (bp),Jumbophage,molGC (%),Molecule,Modification Date,Number CDS,Positive Strand (%),...,Sub-family,Family,Order,Class,Phylum,Kingdom,Realm,Baltimore Group,Genbank Division,Isolation Host
25812,AJ556162,Enterobacteria phage BP-4795,Enterobacteria phage BP-4795 Marienburgvirus B...,57930,False,50.608,DNA,08-DEC-2005,83,69.879518,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,"Escherichia coli,"


After


Unnamed: 0,Accession,Description,Classification,Genome Length (bp),Jumbophage,molGC (%),Molecule,Modification Date,Number CDS,Positive Strand (%),...,Sub-family,Family,Order,Class,Phylum,Kingdom,Realm,Baltimore Group,Genbank Division,Isolation Host
7162,NC_049943,Enterobacteria phage 2851,Enterobacteria phage 2851 Pankowvirus pv2851 P...,57248,False,51.053,DNA,08-JAN-2023,81,65.432099,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,E. coli
25798,FM180578,Enterobacteria phage 2851,Enterobacteria phage 2851 Pankowvirus pv2851 P...,57248,False,51.053,DNA,03-DEC-2008,81,65.432099,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,E. coli
25825,AJ298298,Enterobacteria phage phiP27,Enterobacteria phage phiP27 Caudoviricetes Uro...,42575,False,49.351,DNA,15-APR-2005,63,79.365079,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,E. coli


Unnamed: 0,Accession,Description,Classification,Genome Length (bp),Jumbophage,molGC (%),Molecule,Modification Date,Number CDS,Positive Strand (%),...,Sub-family,Family,Order,Class,Phylum,Kingdom,Realm,Baltimore Group,Genbank Division,Isolation Host
5317,NC_016570,Escherichia phage Cba120,Escherichia phage Cba120 Kuttervirus CBA120 Ku...,157304,False,44.499,DNA,11-JAN-2023,203,35.46798,...,Cvivirinae,Ackermannviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Escherichia coli;
6072,NC_048731,Escherichia phage fp01,Escherichia phage fp01 Tequintavirus fp01 Tequ...,109515,False,39.003,DNA,10-JAN-2023,152,29.605263,...,Markadamsvirinae,Demerecviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Escherichia coli;
13973,JN593240,Escherichia phage Cba120,Escherichia phage Cba120 Kuttervirus CBA120 Ku...,157304,False,44.499,DNA,28-OCT-2021,203,35.46798,...,Cvivirinae,Ackermannviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Escherichia coli;
17601,MH745368,Escherichia phage fp01,Escherichia phage fp01 Tequintavirus fp01 Tequ...,109515,False,39.003,DNA,03-MAR-2020,55,61.818182,...,Markadamsvirinae,Demerecviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Escherichia coli;
19687,MK562504,Enterobacteria phage CHB7,Enterobacteria phage CHB7 Suspvirus Ounavirina...,87998,False,40.049,DNA,02-APR-2019,134,28.358209,...,Ounavirinae,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Escherichia coli;


Unnamed: 0,Accession,Description,Classification,Genome Length (bp),Jumbophage,molGC (%),Molecule,Modification Date,Number CDS,Positive Strand (%),...,Sub-family,Family,Order,Class,Phylum,Kingdom,Realm,Baltimore Group,Genbank Division,Isolation Host
25812,AJ556162,Enterobacteria phage BP-4795,Enterobacteria phage BP-4795 Marienburgvirus B...,57930,False,50.608,DNA,08-DEC-2005,83,69.879518,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,"Escherichia coli,"


Fixing it in Shigella flexneri

In [70]:
print("After")
# Update the "Host" column based on the condition
df.loc[df['Isolation Host'] == 'Shigella flexneri', 'Host'] = 'Shigella'
display(df[df['Isolation Host'] == 'Shigella flexneri'])

After


Unnamed: 0,Accession,Description,Classification,Genome Length (bp),Jumbophage,molGC (%),Molecule,Modification Date,Number CDS,Positive Strand (%),...,Sub-family,Family,Order,Class,Phylum,Kingdom,Realm,Baltimore Group,Genbank Division,Isolation Host
2958,OQ223306,Shigella phage S2_02,Shigella phage S2_02 Tunavirus Tunavirinae Dre...,50597,False,45.271,DNA,16-MAY-2023,76,76.315789,...,Tunavirinae,Drexlerviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Shigella flexneri
2959,OP137232,Shigella phage S2_01,Shigella phage S2_01 Tequatrovirus Tevenvirina...,168102,False,35.317,DNA,16-MAY-2023,264,84.469697,...,Tevenvirinae,Straboviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Shigella flexneri
4100,NC_070988,Shigella phage SGF2,Shigella phage SGF2 Kuravirus Caudoviricetes U...,76964,False,42.298,DNA,08-FEB-2023,118,20.338983,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Shigella flexneri
4235,NC_070850,Shigella virus Moo19,Shigella virus Moo19 Enquatrovirinae Schitovir...,72458,False,44.640,DNA,08-FEB-2023,86,77.906977,...,Enquatrovirinae,Schitoviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Shigella flexneri
4967,NC_049831,Shigella phage Sfin-3,Shigella phage Sfin-3 Tunavirus Sfin3 Tunaviru...,50309,False,45.352,DNA,11-JAN-2023,78,75.641026,...,Tunavirinae,Drexlerviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Shigella flexneri
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24825,KJ603229,Shigella phage POCJ13,Shigella phage POCJ13 Diegovirus POCJ13 Diegov...,62699,False,49.347,DNA,05-SEP-2014,76,69.736842,...,Sepvirinae,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Shigella flexneri
25044,KC710998,Shigella phage pSf-1,Shigella phage pSf-1 Hanrivervirus pSf1 Hanriv...,51821,False,44.023,DNA,13-DEC-2013,82,70.731707,...,Tempevirinae,Drexlerviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Shigella flexneri
25068,KC814930,Shigella phage SfIV,Shigella phage SfIV Caudoviricetes Uroviricota...,39758,False,50.297,DNA,29-OCT-2013,58,72.413793,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Shigella flexneri
25301,JX509734,Enterobacteria phage SfI,Enterobacteria phage SfI Caudoviricetes Urovir...,38389,False,50.116,DNA,01-MAR-2013,58,67.241379,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Shigella flexneri


In [38]:
#df.to_csv("../data/interim/manipulation/phage_df_clean_unassigned.csv")

# Loading data to assign classification

In [26]:
df = pd.read_csv("../data/interim/manipulation/phage_df_clean_unassigned.csv")
df = df[['Accession','Host','Isolation Host']]
df.head()

Unnamed: 0,Accession,Host,Isolation Host
0,MN335248,Vibrio,Xanthomonas vesicatoria
1,MK250029,Prevotella,Prevotella sp.
2,MK250028,Prevotella,Prevotella sp.
3,MK250027,Prevotella,Prevotella sp.
4,MK250026,Prevotella,Prevotella sp.


In [71]:
all_gram = pd.read_csv("../data/interim/gram_staining/exploration/all_gram_stains_fixed_species.csv")
all_gram["species"]

0               Vallitalea guaymasensis
1                Acaricomes phytoseiuli
2         Acetanaerobacterium elongatum
3       Acetilactobacillus jinshanensis
4                Acetivibrio mesophilus
                     ...               
7536                    Zoogloea oryzae
7537            Zooshikella ganghwensis
7538           Zunongwangia endophytica
7539              Zunongwangia mangrovi
7540              Zunongwangia profunda
Name: species, Length: 7541, dtype: object

# Excluding genera and species whcih appear in both gram stainings

In [72]:
exclude_genus = ['Clostridium', 'Neobacillus', 'Alteribacter', 'Desulfotomaculum', 'Caloramator', 'Desulforamulus', 'Heyndrickxia', 'Peptoclostridium', 'Thermoanaerobacter', 'Thermoanaerobacterium', 'Aureimonas', 'Actinomadura', 'Alkalibacterium', 'Deinococcus', 'Tepidibacillus', 'Sphingomonas', 'Lysinibacillus', 'Ruminiclostridium', 'Caldicellulosiruptor', 'Pseudomonas', 'Streptococcus', 'Microlunatus', 'Streptomyces', 'Butyricimonas', 'Halalkalibacter', 'Chelativorans', 'Natrinema', 'Ureibacillus', 'Clostridioides', 'Desulfosporosinus', 'Lacibacter', 'Nocardioides', 'Siminovitchia', 'Belliella', 'Tistlia', 'Actinoplanes', 'Paenibacillus', 'Vallitalea', 'Actinotalea', 'Cohnella', 'Rhizobium', 'Anaerotignum', 'Cellulomonas', 'Flavobacterium', 'Bacillus', 'Nesterenkonia']
exclude_species = ['Ureibacillus massiliensis', 'Tistlia consotensis', 'Clostridioides difficile', 'Vallitalea guaymasensis', 'Belliella pelovolcani', 'Actinotalea ferrariae']

In [73]:
# Save the discarded dataframe
excluded_rows = df[df['Host'].isin(exclude_genus) | df['Isolation Host'].isin(exclude_species)]

# Append the excluded rows to the discarded DataFrame
discarded = pd.concat([discarded, excluded_rows], ignore_index=True)

# discarded.to_csv("../data/interim/discarded_phages.csv")
print(discarded.shape)


(14913, 27)


In [74]:
print("Shape of the dataframe before deleting the common genera: ", df.shape)
df = df[~df['Host'].isin(exclude_genus)]
print("Shape of the dataframe after deleting the common genera: ", df.shape)
df = df[~df['Isolation Host'].isin(exclude_species)]
shape1 = df.shape
print("Shape of the dataframe after deleting the common species: ", shape1)


Shape of the dataframe before deleting the common genera:  (14341, 27)
Shape of the dataframe after deleting the common genera:  (11374, 27)
Shape of the dataframe after deleting the common species:  (11374, 27)


# First mapping: species to isolation host

In [75]:
# Create a mapping dictionary from species to Gram stain
species_to_stain = dict(zip(all_gram['species'], all_gram['Gram stain']))
species_to_stain

{'Vallitalea guaymasensis': 'positive',
 'Acaricomes phytoseiuli': 'positive',
 'Acetanaerobacterium elongatum': 'positive',
 'Acetilactobacillus jinshanensis': 'positive',
 'Acetivibrio mesophilus': 'positive',
 'Acetivibrio saccincola': 'positive',
 'Acidimicrobium ferrooxidans': 'positive',
 'Acidipropionibacterium olivae': 'positive',
 'Acidipropionibacterium virtanenii': 'positive',
 'Acrocarpospora phusangensis': 'positive',
 'Actinoallomurus acaciae': 'positive',
 'Actinoallomurus acanthiterrae': 'positive',
 'Actinoallomurus amamiensis': 'positive',
 'Actinoallomurus caesius': 'positive',
 'Actinoallomurus fulvus': 'positive',
 'Actinoallomurus iriomotensis': 'positive',
 'Actinoallomurus luridus': 'positive',
 'Actinoallomurus oryzae': 'positive',
 'Actinoallomurus purpureus': 'positive',
 'Actinoallomurus spadix': 'positive',
 'Actinoallomurus yoronensis': 'positive',
 'Actinoalloteichus spitiensis': 'positive',
 'Actinobaculum massiliense': 'positive',
 'Actinobaculum suis':

In [76]:
# Use the mapping to create the 'staining' column in df
df_mapped = df.copy(deep=True)
df_mapped['staining'] = df_mapped['Isolation Host'].map(species_to_stain)
drop = df_mapped.dropna(subset=['staining'])
shape2 = drop.shape
print(shape2)
print("Percentage of phages assigned a class based solely on species: ", str(shape2[0]/shape1[0]*100))
print("Percentage of the all_gram dataset used to assign a class based solely on species: ", str(shape2[0]/all_gram.shape[0]*100))

(2559, 28)
Percentage of phages assigned a class based solely on species:  22.4986812027431
Percentage of the all_gram dataset used to assign a class based solely on species:  33.934491446757725


# Second mapping: using genus and Host

In [77]:
# Create a mapping dictionary from species to Gram stain
genus_to_stain = dict(zip(all_gram['Genus'], all_gram['Gram stain']))
genus_to_stain

{'Vallitalea': 'negative',
 'Acaricomes': 'positive',
 'Acetanaerobacterium': 'positive',
 'Acetilactobacillus': 'positive',
 'Acetivibrio': 'positive',
 'Acidimicrobium': 'positive',
 'Acidipropionibacterium': 'positive',
 'Acrocarpospora': 'positive',
 'Actinoallomurus': 'positive',
 'Actinoalloteichus': 'positive',
 'Actinobaculum': 'positive',
 'Actinocatenispora': 'positive',
 'Actinocorallia': 'positive',
 'Actinocrinis': 'positive',
 'Actinocrispum': 'positive',
 'Actinokineospora': 'positive',
 'Actinomadura': 'negative',
 'Actinomyces': 'positive',
 'Actinomycetospora': 'positive',
 'Actinophytocola': 'positive',
 'Actinoplanes': 'negative',
 'Actinopolymorpha': 'positive',
 'Actinopolyspora': 'positive',
 'Actinorectispora': 'positive',
 'Actinorhabdospora': 'positive',
 'Actinospica': 'positive',
 'Actinosynnema': 'positive',
 'Actinotalea': 'positive',
 'Actinotignum': 'positive',
 'Acutalibacter': 'positive',
 'Adlercreutzia': 'positive',
 'Aeribacillus': 'positive',
 'Aer

In [78]:
# Use the mapping to create the 'staining' column in df
df_mapped['staining'] = df_mapped['Host'].map(genus_to_stain)
drop = df_mapped.dropna(subset=['staining'])
shape3 = drop.shape
print(shape3)
print("Percentage of phages assigned a class using species+genus: ", str(shape3[0]/shape1[0]*100))
#print("Percentage of the all_gram dataset used to assign a class based on species+genus: ", str(shape3[0]/all_gram.shape[0]*100))

(9491, 28)
Percentage of phages assigned a class using species+genus:  83.44469843502725


# Exploring the remaining phages with no assignation

In [79]:
remaining = df_mapped[(df_mapped["staining"] != "positive") & (df_mapped["staining"] != "negative")]
print(remaining.shape)
remaining.head()

(1883, 28)


Unnamed: 0,Accession,Description,Classification,Genome Length (bp),Jumbophage,molGC (%),Molecule,Modification Date,Number CDS,Positive Strand (%),...,Family,Order,Class,Phylum,Kingdom,Realm,Baltimore Group,Genbank Division,Isolation Host,staining
69,OR413347,Salmonella phage EH7,Salmonella phage EH7 Viruses,110344,False,39.504,DNA,03-SEP-2023,157,32.484076,...,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,PHG,Salmonella enterica,
70,OR413345,Salmonella phage EH4,Salmonella phage EH4 Viruses,108554,False,39.84,DNA,03-SEP-2023,154,29.220779,...,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,PHG,Salmonella enterica,
71,OR413344,Salmonella phage EH3,Salmonella phage EH3 Viruses,115218,False,40.127,DNA,03-SEP-2023,162,30.864198,...,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,PHG,Salmonella enterica,
72,OR413342,Salmonella phage EH1,Salmonella phage EH1 Viruses,109346,False,39.552,DNA,03-SEP-2023,153,31.372549,...,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,PHG,Salmonella enterica,
2301,OR067837,Salmonella phage KKP 3953,Salmonella phage KKP 3953 Epseptimavirus Marka...,115522,False,39.9,DNA,23-AUG-2023,179,26.256983,...,Demerecviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Salmonella enterica,


In [80]:
print(len(remaining["Host"].unique().tolist()))
print(remaining["Host"].unique().tolist())

88
['Salmonella', 'Synechococcus', 'Pseudanabaena', 'Mammaliicoccus', 'Mycobacteroides', 'Nostoc', 'Microcystis', 'Methanosarcina', 'Actinomycetia', 'Saccharolobus', 'Cylindrospermopsis', 'Sulfolobus', 'Metallosphaera', 'Liberibacter', 'Megaira', 'Dipodfec', 'Thermoproteus', 'Pyrobaculum', 'Methanocaldococcus', 'Acidianus', 'Spiroplasma', 'Elizabethkingia', 'Myxococcus', 'Janthinobacterium', 'Pelagibacter', 'Brochothrix', 'Sinorhizobium', 'Butyrivibrio', 'Lentibacter', 'Salinibacter', 'Eggerthella', 'Phormidium', 'Mycoplasma', 'Chlamydia', 'Hamiltonella', 'Edwardsiella', 'Nodularia', 'Raoultella', 'Faecalibacterium', 'Leclercia', 'Azobacteroides', 'Curvibacter', 'Morganella', 'Natrialba', 'Verrucomicrobia', 'Prochlorococcus', 'Hydrogenobaculum', 'Anabaena', 'Croceibacter', 'Oenococcus', 'Puniceispirillum', 'Tetrasphaera', 'Bdellovibrio', 'Thermococcus', 'Planktothrix', 'Sodalis', 'Silicibacter', 'Pyrococcus', 'Listonella', 'Kluyvera', 'Hungatella', 'Amedibacillus', 'Leptolyngbya', 'Huj

In [81]:
print(len(remaining["Isolation Host"].unique().tolist()))
print(remaining["Isolation Host"].unique().tolist())

153
['Salmonella enterica', 'Parasynechococcus marenigrum', 'Synechococcus sp.', 'Salmonella sp.', 'Pseudanabaena sp.', 'Mammaliicoccus sciuri', 'Mycobacteroides abscessus', 'Nostoc sp.', 'Microcystis sp.', 'Methanosarcina mazei', 'Actinomycetia bacterium', 'Saccharolobus shibatae', 'Cylindrospermopsis raciborskii', 'Sulfolobus beitou', 'Metallosphaera prunae', 'Candidatus Liberibacter', 'Candidatus Megaira', 'Dipodomys merriami', 'Thermoproteus sp.', 'Pyrobaculum arsenaticum', 'Methanocaldococcus fervens', 'Saccharolobus solfataricus', 'Metallosphaera sedula', 'Acidianus brierleyi', 'Circulifer haematoceps', 'Salmonella gallinarum', 'Sulfolobus tengchongensis', 'Elizabethkingia anophelis', 'Myxococcus xanthus', 'Janthinobacterium lividum', 'Sulfolobus sp.', 'Sulfolobus', 'Candidadtus Pelagibacter', 'Dipodomys merriami;', 'Brochothrix thermosphacta', 'Sinorhizobium meliloti', 'Butyrivibrio fibrisolvens', 'Lentibacter sp.', 'Synechococcus', 'Salinibacter ruber', 'Eggerthella lenta', 'Le

In [84]:
remaining

Unnamed: 0,Accession,Description,Classification,Genome Length (bp),Jumbophage,molGC (%),Molecule,Modification Date,Number CDS,Positive Strand (%),...,Family,Order,Class,Phylum,Kingdom,Realm,Baltimore Group,Genbank Division,Isolation Host,staining
69,OR413347,Salmonella phage EH7,Salmonella phage EH7 Viruses,110344,False,39.504,DNA,03-SEP-2023,157,32.484076,...,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,PHG,Salmonella enterica,
70,OR413345,Salmonella phage EH4,Salmonella phage EH4 Viruses,108554,False,39.840,DNA,03-SEP-2023,154,29.220779,...,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,PHG,Salmonella enterica,
71,OR413344,Salmonella phage EH3,Salmonella phage EH3 Viruses,115218,False,40.127,DNA,03-SEP-2023,162,30.864198,...,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,PHG,Salmonella enterica,
72,OR413342,Salmonella phage EH1,Salmonella phage EH1 Viruses,109346,False,39.552,DNA,03-SEP-2023,153,31.372549,...,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,Unclassified,PHG,Salmonella enterica,
2301,OR067837,Salmonella phage KKP 3953,Salmonella phage KKP 3953 Epseptimavirus Marka...,115522,False,39.900,DNA,23-AUG-2023,179,26.256983,...,Demerecviridae,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Salmonella enterica,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26228,DQ785801,Sodalis phage phiSG1,Sodalis phage phiSG1 Caudoviricetes Uroviricot...,52162,False,50.826,DNA,20-MAR-2007,64,70.312500,...,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,PHG,Sodalis glossinidius,
26263,AJ854042,Acidianus filamentous virus 2,Acidianus filamentous virus 2 Deltalipothrixvi...,31787,False,35.656,DNA,14-NOV-2006,50,54.000000,...,Lipothrixviridae,Ligamenvirales,Tokiviricetes,Taleaviricota,Zilligvirae,Adnaviria,Group I,PHG,Acidianus pozzuoliensis,
26274,AJ414696,Sulfolobus islandicus rod-shaped virus 1,Sulfolobus islandicus rod-shaped virus 1 Iceru...,32308,False,25.269,DNA,18-MAR-2002,47,53.191489,...,Rudiviridae,Ligamenvirales,Tokiviricetes,Taleaviricota,Zilligvirae,Adnaviria,Group I,PHG,Sulfolobus islandicus,
26275,AJ344259,Sulfolobus islandicus rod-shaped virus 2,Sulfolobus islandicus rod-shaped virus 2 Iceru...,35450,False,25.199,DNA,15-APR-2005,54,55.555556,...,Rudiviridae,Ligamenvirales,Tokiviricetes,Taleaviricota,Zilligvirae,Adnaviria,Group I,PHG,Sulfolobus islandicus,


# Obtaining the remaining classification with Thomas pickle dict

In [55]:
import pickle

with open('../data/external/gram_pos_neg_dict.pkl', 'rb') as file:
    phage_dict = pickle.load(file)

file.close()

# Create a new genus_to_stain with updated values
phage_dict = {key: 'negative' if value == 'neg' else 'positive' for key, value in phage_dict.items()}

for key, value in list(phage_dict.items())[:3]:
    print(f'{key}: {value}')


Actinobacteria: positive
Actinomycetales: positive
Bifidobacterium: positive


Update the dataframe with Thomas dict

In [58]:
# Iterate through the DataFrame
for index, row in df_mapped.iterrows():
    if pd.isna(row['staining']):  # Check if 'staining' column is NaN
        host = row['Host']
        if host in phage_dict:
            df_mapped.at[index, 'staining'] = phage_dict[host]

drop = df_mapped.dropna(subset=['staining'])
shape4 = drop.shape
print(shape4)
print("Percentage of phages assigned a class using species+genus+thomas dict: ", str(shape4[0]/shape1[0]*100))

(10938, 28)
Percentage of phages assigned a class using species+genus+thomas dict:  96.16669597327238


In [57]:
all_gram

Unnamed: 0,species,Gram stain,Genus
0,Vallitalea guaymasensis,positive,Vallitalea
1,Acaricomes phytoseiuli,positive,Acaricomes
2,Acetanaerobacterium elongatum,positive,Acetanaerobacterium
3,Acetilactobacillus jinshanensis,positive,Acetilactobacillus
4,Acetivibrio mesophilus,positive,Acetivibrio
...,...,...,...
7536,Zoogloea oryzae,negative,Zoogloea
7537,Zooshikella ganghwensis,negative,Zooshikella
7538,Zunongwangia endophytica,negative,Zunongwangia
7539,Zunongwangia mangrovi,negative,Zunongwangia


In [40]:
remaining = df_mapped[df_mapped['staining'].isna()]
print(len(remaining["Host"].unique().tolist()))
print(remaining["Host"].unique().tolist())

57
['Pseudanabaena', 'Mammaliicoccus', 'Mycobacteroides', 'Nostoc', 'Methanosarcina', 'Actinomycetia', 'Saccharolobus', 'Cylindrospermopsis', 'Sulfolobus', 'Metallosphaera', 'Liberibacter', 'Megaira', 'Dipodfec', 'Thermoproteus', 'Pyrobaculum', 'Methanocaldococcus', 'Acidianus', 'Spiroplasma', 'Myxococcus', 'Janthinobacterium', 'Butyrivibrio', 'Salinibacter', 'Mycoplasma', 'Chlamydia', 'Faecalibacterium', 'Leclercia', 'Azobacteroides', 'Prochlorococcus', 'Hydrogenobaculum', 'Anabaena', 'Puniceispirillum', 'Thermococcus', 'Silicibacter', 'Pyrococcus', 'Listonella', 'Kluyvera', 'Hungatella', 'Amedibacillus', 'Leptolyngbya', 'Hujiaoplasma', 'Poriferisphaera', 'Plectonema', 'Cedecea', 'Citromicrobium', 'Kurthia', 'Lokiarchaeota', 'Pararheinheimera', 'Lentisphaerae', 'Solobacterium', 'Arthronema', 'Mastigocladus', 'Enterobacteria', 'Acholeplasma', 'Aphanizomenon', 'Xenohaliotis', 'Pelagibaca', 'Nitrososphaera']


In [41]:
print(len(remaining["Isolation Host"].unique().tolist()))
print(remaining["Isolation Host"].unique().tolist())

88
['Pseudanabaena sp.', 'Mammaliicoccus sciuri', 'Mycobacteroides abscessus', 'Nostoc sp.', 'Methanosarcina mazei', 'Actinomycetia bacterium', 'Saccharolobus shibatae', 'Cylindrospermopsis raciborskii', 'Sulfolobus beitou', 'Metallosphaera prunae', 'Candidatus Liberibacter', 'Candidatus Megaira', 'Dipodomys merriami', 'Thermoproteus sp.', 'Pyrobaculum arsenaticum', 'Methanocaldococcus fervens', 'Saccharolobus solfataricus', 'Metallosphaera sedula', 'Acidianus brierleyi', 'Circulifer haematoceps', 'Sulfolobus tengchongensis', 'Myxococcus xanthus', 'Janthinobacterium lividum', 'Sulfolobus sp.', 'Sulfolobus', 'Dipodomys merriami;', 'Butyrivibrio fibrisolvens', 'Salinibacter ruber', 'Mycoplasmopsis pulmonis', 'Chlamydophila pneumoniae', 'Faecalibacterium prausnitzii', 'Leclercia adecarboxylata', 'Candidatus Azobacteroides', 'Metallosphaera sp.', 'Sulfolobus islandicus', 'Acidianus sp.', 'Sulfolobales', 'Prochlorococcus sp.', 'Hydrogenobaculum', 'Anabaena variabilis', 'Sulfolobus solfatari

## Figuring out the remaining hosts

In [42]:
# List of genera with possible misspellings
genera_list = ['Pseudanabaena', 'Mammaliicoccus', 'Enteroccous', 'Mycobacteroides', 'Nostoc', 'Methanosarcina', 'Actinomycetia', 'Saccharolobus', 'Cylindrospermopsis', 'Sulfolobus', 'Metallosphaera', 'Liberibacter', 'Enterobacteria', 'Megaira', 'Dipodfec', 'Thermoproteus', 'Pyrobaculum', 'Methanocaldococcus', 'Acidianus', 'Spiroplasma', 'Myxococcus', 'Janthinobacterium', 'Butyrivibrio', 'Salinibacter', 'Mycoplasma', 'Chlamydia', 'Faecalibacterium', 'Leclercia', 'Azobacteroides', 'Prochlorococcus', 'Hydrogenobaculum', 'Anabaena', 'Puniceispirillum', 'Thermococcus', 'Silicibacter', 'Pyrococcus', 'Listonella', 'Kluyvera', 'Hungatella', 'Amedibacillus', 'Leptolyngbya', 'Hujiaoplasma', 'Poriferisphaera', 'Plectonema', 'Cedecea', 'Citromicrobium', 'Kurthia', 'Lokiarchaeota', 'Pararheinheimera', 'Lentisphaerae', 'Solobacterium', 'Arthronema', 'Mastigocladus', 'Acholeplasma', 'Aphanizomenon', 'Xenohaliotis', 'Pelagibaca', 'Nitrososphaera']

# Reference list of correctly spelled genera (you can create your own or use a reliable source)
reference_genera = ['Pseudanabaena', 'Mammaliicoccus', 'Enterococcus', 'Mycobacteroides', 'Nostoc', 'Methanosarcina', 'Actinomycetia', 'Sulfolobus', 'Metallosphaera', 'Liberibacter', 'Enterobacter', 'Megaira', 'Thermoproteus', 'Pyrobaculum', 'Methanocaldococcus', 'Acidianus', 'Spiroplasma', 'Myxococcus', 'Janthinobacterium', 'Butyrivibrio', 'Salinibacter', 'Mycoplasma', 'Chlamydia', 'Faecalibacterium', 'Leclercia', 'Azobacteroides', 'Prochlorococcus', 'Hydrogenobaculum', 'Anabaena', 'Puniceispirillum', 'Thermococcus', 'Silicibacter', 'Pyrococcus', 'Listonella', 'Kluyvera', 'Hungatella', 'Amedibacillus', 'Leptolyngbya', 'Hujiaoplasma', 'Poriferisphaera', 'Plectonema', 'Cedecea', 'Citromicrobium', 'Kurthia', 'Lokiarchaeota', 'Pararheinheimera', 'Lentisphaerae', 'Solobacterium', 'Arthronema', 'Mastigocladus', 'Acholeplasma', 'Aphanizomenon', 'Xenohaliotis', 'Pelagibaca', 'Nitrososphaera']

# Find misspelled genera
misspelled_genera = set(genera_list) - set(reference_genera)

# Print misspelled genera
print("Misspelled Genera:")
print(misspelled_genera)


Misspelled Genera:
{'Cylindrospermopsis', 'Dipodfec', 'Enterobacteria', 'Saccharolobus', 'Enteroccous'}


In [43]:
print(genera_list)

['Pseudanabaena', 'Mammaliicoccus', 'Enteroccous', 'Mycobacteroides', 'Nostoc', 'Methanosarcina', 'Actinomycetia', 'Saccharolobus', 'Cylindrospermopsis', 'Sulfolobus', 'Metallosphaera', 'Liberibacter', 'Enterobacteria', 'Megaira', 'Dipodfec', 'Thermoproteus', 'Pyrobaculum', 'Methanocaldococcus', 'Acidianus', 'Spiroplasma', 'Myxococcus', 'Janthinobacterium', 'Butyrivibrio', 'Salinibacter', 'Mycoplasma', 'Chlamydia', 'Faecalibacterium', 'Leclercia', 'Azobacteroides', 'Prochlorococcus', 'Hydrogenobaculum', 'Anabaena', 'Puniceispirillum', 'Thermococcus', 'Silicibacter', 'Pyrococcus', 'Listonella', 'Kluyvera', 'Hungatella', 'Amedibacillus', 'Leptolyngbya', 'Hujiaoplasma', 'Poriferisphaera', 'Plectonema', 'Cedecea', 'Citromicrobium', 'Kurthia', 'Lokiarchaeota', 'Pararheinheimera', 'Lentisphaerae', 'Solobacterium', 'Arthronema', 'Mastigocladus', 'Acholeplasma', 'Aphanizomenon', 'Xenohaliotis', 'Pelagibaca', 'Nitrososphaera']


In [44]:
remaining_genera_dict = {'Pseudanabaena': 'negative',
 'Mammaliicoccus': 'positive',
 'Mycobacteroides': 'positive',
 'Nostoc': 'negative',
 'Methanosarcina': 'remove',
 'Actinomycetia': 'positive',
 'Saccharolobus': 'remove',
 'Cylindrospermopsis': 'positive',
 'Sulfolobus': 'negative',
 'Metallosphaera': 'remove',
 'Liberibacter': 'negative',
 'Enterobacteria': 'remove',
 'Megaira': 'negative',
 'Dipodfec': 'remove',
 'Thermoproteus': 'remove',
 'Pyrobaculum': 'remove',
 'Methanocaldococcus': 'remove',
 'Acidianus': 'remove',
 'Spiroplasma': 'remove',
 'Myxococcus': 'negative',
 'Janthinobacterium': 'negative',
 'Butyrivibrio': 'remove',
 'Salinibacter': 'negative',
 'Mycoplasma': 'remove',
 'Chlamydia': 'negative',
 'Faecalibacterium': 'positive',
 'Leclercia': 'negative',
 'Azobacteroides': 'positive',
 'Prochlorococcus': 'negative',
 'Hydrogenobaculum': 'remove',
 'Anabaena': 'negative',
 'Puniceispirillum': 'remove',
 'Thermococcus': 'remove',
 'Silicibacter': 'negative',
 'Pyrococcus': 'negative',
 'Listonella': 'negative',
 'Kluyvera': 'negative',
 'Hungatella': 'positive',
 'Amedibacillus': 'positive',
 'Leptolyngbya': 'negative',
 'Hujiaoplasma': 'remove',
 'Poriferisphaera': 'remove',
 'Plectonema': 'negative',
 'Cedecea': 'negative',
 'Citromicrobium': 'negative',
 'Kurthia': 'positive',
 'Lokiarchaeota': 'remove',
 'Pararheinheimera': 'negative',
 'Lentisphaerae': 'negative',
 'Solobacterium': 'positive',
 'Arthronema': 'negative',
 'Mastigocladus': 'negative',
 'Acholeplasma': 'remove',
 'Aphanizomenon': 'negative',
 'Xenohaliotis': 'negative',
 'Pelagibaca': 'negative',
 'Nitrososphaera': 'remove'}

In [45]:
# Iterate through the DataFrame
for index, row in df_mapped.iterrows():
    if pd.isna(row['staining']):  # Check if 'staining' column is NaN
        host = row['Host']
        if host in remaining_genera_dict:
            df_mapped.at[index, 'staining'] = remaining_genera_dict[host]

drop = df_mapped.dropna(subset=['staining'])
shape5 = drop.shape
print(shape5)
print("Percentage of phages assigned a class using species+genus+thomas dict+remaining dict: ", str(shape5[0]/shape1[0]*100))

(11374, 4)
Percentage of phages assigned a class using species+genus+thomas dict+remaining dict:  100.0


In [46]:
# Remove the genera which should be removed
removed = df_mapped[df_mapped['staining'] == "remove"]
df_mapped = df_mapped[df_mapped['staining'] != "remove"]
print(removed)
print("Percentage of phages removed due to unclear genera in the last step: ", str(removed.shape[0]/shape1[0]*100))

      Accession            Host           Isolation Host staining
322    OR220913  Methanosarcina     Methanosarcina mazei   remove
323    OR220912  Methanosarcina     Methanosarcina mazei   remove
324    OR220911  Methanosarcina     Methanosarcina mazei   remove
382    OP999015   Saccharolobus   Saccharolobus shibatae   remove
580    OQ473815   Saccharolobus   Saccharolobus shibatae   remove
...         ...             ...                      ...      ...
14203  EF432053       Acidianus     Acidianus convivator   remove
14206  AF074945      Mycoplasma   Mycoplasma arthritidis   remove
14294  EU545650       Acidianus     Acidianus uzoniensis   remove
14305  SVU28974     Spiroplasma        Spiroplasma citri   remove
14335  AJ854042       Acidianus  Acidianus pozzuoliensis   remove

[218 rows x 4 columns]
Percentage of phages removed due to unclear genera in the last step:  1.916652013363812


In [47]:
# df_mapped.to_csv("../data/interim/gram_staining/staining_assignation.csv")

In [48]:
# Append the removed rows to the discarded DataFrame
discarded = pd.concat([discarded, removed.drop('staining', axis=1)], ignore_index=True)
print(discarded.shape)

# discarded.to_csv("../data/interim/discarded_phages.csv")

(15131, 3)


# Merging staining to full dataset

In [50]:
df = pd.read_csv("../data/interim/manipulation/phage_df_clean_unassigned.csv",index_col=0)
print(df.shape)
df.head()

(14341, 27)


Unnamed: 0,Accession,Description,Classification,Genome Length (bp),Jumbophage,molGC (%),Molecule,Modification Date,Number CDS,Positive Strand (%),...,Sub-family,Family,Order,Class,Phylum,Kingdom,Realm,Baltimore Group,Genbank Division,Isolation Host
31,MN335248,Vibrio phage XacF13,Vibrio phage XacF13 Xylivirus XacF13 Xylivirus...,7045,False,60.298,ss-DNA,21-OCT-2019,13,84.615385,...,Unclassified,Inoviridae,Tubulavirales,Faserviricetes,Hofneiviricota,Loebvirae,Monodnaviria,Group II,ENV,Xanthomonas vesicatoria
32,MK250029,Prevotella phage Lak-C1,Prevotella phage Lak-C1 Caudoviricetes Uroviri...,540217,True,25.796,DNA,13-JAN-2019,830,47.108434,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,ENV,Prevotella sp.
33,MK250028,Prevotella phage Lak-B9,Prevotella phage Lak-B9 Caudoviricetes Uroviri...,550053,True,26.012,DNA,13-JAN-2019,859,52.270081,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,ENV,Prevotella sp.
34,MK250027,Prevotella phage Lak-B8,Prevotella phage Lak-B8 Caudoviricetes Uroviri...,551627,True,26.022,DNA,13-JAN-2019,860,53.023256,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,ENV,Prevotella sp.
35,MK250026,Prevotella phage Lak-B7,Prevotella phage Lak-B7 Caudoviricetes Uroviri...,550702,True,26.02,DNA,13-JAN-2019,859,53.201397,...,Unclassified,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,ENV,Prevotella sp.


In [51]:
stain = pd.read_csv("../data/interim/gram_staining/staining_assignation.csv",index_col=0)
stain = stain[['Accession', 'staining']]
print(stain.shape)
stain.head()

(11156, 2)


Unnamed: 0,Accession,staining
0,MN335248,negative
1,MK250029,negative
2,MK250028,negative
3,MK250027,negative
4,MK250026,negative


In [52]:
merged_df = df.merge(stain, on='Accession', how= 'inner')
print(merged_df.shape)
merged_df.head()

(11156, 28)


Unnamed: 0,Accession,Description,Classification,Genome Length (bp),Jumbophage,molGC (%),Molecule,Modification Date,Number CDS,Positive Strand (%),...,Family,Order,Class,Phylum,Kingdom,Realm,Baltimore Group,Genbank Division,Isolation Host,staining
0,MN335248,Vibrio phage XacF13,Vibrio phage XacF13 Xylivirus XacF13 Xylivirus...,7045,False,60.298,ss-DNA,21-OCT-2019,13,84.615385,...,Inoviridae,Tubulavirales,Faserviricetes,Hofneiviricota,Loebvirae,Monodnaviria,Group II,ENV,Xanthomonas vesicatoria,negative
1,MK250029,Prevotella phage Lak-C1,Prevotella phage Lak-C1 Caudoviricetes Uroviri...,540217,True,25.796,DNA,13-JAN-2019,830,47.108434,...,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,ENV,Prevotella sp.,negative
2,MK250028,Prevotella phage Lak-B9,Prevotella phage Lak-B9 Caudoviricetes Uroviri...,550053,True,26.012,DNA,13-JAN-2019,859,52.270081,...,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,ENV,Prevotella sp.,negative
3,MK250027,Prevotella phage Lak-B8,Prevotella phage Lak-B8 Caudoviricetes Uroviri...,551627,True,26.022,DNA,13-JAN-2019,860,53.023256,...,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,ENV,Prevotella sp.,negative
4,MK250026,Prevotella phage Lak-B7,Prevotella phage Lak-B7 Caudoviricetes Uroviri...,550702,True,26.02,DNA,13-JAN-2019,859,53.201397,...,Unclassified,Unclassified,Caudoviricetes,Uroviricota,Heunggongvirae,Duplodnaviria,Group I,ENV,Prevotella sp.,negative


In [53]:
merged_df.columns

Index(['Accession', 'Description', 'Classification', 'Genome Length (bp)',
       'Jumbophage', 'molGC (%)', 'Molecule', 'Modification Date',
       'Number CDS', 'Positive Strand (%)', 'Negative Strand (%)',
       'Lowest Taxa', 'Genus', 'Sub-family', 'Family', 'Order', 'Class',
       'Phylum', 'Kingdom', 'Realm', 'Baltimore Group', 'Genbank Division',
       'Isolation Host', 'staining'],
      dtype='object')

In [54]:
# List of column names to access
desired_columns = ['Accession', 'staining', 'Genome Length (bp)', 'Jumbophage', 'molGC (%)', 'Molecule', 'Number CDS', 'Positive Strand (%)', 'Negative Strand (%)', 'Coding Capacity (%)', 'tRNAs']

# Access the desired columns using the correct syntax (list, not tuple)
merged_df = merged_df[desired_columns]

merged_df

Unnamed: 0,Accession,staining,Genome Length (bp),Jumbophage,molGC (%),Molecule,Number CDS,Positive Strand (%),Negative Strand (%),Coding Capacity (%),tRNAs
0,MN335248,negative,7045,False,60.298,ss-DNA,13,84.615385,15.384615,88.828957,0
1,MK250029,negative,540217,True,25.796,DNA,830,47.108434,52.891566,68.324951,30
2,MK250028,negative,550053,True,26.012,DNA,859,52.270081,47.729919,69.188424,29
3,MK250027,negative,551627,True,26.022,DNA,860,53.023256,46.976744,69.318761,33
4,MK250026,negative,550702,True,26.020,DNA,859,53.201397,46.798603,69.363285,33
...,...,...,...,...,...,...,...,...,...,...,...
11151,EF380009,negative,5386,False,44.653,ss-DNA,6,100.000000,0.000000,65.837356,0
11152,DQ490056,positive,26537,False,37.099,DNA,48,72.916667,27.083333,92.892942,0
11153,AJ414696,negative,32308,False,25.269,DNA,47,53.191489,46.808511,81.425653,0
11154,AJ344259,negative,35450,False,25.199,DNA,54,55.555556,44.444444,78.854725,0


In [55]:
#merged_df.to_csv("../data/processed/inphared_phage_data_clean.csv")

# Updating Thomas Dictionary

Load Thomas dict

In [58]:
import pickle

with open('../data/external/gram_pos_neg_dict.pkl', 'rb') as file:
    thomas_dict = pickle.load(file)

file.close()

# Create a new genus_to_stain with updated values
thomas_dict = {key: 'negative' if value == 'neg' else 'positive' for key, value in phage_dict.items()}

for key, value in list(thomas_dict.items())[:3]:
    print(f'{key}: {value}')

Actinobacteria: positive
Actinomycetales: positive
Bifidobacterium: positive


In [59]:
# Assuming staining_dict is your dictionary
dict_length = len(thomas_dict)

print("Length of the dictionary:", dict_length)

Length of the dictionary: 268


Load dict from staining

In [65]:
df_dict = pd.read_csv("../data/interim/gram_staining/staining_assignation.csv", index_col=0)

# Remove duplicate staining labels for each host
unique_staining = df_dict.drop_duplicates(subset=['Host', 'staining'])

# Group by 'Host' and get the first 'staining' label as a single string
staining_dict = unique_staining.groupby('Host')['staining'].first().to_dict()

staining_dict

{'Acaryochloris': 'negative',
 'Acetobacter': 'negative',
 'Achromobacter': 'negative',
 'Acidithiobacillus': 'negative',
 'Acidovorax': 'negative',
 'Acinetobacter': 'negative',
 'Actinomyces': 'positive',
 'Actinomycetia': 'positive',
 'Aeribacillus': 'positive',
 'Aerococcus': 'positive',
 'Aeromonas': 'negative',
 'Aeropyrum': 'negative',
 'Agathobaculum': 'positive',
 'Aggregatibacter': 'negative',
 'Agrobacterium': 'negative',
 'Akkermansia': 'negative',
 'Alcaligenes': 'negative',
 'Alicyclobacillus': 'positive',
 'Alishewanella': 'negative',
 'Alistipes': 'negative',
 'Alteromonas': 'negative',
 'Amedibacillus': 'positive',
 'Aminobacter': 'negative',
 'Anabaena': 'negative',
 'Anaerostipes': 'positive',
 'Anoxybacillus': 'positive',
 'Aphanizomenon': 'negative',
 'Aquamicrobium': 'negative',
 'Arcanobacterium': 'positive',
 'Arthrobacter': 'positive',
 'Arthronema': 'negative',
 'Aurantimonas': 'negative',
 'Azobacteroides': 'positive',
 'Azospirillum': 'negative',
 'Bacteroid

In [66]:
# Assuming staining_dict is your dictionary
dict_length = len(staining_dict)

print("Length of the dictionary:", dict_length)

Length of the dictionary: 248


# Update Thomas Dict with data coming from staining dataframe as well as raw all_gram data 

## Thomas dict vs staining dict

Seeing which elements one list of genera has that the other does not and vice versa

In [67]:
thomas_df = pd.DataFrame(thomas_dict.items(), columns=['Genus', 'Label'])
# thomas_list = thomas_df['Genus'].to_list()
# thomas_list

In [68]:
phage_df = pd.DataFrame(staining_dict.items(), columns=['Genus', 'Label'])
# phage_list = phage_df['Genus'].to_list()
# phage_list

In [69]:
# Find keys in dict1 that are not in dict2
keys_in_dict1_not_in_dict2 = set(thomas_dict.keys()) - set(staining_dict.keys())
print("Keys in thomas_dict not in staining_dict:", keys_in_dict1_not_in_dict2)
print(len(keys_in_dict1_not_in_dict2))
keys_in_dict2_not_in_dict1 = set(staining_dict.keys()) - set(thomas_dict.keys())
print("Keys in staining_dict not in thomas_dict:", keys_in_dict2_not_in_dict1)
print(len(keys_in_dict2_not_in_dict1))

Keys in thomas_dict not in staining_dict: {'Chromobacterium', 'Thermoactinomyces', 'Anaplasmataceae', 'Lawsonia', 'Frankia', 'Fibrobacter', 'Alphaproteobacteria', 'Betaproteobacteria', 'Pedobacter', 'Brachyspira', 'Rickettsiales', 'Bacillales', 'Streptomycetaceae', 'Bacillaceae', 'Chyrseobacterium', 'Actinomycetales', 'Piscirickettsiaceae', 'Tenericutes', 'Propionibacteriaceae', 'Pasteuria', 'Entercoccus', 'Faecalibacterium prausnitzii', 'Enterobacteriaceae', 'Gram-Positive', 'Peptoclostridium', 'Moritella', 'Flavobacterium', 'Zymomonas', 'Saccharibacteria', 'Catenulispora', 'Saccharopolyspora', 'Nocardiopsis', 'Brevibacillus', 'Planococcaceae', 'Gilliamella', 'Bartonellaceae', 'Sphingomonas', 'Gallionellaceae', 'Spirillum', 'Nocardiaceae', 'Enterococcaceae', 'Eikenella', 'Mycobacteriaceae', 'Gram-Negative', 'Micromonosporaceae', 'Yersini', 'Chlamydiaceae', 'Capnocytophaga', 'Sphingobacterium', 'Epsilonproteobacteria', 'Pseudonocardia', 'Salicola', 'Pasteurellaceae', 'Bartonella', 'Noc

In [70]:
# Merge the dictionaries
merged_dict = thomas_dict.copy()  # Make a copy of dict1 to avoid modifying it directly
merged_dict.update(staining_dict)  # Update with elements from dict2

print("Merged dictionary:", merged_dict)
print(len(merged_dict))

Merged dictionary: {'Actinobacteria': 'positive', 'Actinomycetales': 'positive', 'Bifidobacterium': 'positive', 'Brevibacterium': 'positive', 'Cellulomonas': 'positive', 'Clavibacter': 'positive', 'Corynebacterium': 'positive', 'Dermatophilus': 'positive', 'Frankia': 'positive', 'Gardnerella': 'negative', 'Gordonia': 'positive', 'Microbacterium': 'positive', 'Micrococcaceae': 'positive', 'Micromonosporaceae': 'positive', 'Mycobacteriaceae': 'positive', 'Nocardiaceae': 'positive', 'Nocardioides': 'positive', 'Nocardiopsis': 'positive', 'Propionibacteriaceae': 'positive', 'Pseudonocardia': 'positive', 'Saccharopolyspora': 'positive', 'Streptomycetaceae': 'positive', 'Tropheryma': 'positive', 'Bacillales': 'positive', 'Alicyclobacillus': 'positive', 'Bacillaceae': 'positive', 'Brevibacillus': 'positive', 'Brochothrix': 'positive', 'Gemella': 'positive', 'Listeria': 'positive', 'Paenibacillus': 'positive', 'Pasteuria': 'positive', 'Staphylococcaceae': 'positive', 'Thermoactinomyces': 'posi

In [71]:
# Replace 'positive' with 'pos' and 'negative' with 'neg'
for key, value in merged_dict.items():
    if value == 'positive':
        merged_dict[key] = 'pos'
    elif value == 'negative':
        merged_dict[key] = 'neg'

print("Merged dictionary:", merged_dict)
print(len(merged_dict))

Merged dictionary: {'Actinobacteria': 'pos', 'Actinomycetales': 'pos', 'Bifidobacterium': 'pos', 'Brevibacterium': 'pos', 'Cellulomonas': 'pos', 'Clavibacter': 'pos', 'Corynebacterium': 'pos', 'Dermatophilus': 'pos', 'Frankia': 'pos', 'Gardnerella': 'neg', 'Gordonia': 'pos', 'Microbacterium': 'pos', 'Micrococcaceae': 'pos', 'Micromonosporaceae': 'pos', 'Mycobacteriaceae': 'pos', 'Nocardiaceae': 'pos', 'Nocardioides': 'pos', 'Nocardiopsis': 'pos', 'Propionibacteriaceae': 'pos', 'Pseudonocardia': 'pos', 'Saccharopolyspora': 'pos', 'Streptomycetaceae': 'pos', 'Tropheryma': 'pos', 'Bacillales': 'pos', 'Alicyclobacillus': 'pos', 'Bacillaceae': 'pos', 'Brevibacillus': 'pos', 'Brochothrix': 'pos', 'Gemella': 'pos', 'Listeria': 'pos', 'Paenibacillus': 'pos', 'Pasteuria': 'pos', 'Staphylococcaceae': 'pos', 'Thermoactinomyces': 'pos', 'Gram-Positive': 'pos', 'Aerococcus': 'pos', 'Deinococcus': 'pos', 'Oenococcus': 'pos', 'Peptococcaceae': 'pos', 'Peptostreptococcus': 'pos', 'Planococcaceae': 'po

In [72]:
'''with open('../data/interim/gram_staining/updated_gram_pos_neg_dict.pkl', 'wb') as file:
    pickle.dump(merged_dict, file)'''

## Thomas dict vs all gram datast

In [62]:
import pickle

with open('../data/external/gram_pos_neg_dict.pkl', 'rb') as file:
    thomas_dict = pickle.load(file)

file.close()


thomas_dict

{'Actinobacteria': 'pos',
 'Actinomycetales': 'pos',
 'Bifidobacterium': 'pos',
 'Brevibacterium': 'pos',
 'Cellulomonas': 'pos',
 'Clavibacter': 'pos',
 'Corynebacterium': 'pos',
 'Dermatophilus': 'pos',
 'Frankia': 'pos',
 'Gardnerella': 'pos',
 'Gordonia': 'pos',
 'Microbacterium': 'pos',
 'Micrococcaceae': 'pos',
 'Micromonosporaceae': 'pos',
 'Mycobacteriaceae': 'pos',
 'Nocardiaceae': 'pos',
 'Nocardioides': 'pos',
 'Nocardiopsis': 'pos',
 'Propionibacteriaceae': 'pos',
 'Pseudonocardia': 'pos',
 'Saccharopolyspora': 'pos',
 'Streptomycetaceae': 'pos',
 'Tropheryma': 'pos',
 'Bacillales': 'pos',
 'Alicyclobacillus': 'pos',
 'Bacillaceae': 'pos',
 'Brevibacillus': 'pos',
 'Brochothrix': 'pos',
 'Gemella': 'pos',
 'Listeria': 'pos',
 'Paenibacillus': 'pos',
 'Pasteuria': 'pos',
 'Staphylococcaceae': 'pos',
 'Thermoactinomyces': 'pos',
 'Gram-Positive': 'pos',
 'Aerococcus': 'pos',
 'Deinococcus': 'pos',
 'Oenococcus': 'pos',
 'Peptococcaceae': 'pos',
 'Peptostreptococcus': 'pos',
 

In [66]:
all_gram = pd.read_csv('../data/interim/all_gram_stains_fixed_species.csv')

# Create a mapping dictionary from species to Gram stain
genus_to_stain = dict(zip(all_gram['Genus'], all_gram['Gram stain']))

# Create a new genus_to_stain with updated values
genus_to_stain = {key: 'neg' if value == 'negative' else 'pos' for key, value in genus_to_stain.items()}

genus_to_stain

{'Vallitalea': 'neg',
 'Acaricomes': 'pos',
 'Acetanaerobacterium': 'pos',
 'Acetilactobacillus': 'pos',
 'Acetivibrio': 'pos',
 'Acidimicrobium': 'pos',
 'Acidipropionibacterium': 'pos',
 'Acrocarpospora': 'pos',
 'Actinoallomurus': 'pos',
 'Actinoalloteichus': 'pos',
 'Actinobaculum': 'pos',
 'Actinocatenispora': 'pos',
 'Actinocorallia': 'pos',
 'Actinocrinis': 'pos',
 'Actinocrispum': 'pos',
 'Actinokineospora': 'pos',
 'Actinomadura': 'neg',
 'Actinomyces': 'pos',
 'Actinomycetospora': 'pos',
 'Actinophytocola': 'pos',
 'Actinoplanes': 'neg',
 'Actinopolymorpha': 'pos',
 'Actinopolyspora': 'pos',
 'Actinorectispora': 'pos',
 'Actinorhabdospora': 'pos',
 'Actinospica': 'pos',
 'Actinosynnema': 'pos',
 'Actinotalea': 'pos',
 'Actinotignum': 'pos',
 'Acutalibacter': 'pos',
 'Adlercreutzia': 'pos',
 'Aeribacillus': 'pos',
 'Aeriscardovia': 'pos',
 'Aerococcus': 'pos',
 'Aeromicrobium': 'pos',
 'Aestuariimicrobium': 'pos',
 'Agathobaculum': 'pos',
 'Agrilactobacillus': 'pos',
 'Agrococ

In [67]:
# Find keys in dict1 that are not in dict2
keys_in_dict1_not_in_dict2 = set(thomas_dict.keys()) - set(genus_to_stain.keys())
print("Keys in thomas_dict not in genus_to_stain:", keys_in_dict1_not_in_dict2)
print(len(keys_in_dict1_not_in_dict2))
keys_in_dict2_not_in_dict1 = set(genus_to_stain.keys()) - set(thomas_dict.keys())
print("Keys in genus_to_stain not in thomas_dict:", keys_in_dict2_not_in_dict1)
print(len(keys_in_dict2_not_in_dict1))

Keys in thomas_dict not in genus_to_stain: {'Cyanobacteria', 'Pasteuria', 'Micrococcaceae', 'Nocardiaceae', 'Cyanophage', 'Planococcaceae', 'Streptomycetaceae', 'Cellulophag', 'Gallionellaceae', 'Entomoplasmatales', 'Bacteroidaceae', 'Verrucomicrobia', 'Methylophilales', 'Piscirickettsiaceae', 'Phormidium', 'Alphaproteobacteria', 'Cardiobacteriaceae', 'Pelagibacter', 'Saccharibacteria', 'Bacillaceae', 'TM7', 'CrAssphage', 'Thiobacimonas', 'Mycobacteriaceae', 'Aeromonadaceae', 'Methylosinus', 'Spirillum', 'Bacillales', 'Zymomonas', 'Rhizobi', 'Hamiltonella', 'Gammaproteobacteria', 'Entercoccus', 'Ornithobacterium', 'Alteromonadaceae', 'Planktothrix', 'Aerococcaceae', 'Enterobacteriaceae', 'Acidobacteria', 'Curvibacter', 'Planctomycetales', 'Thermoactinomyces', 'Buttiauxella', 'Oenococcus', 'Chlamydiales', 'Desulfovibrionaceae', 'Methylophilaceae', 'Epsilonproteobacteria', 'Micromonosporaceae', 'Tenericutes', 'Gram-Positive', 'Sphaerotilus', 'Chyrseobacterium', 'Synechococcus', 'Chroococ

In [68]:
print("Length of dictionary before adding mine: ",len(thomas_dict))

# Merge the dictionaries
merged_dict = thomas_dict.copy()  # Make a copy of dict1 to avoid modifying it directly
merged_dict.update(genus_to_stain)  # Update with elements from dict2

# Get the shape of dict2 before the addition
print("Length of dictionary after adding mine: ",len(merged_dict))

print("Merged dictionary:", merged_dict)


Length of dictionary before adding mine:  268
Length of dictionary after adding mine:  2265
Merged dictionary: {'Actinobacteria': 'pos', 'Actinomycetales': 'pos', 'Bifidobacterium': 'pos', 'Brevibacterium': 'pos', 'Cellulomonas': 'neg', 'Clavibacter': 'pos', 'Corynebacterium': 'pos', 'Dermatophilus': 'pos', 'Frankia': 'pos', 'Gardnerella': 'neg', 'Gordonia': 'pos', 'Microbacterium': 'pos', 'Micrococcaceae': 'pos', 'Micromonosporaceae': 'pos', 'Mycobacteriaceae': 'pos', 'Nocardiaceae': 'pos', 'Nocardioides': 'neg', 'Nocardiopsis': 'pos', 'Propionibacteriaceae': 'pos', 'Pseudonocardia': 'pos', 'Saccharopolyspora': 'pos', 'Streptomycetaceae': 'pos', 'Tropheryma': 'pos', 'Bacillales': 'pos', 'Alicyclobacillus': 'pos', 'Bacillaceae': 'pos', 'Brevibacillus': 'pos', 'Brochothrix': 'pos', 'Gemella': 'pos', 'Listeria': 'pos', 'Paenibacillus': 'neg', 'Pasteuria': 'pos', 'Staphylococcaceae': 'pos', 'Thermoactinomyces': 'pos', 'Gram-Positive': 'pos', 'Aerococcus': 'pos', 'Deinococcus': 'neg', 'Oen

In [69]:
'''with open('../data/interim/full_updated_gram_pos_neg_dict.pkl', 'wb') as file:
    pickle.dump(merged_dict, file)'''