In [43]:
import numpy as np
import pandas as pd
import os
os.getcwd()

'/Users/Alvaro/Library/Mobile Documents/com~apple~CloudDocs/DTU/Autumn 2023/Phage project/phage/notebooks'

In [44]:
df = pd.read_csv("../data/raw/inphared_8Sep2023/8Sep2023_data.tsv", sep="\t")
df.rename(columns={'Isolation Host (beware inconsistent and nonsense values)': 'Isolation Host'}, inplace=True)
print("Shape before filtering", df.shape)
df = df[['Accession','Host','Isolation Host']]
df = df[df['Host'] != "Unspecified"] 
print("Shape after filtering unspecified hosts", df.shape)
df = df[df['Isolation Host'] != "Unspecified"]
print("Shape after filtering unspecified isolation hosts", df.shape)
df.head()

Shape before filtering (26287, 27)
Shape after filtering unspecified hosts (23367, 3)
Shape after filtering unspecified isolation hosts (14341, 3)


Unnamed: 0,Accession,Host,Isolation Host
31,MN335248,Vibrio,Xanthomonas vesicatoria
32,MK250029,Prevotella,Prevotella sp.
33,MK250028,Prevotella,Prevotella sp.
34,MK250027,Prevotella,Prevotella sp.
35,MK250026,Prevotella,Prevotella sp.


In [45]:
all_gram = pd.read_csv("../data/interim/all_gram_stains_fixed_species.csv")
all_gram.head()

Unnamed: 0,species,Gram stain,Genus
0,Vallitalea guaymasensis,positive,Vallitalea
1,Acaricomes phytoseiuli,positive,Acaricomes
2,Acetanaerobacterium elongatum,positive,Acetanaerobacterium
3,Acetilactobacillus jinshanensis,positive,Acetilactobacillus
4,Acetivibrio mesophilus,positive,Acetivibrio


# Excluding duplicated genera and species

In [46]:
exclude_genus = ['Clostridium', 'Neobacillus', 'Alteribacter', 'Desulfotomaculum', 'Caloramator', 'Desulforamulus', 'Heyndrickxia', 'Peptoclostridium', 'Thermoanaerobacter', 'Thermoanaerobacterium', 'Aureimonas', 'Actinomadura', 'Alkalibacterium', 'Deinococcus', 'Tepidibacillus', 'Sphingomonas', 'Lysinibacillus', 'Ruminiclostridium', 'Caldicellulosiruptor', 'Pseudomonas', 'Streptococcus', 'Microlunatus', 'Streptomyces', 'Butyricimonas', 'Halalkalibacter', 'Chelativorans', 'Natrinema', 'Ureibacillus', 'Clostridioides', 'Desulfosporosinus', 'Lacibacter', 'Nocardioides', 'Siminovitchia', 'Belliella', 'Tistlia', 'Actinoplanes', 'Paenibacillus', 'Vallitalea', 'Actinotalea', 'Cohnella', 'Rhizobium', 'Anaerotignum', 'Cellulomonas', 'Flavobacterium', 'Bacillus', 'Nesterenkonia']
exclude_species = ['Ureibacillus massiliensis', 'Tistlia consotensis', 'Clostridioides difficile', 'Vallitalea guaymasensis', 'Belliella pelovolcani', 'Actinotalea ferrariae']

print("Shape of the dataframe before deleting the common genera: ", df.shape)
df = df[~df['Host'].isin(exclude_genus)]
print("Shape of the dataframe after deleting the common genera: ", df.shape)
df = df[~df['Isolation Host'].isin(exclude_species)]
shape1 = df.shape
print("Shape of the dataframe after deleting the common species: ", shape1[1])


Shape of the dataframe before deleting the common genera:  (14341, 3)
Shape of the dataframe after deleting the common genera:  (11373, 3)
Shape of the dataframe after deleting the common species:  3


# First mapping: species to isolation host

In [47]:
# Only keep first two strings in each row of Isolation Host to reduce name incompatibilities
# Split each value on space and select the first two elements
df['Isolation Host'] = df['Isolation Host'].str.split().str[:2].str.join(' ')
df

Unnamed: 0,Accession,Host,Isolation Host
31,MN335248,Vibrio,Xanthomonas vesicatoria
32,MK250029,Prevotella,Prevotella sp.
33,MK250028,Prevotella,Prevotella sp.
34,MK250027,Prevotella,Prevotella sp.
35,MK250026,Prevotella,Prevotella sp.
...,...,...,...
26263,AJ854042,Acidianus,Acidianus pozzuoliensis
26266,DQ490056,Lactococcus,Lactococcus lactis
26274,AJ414696,Sulfolobus,Sulfolobus islandicus
26275,AJ344259,Sulfolobus,Sulfolobus islandicus


In [48]:
# Create a mapping dictionary from species to Gram stain
species_to_stain = dict(zip(all_gram['species'], all_gram['Gram stain']))
species_to_stain

{'Vallitalea guaymasensis': 'positive',
 'Acaricomes phytoseiuli': 'positive',
 'Acetanaerobacterium elongatum': 'positive',
 'Acetilactobacillus jinshanensis': 'positive',
 'Acetivibrio mesophilus': 'positive',
 'Acetivibrio saccincola': 'positive',
 'Acidimicrobium ferrooxidans': 'positive',
 'Acidipropionibacterium olivae': 'positive',
 'Acidipropionibacterium virtanenii': 'positive',
 'Acrocarpospora phusangensis': 'positive',
 'Actinoallomurus acaciae': 'positive',
 'Actinoallomurus acanthiterrae': 'positive',
 'Actinoallomurus amamiensis': 'positive',
 'Actinoallomurus caesius': 'positive',
 'Actinoallomurus fulvus': 'positive',
 'Actinoallomurus iriomotensis': 'positive',
 'Actinoallomurus luridus': 'positive',
 'Actinoallomurus oryzae': 'positive',
 'Actinoallomurus purpureus': 'positive',
 'Actinoallomurus spadix': 'positive',
 'Actinoallomurus yoronensis': 'positive',
 'Actinoalloteichus spitiensis': 'positive',
 'Actinobaculum massiliense': 'positive',
 'Actinobaculum suis':

In [49]:
# Use the mapping to create the 'staining' column in df
df_mapped = df.copy(deep=True)
df_mapped['staining'] = df_mapped['Isolation Host'].map(species_to_stain)
drop = df_mapped.dropna(subset=['staining'])
shape2 = drop.shape
print(shape2)
print("Percentage of phages assigned a class based solely on species: ", str(shape2[1]/shape1[1]*100))
print("Percentage of the all_gram dataset used to assign a class based solely on species: ", str(shape2[1]/all_gram.shape[1]*100))

(2558, 4)
Percentage of phages assigned a class based solely on species:  133.33333333333331
Percentage of the all_gram dataset used to assign a class based solely on species:  133.33333333333331


# Second mapping: using genus and Host

In [50]:
# Create a mapping dictionary from species to Gram stain
genus_to_stain = dict(zip(all_gram['Genus'], all_gram['Gram stain']))
genus_to_stain

{'Vallitalea': 'negative',
 'Acaricomes': 'positive',
 'Acetanaerobacterium': 'positive',
 'Acetilactobacillus': 'positive',
 'Acetivibrio': 'positive',
 'Acidimicrobium': 'positive',
 'Acidipropionibacterium': 'positive',
 'Acrocarpospora': 'positive',
 'Actinoallomurus': 'positive',
 'Actinoalloteichus': 'positive',
 'Actinobaculum': 'positive',
 'Actinocatenispora': 'positive',
 'Actinocorallia': 'positive',
 'Actinocrinis': 'positive',
 'Actinocrispum': 'positive',
 'Actinokineospora': 'positive',
 'Actinomadura': 'negative',
 'Actinomyces': 'positive',
 'Actinomycetospora': 'positive',
 'Actinophytocola': 'positive',
 'Actinoplanes': 'negative',
 'Actinopolymorpha': 'positive',
 'Actinopolyspora': 'positive',
 'Actinorectispora': 'positive',
 'Actinorhabdospora': 'positive',
 'Actinospica': 'positive',
 'Actinosynnema': 'positive',
 'Actinotalea': 'positive',
 'Actinotignum': 'positive',
 'Acutalibacter': 'positive',
 'Adlercreutzia': 'positive',
 'Aeribacillus': 'positive',
 'Aer

In [56]:
# Use the mapping to create the 'staining' column in df
df_mapped['staining'] = df_mapped['Host'].map(genus_to_stain)
drop = df_mapped.dropna(subset=['staining'])
shape3 = drop.shape
print(shape3)
print("Percentage of phages assigned a class based using species+genus: ", str(shape3[0]/shape1[0]*100))
print("Percentage of the all_gram dataset used to assign a class based on species+genus: ", str(shape3[0]/all_gram.shape[0]*100))

(9049, 4)
Percentage of phages assigned a class based using species+genus:  79.56563791435856
Percentage of the all_gram dataset used to assign a class based on species+genus:  119.99734783185254


# Exploring the remaining phages with no assignation

In [52]:
remaining = df_mapped[(df_mapped["staining"] != "positive") & (df_mapped["staining"] != "negative")]
remaining.head()

Unnamed: 0,Accession,Host,Isolation Host,staining
69,OR413347,Salmonella,Salmonella enterica,
70,OR413345,Salmonella,Salmonella enterica,
71,OR413344,Salmonella,Salmonella enterica,
72,OR413342,Salmonella,Salmonella enterica,
2301,OR067837,Salmonella,Salmonella enterica,


In [58]:
print(len(remaining["Host"].unique().tolist()))
print(remaining["Host"].unique().tolist())

89
['Salmonella', 'Synechococcus', 'Pseudanabaena', 'Mammaliicoccus', 'Enteroccous', 'Mycobacteroides', 'Nostoc', 'Microcystis', 'Methanosarcina', 'Actinomycetia', 'Saccharolobus', 'Cylindrospermopsis', 'Sulfolobus', 'Metallosphaera', 'Liberibacter', 'Enterobacteria', 'Megaira', 'Dipodfec', 'Thermoproteus', 'Pyrobaculum', 'Methanocaldococcus', 'Acidianus', 'Spiroplasma', 'Elizabethkingia', 'Myxococcus', 'Janthinobacterium', 'Pelagibacter', 'Brochothrix', 'Sinorhizobium', 'Butyrivibrio', 'Lentibacter', 'Salinibacter', 'Eggerthella', 'Phormidium', 'Mycoplasma', 'Chlamydia', 'Hamiltonella', 'Edwardsiella', 'Nodularia', 'Raoultella', 'Faecalibacterium', 'Leclercia', 'Azobacteroides', 'Curvibacter', 'Morganella', 'Natrialba', 'Verrucomicrobia', 'Prochlorococcus', 'Hydrogenobaculum', 'Anabaena', 'Croceibacter', 'Oenococcus', 'Puniceispirillum', 'Tetrasphaera', 'Bdellovibrio', 'Thermococcus', 'Planktothrix', 'Sodalis', 'Silicibacter', 'Pyrococcus', 'Listonella', 'Kluyvera', 'Hungatella', 'Ame

In [59]:
print(len(remaining["Isolation Host"].unique().tolist()))
print(remaining["Isolation Host"].unique().tolist())

159
['Salmonella enterica', 'Parasynechococcus marenigrum', 'Synechococcus sp.', 'Salmonella sp.', 'Pseudanabaena sp.', 'Mammaliicoccus sciuri', 'Enterococcus faecalis', 'Mycobacteroides abscessus', 'Nostoc sp.', 'Microcystis sp.', 'Methanosarcina mazei', 'Actinomycetia bacterium', 'Saccharolobus shibatae', 'Cylindrospermopsis raciborskii', 'Sulfolobus beitou', 'Metallosphaera prunae', 'Candidatus Liberibacter', 'Escherichia coli', 'Candidatus Megaira', 'Dipodomys merriami', 'Thermoproteus sp.', 'Pyrobaculum arsenaticum', 'Methanocaldococcus fervens', 'Saccharolobus solfataricus', 'Metallosphaera sedula', 'Acidianus brierleyi', 'Circulifer haematoceps', 'Salmonella gallinarum', 'Sulfolobus tengchongensis', 'Elizabethkingia anophelis', 'Myxococcus xanthus', 'Janthinobacterium lividum', 'Sulfolobus sp.', 'Sulfolobus', 'Candidadtus Pelagibacter', 'Dipodomys merriami;', 'Brochothrix thermosphacta', 'Sinorhizobium meliloti', 'Butyrivibrio fibrisolvens', 'Lentibacter sp.', 'Synechococcus', '

In [55]:
all_gram[all_gram['species'] == "Salmonella"]

Unnamed: 0,species,Gram stain,Genus
