In [None]:
#### Eugenio Perez Molphe Montoya ####
#### 28.10.20204 ####
#### Get the df with the accesion numbers and the presence/absence of the genes ####

In [27]:
### Packages
import pandas as pd
from Bio import Entrez
import time
import numpy as np

In [28]:
### Functions

def fetch_genbank_assembly_from_sample(biosample_codes):
    """
    Get the Genbank accession number based on the NCBI code
    """
    Entrez.email = "eugenio.perez@mls.uzh.ch"
    genbank_assemblies = {}
    failedSearches = []
    max_retries = 50
    retry_delay = 5
    for sample_code in biosample_codes:
        attempt = 0
        while attempt < max_retries:
            try:
                # Step 1: Search the Assembly database linked to the BioSample
                handle = Entrez.esearch(db="assembly", term=f"{sample_code}[biosample]")
                record = Entrez.read(handle)
                handle.close()

                # Check if any assembly IDs were found
                if record['IdList']:
                    assembly_id = record['IdList'][0]

                    # Step 2: Fetch the GenBank assembly accession using the assembly ID
                    handle = Entrez.esummary(db="assembly", id=assembly_id, report="full")
                    summary_record = Entrez.read(handle)
                    handle.close()
                    print(summary_record['DocumentSummarySet']['DocumentSummary'][0]['Synonym']['Genbank'])
                    genbank_assembly = summary_record['DocumentSummarySet']['DocumentSummary'][0]['Synonym']['Genbank']
                    genbank_assemblies[sample_code] = genbank_assembly

                    attempt = max_retries
                else:
                    attempt += 1
                    if attempt < max_retries:
                        print(f"No assembly found for BioSample {sample_code}. Retrying in {retry_delay} seconds...")
                        time.sleep(retry_delay)
                    else:
                        genbank_assemblies[sample_code] = sample_code
                        failedSearches.append(sample_code)
                        print(f"No assembly found for BioSample {sample_code}")

            except Exception as e:
                attempt += 1
                if attempt < max_retries:
                    print(f"Error fetching data for BioSample {sample_code}: {e}. Retrying in {retry_delay} seconds...")
                    time.sleep(retry_delay)
                else:
                    print(f"Error fetching data for BioSample {sample_code}: {e}")
                    genbank_assemblies[sample_code] = sample_code
                    failedSearches.append(sample_code)

    print(f"Failed searches: {failedSearches}")
    return genbank_assemblies

In [29]:
### Paths

# The original table 6 with the books concatenated
enteroPath = '/mnt/mnemo5/eugenio/IroN_project/Files/Enterobacteriaceae_iron_genes.tsv'

# The sample numbers of the Enterobacteriaceae genomes
taxEnteroPath = '/mnt/mnemo5/eugenio/IroN_project/Files/04_Blast_annotations/enterobacteriaceae.identities.tsv'

# The accession codes of the Salmonella genomes
salmoPath = '/mnt/mnemo5/eugenio/IroN_project/Files/04_Blast_annotations/salmonella_blasted_sequences.ncbi_codes.txt'

# The NCBI Genbank accession numbers of the Escherichia genomes
eColiPath = '/mnt/mnemo5/eugenio/IroN_project/Files/10_Escherichia/genomes_w_double_checked_iroN.txt'

# The list with the E. coli genomes that have a double checked annotations of iroN
eColiDoubleCheck = '/mnt/mnemo5/eugenio/IroN_project/Files/10_Escherichia/genomes_w_double_checked_iroN.txt'

# The promoters of the Enterobacteriaceae genomes
promEnteroPath = '/mnt/mnemo5/eugenio/IroN_project/Files/11_DFs_promoters/enterobacteriaceae_110.tsv'

# The promoters of the Salmonella genomes
promSalmoPath = '/mnt/mnemo5/eugenio/IroN_project/Files/11_DFs_promoters/salmonella_110.tsv'

# The NCBI codes for the salmonella genomes
pathNCBIcodesSalmo = '/mnt/mnemo5/eugenio/IroN_project/Files/07_Final_Report/Salmonella_header_NCBIcode.csv'

# The promoters of the Escherichia genomes
promEscherichiaPath = '/mnt/mnemo5/eugenio/IroN_project/Files/11_DFs_promoters/escherichia_110.tsv'


In [30]:
#### Read the file with the names of the genomes that were double checked to effectively have iroN, I'll use it to filter out the genomes that we're still doubtful ####
#### This step is important, because it incorporates if the iroN hit is trustworthy and if it is, will be included as such ####

pathFilteredPromoters =  '/mnt/mnemo5/eugenio/IroN_project/Files/07_Final_Report/filtered_promoters.txt'

with open(pathFilteredPromoters, 'r') as f:
    promoters = f.readlines()

for idx in range(len(promoters)):
    promoters[idx] = promoters[idx].strip()

headerToNCBIcode = {}
with open(pathNCBIcodesSalmo, 'r') as f:
    lines = f.readlines()
    for line in lines:
        header, NCBIcode = line.strip().split(',')
        headerToNCBIcode[header] = NCBIcode

for idx in range(len(promoters)):
    if promoters[idx] in headerToNCBIcode:
        promoters[idx] = headerToNCBIcode[promoters[idx]]

# And transform it into a set
qcSet = set(promoters)
print(promoters)

# The one with the E. coli genomes that have a double checked annotations of iroN
with open(eColiDoubleCheck, 'r') as f:
    eColiDouble = f.readlines()
eColiDouble = [x.strip() for x in eColiDouble]

# Get the NCBI code translator for salmonella
headerToNCBIcodeSalmo = pd.read_csv(pathNCBIcodesSalmo, header = None)

['SAMN13494091', 'SAMN15566989', 'SAMN07690113', 'GCA_014244715.1', 'SAMD00020672', 'SAMN15589939', 'GCA_014244955.1', 'SAMN11855534', 'SAMN09210776', 'SAMN14410566', 'GCA_014165615.1', 'SAMN10435575', 'GCA_014244595.1', 'SAMEA7313125', 'SAMEA2273685', 'GCA_014297915.1', 'SAMN05770934', 'GCA_013703345.1', 'GCA_014042825.1', 'SAMN10644675', 'SAMN16233353', 'GCA_013702675.1', 'GCA_013555915.1', 'SAMN15148584', 'SAMN15148721', 'SAMN14082844', 'SAMEA6657193', 'GCA_014067055.1', 'SAMN11505997', 'SAMN16233093', 'SAMN15907161', 'SAMN11928082', 'SAMN08395900', 'SAMN13943464', 'SAMN14257459', 'SAMN16205307', 'SAMN13621907', 'SAMN14640086', 'GCA_013870775.1', 'GCA_014154475.1', 'GCA_013703645.1', 'SAMN11506044', 'SAMN03733752', 'GCA_013679235.1', 'GCA_014165235.1', 'SAMN05770933', 'SAMN09650196', 'SAMN10743647', 'SAMN10435593', 'GCA_013907745.1', 'GCA_013702145.1', 'SAMN05441416', 'SAMN15148564', 'GCA_013482925.1', 'SAMN15148588', 'GCA_014062545.1', 'SAMN16233300', 'GCA_014153405.1', 'GCA_013703

In [31]:
##### Read the files (the original table and the lists of genomes with iroN) #####

# The original table 6 with the books concatenated
df = pd.read_csv(enteroPath, sep = '\t')
print(df)


# The sample numbers of the Enterobacteriaceae genomes in df with taxonomy code / sample code
enteroTaxInfo = pd.read_csv(taxEnteroPath, header=None, sep = '\t')
enteroTaxInfo = enteroTaxInfo.rename(columns = {0: 'Taxonomy_numbers', 1: 'Sample'})
print(enteroTaxInfo)

# The salmonella accession numbers with iroN
with open(salmoPath) as f:
    salmonellaTaxInfo = f.readlines()
salmonellaTaxInfo = [i.strip() for i in salmonellaTaxInfo]

print(salmonellaTaxInfo[0:10])

# The E. coli genomes
with open(eColiPath, 'r') as f:
    eColi = f.readlines()

eColi = [i.strip() for i in eColi]
eColi = [i.split('_')[0] + '_' + i.split('_')[1] for i in eColi]
print(eColi[0:10])

             accession              family        genus             species
0      GCA_000167875.2  Enterobacteriaceae  Escherichia    Escherichia coli
1      GCA_000167915.2  Enterobacteriaceae  Escherichia    Escherichia coli
2      GCA_000181735.1  Enterobacteriaceae  Escherichia    Escherichia coli
3      GCA_000183005.2  Enterobacteriaceae  Escherichia    Escherichia coli
4      GCA_000184185.1  Enterobacteriaceae  Escherichia    Escherichia coli
...                ...                 ...          ...                 ...
19170  GCA_002033765.1  Enterobacteriaceae   Salmonella  Salmonella bongori
19171  GCA_008485645.2  Enterobacteriaceae   Salmonella  Salmonella bongori
19172  GCA_013693605.1  Enterobacteriaceae   Salmonella  Salmonella bongori
19173  GCA_013138145.1  Enterobacteriaceae   Salmonella  Salmonella bongori
19174  GCA_013693595.1  Enterobacteriaceae   Salmonella  Salmonella bongori

[19175 rows x 4 columns]
     Taxonomy_numbers          Sample
0             1128983   

In [32]:
#### The Enterobacteriaceae accession numbers are actually sample codes, not GenBank accession numbers, so I'll get the later ####

samples = enteroTaxInfo['Sample'].tolist()
print(samples)
genbankCodes = fetch_genbank_assembly_from_sample(samples)

# Add the genbank codes to the dataframe
enteroTaxInfo['NCBI_codes'] = enteroTaxInfo['Sample'].map(genbankCodes)

# Obtain the repeated genomes
repeatedGenomes = enteroTaxInfo[enteroTaxInfo.duplicated(subset = 'Sample', keep = False)]
print(repeatedGenomes)

# Get a list with the NCBI codes of the genomes
enteroIroN = enteroTaxInfo['NCBI_codes'].tolist()

['SAMN01814043', 'SAMD00093683', 'SAMEA104567719', 'SAMEA2273639', 'SAMEA7847578', 'SAMN04521924', 'SAMN07319176', 'SAMN08383930', 'SAMN12212157', 'SAMN12212295', 'SAMN16729814', 'SAMN02603118', 'SAMN02927897', 'SAMN14987543', 'SAMN02952934', 'SAMN02367601', 'SAMN11879372', 'SAMN02597163', 'SAMN00990756', 'SAMN03700895', 'SAMN02367598', 'SAMN10686380', 'SAMN02470245', 'SAMN02603391', 'SAMN02472168', 'SAMN04160809', 'SAMN03283677', 'SAMN03283682', 'SAMN06218067', 'SAMN15680779', 'SAMN15680854', 'SAMN17371885', 'SAMN19316154', 'SAMN01933082', 'SAMN01933083', 'SAMN02261881', 'SAMN03081475', 'SAMN02356630', 'SAMN02641225', 'SAMN19340354', 'SAMEA104567865', 'SAMEA3672999', 'SAMN07291408', 'SAMN13677595', 'SAMN19797024', 'SAMD00020672', 'SAMD00020673', 'SAMN03159353', 'SAMN03431277', 'SAMN03031216', 'SAMN17372131', 'SAMD00089491', 'SAMEA104225364', 'SAMEA104567255', 'SAMEA104567451', 'SAMEA1920554', 'SAMEA2053723', 'SAMEA2054051', 'SAMEA2054767', 'SAMEA2054832', 'SAMEA2054917', 'SAMEA2273148

In [33]:
#### Now, let's read the promoters and process them ####

# Read the promoters dfs
promEntero = pd.read_csv(promEnteroPath, sep = '\t')
promSalmo = pd.read_csv(promSalmoPath, sep = '\t')
promEscherichia = pd.read_csv(promEscherichiaPath, sep = '\t')

# Rename the column with the genome names
promEntero = promEntero.rename(columns = {'Unnamed: 0': 'Genome'})
promSalmo = promSalmo.rename(columns = {'Unnamed: 0': 'Genome'})
promEscherichia = promEscherichia.rename(columns={'Unnamed: 0' : 'Genome'})

# So I don't care about promoter's scores and I'll just replace the values with 1
promEntero.replace(2, 1, inplace=True)
promSalmo.replace(2, 1, inplace=True)
promEscherichia.replace(2, 1, inplace=True)

# Replace the taxonomy numbers with the NCBI codes in Enterobacteriaceae
NCBI_codes = dict(zip(enteroTaxInfo['Sample'], enteroTaxInfo['NCBI_codes']))
promEntero['Genome'] = promEntero['Genome'].map(NCBI_codes)

### Time to create a new column for the promSalmo in which it will have all the genomes with iroN, and later I'll change their names for NCBI codes ###
promSalmo['Genome'] = promSalmo['Genome'].map(headerToNCBIcodeSalmo.set_index(0)[1])

In [34]:
#### Final preparation: read the table with the E. coli strains ####

# Read the table with the E. coli strains
path = '/mnt/mnemo5/eugenio/IroN_project/Files/10_Escherichia/Results/Escherichia_w_strains.csv'
eColiStrains = pd.read_csv(path, sep = ',')
eColiStrains.fillna('', inplace=True)
eColiStrains['Strain'] = eColiStrains['Strain'].astype(str)
print(eColiStrains)

             accession              family        genus           species  \
0      GCA_000167875.2  Enterobacteriaceae  Escherichia  Escherichia coli   
1      GCA_000167915.2  Enterobacteriaceae  Escherichia  Escherichia coli   
2      GCA_000181735.1  Enterobacteriaceae  Escherichia  Escherichia coli   
3      GCA_000183005.2  Enterobacteriaceae  Escherichia  Escherichia coli   
4      GCA_000184185.1  Enterobacteriaceae  Escherichia  Escherichia coli   
...                ...                 ...          ...               ...   
10638  GCA_907164025.1  Enterobacteriaceae  Escherichia  Escherichia coli   
10639  GCA_907164265.1  Enterobacteriaceae  Escherichia  Escherichia coli   
10640  GCA_907164285.1  Enterobacteriaceae  Escherichia  Escherichia coli   
10641  GCA_907172405.1  Enterobacteriaceae  Escherichia  Escherichia coli   
10642  GCA_910573655.1  Enterobacteriaceae  Escherichia  Escherichia coli   

                                                Taxon   Strain  
0         

In [35]:
# The salmonella accession numbers with iroN
with open(salmoPath) as f:
    salmonellaTaxInfo = f.readlines()
salmonellaTaxInfo = [i.strip() for i in salmonellaTaxInfo]

print(salmonellaTaxInfo[0:10])

['GCA_000278545.1', 'GCA_000380325.1', 'GCA_000411815.1', 'GCA_000442415.1', 'GCA_000487415.1', 'GCA_000494505.1', 'GCA_000623695.2', 'GCA_000624495.2', 'GCA_000625715.2', 'GCA_000626055.2']


In [36]:
#### Now, let's get who has the iroN gene ####
df = pd.read_csv(enteroPath, sep = '\t')
print(df)


# First let's concatenate our lists of genomes (enterobacteriaceae, salmonella and escherichia)
df['accession'] = df['accession'].astype(str).str.strip()
allGenomes = enteroIroN.copy()
allGenomes.extend(salmonellaTaxInfo)
allGenomes.extend(eColi)
print(len(allGenomes))

# Create the new column based on presence in the list
df['iroN'] = df['accession'].apply(lambda x: 'Probable presence' if x in allGenomes else '')
print(df)

### Now let's replace 'Probable presence' with 'Present' if the genome is in the list of corroborate genomes
# Obtain the Entero genomes that have a double checked iroN
enteroTaxInfo['Taxonomy_numbers'] = enteroTaxInfo['Taxonomy_numbers'].astype(str)
qcEntero = enteroTaxInfo[enteroTaxInfo['Sample'].isin(qcSet)]

# The salmonella genomes that have a double checked iroN
filteredEnteroIroN = qcEntero['NCBI_codes'].tolist()
qcSalmonellaTaxInfo = [i for i in salmonellaTaxInfo if i in qcSet]

# The E. coli genomes that have a double checked iroN
qcEcoli = [i for i in eColi if i in eColiDouble]

# Now let's replace the 'Probable presence' with 'Present' if the genome is in the list of corroborate genomes
df['iroN'] = df.apply(lambda row: 'Present' if row['accession'] in filteredEnteroIroN or row['accession'] in qcSalmonellaTaxInfo or row['accession'] in qcEcoli else row['iroN'], axis=1)

# Now let's get the number of 'Present' or 'Probable presence' in the iroN column
print(df['iroN'].value_counts())


             accession              family        genus             species
0      GCA_000167875.2  Enterobacteriaceae  Escherichia    Escherichia coli
1      GCA_000167915.2  Enterobacteriaceae  Escherichia    Escherichia coli
2      GCA_000181735.1  Enterobacteriaceae  Escherichia    Escherichia coli
3      GCA_000183005.2  Enterobacteriaceae  Escherichia    Escherichia coli
4      GCA_000184185.1  Enterobacteriaceae  Escherichia    Escherichia coli
...                ...                 ...          ...                 ...
19170  GCA_002033765.1  Enterobacteriaceae   Salmonella  Salmonella bongori
19171  GCA_008485645.2  Enterobacteriaceae   Salmonella  Salmonella bongori
19172  GCA_013693605.1  Enterobacteriaceae   Salmonella  Salmonella bongori
19173  GCA_013138145.1  Enterobacteriaceae   Salmonella  Salmonella bongori
19174  GCA_013693595.1  Enterobacteriaceae   Salmonella  Salmonella bongori

[19175 rows x 4 columns]
9899
             accession              family        genus  

In [37]:
### Time to add the promoters to the dataframe, and the E. coli strains ###

# Concatenate the promoters dfs
promDf = pd.concat([promEntero, promSalmo, promEscherichia], axis = 0, ignore_index=True)
promDf = promDf.drop_duplicates(subset = 'Genome', keep = 'first')

# Now, let's register the argR and argR2 genes as one
conditions = [
    ((promDf['argR'] > 0) | (promDf['argR2'] > 0)),
    ((promDf['argR'] == 0) & (promDf['argR2'] == 0))
]
choices = ['Present', 'Absent']
promDf['ArgR'] = np.select(conditions, choices, default='')
promDf.drop(columns = ['argR', 'argR2'], inplace=True)

# Now let's merge the dataframes to get the promoters
mergedDf = df.merge(promDf, left_on='accession', right_on='Genome', how='left', suffixes=('', '_dup'))

# And add the strains to the dataframe
finalDf = mergedDf.merge(eColiStrains, left_on='accession', right_on='accession', how='left', suffixes=('', '_dup'))

# Fill the NaN with '', and make the dataframes into strings, replace the '1' with 'Present', besides I'll eliminate the duplicated columns
finalDf.fillna('', inplace=True)
finalDf = finalDf.astype(str)
finalDf.replace('1.0', 'Present', inplace=True)
finalDf.replace('0.0', 'Absent', inplace=True)
finalDf = finalDf.loc[:, ~finalDf.columns.str.endswith('_dup')]

print(finalDf)

# Save the Df
path = '/mnt/mnemo5/eugenio/IroN_project/Files/07_Final_Report/table_6.csv'
finalDf.to_csv(path, sep = ',', index=False)

  finalDf.fillna('', inplace=True)


             accession              family        genus             species  \
0      GCA_000167875.2  Enterobacteriaceae  Escherichia    Escherichia coli   
1      GCA_000167915.2  Enterobacteriaceae  Escherichia    Escherichia coli   
2      GCA_000181735.1  Enterobacteriaceae  Escherichia    Escherichia coli   
3      GCA_000183005.2  Enterobacteriaceae  Escherichia    Escherichia coli   
4      GCA_000184185.1  Enterobacteriaceae  Escherichia    Escherichia coli   
...                ...                 ...          ...                 ...   
19170  GCA_002033765.1  Enterobacteriaceae   Salmonella  Salmonella bongori   
19171  GCA_008485645.2  Enterobacteriaceae   Salmonella  Salmonella bongori   
19172  GCA_013693605.1  Enterobacteriaceae   Salmonella  Salmonella bongori   
19173  GCA_013138145.1  Enterobacteriaceae   Salmonella  Salmonella bongori   
19174  GCA_013693595.1  Enterobacteriaceae   Salmonella  Salmonella bongori   

          iroN Genome arcA crp deoR fis  ... dnaA g

In [38]:
finalDf = finalDf.drop_duplicates(subset = 'accession', keep = 'first')
print(finalDf)

             accession              family        genus             species  \
0      GCA_000167875.2  Enterobacteriaceae  Escherichia    Escherichia coli   
1      GCA_000167915.2  Enterobacteriaceae  Escherichia    Escherichia coli   
2      GCA_000181735.1  Enterobacteriaceae  Escherichia    Escherichia coli   
3      GCA_000183005.2  Enterobacteriaceae  Escherichia    Escherichia coli   
4      GCA_000184185.1  Enterobacteriaceae  Escherichia    Escherichia coli   
...                ...                 ...          ...                 ...   
19170  GCA_002033765.1  Enterobacteriaceae   Salmonella  Salmonella bongori   
19171  GCA_008485645.2  Enterobacteriaceae   Salmonella  Salmonella bongori   
19172  GCA_013693605.1  Enterobacteriaceae   Salmonella  Salmonella bongori   
19173  GCA_013138145.1  Enterobacteriaceae   Salmonella  Salmonella bongori   
19174  GCA_013693595.1  Enterobacteriaceae   Salmonella  Salmonella bongori   

          iroN Genome arcA crp deoR fis  ... dnaA g

In [39]:
#### Let's cleanup the table ####
finalDf = finalDf.drop_duplicates(subset = 'accession', keep = 'first')
print(finalDf)

### Now let's create three dfs, one with the Enterobacteriaceae, one with the Salmonella and one with the E. coli
# finalDf = finalDf.drop(columns=['Genome'])

# E. Coli Df
eColiDf = finalDf[finalDf['genus'] == 'Escherichia']
eColiDf = eColiDf.loc[:, (eColiDf != '').any(axis=0)]
eColiDf.drop(columns=['Taxon'], inplace=True)
# Double check the number of E. Coli genomes

print(eColiDf.columns)
print(eColiDf)

# Salmonella Df
salmoDf = finalDf[finalDf['genus'] == 'Salmonella']
salmoDf = salmoDf.loc[:, (salmoDf != '').any(axis=0)]
print(salmoDf.columns)
print(salmoDf)

# Enterobacteriaceae Df
enteroDf = finalDf[~finalDf['genus'].isin(['Escherichia', 'Salmonella'])]
enteroDf = enteroDf.loc[:, (enteroDf != '').any(axis=0)]
print(enteroDf.columns)
print(enteroDf)

             accession              family        genus             species  \
0      GCA_000167875.2  Enterobacteriaceae  Escherichia    Escherichia coli   
1      GCA_000167915.2  Enterobacteriaceae  Escherichia    Escherichia coli   
2      GCA_000181735.1  Enterobacteriaceae  Escherichia    Escherichia coli   
3      GCA_000183005.2  Enterobacteriaceae  Escherichia    Escherichia coli   
4      GCA_000184185.1  Enterobacteriaceae  Escherichia    Escherichia coli   
...                ...                 ...          ...                 ...   
19170  GCA_002033765.1  Enterobacteriaceae   Salmonella  Salmonella bongori   
19171  GCA_008485645.2  Enterobacteriaceae   Salmonella  Salmonella bongori   
19172  GCA_013693605.1  Enterobacteriaceae   Salmonella  Salmonella bongori   
19173  GCA_013138145.1  Enterobacteriaceae   Salmonella  Salmonella bongori   
19174  GCA_013693595.1  Enterobacteriaceae   Salmonella  Salmonella bongori   

          iroN Genome arcA crp deoR fis  ... dnaA g

In [40]:
print(finalDf['genus'].value_counts())

genus
Escherichia                          10643
Salmonella                            7561
Citrobacter                            100
Klebsiella/Raoultella group            100
Serratia                               100
Enterobacter                           100
Cronobacter                            100
Yersinia                               100
Leclercia                               79
unclassified Enterobacteriaceae         74
Kosakonia                               44
Rahnella                                39
Shigella                                32
Lelliottia                              31
Kluyvera                                23
Pluralibacter                           22
Enterobacteriaceae incertae sedis       16
Cedecea                                 11
Name: count, dtype: int64


In [41]:
### Save the dataframes ###
path = '/mnt/mnemo5/eugenio/IroN_project/Files/07_Final_Report/Sup_table/enterobacteriaceae_table_4.csv'
enteroDf.to_csv(path, sep = ',', index=False)

path = '/mnt/mnemo5/eugenio/IroN_project/Files/07_Final_Report/Sup_table/salmonella_table_4.csv'
salmoDf.to_csv(path, sep = ',', index=False)

path = '/mnt/mnemo5/eugenio/IroN_project/Files/07_Final_Report/Sup_table/e_coli_table_4.csv'
eColiDf.to_csv(path, sep = ',', index=False)

Create table 5

In [42]:
# Read the table with the E. coli strains
path = '/mnt/mnemo5/eugenio/IroN_project/Files/10_Escherichia/Results/Escherichia_w_strains.csv'
eColiStrains = pd.read_csv(path, sep = ',')
eColiStrains.fillna('', inplace=True)
eColiStrains['Strain'] = eColiStrains['Strain'].astype(str)
print(eColiStrains)

             accession              family        genus           species  \
0      GCA_000167875.2  Enterobacteriaceae  Escherichia  Escherichia coli   
1      GCA_000167915.2  Enterobacteriaceae  Escherichia  Escherichia coli   
2      GCA_000181735.1  Enterobacteriaceae  Escherichia  Escherichia coli   
3      GCA_000183005.2  Enterobacteriaceae  Escherichia  Escherichia coli   
4      GCA_000184185.1  Enterobacteriaceae  Escherichia  Escherichia coli   
...                ...                 ...          ...               ...   
10638  GCA_907164025.1  Enterobacteriaceae  Escherichia  Escherichia coli   
10639  GCA_907164265.1  Enterobacteriaceae  Escherichia  Escherichia coli   
10640  GCA_907164285.1  Enterobacteriaceae  Escherichia  Escherichia coli   
10641  GCA_907172405.1  Enterobacteriaceae  Escherichia  Escherichia coli   
10642  GCA_910573655.1  Enterobacteriaceae  Escherichia  Escherichia coli   

                                                Taxon   Strain  
0         

In [43]:
# Let's add the Escherichia promoters
ecoliPath = '/mnt/mnemo5/eugenio/IroN_project/Files/11_DFs_promoters/escherichia_110.tsv'

# Read the dataframe
dfEscherichia = pd.read_csv(ecoliPath, sep='\t')
dfEscherichia = dfEscherichia.rename(columns={'Unnamed: 0' : 'Genome'})
print(dfEscherichia)

               Genome  araC  arcA  argR  cpxR  farR  lexA  lrp  phoB  rpoD16  \
0     GCA_012832335.1     0     0     0     0     0     1    0     0       0   
1     GCA_017580805.1     0     0     0     0     0     1    0     0       0   
2     GCA_014271505.1     0     0     0     0     0     1    0     0       0   
3     GCA_001463455.1     0     0     0     0     0     1    0     0       0   
4     GCA_003903315.1     0     0     0     0     0     1    0     0       0   
...               ...   ...   ...   ...   ...   ...   ...  ...   ...     ...   
1636  GCA_016946115.1     0     0     0     0     0     1    0     0       0   
1637  GCA_007844635.1     0     0     0     0     0     1    0     0       0   
1638  GCA_003779805.1     0     0     0     0     0     1    0     0       0   
1639  GCA_902828165.1     0     0     0     0     0     1    0     0       0   
1640  GCA_019577055.1     0     0     0     0     0     1    0     0       0   

      rpoD17  soxS  
0          0     0

In [44]:
# Merge the dataframes on the 'Genome' and 'accession' columns
dfEscherichia = dfEscherichia.merge(eColiStrains[['accession', 'Strain']], left_on='Genome', right_on='accession', how='left')

# Drop the 'accession' column as it is no longer needed
dfEscherichia.drop(columns=['accession'], inplace=True)

print(dfEscherichia)

               Genome  araC  arcA  argR  cpxR  farR  lexA  lrp  phoB  rpoD16  \
0     GCA_012832335.1     0     0     0     0     0     1    0     0       0   
1     GCA_017580805.1     0     0     0     0     0     1    0     0       0   
2     GCA_014271505.1     0     0     0     0     0     1    0     0       0   
3     GCA_001463455.1     0     0     0     0     0     1    0     0       0   
4     GCA_003903315.1     0     0     0     0     0     1    0     0       0   
...               ...   ...   ...   ...   ...   ...   ...  ...   ...     ...   
1636  GCA_016946115.1     0     0     0     0     0     1    0     0       0   
1637  GCA_007844635.1     0     0     0     0     0     1    0     0       0   
1638  GCA_003779805.1     0     0     0     0     0     1    0     0       0   
1639  GCA_902828165.1     0     0     0     0     0     1    0     0       0   
1640  GCA_019577055.1     0     0     0     0     0     1    0     0       0   

      rpoD17  soxS         Strain  
0  

In [45]:
#### Group the df by strains
dfMergedByStrain = dfEscherichia.groupby('Strain', as_index=False).sum()

# Let's create a dictionary that counts the ocurrences of a promoter in the df
generaCounts = dfEscherichia['Strain'].value_counts()
dfMergedByStrain['Total hits'] = [generaCounts[i] for i in dfMergedByStrain['Strain']]
dfMergedByStrain = dfMergedByStrain.drop(columns='Genome')

# Time to get the proportion of hits in a promoter with certain strain
for col in dfMergedByStrain.columns[1:]:
    if col != 'Total hits':
        dfMergedByStrain[col] = dfMergedByStrain[col] / dfMergedByStrain['Total hits'] * 100

# Delete the column total genomes
dfMergedByStrain = dfMergedByStrain.drop(columns=['Total hits'])

#### Let's add iroN presence:
# Create the column
dfMergedByStrain['iroN'] = dfMergedByStrain['Strain'].apply(lambda x: 1)

# The E. coli genomes that have a double checked annotations of iroN
eColiDoubleCheck = '/mnt/mnemo5/eugenio/IroN_project/Files/10_Escherichia/genomes_w_double_checked_iroN.txt'
with open(eColiDoubleCheck, 'r') as f:
    eColiDouble = f.readlines()
eColiDouble = [x.strip() for x in eColiDouble]

# Add the strains present in 'Strain' in eColiStrains dataframe with 'Present' value in the column 'iroN'
for _, row in eColiStrains.iterrows():
    newRow = {
        'Strain': row['Strain'],
        'iroN': 1 if row['accession'] in eColiDouble else 0
    }
    dfMergedByStrain = pd.concat([dfMergedByStrain, pd.DataFrame([newRow])], ignore_index=True)

# Let's collapse the dataframe by strain again to get also a clean version of the strains that we missed in BProm
dfMergedByStrain['Strain'] = dfMergedByStrain['Strain'].astype(str)
collapsedDf = dfMergedByStrain.groupby('Strain', as_index=False).sum()
collapsedDf.fillna('', inplace=True)
collapsedDf['iroN'] = collapsedDf['iroN'].apply(lambda x: 'Present' if x > 0 else '')
collapsedDf = collapsedDf.astype(str)

# And let's remove the 'strain' that is an empty string
collapsedDf = collapsedDf[dfMergedByStrain['Strain'] != '']

print(collapsedDf)

# Save the dataframe
path = '/mnt/mnemo5/eugenio/IroN_project/Files/07_Final_Report/Sup_table/table_5.csv'
collapsedDf.to_csv(path, sep = ',', index=False)

         Strain araC arcA argR cpxR farR   lexA  lrp phoB rpoD16 rpoD17 soxS  \
1         #5522  0.0  0.0  0.0  0.0  0.0    0.0  0.0  0.0    0.0    0.0  0.0   
2        0.2610  0.0  0.0  0.0  0.0  0.0    0.0  0.0  0.0    0.0    0.0  0.0   
3       00-3279  0.0  0.0  0.0  0.0  0.0    0.0  0.0  0.0    0.0    0.0  0.0   
4     002PP2015  0.0  0.0  0.0  0.0  0.0  100.0  0.0  0.0    0.0    0.0  0.0   
5           004  0.0  0.0  0.0  0.0  0.0    0.0  0.0  0.0    0.0    0.0  0.0   
...         ...  ...  ...  ...  ...  ...    ...  ...  ...    ...    ...  ...   
8640    upec-87  0.0  0.0  0.0  0.0  0.0    0.0  0.0  0.0    0.0    0.0  0.0   
8641         wt  0.0  0.0  0.0  0.0  0.0  100.0  0.0  0.0    0.0    0.0  0.0   
8642      xz013  0.0  0.0  0.0  0.0  0.0    0.0  0.0  0.0    0.0    0.0  0.0   
8643      xz027  0.0  0.0  0.0  0.0  0.0    0.0  0.0  0.0    0.0    0.0  0.0   
8644      xz028  0.0  0.0  0.0  0.0  0.0  100.0  0.0  0.0    0.0    0.0  0.0   

         iroN  
1              
2      

  collapsedDf = collapsedDf[dfMergedByStrain['Strain'] != '']
