In [1]:
#### Eugenio Perez Molphe Montoya ####
#### 28.10.20204 ####
#### Get the df with the accesion numbers and the presence/absence of the genes ####

In [1]:
### Packages
import pandas as pd
from Bio import Entrez
import time

In [2]:
### Functions

def fetch_genbank_assembly_from_sample(biosample_codes):
    """
    Get the Genbank accession number based on the NCBI code
    """
    Entrez.email = "eugenio.perez@mls.uzh.ch"
    genbank_assemblies = {}
    failedSearches = []
    max_retries = 10
    retry_delay = 5
    for sample_code in biosample_codes:
        attempt = 0
        while attempt < max_retries:
            try:
                # Step 1: Search the Assembly database linked to the BioSample
                handle = Entrez.esearch(db="assembly", term=f"{sample_code}[biosample]")
                record = Entrez.read(handle)
                handle.close()

                # Check if any assembly IDs were found
                if record['IdList']:
                    assembly_id = record['IdList'][0]

                    # Step 2: Fetch the GenBank assembly accession using the assembly ID
                    handle = Entrez.esummary(db="assembly", id=assembly_id, report="full")
                    summary_record = Entrez.read(handle)
                    handle.close()
                    print(summary_record['DocumentSummarySet']['DocumentSummary'][0]['Synonym']['Genbank'])
                    genbank_assembly = summary_record['DocumentSummarySet']['DocumentSummary'][0]['Synonym']['Genbank']
                    genbank_assemblies[sample_code] = genbank_assembly

                    attempt = max_retries
                else:
                    attempt += 1
                    if attempt < max_retries:
                        print(f"No assembly found for BioSample {sample_code}. Retrying in {retry_delay} seconds...")
                        time.sleep(retry_delay)
                    else:
                        genbank_assemblies[sample_code] = sample_code
                        failedSearches.append(sample_code)
                        print(f"No assembly found for BioSample {sample_code}")

            except Exception as e:
                attempt += 1
                if attempt < max_retries:
                    print(f"Error fetching data for BioSample {sample_code}: {e}. Retrying in {retry_delay} seconds...")
                    time.sleep(retry_delay)
                else:
                    print(f"Error fetching data for BioSample {sample_code}: {e}")
                    genbank_assemblies[sample_code] = sample_code
                    failedSearches.append(sample_code)

    print(f"Failed searches: {failedSearches}")
    return genbank_assemblies

In [51]:
### Paths

# The original table 6 with the books concatenated
enteroPath = '/mnt/mnemo5/eugenio/IroN_project/Files/Enterobacteriaceae_iron_genes.tsv'

# The sample numbers of the Enterobacteriaceae genomes
taxEnteroPath = '/mnt/mnemo5/eugenio/IroN_project/Files/04_Blast_annotations/enterobacteriaceae.identities.tsv'

# The accession codes of the Salmonella genomes
salmoPath = '/mnt/mnemo5/eugenio/IroN_project/Files/04_Blast_annotations/salmonella_blasted_sequences.ncbi_codes.txt'

# The NCBI Genbank accession numbers of the Escherichia genomes
eColiPath = '/mnt/mnemo5/eugenio/IroN_project/Files/10_Escherichia/eColi_iroN.txt'

# The promoters of the Enterobacteriaceae genomes
promEnteroPath = '/mnt/mnemo5/eugenio/IroN_project/Files/07_Final_Report/promoters_enterobacteriaceae_sample_codes.tsv'

# The promoters of the Salmonella genomes
promSalmoPath = '/mnt/mnemo5/eugenio/IroN_project/Files/07_Final_Report/salmonella_matrix_all.tsv'

# The NCBI codes for the salmonella genomes
pathNCBIcodesSalmo = '/mnt/mnemo5/eugenio/IroN_project/Files/07_Final_Report/Salmonella_header_NCBIcode.csv'

# The promoters of the Escherichia genomes
promEscherichiaPath = '/mnt/mnemo5/eugenio/IroN_project/Files/10_Escherichia/promoters_df_with_immunity_genes.csv'


In [57]:
#### Read the file with the names of the genomes that were double checked to effectively have iroN, I'll use it to filter out the genomes that we're still doubtful ####
#### This step is important, because it incorporates if the iroN hit is trustworthy and if it is, will be included as such ####

pathFilteredPromoters =  '/mnt/mnemo5/eugenio/IroN_project/Files/07_Final_Report/filtered_promoters.txt'

with open(pathFilteredPromoters, 'r') as f:
    promoters = f.readlines()

for idx in range(len(promoters)):
    promoters[idx] = promoters[idx].strip()

headerToNCBIcode = {}
with open(pathNCBIcodesSalmo, 'r') as f:
    lines = f.readlines()
    for line in lines:
        header, NCBIcode = line.strip().split(',')
        header = header.split('.')[0]
        headerToNCBIcode[header] = NCBIcode

for idx in range(len(promoters)):
    if promoters[idx] in headerToNCBIcode:
        promoters[idx] = headerToNCBIcode[promoters[idx]]

# And transform it into a set
qcSet = set(promoters)
print(promoters)

# Get the NCBI code translator for salmonella
headerToNCBIcodeSalmo = pd.read_csv(pathNCBIcodesSalmo, header = None)

['54291', '54291', '54291', 'GCA_014244715.1', '1560357', '546', 'GCA_014244955.1', '546', '546', '67827', 'GCA_014165615.1', '546', 'GCA_014244595.1', '546', '573', 'GCA_014297915.1', '546', 'GCA_013703345.1', 'GCA_014042825.1', '546', '546', 'GCA_013702675.1', 'GCA_013555915.1', '546', '546', '546', '546', 'GCA_014067055.1', '546', '546', '546', '57706', '57706', '573', '546', '546', '546', '546', 'GCA_013870775.1', 'GCA_014154475.1', 'GCA_013703645.1', '546', '546', 'GCA_013679235.1', 'GCA_014165235.1', '546', '546', '546', '546', 'GCA_013907745.1', 'GCA_013702145.1', '546', '546', 'GCA_013482925.1', '546', 'GCA_014062545.1', '546', 'GCA_014153405.1', 'GCA_013703435.1', '546', '546', '546', '546', 'GCA_014052025.1', 'GCA_013587435.1', '573', '546', 'GCA_014472815.1', '573', '158836', '546', 'GCA_013874735.1', 'GCA_014387265.1', 'GCA_014065805.1', '546', '573', 'GCA_013849385.1', '546', '573', '57706', 'GCA_013875075.1', 'GCA_014455335.1', '158836', '1812934', '158836', '158836', '15

In [None]:
##### Read the files (the original table and the lists of genomes with iroN) #####

# The original table 6 with the books concatenated
df = pd.read_csv(enteroPath, sep = '\t')
print(df)


# The sample numbers of the Enterobacteriaceae genomes in df with taxonomy code / sample code
enteroTaxInfo = pd.read_csv(taxEnteroPath, header=None, sep = '\t')
enteroTaxInfo = enteroTaxInfo.rename(columns = {0: 'Taxonomy_numbers', 1: 'Sample'})
print(enteroTaxInfo)

# The salmonella accession numbers with iroN
with open(salmoPath) as f:
    salmonellaTaxInfo = f.readlines()
salmonellaTaxInfo = [i.strip() for i in salmonellaTaxInfo]

print(salmonellaTaxInfo[0:10])

# The E. coli genomes
with open(eColiPath, 'r') as f:
    eColi = f.readlines()

eColi = [i.strip() for i in eColi]
eColi = [i.split('_')[0] + '_' + i.split('_')[1] for i in eColi]
print(eColi[0:10])

In [None]:
#### The Enterobacteriaceae accession numbers are actually sample codes, not GenBank accession numbers, so I'll get the later ####

samples = enteroTaxInfo['Sample'].tolist()
print(samples)
genbankCodes = fetch_genbank_assembly_from_sample(samples)

# Add the genbank codes to the dataframe
enteroTaxInfo['NCBI_codes'] = enteroTaxInfo['Sample'].map(genbankCodes)

# Obtain the repeated genomes
repeatedGenomes = enteroTaxInfo[enteroTaxInfo.duplicated(subset = 'Sample', keep = False)]
print(repeatedGenomes)

# Get a list with the NCBI codes of the genomes
enteroIroN = enteroTaxInfo['NCBI_codes'].tolist()

In [48]:
#### Now, let's read the promoters and process them ####

# Read the promoters dfs
promEntero = pd.read_csv(promEnteroPath, sep = '\t')
promSalmo = pd.read_csv(promSalmoPath, sep = '\t')
promEscherichia = pd.read_csv(promEscherichiaPath, sep = '\t')

# Rename the column with the genome names
promEntero = promEntero.rename(columns = {'Unnamed: 0': 'Genome'})
promSalmo = promSalmo.rename(columns = {'Unnamed: 0': 'Genome'})
promEscherichia = promEscherichia.rename(columns={'Unnamed: 0' : 'Genome'})

# So I don't care about promoter's scores and I'll just replace the values with 1
promEntero.replace(2, 1, inplace=True)
promSalmo.replace(2, 1, inplace=True)
promEscherichia.replace(2, 1, inplace=True)

# Replace the taxonomy numbers with the NCBI codes in Enterobacteriaceae
NCBI_codes = dict(zip(enteroTaxInfo['Sample'], enteroTaxInfo['NCBI_codes']))
promEntero['Genome'] = promEntero['Genome'].map(NCBI_codes)

### Time to create a new column for the promSalmo in which it will have all the genomes with iroN, and later I'll change their names for NCBI codes ###
promSalmo['Genome'] = promSalmo['Genome'].map(headerToNCBIcodeSalmo.set_index(0)[1])

In [None]:
#### Final preparation: read the table with the E. coli strains ####

# Read the table with the E. coli strains
path = '/mnt/mnemo5/eugenio/IroN_project/Files/10_Escherichia/Results/Escherichia_w_strains.csv'
eColiStrains = pd.read_csv(path, sep = ',')
eColiStrains.fillna('', inplace=True)
eColiStrains['Strain'] = eColiStrains['Strain'].astype(str)
print(eColiStrains)

In [None]:
# The salmonella accession numbers with iroN
with open(salmoPath) as f:
    salmonellaTaxInfo = f.readlines()
salmonellaTaxInfo = [i.strip() for i in salmonellaTaxInfo]

print(salmonellaTaxInfo[0:10])

In [None]:
#### Now, let's get who has the iroN gene ####
df = pd.read_csv(enteroPath, sep = '\t')

# First let's concatenate our lists of genomes (enterobacteriaceae, salmonella and escherichia)
df['accession'] = df['accession'].astype(str).str.strip()
allGenomes = enteroIroN.copy()
allGenomes.extend(salmonellaTaxInfo)
allGenomes.extend(eColi)
print(len(allGenomes))

# Create the new column based on presence in the list
df['iroN'] = df['accession'].apply(lambda x: 'Probable presence' if x in allGenomes else '')
print(df)

# Now let's replace 'Probable presence' with 'Present' if the genome is in the list of corroborate genomes
# Obtain the Entero genomes that have a double checked iroN
enteroTaxInfo['Taxonomy_numbers'] = enteroTaxInfo['Taxonomy_numbers'].astype(str)
qcEntero = enteroTaxInfo[enteroTaxInfo['Taxonomy_numbers'].isin(qcSet)]
filteredEnteroIroN = qcEntero['NCBI_codes'].tolist()
salmonellaTaxInfo = [i for i in salmonellaTaxInfo if i in qcSet]


# Now let's get the number of 'Present' in the iroN column
print(df['iroN'].value_counts())


In [None]:
### Time to add the promoters to the dataframe, and the E. coli strains ###

# Concatenate the promoters dfs
promDf = pd.concat([promEntero, promSalmo, promEscherichia], axis = 0, ignore_index=True)
promDf = promDf.drop_duplicates(subset = 'Genome', keep = 'first')

# Now let's merge the dataframes to get the promoters
mergedDf = df.merge(promDf, left_on='accession', right_on='Genome', how='left', suffixes=('', '_dup'))

# And add the strains to the dataframe
finalDf = mergedDf.merge(eColiStrains, left_on='accession', right_on='accession', how='left', suffixes=('', '_dup'))

# Fill the NaN with '', and make the dataframes into strings, replace the '1' with 'Present', besides I'll eliminate the duplicated columns
finalDf.fillna('', inplace=True)
finalDf = finalDf.astype(str)
finalDf.replace('1.0', 'Present', inplace=True)
finalDf.replace('0.0', 'Absent', inplace=True)
finalDf = finalDf.loc[:, ~finalDf.columns.str.endswith('_dup')]

print(finalDf)

# Save the Df
path = '/mnt/mnemo5/eugenio/IroN_project/Files/07_Final_Report/table_6.csv'
finalDf.to_csv(path, sep = ',', index=False)