In [None]:
#### Eugenio Perez Molphe Montoya ####
#### 17.06.2024 ####
#### Create a file of taxonomy_identity_numbers for the Salmonella species ####

In [1]:
### Import packages ###
import os
import pandas as pd

In [2]:
#### Remove the duplicates from the metadata ####

# I have no writing access to Nico's directory, so I will copy the metadata file to my directory but this new metadata will have no duplicates
out = '/mnt/mnemo5/eugenio/IroN_project/Files/07_Final_Report/salmonella_metadata.txt'

# Read the metadata file
pathMetadata = '/mnt/mnemo6/nnaepf/for_eugenio/data/salmonella_genomes/salmonella_metadata.txt'
metadata = pd.read_csv(pathMetadata)

# Read the list of duplicates
pathDuplicates = '/mnt/mnemo6/nnaepf/for_eugenio/data/salmonella_genomes/duplicate_genomes.txt'
with open(pathDuplicates, 'r') as f:
    duplicates = f.readlines()
duplicates = [x.strip() for x in duplicates]

# Remove the duplicates
metadata = metadata[~metadata['accession'].isin(duplicates)]

# Save the metadata without duplicates
metadata.to_csv(out, sep=',', index=False)

In [None]:
### Create a dictionary with the following structure: ###
# {header_code: NCBI_taxonomic_code}

# Sample data: list of FASTA file names
directory = '/mnt/mnemo6/nnaepf/for_eugenio/data/salmonella_genomes/ncbi_dataset/genomes'

allFastaFiles = os.listdir(directory)

# Create an empty dictionary to hold the mappings
headerToNCBIcode = {}

# Iterate through each FASTA file
for file_name in allFastaFiles:
    # Check if file exists to avoid errors
    with open(directory + '/' + file_name, 'r') as file:
        # Let's extract the codes in the headers of the fasta file
        lines = file.readlines()
        headersFastaFile = [line for line in lines if line.startswith('>')]
        headersFastaFile = [header[1:].strip() for header in headersFastaFile]
        headersFastaFile = [header.split(' ')[0] for header in headersFastaFile]
        # Add the mapping to the dictionary
        for header in headersFastaFile:
            headerToNCBIcode[header] = file_name

# Only get the NCBI taxonomic code for the file names
for key in headerToNCBIcode:
    headerToNCBIcode[key] = headerToNCBIcode[key].split('_')[0] + '_' + headerToNCBIcode[key].split('_')[1]

# Print the results
print(headerToNCBIcode)

In [5]:
## Create a file with the header code and NCBI taxonomic code for the Salmonella species ##
# Save the dictionary to a file
out = '/mnt/mnemo5/eugenio/IroN_project/Files/07_Final_Report/Salmonella_header_NCBIcode.csv'
with open(out, 'w') as f:
    for key in headerToNCBIcode:
        f.write(key + ',' + headerToNCBIcode[key] + '\n')

In [2]:
## In case I have already the header code and NCBI code file, I can read it ##
# Read the file
path = '/mnt/mnemo5/eugenio/IroN_project/Files/07_Final_Report/Salmonella_header_NCBIcode.csv'
headerToNCBIcode = {}
with open(path, 'r') as f:
    lines = f.readlines()
    for line in lines:
        header, NCBIcode = line.strip().split(',')
        headerToNCBIcode[header] = NCBIcode

In [None]:
### Read the fasta file of the Salmonella species and extract the headers ###

# The path of the fasta file
pathFasta = '/mnt/mnemo5/eugenio/IroN_project/Files/04_Blast_annotations/salmonella_all_hits.fa'
pathListofTaxonomyCodes = '/mnt/mnemo5/eugenio/IroN_project/Files/04_Blast_annotations/salmonella_blasted_sequences.ncbi_codes.txt' # Output file

# Open the file
with open(pathFasta, 'r') as file:
    # Read the file
    lines = file.readlines()
    # Extract the headers
    headersFastaFile = [line for line in lines if line.startswith('>')]
    # Remove the character '>' and the newline character
    headersFastaFile = [header[1:].strip() for header in headersFastaFile]
    # Remove everything after the first space
    headersFastaFile = [header.split(' ')[0] for header in headersFastaFile]
    # Remove the _upstream_seq thing
    headersFastaFile = [header.strip('_upstream_seq') for header in headersFastaFile]

# Time to replace the headers with the taxonomic code

salmonellaAccesions = []
for i in headersFastaFile:
    if i in headerToNCBIcode:
        salmonellaAccesions.append(headerToNCBIcode[i])

print(len(salmonellaAccesions))

# Save it as a list and save the list as a file

with open(pathListofTaxonomyCodes, 'w') as file:
    for header in salmonellaAccesions:
        file.write(header + '\n')

I already run this code, so everything is ready for the following script: get_taxonomic_information.ipynb
9.07.2024

In [11]:
#### 15.10.2024 ####
#### I do the same for the E. coli species ####

# I'll read the NCBI codes for the E. coli species from the metadata file
pathFasta = '/mnt/mnemo5/eugenio/IroN_project/Files/10_Escherichia/Escherichia_iroN.fa'
# To find the assembly codes for the headers that are not assembly codes, I need to search for the headers in the FASTA files
pathDirectory = '/mnt/mnemo5/eugenio/IroN_project/Files/10_Escherichia/Genomes'
# The output file
pathListofTaxonomyCodes = '/mnt/mnemo5/eugenio/IroN_project/Files/10_Escherichia/Results/Escherichia_w_iroN.txt'

# Open the file
with open(pathFasta, 'r') as file:
    # Read the file
    lines = file.readlines()
    # Extract the headers
    headersFastaFile = [line for line in lines if line.startswith('>')]
    # Remove the character '>' and the newline character
    headersFastaFile = [header[1:].strip() for header in headersFastaFile]
    # Remove everything after the first space
    headersFastaFile = [header.split(' ')[0] for header in headersFastaFile]

# Now, we are in a bit of a sticky situation: some headers are actually the name of the contig, not the assembly code, so I need to search for the contig in the fasta files
# The functions for this thing
def get_fasta_files(directory):
    """Returns a list of all FASTA files in the directory."""
    return [f for f in os.listdir(directory) if f.endswith('.fna') or f.endswith('.fa')]

def is_gca_string(s):
    """Checks if a string starts with 'GCA_' indicating it is a valid assembly name."""
    return s.startswith('GCA_')

def find_string_in_fasta(fasta_file, search_string):
    """Searches for the presence of a specific string (header) in a FASTA file."""
    with open(fasta_file, 'r') as f:
        for line in f:
            if search_string in line:
                return True
    return False

def replace_with_fasta_name(strings, fasta_directory):
    """Replaces non-GCA strings with the names of the FASTA files they belong to."""
    fasta_files = get_fasta_files(fasta_directory)
    
    for i, s in enumerate(strings):
        if not is_gca_string(s):  # If the string is not a GCA_ string (header of contig)
            for fasta_file in fasta_files:
                fasta_path = os.path.join(fasta_directory, fasta_file)
                if find_string_in_fasta(fasta_path, s):
                    strings[i] = fasta_file  # Replace header with FASTA file name
                    print(f"Replaced {s} with {fasta_file}")
                    break

    strings = [code.split('_')[0] + '_' + code.split('_')[1] for code in strings]  # Remove the file name and keep only the assembly code
    return strings

# Now let's get the assembly codes for the headers that are not assembly codes
updatedCodes = replace_with_fasta_name(headersFastaFile, pathDirectory)

# Save the updated codes
with open(pathListofTaxonomyCodes, 'w') as file:
    for code in updatedCodes:
        file.write(code + '\n')

KeyboardInterrupt: 

In [13]:
# Change the name of the E. coli genomes of the promoters df to the assembly codes

# The input and output paths
pathPromoters = '/mnt/mnemo5/eugenio/IroN_project/Files/10_Escherichia/promoters_df.csv'
outputPath = '/mnt/mnemo5/eugenio/IroN_project/Files/10_Escherichia/Results/promoters_w_accession_codes_df.csv'

# Read the promoters df
promoters = pd.read_csv(pathPromoters, sep='\t')

# Get the original names
originalNames = promoters['Unnamed: 0'].tolist()

# Replace the names with contig names
updatedNames = replace_with_fasta_name(originalNames, pathDirectory)

# Replace the names with incomplete assembly codes
for i, code in enumerate(updatedNames):
    if '.' not in code:
        updatedNames[i] = next((item for item in updatedCodes if item.startswith(code)), code)

print(updatedNames)

# Update the promoters df
promoters['Unnamed: 0'] = updatedNames

# Save the updated promoters df
promoters.to_csv(outputPath, index=False)

Replaced RODF01000039 with GCA_003794675.1_PDT000395251.1_genomic.fna
Replaced AAZTQD010000042 with GCA_017753105.1_PDT000997500.1_genomic.fna
Replaced AASWKY010000074 with GCA_012609475.1_PDT000639540.1_genomic.fna
Replaced AASVZO010000056 with GCA_012602615.1_PDT000644926.1_genomic.fna
Replaced AASTQI010000050 with GCA_012567285.1_PDT000156806.2_genomic.fna
Replaced AATLXQ010000021 with GCA_012898485.1_PDT000215551.2_genomic.fna
Replaced DACYOK010000047 with GCA_016108845.1_PDT000847298.1_genomic.fna
Replaced AASEDK010000025 with GCA_012359775.1_PDT000497308.1_genomic.fna
Replaced AAZIGU010000025 with GCA_017253655.1_PDT000976836.1_genomic.fna
Replaced AAZZEP010000040 with GCA_017924855.1_PDT001004631.1_genomic.fna
Replaced DAEVMD010000009 with GCA_019883145.1_PDT001123324.1_genomic.fna
Replaced DABSCJ010000037 with GCA_014485755.1_PDT000829345.1_genomic.fna
Replaced AARMNU010000106 with GCA_012045625.1_PDT000420662.1_genomic.fna
Replaced DADVUR010000108 with GCA_017947765.1_PDT00100

In [None]:
# Now let's report in our E. coli df the genomes that contain the iroN gene and which promoters do they have

# The input and output paths
pathPromoters = '/mnt/mnemo5/eugenio/IroN_project/Files/10_Escherichia/Results/promoters_w_accession_codes_df.csv'