In [1]:
import os
import pandas as pd
import numpy as np
import pytaxonkit
from tqdm import tqdm

In [2]:
def process_prokaryote_taxon(data):
    """
    Processes a DataFrame to extract and structure taxonomic information using PyTaxonKit.
    
    Parameters:
    - data: DataFrame with 'TaxID' column.
    
    Returns:
    - DataFrame with taxonomic information columns added.
    """
    # Get taxonomic lineage using PyTaxonKit
    data_taxonkit = pytaxonkit.lineage(data["TaxID"].unique(), formatstr="{s}", threads=14)
    # Preprocess lineage information for easier data manipulation
    taxonomic_info = []
    for index, row in tqdm(data_taxonkit.iterrows(), total=data_taxonkit.shape[0], desc="Processing taxonomic information"):
        lineage_id = row['LineageTaxIDs']
        lineage_name = row['Lineage']
        lineage_ranks = row['FullLineageRanks'].split(';')
        full_lineage = row['FullLineage'].split(';')
        taxonomy = dict(zip(lineage_ranks, full_lineage))
        taxonomic_info.append({
            'TaxID': row['TaxID'],
            'Organism': full_lineage[1] if len(full_lineage) > 1 else None,
            'Lineage' : lineage_name,
            'LineageTaxIDs' : lineage_id,
            'Clade': taxonomy.get('clade'),
            'Order': taxonomy.get('order'),
            'Class': taxonomy.get('class'),
            'Phylum': taxonomy.get('phylum'),
            'Genus': taxonomy.get('genus')
        })
    taxonomic_df = pd.DataFrame(taxonomic_info)
    return pd.merge(data, taxonomic_df, on='TaxID', how='left').drop_duplicates()

def load_and_prepare_prokaryotes(main_dir):
    """
    Loads prokaryotes data and prepares it for processing.
    
    Parameters:
    - main_dir: Directory containing the prokaryotes.txt file.
    
    Returns:
    - DataFrame of prokaryotes with essential information and FTP paths.
    """
    prokaryotes_path = os.path.join(main_dir, "prokaryotes.txt")
    prokaryotes = pd.read_csv(prokaryotes_path, sep='\t', low_memory=False)
    # prokaryotes['FTP Path'].replace('-', np.nan, inplace=True)
    prokaryotes['FTP Path'] = prokaryotes['FTP Path'].replace('-', np.nan)
    prokaryotes.dropna(subset=['FTP Path'], inplace=True)
    
    # Add columns for genome and CDS file paths
    ftp_to_path = lambda x: f"{x}/{x.split('/')[-1]}"
    prokaryotes["GenomePath"] = prokaryotes["FTP Path"].apply(lambda x: ftp_to_path(x) + "_genomic.fna.gz")
    prokaryotes["CDSPath"] = prokaryotes["FTP Path"].apply(lambda x: ftp_to_path(x) + "_cds_from_genomic.fna.gz")

    ftp_to_path = lambda x: f"{x.split('/')[-1]}"
    prokaryotes["GenomeFile"] = prokaryotes["FTP Path"].apply(lambda x: ftp_to_path(x) + "_genomic.fna")
    prokaryotes["CDSFile"] = prokaryotes["FTP Path"].apply(lambda x: ftp_to_path(x) + "_cds_from_genomic.fna")
    
    return prokaryotes

def filter_and_process_taxon(prokaryotes):
    """
    Filters prokaryotes by completeness status, processes taxonomic information,
    and merges the taxonomic information back into the main DataFrame.
    
    Parameters:
    - prokaryotes: DataFrame of prokaryotes.
    
    Returns:
    - DataFrame with processed taxonomic information.
    """
    # Filter by Reference and Status
    # reference = prokaryotes[prokaryotes['Reference'].isin(['REFR', 'REPR'])]
    complete = prokaryotes[prokaryotes['Status'].isin(['Complete Genome', 'Chromosome', 'Complete', 'Chromosome(s)'])]

    # Process taxonomic information
    complete_taxon = process_prokaryote_taxon(complete)
    # reference_taxon = process_prokaryote_taxon(reference)
    
    # Combine processed data
    # combined_taxon = pd.concat([complete_taxon, reference_taxon]).drop_duplicates()
    return complete_taxon

def save_output(prokaryotes_genomes, dir_path):
    """
    Saves the processed prokaryotes genomes information to CSV and FTP path files.
    
    Parameters:
    - prokaryotes_genomes: DataFrame with prokaryotes genomes information.
    - dir_path: Directory path to save the output files.
    """
    # Save to CSV
    prokaryotes_genomes.to_csv(os.path.join(dir_path, "prokaryotes_processed.csv"), index=False)
    
    # Save FTP paths
    with open(os.path.join(dir_path, 'ncbi_cds_ftp.txt'), 'w') as f:
        for ftp in prokaryotes_genomes['CDSPath']:
            f.write(f"{ftp}\n")
    
    with open(os.path.join(dir_path, 'ncbi_genomes_ftp.txt'), 'w') as f:
        for ftp in prokaryotes_genomes['GenomePath']:
            f.write(f"{ftp}\n")

In [3]:
main_dir = "/Users/akshayonly/Work/Updated/Data/01/"
prokaryotes = load_and_prepare_prokaryotes(main_dir)
prokaryotes_genomes = filter_and_process_taxon(prokaryotes)

Processing taxonomic information: 100%|████████████████████████████████████████| 15591/15591 [00:00<00:00, 75819.66it/s]


In [4]:
prokaryotes_genomes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45976 entries, 0 to 45975
Data columns (total 35 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   #Organism/Name        45976 non-null  object 
 1   TaxID                 45976 non-null  int64  
 2   BioProject Accession  45976 non-null  object 
 3   BioProject ID         45976 non-null  int64  
 4   Group                 45976 non-null  object 
 5   SubGroup              45976 non-null  object 
 6   Size (Mb)             45976 non-null  float64
 7   GC%                   45976 non-null  object 
 8   Replicons             45976 non-null  object 
 9   WGS                   45976 non-null  object 
 10  Scaffolds             45976 non-null  object 
 11  Genes                 45976 non-null  object 
 12  Proteins              45976 non-null  object 
 13  Release Date          45976 non-null  object 
 14  Modify Date           45976 non-null  object 
 15  Status             

In [5]:
prokaryotes_genomes['LineageTaxIDs'].nunique()

11669

In [6]:
# First filter out rows with NaN in 'Phylum'
prokaryotes_genomes = prokaryotes_genomes[prokaryotes_genomes['Phylum'].notna()]

# Then filter out rows where 'Phylum' starts with 'Candidatus' or 'candidate'
prokaryotes_genomes = prokaryotes_genomes[
    ~prokaryotes_genomes['Phylum'].str.startswith('Candidatus') & 
    ~prokaryotes_genomes['Phylum'].str.startswith('candidate')
]

# Get phylum counts and filter phyla with counts >= 2
phylum_counts = prokaryotes_genomes['Phylum'].value_counts()
selected_phyla = phylum_counts[phylum_counts >= 2].index

# Filter the genomes based on the selected phyla
prokaryotes_genomes = prokaryotes_genomes[prokaryotes_genomes['Phylum'].isin(selected_phyla)]

# Filter the genomes which contains Genus information
prokaryotes_genomes = prokaryotes_genomes[~prokaryotes_genomes['Genus'].isnull()]

prokaryotes_genomes = prokaryotes_genomes[~prokaryotes_genomes['Genus'].str.startswith('Candidatus')]

In [7]:
genus = prokaryotes_genomes['Genus'].value_counts().reset_index()

# Group by 'Genus' and filter groups with at least 2 unique 'LineageTaxIDs'
genus_to_keep = (
    prokaryotes_genomes.groupby('Genus')
    .filter(lambda x: x['LineageTaxIDs'].nunique() >= 2)['Genus']
    .unique()
)

# Filter the original DataFrame to keep only the selected genera
prokaryotes_genomes = prokaryotes_genomes[prokaryotes_genomes['Genus'].isin(genus_to_keep)]

In [8]:
prokaryotes_genomes = prokaryotes_genomes.drop_duplicates().reset_index(drop=True)

In [9]:
save_output(prokaryotes_genomes, main_dir)

In [10]:
prokaryotes_genomes.head()

Unnamed: 0,#Organism/Name,TaxID,BioProject Accession,BioProject ID,Group,SubGroup,Size (Mb),GC%,Replicons,WGS,...,GenomeFile,CDSFile,Organism,Lineage,LineageTaxIDs,Clade,Order,Class,Phylum,Genus
0,Campylobacter jejuni subsp. jejuni NCTC 11168 ...,192222,PRJNA8,8,Campylobacterota,Epsilonproteobacteria,1.64148,30.5,chromosome:NC_002163.1/AL111168.1,-,...,GCA_000009085.1_ASM908v1_genomic.fna,GCA_000009085.1_ASM908v1_cds_from_genomic.fna,Bacteria,Campylobacter jejuni,197,,Campylobacterales,Epsilonproteobacteria,Campylobacterota,Campylobacter
1,Pseudomonas fluorescens,294,PRJEB22404,401622,Pseudomonadota,Gammaproteobacteria,6.51155,60.0,chromosome I:NZ_LT907842.1/LT907842.1,-,...,GCA_900215245.1_IMG-taxon_2617270901_annotated...,GCA_900215245.1_IMG-taxon_2617270901_annotated...,Bacteria,Pseudomonas fluorescens,294,,Pseudomonadales,Gammaproteobacteria,Pseudomonadota,Pseudomonas
2,Xanthomonas campestris pv. raphani,359385,PRJNA641237,641237,Pseudomonadota,Gammaproteobacteria,4.94204,65.3,chromosome:NZ_CP058243.1/CP058243.1,-,...,GCA_013388375.1_ASM1338837v1_genomic.fna,GCA_013388375.1_ASM1338837v1_cds_from_genomic.fna,Bacteria,Xanthomonas campestris,339,,Lysobacterales,Gammaproteobacteria,Pseudomonadota,Xanthomonas
3,Salmonella enterica subsp. enterica serovar Ty...,99287,PRJNA241,241,Pseudomonadota,Gammaproteobacteria,4.95138,52.2171,chromosome:NC_003197.2/AE006468.2; plasmid pSL...,-,...,GCA_000006945.2_ASM694v2_genomic.fna,GCA_000006945.2_ASM694v2_cds_from_genomic.fna,Bacteria,Salmonella enterica,28901,,Enterobacterales,Gammaproteobacteria,Pseudomonadota,Salmonella
4,Yersinia pestis A1122,1035377,PRJNA67155,67155,Pseudomonadota,Gammaproteobacteria,4.65841,47.6472,chromosome:CP002956.1; plasmid unnamed:CP00295...,-,...,GCA_000222975.1_ASM22297v1_genomic.fna,GCA_000222975.1_ASM22297v1_cds_from_genomic.fna,Bacteria,Yersinia pestis,632,,Enterobacterales,Gammaproteobacteria,Pseudomonadota,Yersinia


In [11]:
prokaryotes_genomes['ProteomeFile'] = prokaryotes_genomes['GenomeFile'].apply(lambda x: x.replace('_genomic.fna', '_cds_proteins.faa'))

In [12]:
# Define the directories
proteome_dir = '/Users/akshayonly/Work/Sequence-Data/Proteomes'

# Get the list of all files in the directories
all_proteome_files = set(os.listdir(proteome_dir))
selected_proteome_files = set(prokaryotes_genomes['ProteomeFile'])

In [13]:
prokaryotes_genomes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42593 entries, 0 to 42592
Data columns (total 36 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   #Organism/Name        42593 non-null  object 
 1   TaxID                 42593 non-null  int64  
 2   BioProject Accession  42593 non-null  object 
 3   BioProject ID         42593 non-null  int64  
 4   Group                 42593 non-null  object 
 5   SubGroup              42593 non-null  object 
 6   Size (Mb)             42593 non-null  float64
 7   GC%                   42593 non-null  object 
 8   Replicons             42593 non-null  object 
 9   WGS                   42593 non-null  object 
 10  Scaffolds             42593 non-null  object 
 11  Genes                 42593 non-null  object 
 12  Proteins              42593 non-null  object 
 13  Release Date          42593 non-null  object 
 14  Modify Date           42593 non-null  object 
 15  Status             

In [14]:
remaining_genomes = prokaryotes_genomes[~prokaryotes_genomes['ProteomeFile'].isin(all_proteome_files)]['GenomePath'].values

In [15]:
with open(os.path.join('/Users/akshayonly/Work', 'remaining_genomes_ftp.txt'), 'w') as f:
    for ftp in remaining_genomes:
        f.write(f"{ftp}\n")