In [1]:
import os
import re
import pandas as pd
from Bio import SeqIO
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [2]:
def extract_information_from_header(header, patterns):
    """Extracts information from a header string based on provided regex patterns."""
    return {key: re.search(pattern, header).group(1) if re.search(pattern, header) else None for key, pattern in patterns.items()}

def format_gene_symbol(gene_symbol):
    """Formats the gene symbol by removing specified characters and adjusting casing."""
    return gene_symbol.lower().replace('[h','').replace('[c','').strip().replace('nuo', '').upper().translate(str.maketrans('', '', '-_/'))

def parse_fasta_files(cds_dir, patterns):
    """Parses fasta files, extracting records with 'gene=nuo' and returns a DataFrame with extracted information."""
    data = []

    for fasta in tqdm(os.listdir(cds_dir), desc="Parsing FASTA files"):
        for record in SeqIO.parse(os.path.join(cds_dir, fasta), "fasta"):
            if "gene=nuo" in record.description:
                prot_seq = record.seq.translate(table=11, to_stop=True)
                data.append([fasta, record.description] + list(extract_information_from_header(record.description, patterns).values()) + [len(prot_seq)])
    return pd.DataFrame(data, columns=['CDSFile', 'Header', 'Accession', 'GeneName', 'ProteinName', 'ProteinLength'])

In [3]:
cds_dir = "/Users/akshayonly/Work/Sequence-Data/CDS"
patterns = {
    "accession_number": r"lcl\|(.*?)_cds",
    "gene": r"\[gene=(.*?)\]",
    "protein": r"\[protein=(.*?)\]"
}

nuo_cds = parse_fasta_files(cds_dir, patterns)

Parsing FASTA files: 100%|█| 39893/39893 [08:24<00:00, 79.0


In [4]:
nuo_cds['GeneName'] = nuo_cds['GeneName'].fillna('NONE').apply(format_gene_symbol)
nuo_cds.drop_duplicates(subset=['CDSFile', 'GeneName'], inplace=True)

# Preprocessing steps simplified and vectorized
nuo_cds['Subunit'] = "Nuo" + nuo_cds['GeneName'].str.replace(r'\d', '', regex=True)
nuo_cds = nuo_cds[~nuo_cds['Subunit'].isin(['NuoBC', 'Nuo', 'NuoII', 'NuoLM', 'NuoP'])]

# Define fused names and update 'Subunit'
fused_names = "|".join(["subunit C,D", 'subunit C/D', 'subunit C/ D', 'chain C,D', 'chain C/D', 'chain C/ D', 'chain C; chain D', 
                        "chain C, D", "chain CD", "subunit CD", 'NADH-ubiquinone oxidoreductase chain C / NADH-ubiquinone oxidoreductase chain D', 
                        'NADH-quinone oxidoreductase, C/D subunit', 'NADH:ubiquinone oxidoreductase, fused CD subunit', 'NADH-ubiquinone oxidoreductase chain C'])

nuo_cds.loc[nuo_cds['Subunit'].eq('NuoC') & nuo_cds['ProteinName'].str.contains(fused_names, case=False, na=False), 'Subunit'] = "NuoCD"

In [6]:
prok_data = pd.read_csv('/Users/akshayonly/Work/Updated/Data/01/prokaryotes_processed.csv')

genome_data = pd.read_csv('/Users/akshayonly/Work/Updated/Data/01/genome_information.csv')

In [13]:
nuo_cds = pd.merge(nuo_cds, genome_data, on='Accession')

nuo_cds = pd.merge(nuo_cds, prok_data, on='GenomeFile')

In [42]:
nuo_cds = nuo_cds[['Lineage', 'LineageTaxIDs', 'Strain', 'Accession', 'Replicon', 'CDSFile', 'Header', 'ProteinLength', 'Subunit']]

nuo_cds.drop_duplicates(inplace=True)

In [43]:
nuo_cds.head()

Unnamed: 0,Lineage,LineageTaxIDs,Strain,Accession,Replicon,CDSFile,Header,ProteinLength,Subunit
0,Salmonella enterica,28901,CRSE-01,NZ_CP126166.1,Chromosome,GCF_030168445.1_ASM3016844v1_cds_from_genomic.fna,lcl|NZ_CP126166.1_cds_WP_000062993.1_2864 [gen...,147,NuoA
1,Salmonella enterica,28901,CRSE-01,NZ_CP126166.1,Chromosome,GCF_030168445.1_ASM3016844v1_cds_from_genomic.fna,lcl|NZ_CP126166.1_cds_WP_000386728.1_2865 [gen...,220,NuoB
2,Salmonella enterica,28901,CRSE-01,NZ_CP126166.1,Chromosome,GCF_030168445.1_ASM3016844v1_cds_from_genomic.fna,lcl|NZ_CP126166.1_cds_WP_000247855.1_2866 [gen...,600,NuoCD
3,Salmonella enterica,28901,CRSE-01,NZ_CP126166.1,Chromosome,GCF_030168445.1_ASM3016844v1_cds_from_genomic.fna,lcl|NZ_CP126166.1_cds_WP_000545038.1_2867 [gen...,166,NuoE
4,Salmonella enterica,28901,CRSE-01,NZ_CP126166.1,Chromosome,GCF_030168445.1_ASM3016844v1_cds_from_genomic.fna,lcl|NZ_CP126166.1_cds_WP_000800048.1_2868 [gen...,445,NuoF


In [50]:
uniprot = pd.read_csv('/Users/akshayonly/Work/Updated/Data/Misc/Uniprot/uniprot_reviewed_nuo_proteins_length.csv')

In [51]:
# Merge this information with cds_chromosome
merged_data = nuo_cds.merge(uniprot[['Subunit', 'MinLength']], on='Subunit', how='left')

# Use a vectorized operation to assign "To-Review" or "Reviewed" based on the condition
merged_data['ReviewStatus'] = "To-Review"  # Default value for all rows
# Update only the rows that meet the condition
merged_data.loc[merged_data['ProteinLength'] >= merged_data['MinLength'], 'ReviewStatus'] = "Reviewed"

In [52]:
merged_data.head()

Unnamed: 0,Lineage,LineageTaxIDs,Strain,Accession,Replicon,CDSFile,Header,ProteinLength,Subunit,MinLength,ReviewStatus
0,Salmonella enterica,28901,CRSE-01,NZ_CP126166.1,Chromosome,GCF_030168445.1_ASM3016844v1_cds_from_genomic.fna,lcl|NZ_CP126166.1_cds_WP_000062993.1_2864 [gen...,147,NuoA,116.0,Reviewed
1,Salmonella enterica,28901,CRSE-01,NZ_CP126166.1,Chromosome,GCF_030168445.1_ASM3016844v1_cds_from_genomic.fna,lcl|NZ_CP126166.1_cds_WP_000386728.1_2865 [gen...,220,NuoB,158.0,Reviewed
2,Salmonella enterica,28901,CRSE-01,NZ_CP126166.1,Chromosome,GCF_030168445.1_ASM3016844v1_cds_from_genomic.fna,lcl|NZ_CP126166.1_cds_WP_000247855.1_2866 [gen...,600,NuoCD,519.0,Reviewed
3,Salmonella enterica,28901,CRSE-01,NZ_CP126166.1,Chromosome,GCF_030168445.1_ASM3016844v1_cds_from_genomic.fna,lcl|NZ_CP126166.1_cds_WP_000545038.1_2867 [gen...,166,NuoE,160.0,Reviewed
4,Salmonella enterica,28901,CRSE-01,NZ_CP126166.1,Chromosome,GCF_030168445.1_ASM3016844v1_cds_from_genomic.fna,lcl|NZ_CP126166.1_cds_WP_000800048.1_2868 [gen...,445,NuoF,417.0,Reviewed


In [57]:
nuo_cds = merged_data.drop(columns=['MinLength'])

In [59]:
nuo_cds.to_csv('/Users/akshayonly/Work/Updated/Data/02/prescreened_nuo_entries_from_cds.csv', index=False)