In [1]:
import os
import re
import pandas as pd
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('/Users/akshayonly/Work/Updated/Data/02/prescreened_nuo_entries_from_cds.csv')

In [3]:
# Regex pattern to extract numbers
pattern = re.compile(r'\d+')

# Extract numbers from each string
extracted_numbers = [pattern.findall(item) for item in data['Header'].apply(lambda x: x.split(' ')[-2]).values]

extracted_numbers = pd.DataFrame(extracted_numbers).iloc[:, :-2]

extracted_numbers.columns = ['Start', 'End']

data = pd.concat([data, extracted_numbers], axis=1)

In [4]:
data['Start'] = data['Start'].astype(int)
data['End'] = data['End'].astype(int)

In [5]:
def cluster_genes(group, threshold=250):
    # Sort the group by 'Start' position
    group = group.sort_values('Start')
    
    # Initialize cluster labels
    group['Cluster'] = 0
    
    # Calculate distances between consecutive genes
    distances = group['Start'].values[1:] - group['End'].values[:-1]
    
    # Determine cluster IDs (increment when distance >= threshold)
    group['Cluster'][1:] = (distances >= threshold).cumsum()
    
    return group

In [6]:
data_cls = data.groupby('Accession').apply(cluster_genes).reset_index(drop=True)

In [7]:
all_subunits = sorted(data_cls['Subunit'].unique())

# Create a pivot table with 'Accession' and 'Cluster' as indices
pivot = data_cls.pivot_table(index=['Accession', 'Cluster'], 
                          columns='Subunit', 
                          aggfunc='size', 
                          fill_value=0)

# Ensure all subunits are present as columns, even if no data exists for them
pivot = pivot.reindex(columns=all_subunits, fill_value=0).reset_index()

# Rename the columns as per your requirement (if necessary)
nuo_count = pd.DataFrame(pivot)
nuo_count.columns = ['Accession', 'Cluster'] + all_subunits  # Adjust if specific names are needed

# Create a copy of the dataframe to apply boolean conversion
nuo_bool = nuo_count.copy()
nuo_bool[nuo_bool.iloc[:, 2:].columns] = nuo_bool.iloc[:, 2:].ge(1)

nuo_bool['Count'] = nuo_bool[all_subunits].sum(axis=1)

In [8]:
data_cls.head()

Unnamed: 0,Lineage,LineageTaxIDs,Strain,Accession,Replicon,CDSFile,Header,ProteinLength,Subunit,ReviewStatus,Start,End,Cluster
0,Escherichia coli,562,2020CK-00188,ABACVG020000001.1,Chromosome,GCA_018071945.2_ASM1807194v2_cds_from_genomic.fna,lcl|ABACVG020000001.1_cds_MCZ6945865.1_824 [ge...,485,NuoN,Reviewed,876770,878227,0
1,Escherichia coli,562,2020CK-00188,ABACVG020000001.1,Chromosome,GCA_018071945.2_ASM1807194v2_cds_from_genomic.fna,lcl|ABACVG020000001.1_cds_MCZ6945866.1_825 [ge...,509,NuoM,Reviewed,878234,879763,0
2,Escherichia coli,562,2020CK-00188,ABACVG020000001.1,Chromosome,GCA_018071945.2_ASM1807194v2_cds_from_genomic.fna,lcl|ABACVG020000001.1_cds_MCZ6945867.1_826 [ge...,613,NuoL,Reviewed,879994,881835,0
3,Escherichia coli,562,2020CK-00188,ABACVG020000001.1,Chromosome,GCA_018071945.2_ASM1807194v2_cds_from_genomic.fna,lcl|ABACVG020000001.1_cds_MCZ6945868.1_827 [ge...,100,NuoK,Reviewed,881832,882134,0
4,Escherichia coli,562,2020CK-00188,ABACVG020000001.1,Chromosome,GCA_018071945.2_ASM1807194v2_cds_from_genomic.fna,lcl|ABACVG020000001.1_cds_MCZ6945869.1_828 [ge...,184,NuoJ,Reviewed,882131,882685,0


In [9]:
data.head(1)

Unnamed: 0,Lineage,LineageTaxIDs,Strain,Accession,Replicon,CDSFile,Header,ProteinLength,Subunit,ReviewStatus,Start,End
0,Salmonella enterica,28901,CRSE-01,NZ_CP126166.1,Chromosome,GCF_030168445.1_ASM3016844v1_cds_from_genomic.fna,lcl|NZ_CP126166.1_cds_WP_000062993.1_2864 [gen...,147,NuoA,Reviewed,3108466,3108909


In [10]:
# Global configurations
DATA_DIR = "/Users/akshayonly/Work/Sequence-Data/CDS"
OUTPUT_DIR = "/Users/akshayonly/Work/Sequence-Data/Nuo-Sequences"

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# Function to filter and process sequence data for an organism
def process_sequences(cds, cds_dir):
    """
    Process and save sequences for a given organism.
    :param organism: Organism type ('Bacteria' or 'Archaea')
    :param cds: DataFrame containing CDS information
    :param cds_dir: Directory containing CDS files
    """
    # Filter CDS entries for the organism
    chr_cds = cds[(cds['Replicon']=='Chromosome') & (cds['ReviewStatus']=='Reviewed')]
    cds_to_info = chr_cds.groupby('CDSFile')[['Subunit', 'Header']].apply(lambda x: x.values.tolist()).to_dict()
    sequence_data = {subunit: [] for subunit in cds['Subunit'].unique()}

    # Process each CDS file
    for fasta in tqdm(chr_cds['CDSFile'].unique()):
        fasta_fullpath = os.path.join(cds_dir, fasta)
        for record in SeqIO.parse(fasta_fullpath, "fasta"):
            for subunit, header in cds_to_info[fasta]:
                if header in record.description:
                    sequence_data[subunit].append(record.seq.translate(table=11, to_stop=True))

    # Write sequences to files
    for subunit, sequences in sequence_data.items():
        records = [SeqRecord(seq, id=f"{subunit}_{i+1}", description=f"{subunit} Subunit") for i, seq in enumerate(sequences)]
        if records:
            SeqIO.write(records, os.path.join(OUTPUT_DIR, f"{subunit.lower()}_cds.faa"), "fasta")

In [11]:
process_sequences(data, DATA_DIR)

100%|██████████| 24696/24696 [06:56<00:00, 59.26it/s]
