In [1]:
import pandas as pd
import numpy as np
import os
import statistics as st

from tqdm import tqdm
from glob import glob

import re
import seaborn as sns
import matplotlib.pyplot as plt

from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

import warnings

warnings.filterwarnings('ignore')  # Ignore all warnings

In [2]:
def cluster_genes(group, threshold=250):
    # Sort the group by 'Start' position
    group = group.sort_values('Start')
    
    # Initialize cluster labels
    group['Cluster'] = 0
    
    # Calculate distances between consecutive genes
    distances = group['Start'].values[1:] - group['End'].values[:-1]
    
    # Determine cluster IDs (increment when distance >= threshold)
    group['Cluster'][1:] = (distances >= threshold).cumsum()
    
    return group

In [3]:
def generate_subunit_data(hmmer):
    # Generate a list of all unique 'Subunit' values, sorted
    all_subunits = ['NuoA', 'NuoB', 'NuoBCD', 'NuoC', 
                    'NuoCD', 'NuoD', 'NuoE', 'NuoF', 
                    'NuoG', 'NuoH', 'NuoI', 'NuoJ', 
                    'NuoK', 'NuoL', 'NuoM', 'NuoN']

    # Create a pivot table with 'Accession' and 'Cluster' as indices
    pivot = hmmer.pivot_table(index=['Accession', 'Cluster'], 
                              columns='Subunit', 
                              aggfunc='size', 
                              fill_value=0)

    # Ensure all subunits are present as columns, even if no data exists for them
    pivot = pivot.reindex(columns=all_subunits, fill_value=0).reset_index()

    # Rename the columns as per your requirement (if necessary)
    nuo_count = pd.DataFrame(pivot)
    nuo_count.columns = ['Accession', 'Cluster'] + all_subunits  # Adjust if specific names are needed

    # Create a copy of the dataframe to apply boolean conversion
    nuo_bool = nuo_count.copy()
    nuo_bool[nuo_bool.iloc[:, 2:].columns] = nuo_bool.iloc[:, 2:].ge(1)

    nuo_bool['Count'] = nuo_bool[all_subunits].sum(axis=1)

    return nuo_count, nuo_bool

In [4]:
prok_info = pd.read_csv('/Users/akshayonly/Work/Updated/Data/01/prokaryotes_processed.csv')

In [5]:
hmmer = pd.read_csv('/Users/akshayonly/Work/Updated/Data/hmmer_search_results_processed.csv', low_memory=False)
metadata = pd.read_csv('/Users/akshayonly/Work/Updated/Data/01/genome_information.csv')

In [6]:
metadata = pd.merge(prok_info[['GenomeFile', 'Organism', 'Lineage', 'LineageTaxIDs', 'Strain', 'TaxID']], metadata, on='GenomeFile')

In [7]:
metadata.drop_duplicates(inplace=True)

In [8]:
# Apply clustering function to each subset of data grouped by 'Accession'
hmmer = hmmer.groupby('Accession').apply(cluster_genes).reset_index(drop=True)

hmmer = pd.merge(metadata[['Accession', 'ProteomeFile']], hmmer, on='Accession')

In [9]:
hmmer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 756835 entries, 0 to 756834
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Accession         756835 non-null  object 
 1   ProteomeFile      756835 non-null  object 
 2   ProteinAccession  756835 non-null  object 
 3   Subunit           756835 non-null  object 
 4   evalue            756835 non-null  float64
 5   BitScore          756835 non-null  float64
 6   Bias              756835 non-null  float64
 7   SequenceDesc      756835 non-null  object 
 8   Start             756835 non-null  int64  
 9   End               756835 non-null  int64  
 10  Cluster           756835 non-null  int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 63.5+ MB


In [10]:
metadata.head(1)

Unnamed: 0,GenomeFile,Organism,Lineage,LineageTaxIDs,Strain,TaxID,Accession,Replicon,ProteomeFile,SequenceLength(Mb)
0,GCA_000009085.1_ASM908v1_genomic.fna,Bacteria,Campylobacter jejuni,197,NCTC 11168,192222,AL111168.1,Chromosome,GCA_000009085.1_ASM908v1_cds_proteins.faa,1.641481


In [11]:
nuo_count, nuo_bool = generate_subunit_data(hmmer)

In [12]:
hmmer['Accession'].nunique()

41277

In [13]:
nuo_bool['Accession'].nunique()

41277

In [14]:
rearrange_columns = ['Accession', 'Cluster', 'Count', 'NuoA', 'NuoB', 'NuoBCD', 
                     'NuoC', 'NuoCD', 'NuoD', 'NuoE', 'NuoF', 'NuoG', 'NuoH', 'NuoI', 'NuoJ', 
                     'NuoK', 'NuoL', 'NuoM', 'NuoN']

In [15]:
nuo_bool = nuo_bool[rearrange_columns]

In [16]:
# Defining the combinations
combinations = {
    'Complete 14 Subunits': ['NuoA', 'NuoB', 'NuoC', 'NuoD', 'NuoE', 'NuoF', 'NuoG', 'NuoH', 'NuoI', 'NuoJ', 'NuoK', 'NuoL', 'NuoM', 'NuoN'],
    '13 Subunits with Fused CD': ['NuoA', 'NuoB', 'NuoCD', 'NuoE', 'NuoF', 'NuoG', 'NuoH', 'NuoI', 'NuoJ', 'NuoK', 'NuoL', 'NuoM', 'NuoN'],
    # '12 Subunits with Fused BCD': ['NuoA', 'NuoBCD', 'NuoE', 'NuoF', 'NuoG', 'NuoH', 'NuoI', 'NuoJ', 'NuoK', 'NuoL', 'NuoM', 'NuoN'],
    # '12 Subunits Excluding NuoEF': ['NuoA', 'NuoB', 'NuoC', 'NuoD', 'NuoG', 'NuoH', 'NuoI', 'NuoJ', 'NuoK', 'NuoL', 'NuoM', 'NuoN'],
    # '11 Subunits with Fused CD Excluding NuoEF': ['NuoA', 'NuoB', 'NuoCD', 'NuoG', 'NuoH', 'NuoI', 'NuoJ', 'NuoK', 'NuoL', 'NuoM', 'NuoN'],
    # '11 Subunits Excluding NuoEFG': ['NuoA', 'NuoB', 'NuoC', 'NuoD', 'NuoH', 'NuoI', 'NuoJ', 'NuoK', 'NuoL', 'NuoM', 'NuoN'],
    # '11 Subunits with Fused CD Excluding NuoEFG': ['NuoA', 'NuoB', 'NuoCD', 'NuoH', 'NuoI', 'NuoJ', 'NuoK', 'NuoL', 'NuoM', 'NuoN']
}

all_subunits = nuo_bool.iloc[:, 3:].columns.tolist()

nuo_bool['ComplexType'] = 'Less than 11 Subunits'

# Checking for each variation
results = {}
# Iterating over combinations to update the new column
for name, true_subunits in tqdm(combinations.items()):
    false_subunits = [subunit for subunit in all_subunits if subunit not in true_subunits]
    
    # Identifying rows that match the current combination
    match_mask = (nuo_bool[true_subunits].all(axis=1)) & (~nuo_bool[false_subunits].any(axis=1))
    
    # Updating the 'CombinationType' column for rows that match the combination
    nuo_bool.loc[match_mask, 'ComplexType'] = name

100%|███████████████████████| 2/2 [00:00<00:00, 146.22it/s]


In [17]:
selected_columns = ['Accession', 'Replicon', 'ProteomeFile', 'Organism', 'Lineage', 'LineageTaxIDs', 'Strain', 'TaxID']

nuo_bool_detailed = nuo_bool.copy()
nuo_bool_detailed = pd.merge(metadata[selected_columns], nuo_bool_detailed, on='Accession')

In [18]:
metadata_leftout = metadata[~metadata['ProteomeFile'].isin(nuo_bool_detailed['ProteomeFile'])][selected_columns]

metadata_leftout = metadata_leftout.assign(**{col: False for col in all_subunits})
metadata_leftout['ComplexType'] = 'Absent'
metadata_leftout['Cluster'] = None

rearrange_columns = ['Accession', 'Replicon', 'ProteomeFile', 'Organism', 'Lineage', 'LineageTaxIDs', 'Strain', 'TaxID',
                     'ComplexType', 'Cluster', 'NuoA', 'NuoB', 'NuoBCD', 'NuoC', 'NuoCD', 'NuoD', 'NuoE', 'NuoF', 
                     'NuoG', 'NuoH', 'NuoI', 'NuoJ', 'NuoK', 'NuoL', 'NuoM', 'NuoN']

nuo_bool_detailed = nuo_bool_detailed[rearrange_columns]
metadata_leftout = metadata_leftout[rearrange_columns]

nuo_bool_detailed = pd.concat([nuo_bool_detailed, metadata_leftout])

In [19]:
incomplete_set = set(nuo_bool_detailed[(nuo_bool_detailed['ComplexType'].isin(['Less than 11 Subunits','Absent']))]['LineageTaxIDs'])

complete_set = set(nuo_bool_detailed[(nuo_bool_detailed['ComplexType'].isin(['Complete 14 Subunits', '13 Subunits with Fused CD']))]['LineageTaxIDs'])

# Create a boolean mask for rows to remove
mask = (nuo_bool_detailed['ComplexType'].isin(['Less than 11 Subunits','Absent'])) & (nuo_bool_detailed['LineageTaxIDs'].isin(incomplete_set.intersection(complete_set)))

# Use the negation of the mask to keep rows that do not meet the condition
nuo_bool_detailed = nuo_bool_detailed[~mask]

# Optionally reset the index if you want a clean DataFrame index
nuo_bool_detailed.reset_index(drop=True, inplace=True)

In [20]:
for complex_type in sorted(nuo_bool_detailed['ComplexType'].unique()):
    no_lineages = nuo_bool_detailed[nuo_bool_detailed['ComplexType']==complex_type]['LineageTaxIDs'].nunique()
    no_genomes = nuo_bool_detailed[nuo_bool_detailed['ComplexType']==complex_type]['ProteomeFile'].nunique()
    print(complex_type, no_lineages, no_genomes)

13 Subunits with Fused CD 1345 12726
Absent 801 5426
Complete 14 Subunits 2610 7201
Less than 11 Subunits 5160 15239


In [21]:
nuo_bool_detailed[(nuo_bool_detailed['ComplexType'].isin(['Complete 14 Subunits', '13 Subunits with Fused CD']))]

Unnamed: 0,Accession,Replicon,ProteomeFile,Organism,Lineage,LineageTaxIDs,Strain,TaxID,ComplexType,Cluster,...,NuoE,NuoF,NuoG,NuoH,NuoI,NuoJ,NuoK,NuoL,NuoM,NuoN
5,LT907842.1,Chromosome,GCA_900215245.1_IMG-taxon_2617270901_annotated...,Bacteria,Pseudomonas fluorescens,294,ATCC 13525,294,13 Subunits with Fused CD,3,...,True,True,True,True,True,True,True,True,True,True
6,CP058243.1,Chromosome,GCA_013388375.1_ASM1338837v1_cds_proteins.faa,Bacteria,Xanthomonas campestris,339,MAFF106181,359385,Complete 14 Subunits,3,...,True,True,True,True,True,True,True,True,True,True
7,CP002956.1,Chromosome,GCA_000222975.1_ASM22297v1_cds_proteins.faa,Bacteria,Yersinia pestis,632,A1122,1035377,13 Subunits with Fused CD,6,...,True,True,True,True,True,True,True,True,True,True
31,CP018019.1,Chromosome,GCA_022175585.1_ASM2217558v1_cds_proteins.faa,Bacteria,Mycobacterium avium,1764,OCU889s_P11_4s,439334,Complete 14 Subunits,2,...,True,True,True,True,True,True,True,True,True,True
32,NZ_CP039850.1,Chromosome,GCF_005156105.1_ASM515610v1_cds_proteins.faa,Bacteria,Mycobacterium tuberculosis,1773,Danish 1331,1765,Complete 14 Subunits,6,...,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114499,CP133106.1,Chromosome,GCA_031236885.1_ASM3123688v1_cds_proteins.faa,Bacteria,Escherichia coli,562,8523,562,13 Subunits with Fused CD,4,...,True,True,True,True,True,True,True,True,True,True
114500,CP085372.1,Chromosome,GCA_020673295.1_ASM2067329v1_cds_proteins.faa,Bacteria,Escherichia coli,562,NDM7,562,13 Subunits with Fused CD,6,...,True,True,True,True,True,True,True,True,True,True
114501,CP092707.1,Chromosome,GCA_022488325.1_ASM2248832v1_cds_proteins.faa,Bacteria,Escherichia coli,562,S-P-N-065.01,562,13 Subunits with Fused CD,8,...,True,True,True,True,True,True,True,True,True,True
114502,CP092704.1,Chromosome,GCA_022488265.1_ASM2248826v1_cds_proteins.faa,Bacteria,Escherichia coli,562,S-P-N-045.01,562,13 Subunits with Fused CD,7,...,True,True,True,True,True,True,True,True,True,True


In [22]:
partial_comp_one_accession = nuo_bool_detailed[nuo_bool_detailed['ComplexType']=='Less than 11 Subunits']['Accession'].unique()
partial_comp_one_hmmer = hmmer[hmmer['Accession'].isin(partial_comp_one_accession)]

complete_comp_one_accession = nuo_bool_detailed[(nuo_bool_detailed['ComplexType'].isin(['Complete 14 Subunits', '13 Subunits with Fused CD']))]['Accession'].unique()
complete_hmmer = hmmer[hmmer['Accession'].isin(complete_comp_one_accession)]

In [23]:
nuo_bool_detailed[nuo_bool_detailed['Accession']=='U00096.3']

Unnamed: 0,Accession,Replicon,ProteomeFile,Organism,Lineage,LineageTaxIDs,Strain,TaxID,ComplexType,Cluster,...,NuoE,NuoF,NuoG,NuoH,NuoI,NuoJ,NuoK,NuoL,NuoM,NuoN
48,U00096.3,Chromosome,GCA_000005845.2_ASM584v2_cds_proteins.faa,Bacteria,Escherichia coli,562,K-12 substr. MG1655,511145,13 Subunits with Fused CD,8,...,True,True,True,True,True,True,True,True,True,True


In [24]:
# Filter the nuo_bool_detailed DataFrame to include only the relevant ComplexTypes
nuo_filtered = nuo_bool_detailed[nuo_bool_detailed['ComplexType'].isin(['Complete 14 Subunits', '13 Subunits with Fused CD'])]

# Merge the filtered nuo_bool_detailed DataFrame with the complete_hmmer DataFrame
complete_hmmer_filtered = complete_hmmer.merge(nuo_filtered[['Accession', 'Cluster']].drop_duplicates(), on=['Accession', 'Cluster'], how='inner')

# Reset index of the resulting DataFrame
complete_hmmer_filtered = complete_hmmer_filtered.reset_index(drop=True)

In [25]:
hmmer['Accession'].nunique()

41277

In [26]:
complete_hmmer_filtered['Accession'].nunique()

19918

In [27]:
partial_comp_one_hmmer['Accession'].nunique()

17206

In [28]:
# saving_dir = "/home/arglab/Work/Projects/04-Complex-I-Spread/Scripts-Data/MAIN-ANALYSIS/Secondary"

# nuo_count.to_csv(os.path.join(saving_dir, 'complex_one_count.csv'), index=False)

# nuo_bool_detailed.to_csv(os.path.join(saving_dir, 'complex_one_boolean.csv'), index=False)

# partial_comp_one_hmmer.to_csv("/home/arglab/Work/Projects/04-Complex-I-Spread/Scripts-Data/MAIN-ANALYSIS/processed_hmmer_search_results_02_partial.csv", index=False)

# complete_hmmer_filtered.to_csv("/home/arglab/Work/Projects/04-Complex-I-Spread/Scripts-Data/MAIN-ANALYSIS/processed_hmmer_search_results_02_complete.csv", index=False)

# hmmer.to_csv("/home/arglab/Work/Projects/04-Complex-I-Spread/Scripts-Data/MAIN-ANALYSIS/processed_hmmer_search_results_02_clustered.csv", index=False)

In [29]:
# def split_dataframe(df, n):
#     # Calculate the number of subsets
#     num_subsets = len(df) // n + (1 if len(df) % n != 0 else 0)
#     subsets = []
#     for i in range(num_subsets):
#         subset = df.iloc[i*n:(i+1)*n]
#         subsets.append(subset)
#     return subsets

# partial_comp_one_hmmer = partial_comp_one_hmmer.sort_values('ProteinAccession')

# hmmer_subsets = split_dataframe(partial_comp_one_hmmer, 55000)

# for i, subset in enumerate(hmmer_subsets):
#     sequence_records = []
#     subset = subset.groupby('ProteomeFile')
#     for ProteomeFile, data in tqdm(subset, desc=f"Fasta: {i+1}"):
#         ProteomeFile_path = os.path.join('/home/arglab/Work/Sequence-Data/ProteomeFiles', ProteomeFile)
#         for record in SeqIO.parse(ProteomeFile_path, 'fasta'):
#             if record.id in data['ProteinAccession'].unique().tolist():
#                 sequence_record = SeqRecord(seq=Seq(str(record.seq).replace('*', '')), id=record.id, description=record.description)
#                 sequence_records.append(sequence_record)

#     SeqIO.write(sequence_records, os.path.join('/home/arglab/Work/Sequence-Data/', f"potential_nuo_sequences_0{i}.faa"), "fasta")