In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter

from tqdm import tqdm

from plot_evalue_distributions import plot_evalue_histograms, plot_evalue_kde
from complex_i_analysis import cluster_hits_with_strand, generate_subunit_data, classify_complex_types

import warnings
warnings.filterwarnings('ignore')

In [2]:
colors = ["#a5b1c2", "#f7b731", "#20bf6b", "#45aaf2", "#3867d6", "#a55eea", "#0fb9b1", '#4b6584']
labels = ['Nuo-Partial', 'Nuo13', 'Nuo14', 'Nuo14-EFG', 'Nuo13-EFG', 'Nuo14-EF', 'Nuo12', 'Existing Annotation']

complex_colors = dict(zip(labels, colors))

In [3]:
results = pd.read_csv("/Users/akshayonly/Work/04-Complex-I/Data/07-Figures/regenerate/results_unfil.csv")

In [4]:
results['EstProtLength'] = np.int16(round(abs(results['Start'] - results['End'])/ 3))

In [5]:
genomes_dataset = pd.read_csv('/Users/akshayonly/Work/04-Complex-I/Data/02-Genomic-Records-and-Metadata/02-Genome-Metadata/genomes_dataset.csv')
genomes_metadata = pd.read_csv('/Users/akshayonly/Work/04-Complex-I/Data/02-Genomic-Records-and-Metadata/02-Genome-Metadata/genomes_metadata.csv')
results = pd.merge(genomes_metadata.merge(results), genomes_dataset[['GenomeFile', 'Species', 'Organism', 'Assembly Accession']], on='GenomeFile')

In [6]:
taxonomy = pd.read_csv('/Users/akshayonly/Work/04-Complex-I/Data/02-Genomic-Records-and-Metadata/03-Taxonomy-Metadata/taxonomy.csv')

In [7]:
results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1078643 entries, 0 to 1078642
Data columns (total 17 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   Accession           1078643 non-null  object 
 1   Replicon            1078643 non-null  object 
 2   GenomeFile          1078643 non-null  object 
 3   SequenceLength(Mb)  1078643 non-null  float64
 4   ProteinAccession    1078643 non-null  object 
 5   evalue              1078643 non-null  float64
 6   BitScore            1078643 non-null  float64
 7   Bias                1078643 non-null  float64
 8   SequenceDesc        1078643 non-null  object 
 9   Subunit             1078643 non-null  object 
 10  Start               1078643 non-null  int64  
 11  End                 1078643 non-null  int64  
 12  log10evalue         1078643 non-null  float64
 13  EstProtLength       1078643 non-null  int16  
 14  Species             1078643 non-null  object 
 15  Organism       

In [8]:
results['Species'].nunique()

10460

In [9]:
hits_summaries = (
    results.groupby('Subunit')
    .agg(
        Species=('Species', 'nunique'),
        NoProteinHits=('ProteinAccession', 'nunique'),
        NoGenomes=('GenomeFile', 'nunique')
    )
    .assign(AvgHits=lambda df: df['NoProteinHits'] / df['NoGenomes'])
    .reset_index()
    .sort_values('AvgHits', ascending=False)
)


In [10]:
# Total Hits
hits_summaries

Unnamed: 0,Subunit,Species,NoProteinHits,NoGenomes,AvgHits
8,NuoG,9631,277362,40320,6.879018
10,NuoI,9884,231130,38772,5.961261
13,NuoL,9236,77059,37879,2.034346
7,NuoF,7981,63132,33122,1.906044
5,NuoD,6565,41155,23987,1.715721
14,NuoM,8729,62367,36982,1.686415
1,NuoB,7761,52851,31669,1.668856
9,NuoH,7723,51020,31576,1.615784
4,NuoCD,1936,26062,16881,1.543866
15,NuoN,8408,41766,34629,1.206099


In [11]:
# Define e-value cutoffs for filtering
E_VALUE_CUTOFF = {
    'NuoA': -20, 'NuoB': -58, 'NuoBCD': -200, 'NuoC': -27,
    'NuoCD': -110, 'NuoD': -100, 'NuoE': -41, 'NuoF': -74,
    'NuoG': -80, 'NuoH': -75, 'NuoI': -38, 'NuoJ': -30,
    'NuoK': -24, 'NuoL': -145, 'NuoM': -95, 'NuoN': -78
}

# Define protein length thresholds
LENGTH_THRESHOLDS = {
    'NuoA': 245, 'NuoB': 289, 'NuoC': 311, 'NuoE': 450,
    'NuoF': 540, 'NuoG': 966, 'NuoH': 549, 'NuoI': 301,
    'NuoJ': 408, 'NuoM': 1087, 'NuoN': 664
}

# Function to filter results based on e-value and protein length
def filter_results(df):
    df = df[df['log10evalue'] <= df['Subunit'].map(E_VALUE_CUTOFF).fillna(float('inf'))]
    df = df[df['EstProtLength'] <= df['Subunit'].map(LENGTH_THRESHOLDS).fillna(float('inf'))]
    return df.reset_index(drop=True)

# Apply filtering
filtered_results = filter_results(results)

# Sort and get unique subunits
subunits = sorted(filtered_results['Subunit'].unique())

# Apply clustering to group hits efficiently
filtered_results = (
    filtered_results
    .groupby('Accession', group_keys=False)
    .apply(cluster_hits_with_strand, intergenic_distance=250)
    .reset_index(drop=True)
)

# Generate count and boolean tables
nuo_count, nuo_bool = generate_subunit_data(filtered_results, subunits)

# Remove duplicates from boolean table
nuo_bool.drop_duplicates(inplace=True)

# Define Complex I subunit variations
COMPLEX_VARIANTS = {
    'Nuo14': ['NuoA', 'NuoB', 'NuoC', 'NuoD', 'NuoE', 'NuoF', 'NuoG', 'NuoH', 
              'NuoI', 'NuoJ', 'NuoK', 'NuoL', 'NuoM', 'NuoN'],
    'Nuo13': ['NuoA', 'NuoB', 'NuoCD', 'NuoE', 'NuoF', 'NuoG', 'NuoH', 'NuoI', 
              'NuoJ', 'NuoK', 'NuoL', 'NuoM', 'NuoN'],
    'Nuo12': ['NuoA', 'NuoBCD', 'NuoE', 'NuoF', 'NuoG', 'NuoH', 'NuoI', 'NuoJ', 
              'NuoK', 'NuoL', 'NuoM', 'NuoN'],
    'Nuo14-EF': ['NuoA', 'NuoB', 'NuoC', 'NuoD', 'NuoG', 'NuoH', 'NuoI', 'NuoJ', 
                 'NuoK', 'NuoL', 'NuoM', 'NuoN'],
    'Nuo14-EFG': ['NuoA', 'NuoB', 'NuoC', 'NuoD', 'NuoH', 'NuoI', 'NuoJ', 'NuoK', 
                  'NuoL', 'NuoM', 'NuoN'],
    'Nuo11-EF': ['NuoA', 'NuoB', 'NuoCD', 'NuoG', 'NuoH', 'NuoI', 'NuoJ', 'NuoK', 
                 'NuoL', 'NuoM', 'NuoN'],
    'Nuo11-EFG': ['NuoA', 'NuoB', 'NuoCD', 'NuoH', 'NuoI', 'NuoJ', 'NuoK', 'NuoL', 
                  'NuoM', 'NuoN'],
}

# Classify complex types
nuo_bool = classify_complex_types(nuo_bool, COMPLEX_VARIANTS, subunits)

# Merge species and accession information
nuo_bool = nuo_bool.merge(filtered_results[['Species', 'Accession']].drop_duplicates(), on="Accession", how="left")

# Merge metadata while preserving unique records
metadata_columns = ['Accession', 'Assembly Accession', 'Organism', 'Species', 'Replicon']
nuo_bool = filtered_results[metadata_columns].merge(nuo_bool).drop_duplicates()
nuo_count = filtered_results[metadata_columns].merge(nuo_count).drop_duplicates()

In [12]:
filtered_hits_summaries = (
    filtered_results.groupby('Subunit')
    .agg(
        Species=('Species', 'nunique'),
        NoProteinHits=('ProteinAccession', 'nunique'),
        NoGenomes=('GenomeFile', 'nunique')
    )
    .assign(AvgHits=lambda df: df['NoProteinHits'] / df['NoGenomes'])
    .reset_index()
    .sort_values('AvgHits', ascending=False)
)


In [13]:
nuo_bool['Variation'].value_counts()

Variation
Nuo-Partial    45087
Nuo13          13590
Nuo14           7620
Nuo14-EFG        116
Nuo12             11
Nuo14-EF           3
Name: count, dtype: int64

In [14]:
# Summarize subunits and drop unnecessary columns
nuo_bool_partial = nuo_bool[nuo_bool['Variation']=='Nuo-Partial']

aggregated_nuo_bool = (
    nuo_bool_partial.groupby(['Accession'])
    .sum()
    .reset_index()
    .drop(columns=['Organism', 'Species', 'Variation', 'Cluster', 'Strand', 'Replicon', 'Assembly Accession'], errors='ignore')  # Drop columns if they exist
)

In [15]:
# Convert subunit counts to boolean (True if count >= 1)
aggregated_nuo_bool[subunits] = aggregated_nuo_bool[subunits].ge(1)

# Classify complex types
aggregated_nuo_bool = classify_complex_types(aggregated_nuo_bool, COMPLEX_VARIANTS, subunits)

aggregated_nuo_bool = nuo_bool[['Accession', 'Assembly Accession', 'Organism', 'Species', 'Replicon']].merge(aggregated_nuo_bool)
aggregated_nuo_bool.drop_duplicates(inplace=True)

aggregated_nuo_bool.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16420 entries, 0 to 49638
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Accession           16420 non-null  object
 1   Assembly Accession  16420 non-null  object
 2   Organism            16420 non-null  object
 3   Species             16420 non-null  object
 4   Replicon            16420 non-null  object
 5   NuoA                16420 non-null  bool  
 6   NuoB                16420 non-null  bool  
 7   NuoBCD              16420 non-null  bool  
 8   NuoC                16420 non-null  bool  
 9   NuoCD               16420 non-null  bool  
 10  NuoD                16420 non-null  bool  
 11  NuoE                16420 non-null  bool  
 12  NuoF                16420 non-null  bool  
 13  NuoG                16420 non-null  bool  
 14  NuoH                16420 non-null  bool  
 15  NuoI                16420 non-null  bool  
 16  NuoJ                16420 n

In [16]:
plasmid_species_single_cluster = set(nuo_bool[(nuo_bool['Replicon']=='Plasmid')]['Species'])

plasmid_species_multi_cluster = set(aggregated_nuo_bool[(aggregated_nuo_bool['Replicon']=='Plasmid')]['Species'])

plasmid_species = plasmid_species_multi_cluster | plasmid_species_single_cluster

In [17]:
len(plasmid_species)

173

In [18]:
nuo_bool[(nuo_bool['Variation']!='Nuo-Partial') & (nuo_bool['Replicon']=='Plasmid')][['Species', 'Variation']].drop_duplicates().reset_index(drop=True)

Unnamed: 0,Species,Variation
0,Citrobacter freundii,Nuo13
1,Paenibacillus cellulosilyticus,Nuo14-EFG
2,Ralstonia solanacearum,Nuo14
3,Burkholderia vietnamiensis,Nuo14
4,Klebsiella aerogenes,Nuo13
5,Legionella adelaidensis,Nuo14
6,Tsukamurella tyrosinosolvens,Nuo14
7,Mycobacterium intracellulare,Nuo14
8,Komagataeibacter saccharivorans,Nuo13
9,Acinetobacter baumannii,Nuo13


In [19]:
aggregated_nuo_bool[(aggregated_nuo_bool['Variation']!='Nuo-Partial') & (aggregated_nuo_bool['Replicon']=='Plasmid')][['Species', 'Variation']].drop_duplicates().reset_index(drop=True)

Unnamed: 0,Species,Variation
0,Sinorhizobium meliloti,Nuo14
1,Komagataeibacter saccharivorans,Nuo13
2,Salmonella enterica,Nuo13
3,Sinorhizobium sp. M103,Nuo14
4,Sinorhizobium sp. K101,Nuo14
5,Sinorhizobium sp. C101,Nuo14


In [20]:
chromosome_species_single_cluster = set(nuo_bool[(nuo_bool['Replicon']=='Chromosome')]['Species'])

chromosome_species_multi_cluster = set(aggregated_nuo_bool[(aggregated_nuo_bool['Replicon']=='Chromosome')]['Species'])

chromosome_species = chromosome_species_multi_cluster | chromosome_species_single_cluster

In [21]:
len(chromosome_species_single_cluster)

7953

In [22]:
len(chromosome_species_multi_cluster)

5875

In [23]:
len(chromosome_species_multi_cluster.intersection(chromosome_species_single_cluster))

5875

In [24]:
len(chromosome_species_single_cluster.difference(chromosome_species_multi_cluster))

2078

In [25]:
undefined_species_single_cluster = set(nuo_bool[(nuo_bool['Replicon']=='Undefined')]['Species'])

undefined_species_multi_cluster = set(aggregated_nuo_bool[(aggregated_nuo_bool['Replicon']=='Undefined')]['Species'])

undefined_species = undefined_species_multi_cluster | undefined_species_single_cluster

In [26]:
len(undefined_species_single_cluster)

27

In [27]:
len(undefined_species_multi_cluster)

17

In [28]:
len(undefined_species)

27

In [29]:
len(undefined_species_multi_cluster.intersection(undefined_species_single_cluster))

17

In [30]:
len(undefined_species_single_cluster.difference(undefined_species_multi_cluster))

10

In [32]:
# Optimized approach to processing species variations
species_variations = pd.concat([
    nuo_bool[['Species', 'Variation', 'Accession']],
    aggregated_nuo_bool[['Species', 'Variation', 'Accession']]
]).drop_duplicates().reset_index(drop=True)

# Splitting into partial and complete variations
partial = species_variations[species_variations['Variation'] == 'Nuo-Partial']
complete = species_variations[species_variations['Variation'] != 'Nuo-Partial']

# Retaining only partial variations not found in complete variations
partial = partial.loc[~partial['Species'].isin(complete['Species'])]

# Final combined dataset
species_variations = pd.concat([complete, partial]).reset_index(drop=True)

In [33]:
species_variations_counts = species_variations['Species'].value_counts().reset_index()
accession_variations_counts = species_variations['Accession'].value_counts().reset_index()

In [34]:
accession_variations_counts[accession_variations_counts['count']==2]['Accession'].nunique()

137

In [35]:
species_variations[species_variations['Accession'].isin(accession_variations_counts[accession_variations_counts['count']==2]['Accession'])]['Species'].nunique()

122

In [38]:
species_variations[species_variations['Accession'].isin(accession_variations_counts[accession_variations_counts['count']==2]['Accession'])]['Species'].value_counts()

Species
Micromonospora zamorensis           12
Micromonospora aurantiaca            8
Gluconacetobacter diazotrophicus     4
Micromonospora maris                 4
Streptantibioticus cattleyicolor     4
                                    ..
Dyadobacter sp. NIV53                2
Chitinophaga sp. HK235               2
Dactylosporangium vinaceum           2
Micromonospora craniellae            2
uncultured Desulfobulbus sp.         2
Name: count, Length: 122, dtype: int64

**NOTE**

- Dehalobacter sp. DCM have complete subunits to form Complex I. EFG are present elsewhere in its genome.
- 173 species have atleast one subunits present, 16 of which have all the subunits required to form Complex I and 01 has EFG missing Complex I.
- 122 species have two variations within their genomes.