Goal: Create a hierarchical clustering built upon both genotypic and phenotypic information. 

One proposal (40 total features): 
- 20 features for normalized abundance at T0 across native pHs
- 11 features for normalized abundance at T9 across perturbed pHs
- 9 features for structural and sequence-based information:
  - (3) Alpha helix, Beta sheet, Coil proportions
  - (2) Solvent-Accessible surface area (both total and average per residue)
  - (1) Radius of Gyration (compactness)
  - (1) Length
  - (1) Instability Index
  - (1) Isoelectric Point (pI) — pH at which the protein is net neutral




First, I need the ORFs for each of the 60 for which we have a compelete sequence after the first clustering. Then, I will use a cluster map to reduce redudancy, gather the relevant data for each one. 

In [13]:
import numpy as np
import pandas as pd
from mgsa.helpers import plot

import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)


In [None]:
drug = 'None'
map = '09'
KO = 'K00370'

soils = ['Soil3', 'Soil5', 'Soil6', 'Soil9', 'Soil11', 'Soil12', 'Soil14', 'Soil15', 'Soil16', 'Soil17']

cluster_ids = pd.read_csv(f'../out/orf_ids/cluster_ids_{map}_{KO}.tsv', header=None)
cluster_ids = cluster_ids.values
clustered_data = pd.read_csv(f'../out/{KO}abundances/T0data_{map}_{drug}_{KO}.tsv', sep='\t', header=None)
data = clustered_data.values
T0data = data / data.sum(axis=1, keepdims=True)

id_list = []

T9data = np.zeros((len(cluster_ids, 11)))
#Add all of the T9 data arrays together to condition out native environment
for soil in soils:
    filename = f"../out/{KO}abundances/{soil}data_{map}_{drug}_{KO}.tsv"  # Construct the filename
    df = pd.read_csv(filename, sep='\t', header = None)
    df = df.values
    df = df / df.sum(axis = 1, keepdims=True)
    T9data += df 
    
    
data = np.nan_to_num(data, nan = 0)

for i in range(len(cluster_ids)):
    print(i, 'id:', cluster_ids[i][0], 'data: ', data[i])

0 id: Soil5.scaffold_179483344_c1_1 data:  [0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.09768614 0.10319309 0.0777248  0.0931439
 0.12365392 0.08824463 0.10877037 0.07958209 0.0956287  0.07426448
 0.05810787 0.13833559 0.03362902 0.08633067 0.10679672 0.08914242
 0.05728007 0.14755465 0.11665029 0.         0.11678714 0.10749344
 0.12537183 0.10800691 0.10348742 0.10347976 0.08242526 0.11648492
 0.07876508 0.0891374  0.09283272 0.0598138  0.04019489 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.12741618 0.10453624
 0.12694704 0.1229733  0.05814278 0.11834239 0.08094608 0.09169381
 0.07791559 0.05940344 0.03168316 0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.   

In [34]:
#load in round_1 ids which have complete sequences 

KO = 'K00370'

complete_seq_ids = pd.read_csv('../out/aaseqs/long_complete_orf_ids.txt', header = None)
complete_seq_ids = complete_seq_ids.values
complete_seq_ids = [item[0] for item in complete_seq_ids]
cluster_ids = pd.read_csv(f'../out/orf_ids/cluster_ids_{map}_{KO}.tsv', header=None)
cluster_ids = cluster_ids.values
cluster_ids = [item[0] for item in cluster_ids]
print(complete_seq_ids)
print(cluster_ids)
print(len(complete_seq_ids))
print(len(cluster_ids))


indices = [cluster_ids.index(item) for item in complete_seq_ids if item in cluster_ids]

data_subset = []

for idx in indices:
    data_subset.append(data[idx])
    
data_subset = [item.tolist() for item in data_subset]

print(data_subset)

['Soil11.scaffold_431547323_c1_2', 'Soil11.scaffold_587233302_c1_1', 'Soil11.scaffold_65427810_c1_20', 'Soil11.scaffold_211583530_c1_6', 'Soil12.scaffold_266712765_c1_2', 'Soil14.scaffold_576820813_c1_40', 'Soil14.scaffold_280216268_c1_12', 'Soil14.scaffold_465502786_c1_2', 'Soil14.scaffold_596019752_c1_119', 'Soil15.scaffold_116822102_c1_19', 'Soil15.scaffold_175810641_c1_3', 'Soil15.scaffold_16321085_c1_6', 'Soil15.scaffold_1255266399_c1_142', 'Soil16.scaffold_436666653_c1_2', 'Soil16.scaffold_804923086_c1_3', 'Soil16.scaffold_313969256_c1_10', 'Soil16.scaffold_1312607544_c1_3', 'Soil17.scaffold_1045002370_c1_23', 'Soil17.scaffold_918557319_c1_2', 'Soil17.scaffold_1202428475_c1_2', 'Soil17.scaffold_274056314_c1_61', 'Soil17.scaffold_332687915_c1_2', 'Soil3.scaffold_414071996_c1_8', 'Soil3.scaffold_333288240_c1_8', 'Soil3.scaffold_530221301_c1_2', 'Soil3.scaffold_408070428_c1_2', 'Soil3.scaffold_285743490_c1_2', 'Soil5.scaffold_133269568_c1_66', 'Soil5.scaffold_215117222_c1_59', 'Soil

In [None]:
#find alpha/beta/coil proportion from a pdb file

from Bio.PDB import PDBParser
from collections import defaultdict

pdb_file = '../out/structure/nar/Soil3.scaffold_285743490_c1_2.pdb'

def get_secondary_structure_proportions(pdb_file):
    """
    Calculate secondary structure proportions from PDB HELIX and SHEET records.
    """
    parser = PDBParser()
    structure = parser.get_structure("protein", pdb_file)
    

    secondary_structure = defaultdict(int)
    total_residues = 0
    

    helix_residues = set()
    sheet_residues = set()
    

    for record in structure.header.get('helix', []):
        start_chain = record['init_chain_id']
        start_res = record['init_res_num']
        end_chain = record['end_chain_id']
        end_res = record['end_res_num']
        
        # Add all residues in this helix
        for model in structure:
            for chain in model:
                if chain.id == start_chain:
                    for residue in chain:
                        if (residue.id[1] >= start_res and 
                            residue.id[1] <= end_res and
                            chain.id == end_chain):
                            helix_residues.add((chain.id, residue.id))
    
    # Parse SHEET records
    for record in structure.header.get('sheet', []):
        start_chain = record['init_chain_id']
        start_res = record['init_res_num']
        end_chain = record['end_chain_id']
        end_res = record['end_res_num']
        
        # Add all residues in this sheet
        for model in structure:
            for chain in model:
                if chain.id == start_chain:
                    for residue in chain:
                        if (residue.id[1] >= start_res and 
                            residue.id[1] <= end_res and
                            chain.id == end_chain):
                            sheet_residues.add((chain.id, residue.id))
    
    # Second pass: count all residues and classify them
    for model in structure:
        for chain in model:
            for residue in chain:
                total_residues += 1
                res_id = (chain.id, residue.id)
                if res_id in helix_residues:
                    secondary_structure['helix'] += 1
                elif res_id in sheet_residues:
                    secondary_structure['sheet'] += 1
                else:
                    secondary_structure['coil'] += 1
    
    if total_residues > 0:
        return {
            'helix': secondary_structure['helix'] / total_residues,
            'sheet': secondary_structure['sheet'] / total_residues,
            'coil': secondary_structure['coil'] / total_residues
        }
    return {'helix': 0, 'sheet': 0, 'coil': 0}
    
proportions = get_secondary_structure_proportions(pdb_file)
print(f"Helix: {proportions['helix']:.2%}")
print(f"Sheet: {proportions['sheet']:.2%}")
print(f"Coil: {proportions['coil']:.2%}")

Helix: 0.00%
Sheet: 0.00%
Coil: 100.00%
