In [1]:
import MDAnalysis as mda
import numpy as np
import pandas as pd
import sys
sys.path.append('/Users/dburns/Library/CloudStorage/Box-Box/my_scripts')
from ChACRA.ContactAnalysis.ContactFrequencies import *
import os

In [2]:
u = mda.Universe('/Users/dburns/Library/CloudStorage/Box-Box/1_GROUP_data/Enzymes/atcase/from_pronto/1q95/1q95_minimized_nohoh.pdb')

In [3]:
protein = u.select_atoms('protein')

In [4]:
u = mda.Merge(protein)
u

<Universe with 43566 atoms>

In [7]:
universe = u.copy()
residues = {seg.segid: seg.residues.resnames for seg in universe.segments}
segids = list(residues.keys())
# make a square matrix that will be filled with True values for identical subunits
array = np.zeros((len(segids),len(segids)),dtype=np.bool_)
# every subunit is identical with itself
np.fill_diagonal(array,True)
# work with it as a df
identical_table = pd.DataFrame(array, columns=segids, index=segids)



In [8]:
contact_dir = '/Users/dburns/Library/CloudStorage/Box-Box/1_GROUP_data/Enzymes/atcase/from_nova/contacts/freqs'
contact_files = [f'{contact_dir}/{file}' for file in os.listdir(contact_dir)]
contact_files.sort()
cont = make_contact_frequency_dictionary(contact_files)
cont = ContactFrequencies(pd.DataFrame(cont))

In [9]:
identical_table

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L
A,True,False,False,False,False,False,False,False,False,False,False,False
B,False,True,False,False,False,False,False,False,False,False,False,False
C,False,False,True,False,False,False,False,False,False,False,False,False
D,False,False,False,True,False,False,False,False,False,False,False,False
E,False,False,False,False,True,False,False,False,False,False,False,False
F,False,False,False,False,False,True,False,False,False,False,False,False
G,False,False,False,False,False,False,True,False,False,False,False,False
H,False,False,False,False,False,False,False,True,False,False,False,False
I,False,False,False,False,False,False,False,False,True,False,False,False
J,False,False,False,False,False,False,False,False,False,True,False,False


In [10]:
# Go through all pairwise combinations of subunits
for combo in combinations(segids,2):
    # not identical if lengths are different
    if len(residues[combo[0]]) != len(residues[combo[1]]):
         identical_table[combo[1]][combo[0]], identical_table[combo[0]][combo[1]] = False, False
    else:
        # catch anything that might have same number of residues but different sequences
        bool = np.all(np.equal(residues[combo[0]],residues[combo[1]]))
        # Enter True or False in both the upper tri and lower tri  
        identical_table[combo[1]][combo[0]], identical_table[combo[0]][combo[1]] =  bool, bool

# Only keep one representative row for each unique chain that gives the identical sets of subunits
identical_table.drop_duplicates(inplace=True)

identical_subunits = {}
# Add the lists of identical subunits to the dictionary
# You can deal with different sets of identical subunits in complex situations\
for i, segid in enumerate(identical_table.index):
    subunits = identical_table.T[identical_table.loc[segid]==True].index
    # if there is only one, it's just identical with itself
    if len(subunits) >= 2:
        identical_subunits[i] = list(subunits)

In [11]:
identical_table

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L
A,True,True,True,True,True,True,False,False,False,False,False,False
G,False,False,False,False,False,False,True,True,True,True,True,True


In [12]:
identical_subunits

{0: ['A', 'B', 'C', 'D', 'E', 'F'], 1: ['G', 'H', 'I', 'J', 'K', 'L']}

In [11]:
len(identical_subunits)

2

In [15]:
combos = list(combinations(identical_subunits.keys(),2))

In [21]:
cont.freqs

Unnamed: 0,B:ALA:112-B:ASN:126,C:GLN:146-C:LEU:152,D:THR:26-D:VAL:309,E:LEU:192-E:PRO:189,E:ARG:65-E:GLN:297,B:ARG:56-B:SER:74,C:ARG:105-C:PHE:49,C:ALA:175-C:PHE:179,A:GLU:60-A:SER:62,L:ASP:19-L:PHE:65,...,D:ARG:269-F:THR:91,E:ASN:256-E:HIS:282,B:ASN:13-H:CYS:138,H:CYS:114-H:GLU:119,E:LEU:254-E:PRO:281,F:ILE:9-F:THR:116,E:ASP:278-E:MET:227,B:GLU:109-H:GLU:119,E:GLU:109-E:HIS:106,D:ALA:274-D:THR:275
0,0.528,0.505,0.234,0.997,0.764,0.982,0.999,0.864,1.0,0.264,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.486,0.509,0.252,0.999,0.77,0.972,0.998,0.88,1.0,0.311,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.481,0.5,0.264,0.998,0.758,0.963,0.999,0.883,1.0,0.362,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.457,0.512,0.273,0.998,0.732,0.954,0.999,0.884,1.0,0.332,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.415,0.506,0.256,0.997,0.685,0.929,0.999,0.884,0.999,0.307,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.374,0.514,0.266,0.997,0.659,0.928,0.998,0.88,0.999,0.265,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.381,0.511,0.271,0.998,0.655,0.926,0.998,0.876,0.999,0.217,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.395,0.5,0.273,0.997,0.654,0.922,0.997,0.865,0.999,0.174,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.417,0.503,0.319,0.997,0.647,0.894,0.997,0.868,0.998,0.143,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.442,0.5,0.314,0.998,0.636,0.88,0.997,0.864,0.999,0.134,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
df = cont.freqs
non_identical_subunit_sets = {}
combos = combinations(identical_subunits.keys(),2)
for combo in combos:
    regex1 = f"[{'|'.join(identical_subunits[combo[0]])}]:[A-Z]+:[1-9]+-[{'|'.join(identical_subunits[combo[1]])}]:[A-Z]+:[1-9]+"
    regex2 = f"[{'|'.join(identical_subunits[combo[1]])}]:[A-Z]+:[1-9]+-[{'|'.join(identical_subunits[combo[0]])}]:[A-Z]+:[1-9]+"
    regex = f"{regex1}|{regex2}"
#this will have the identical_subunit key pairs and corresponding column names
# you can create the regex using these keys and identical_subunits dictionary
non_identical_subunit_sets[combo] = list(df.filter(regex=regex, axis=1).columns)

In [36]:
regex1

'[A|B|C|D|E|F]:[A-Z]+:[1-9]+-[G|H|I|J|K|L]:[A-Z]+:[1-9]+'

In [37]:
len(non_identical_subunit_sets[(0, 1)])

1156

In [42]:
identical_subunits

{0: ['A', 'B', 'C', 'D', 'E', 'F'], 1: ['G', 'H', 'I', 'J', 'K', 'L']}

In [43]:
u.select_atoms('chainID A')

<AtomGroup with 4837 atoms>

In [52]:
from MDAnalysis.analysis.distances import distance_array
# can make a table of the equivalent contacts...
sorted_distances = {}

#for identical_subunits_key in identical_subunits
# find distances between identical and non identical and for all non identical, choose the closest one to priority as the rep for those
for key in identical_subunits:
    chain_distances = {}
    priority = identical_subunits[key][0]
    sel1 = u.select_atoms(f'chainID {priority}')
    for chain in identical_subunits[key][1:]:
        sel2 = u.select_atoms(f'chainID {chain}')

        min_dist = distance_array(sel1.atoms, sel2.atoms,).min()
        chain_distances[(priority, chain)] = min_dist
    sorted_distances[key]={k:v for k, v in sorted(chain_distances.items(), key=lambda x:x[1])}

# then get mixed subunit distances
# this should possibly go after determining the priority chain
# but before determing the other identical subunit priority naming since it will find which of the 
# non identical subunits is closest....
for combo in combinations(identical_subunits.keys(),2):
    chain_distances = {}
    priority = identical_subunits[combo[0]][0]
    sel1 = u.select_atoms(f'chainID {priority}')
    for chain in identical_subunits[combo[1]]:
        sel2 = u.select_atoms(f'chainID {chain}')

        min_dist = distance_array(sel1.atoms, sel2.atoms,).min()
        chain_distances[(priority, chain)] = min_dist
    sorted_distances[combo]={k:v for k, v in sorted(chain_distances.items(), key=lambda x:x[1])}

# After determining the priority nameing scheme, contact distances can be checked to find the best partner subunit with allclose(2 decimals)
# in this case - intra is going to be A-A
# adjacent will be A-C, if it also occurs AB use all close to determine if it's equivalent 



In [53]:
sorted_distances

{0: {('A', 'C'): 1.7065504023539435,
  ('A', 'B'): 1.7316963647164045,
  ('A', 'E'): 2.276290121014368,
  ('A', 'D'): 14.695149666859432,
  ('A', 'F'): 23.72274977485476},
 1: {('G', 'J'): 1.7411957335158068,
  ('G', 'I'): 27.48132477068823,
  ('G', 'H'): 28.549628162117425,
  ('G', 'K'): 30.40205853744378,
  ('G', 'L'): 44.52822064542099},
 (0, 1): {('A', 'G'): 1.6783015165014643,
  ('A', 'H'): 2.3374572559516813,
  ('A', 'K'): 10.352570156109394,
  ('A', 'I'): 11.368680308402194,
  ('A', 'J'): 19.089641017069628,
  ('A', 'L'): 27.828149816818694}}

In [None]:
sel1 = self.u.select_atoms(sel1)
        sel2 = self.u.select_atoms(sel2)
        
        return distance_array(sel1.atoms, sel2.atoms,).min()

In [5]:
segids = set(u.residues.segids)
# hold the coms for each protomer 
seg_sels = {seg: u.aselect_atoms(f'segid {seg}') for seg in set(u.residues.segids)} # if segid in subunits

In [6]:
# access a subunit com
from itertools import combinations, permutations
triples = [[a,b,c] for a,b,c in permutations(segids, 3)]
coms = {segid: seg_sels[segid].center_of_mass() for segid in segids}


In [7]:
import numpy as np
# triple selections without duplicating angle created by vertex and permutations of the 2 points
trips = []
for segid in segids:
    trips.append([[ch1,segid,ch2] for ch1,ch2 in combinations([ch for ch in segids if ch != segid],2)])

In [8]:
trips

[[['A', 'C', 'D'], ['A', 'C', 'B'], ['D', 'C', 'B']],
 [['C', 'A', 'D'], ['C', 'A', 'B'], ['D', 'A', 'B']],
 [['C', 'D', 'A'], ['C', 'D', 'B'], ['A', 'D', 'B']],
 [['C', 'B', 'A'], ['C', 'B', 'D'], ['A', 'B', 'D']]]

In [9]:
all_seg_sel_string = ''.join([f'segid {seg} or ' for seg in list(segids)[:-1]])
all_seg_sel_string += f'segid {list(segids)[-1]}'
all_com = u.select_atoms(all_seg_sel_string).center_of_mass()

In [10]:
def get_angle(a,b,c):
    ba = a - b
    bc = c - b

    cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc))
    angle = np.arccos(cosine_angle)
    return angle

In [11]:
opposing_subunits = []
check = []
for seg in segids:
    if seg not in check:
        for seg2 in [segid for segid in segids if segid != seg]:
            if np.abs(180-np.rad2deg(get_angle(coms[seg],all_com,coms[seg2]))) <= 2:
                opposing_subunits.append((seg,seg2))
                check.append(seg)
                break




In [12]:
opposing_subunits

[('C', 'B'), ('A', 'D'), ('D', 'A'), ('B', 'C')]

In [80]:
np.rad2deg(get_angle(coms['A'],all_com,coms['D']))

179.92590933219583

### Use contact df to determine which chains actually make contact

In [14]:
df = cont.freqs

In [16]:
from ChACRA.ContactAnalysis.contact_functions import _parse_id

In [23]:
partner_chains = {}

for contact in df.columns:
    data = _parse_id(contact)
    if data['chaina'] not in partner_chains:
        partner_chains[data['chaina']] = set(data['chainb'])
    else:
        partner_chains[data['chaina']].add(data['chainb'])
    if data['chainb'] not in partner_chains:
        partner_chains[data['chainb']] = set(data['chaina'])
    else:
        partner_chains[data['chainb']].add(data['chaina'])

In [24]:
partner_chains

{'B': {'A', 'B', 'C', 'E', 'F', 'H', 'I'},
 'C': {'A', 'B', 'C', 'D', 'F', 'G', 'I', 'J'},
 'D': {'C', 'D', 'E', 'F', 'I', 'J', 'L'},
 'E': {'A', 'B', 'D', 'E', 'F', 'J', 'K'},
 'A': {'A', 'B', 'C', 'E', 'G', 'H'},
 'L': {'D', 'F', 'I', 'L'},
 'J': {'C', 'D', 'E', 'G', 'J'},
 'I': {'B', 'C', 'D', 'I', 'L'},
 'G': {'A', 'C', 'G', 'J'},
 'H': {'A', 'B', 'H', 'K'},
 'F': {'B', 'C', 'D', 'E', 'F', 'K', 'L'},
 'K': {'E', 'F', 'H', 'K'}}