# Check of the NOE quantitative reliability

## Todo:
- H1: the most intense peak per spin system is HA(i)
- H2: the 2nd most intense is HA(i-1) and the 3rd is HN(i-1)
- H3: the most intense inter-residual HN peak per spin system is HN(i-1)

In [1]:
import pandas as pd
import numpy as np

from functions import *

In [2]:
PDB_ID=['2LEA']

## Tidying up the table

In [3]:
path = f'~/Sparky/Lists/{PDB_ID}.list' # set correctly

# Reading the data
df = pd.read_csv(path, header=0, index_col=None, sep='\s+')

# Restructuring 

df.drop(columns='Height', inplace=True)
df.rename({
    'Assignment': 'label',
    'Data': 'height',
    'w1': 'N',
    'w2': 'Hn',
    'w3': 'H'
}, axis=1, inplace=True)

df.insert(0, 'noe', df.label.apply(lambda s: s.split('-')[-1]))
df.insert(0, 'res', df.label.apply(lambda s: s.split('-')[0]))

df['noe_res'] = df.noe.apply(lambda s: s.split('H')[0])
df.loc[df.noe_res == '', 'noe_res'] = df.loc[df.noe_res == ''].res
df['noe_res'] = df.noe_res.str.removesuffix('N')
df['res'] = df.res.str.removesuffix('N')
df.drop(columns='label', inplace=True)
df['inter'] = df.noe_res != df.res


In [4]:
df

Unnamed: 0,res,noe,N,Hn,H,height,noe_res,inter
0,S1,H,116.240,8.111,8.111,1571,S1,False
1,S1,HA,116.240,8.111,4.389,1756,S1,False
2,S1,HB2,116.240,8.111,3.750,2457,S1,False
3,S1,HB3,116.240,8.111,3.750,2457,S1,False
4,S1,Y2H,116.240,8.111,8.062,554,Y2,True
...,...,...,...,...,...,...,...,...
1789,S100,H99HB3,123.154,8.119,3.245,1350,S100,False
1790,S100,H,123.154,8.119,8.119,69967,S100,False
1791,S100,HA,123.154,8.119,4.258,2956,S100,False
1792,S100,HB2,123.154,8.119,3.849,3318,S100,False


In [5]:
# optional: drop the residue types and leave only numbers

df['resnum'] = df['res'].str.extract('(\d+)', expand=False).fillna(0).astype(int)
df['noe_resnum'] = df['noe'].str.extract('(\d+)', expand=False).fillna(0).astype(int)
df['inter'] = df.noe_res != df.res

df

Unnamed: 0,res,noe,N,Hn,H,height,noe_res,inter,resnum,noe_resnum
0,S1,H,116.240,8.111,8.111,1571,S1,False,1,0
1,S1,HA,116.240,8.111,4.389,1756,S1,False,1,0
2,S1,HB2,116.240,8.111,3.750,2457,S1,False,1,2
3,S1,HB3,116.240,8.111,3.750,2457,S1,False,1,3
4,S1,Y2H,116.240,8.111,8.062,554,Y2,True,1,2
...,...,...,...,...,...,...,...,...,...,...
1789,S100,H99HB3,123.154,8.119,3.245,1350,S100,False,100,99
1790,S100,H,123.154,8.119,8.119,69967,S100,False,100,0
1791,S100,HA,123.154,8.119,4.258,2956,S100,False,100,0
1792,S100,HB2,123.154,8.119,3.849,3318,S100,False,100,2


# How many $H^{i-1}_{\alpha}$s are stronger than $H^{i}_{\alpha}$?

In [6]:
def get_n_deviations(df_strong, df_weak):
    """Calcaulates how many interresidual NOEs are 
    more intense than the correlation Hn-N-Hnoe
    
    Arguments:
        - df_strong (DataFrame):  list of intra-residual NOE peaks, which are supposed to be the most intense (filtered by the NOE atom type)
        - df_weak (DataFrame): list of inter-residual list of NOE peaks, which are assumed to be less intense (filtered by the NOE atom type, must be the same as in df_i)
    Returns:
        None
        """
    noes_strong = df_strong[['height', 'res']].groupby('res', as_index=True).max('height')
    noes_weak = df_weak[['height', 'res']].groupby('res', as_index=True).max('height')

    noe_compare = noes_strong.join(noes_weak, how='left',
                       lsuffix='_noe_intra', rsuffix='_noe_inter').fillna(0)
    n_devs = (noe_compare['height_noe_intra'].apply(np.abs) 
              < noe_compare['height_noe_inter'].apply(np.abs)).sum()
    print(n_devs)
    if n_devs == 0:
        print("Brilliant result, we have 0 deviations from the theory!")
    else:
        print("There are some deviations, check those most prominent deviations closer:")
        print(noe_compare.loc[noe_compare['height_noe_intra'].apply(np.abs) < noe_compare['height_noe_inter'].apply(np.abs)])

In [7]:
df_i_a = df[~df.inter & df.noe.str.contains('HA')]
df_im1_a = df[df.inter & df.noe.str.contains('HA')]

df_i_a

Unnamed: 0,res,noe,N,Hn,H,height,noe_res,inter,resnum,noe_resnum
1,S1,HA,116.240,8.111,4.389,1756,S1,False,1,0
10,Y2,HA,121.776,8.062,4.545,5887,Y2,False,2,0
22,G3,HA2,110.102,8.221,3.860,11348,G3,False,3,2
23,G3,HA3,110.102,8.221,3.860,11348,G3,False,3,3
29,R4,HA,121.307,7.967,4.582,8560,R4,False,4,0
...,...,...,...,...,...,...,...,...,...,...
1757,G92,HA2,110.707,7.969,3.642,848,G92,False,92,2
1758,G92,HA3,110.707,7.969,3.782,457,G92,False,92,3
1764,R93,HA,121.504,7.930,4.465,7823,R93,False,93,0
1776,S97,HA,115.594,8.128,4.348,2611,S97,False,97,0


In [8]:
df_im1_a

Unnamed: 0,res,noe,N,Hn,H,height,noe_res,inter,resnum,noe_resnum
6,Y2,S1HA,121.776,8.062,4.389,13351,S1,True,2,1
17,G3,Y2HA,110.102,8.221,4.545,7299,Y2,True,3,2
26,R4,G3HA2,121.307,7.967,3.860,28898,G3,True,4,3
27,R4,G3HA3,121.307,7.967,3.860,28898,G3,True,4,3
33,D8,P7HA,119.234,8.260,4.331,32052,P7,True,8,7
...,...,...,...,...,...,...,...,...,...,...
1739,Y91,A89HA,121.757,8.182,4.449,4359,A89,True,91,89
1742,Y91,R90HA,121.757,8.182,4.319,9894,R90,True,91,90
1753,G92,Y91HA,110.707,7.969,4.466,631,Y91,True,92,91
1761,R93,G92HA2,121.504,7.930,3.642,10568,G92,True,93,92


In [9]:
get_n_deviations(df_i_a, df_im1_a)

46
There are some deviations, check those most prominent deviations closer:
        height_noe_intra  height_noe_inter
res                                       
A57                 2905            5071.0
A89                 4225            9204.0
D18                 2494            7194.0
D8                 14410           32052.0
D80                 3434            6573.0
E10                14086           19548.0
E83                 5532           13671.0
F56                 2881            4228.0
F58                 2703            6544.0
F61                 1983            3418.0
G40                 4891            6003.0
H62                 2809            3601.0
I44                 2259            8803.0
K16                 1130            3833.0
K51                 3258            3595.0
K64                 5124           13873.0
L15                 1551            3180.0
L20                 4660            7067.0
L79                 3865            9412.0
L84                 3

Where is the maximum intensity of all interresidual $H_a$ crosspeaks is higher than residue's very own $H_a$?