In [None]:
import sys
# !{sys.executable} -m pip install git+https://github.com/zotko/xyz2graph.git
import shap
from collections import Counter

from tqdm.notebook import trange, tqdm
from time import sleep, perf_counter

import os
from glob import glob
import pandas as pd
import numpy as np
import networkx as nx
from scipy.spatial import distance_matrix

# Scikit-learn
from sklearn.model_selection import KFold
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error,mean_absolute_percentage_error
from sklearn.preprocessing import normalize, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE


# Reps
from alchemical_cms import genpaddedCMs
from dscribe.descriptors import SOAP
from dscribe.kernels import REMatchKernel
from mendeleev.fetch import fetch_table
from ase.io import read
from Element_PI import VariancePersist
from Element_PI import VariancePersistv1

# from rdkit import DataStructs
# from rdkit import Chem
# from rdkit.Chem import AllChem
# from rdkit.Chem import Draw, MACCSkeys
# from openbabel import openbabel as ob
# from openbabel import pybel

from xyz2graph import MolGraph, to_networkx_graph, to_plotly_figure
# from plotly.offline import offline
#Plotting
import seaborn as sns
sns.set_style()
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['figure.dpi'] = 200


In [None]:
AA_df=pd.read_excel('20_AA_CO2_interaction_energy_DFT_PBE0D3BJ_def2TZVPP.xlsx',index_col=0,header=None)
AA_df.index=[i.replace(' acid','').capitalize() if 'acid' in i else i.capitalize() for i in AA_df.index]

# rdkit.Chem.rdMolDescriptors.CalcNumRotatableBonds((Mol)mol, (bool)strict) → int


In [None]:
df=pd.read_excel('site_data.xlsx',index_col=0)
sitelist=['A_carboxy', 'A_amine', 'B_carboxy', 'B_amine', 'A_link','B_link', 'A_side', 'B_side']
inter_df=df[sitelist].dropna()
# inter_df[inter_df!=0]=1

In [None]:
df['Interaction_Energy'].describe()

In [None]:


y=pd.read_excel('400_dipeptides_interaction_energy_UPDATED2.xlsx',usecols=[1,2],index_col=0)
y['keypair']=['_'.join(i.split('_')[:3]) for i in y.index]
pairs=pd.DataFrame([(i[0].split(":")[0],i[0].split(":")[1].replace(" ","").replace('acid','')) for i in pd.read_excel('400_dipeptides_interaction_energy_UPDATED2.xlsx', 'Sheet2',header=None).values]).set_index(0)


# Find set of files
monomerfiles={}        
co2files={}
for k,v in pairs.to_dict()[1].items():
    try:
        path=glob(f'./dipeptides_co2_coordinates/{k}_*.xyz')[0]
        monpath=f'./dipeptides_coordinates/{k}.xyz'
        if os.path.exists(path) and os.path.exists(monpath):
            co2files[v]=path    
            monomerfiles[v]=monpath
    except:
        print(k)
        
# Name check
co2check=['_'.join(i.split('/')[-2].split('_')[:3]) for i in co2files.values()]
moncheck=['_'.join(i.split('/')[-2].split('_')[:4]) for i in monomerfiles.values()]

In [None]:
AAs=sorted(set([i[0].split('-')[0].replace('acid','') for i in pairs.values]))

In [None]:
noncarboyx=inter_df[(inter_df[['A_carboxy','B_carboxy']]==0).all(axis=1)]

In [None]:
noncarboyx

In [None]:

co2dipepgraphs={}
co2mgs={}
for k,v in co2files.items():
    # Create the MolGraph object
    mg = MolGraph()

    # Read the data from the .xyz file
    mg.read_xyz(v)
    co2mgs[k]=mg
    # Convert the molecular graph to the NetworkX graph
    G = to_networkx_graph(mg)
    co2dipepgraphs[k]=G
    
dipepgraphs={}    
mgs={}
for k,v in monomerfiles.items():
    # Create the MolGraph object
    mg = MolGraph()

    # Read the data from the .xyz file
    mg.read_xyz(v)
    mgs[k]=mg
    # Convert the molecular graph to the NetworkX graph
    G = to_networkx_graph(mg)
    dipepgraphs[k]=G    


In [None]:
# FLAG!
# df.loc['Asparagine-Alanine']
# fig = to_plotly_figure(co2mgs['Asparagine-Alanine'])
# offline.plot(fig)

# FLAG THIS ONE!!
# fig = to_plotly_figure(co2mgs['Alanine-Lysine'])
# offline.plot(fig)

In [None]:
for k,v in pairs.to_dict()[1].items():
    names=v.split('-')
    for j in names:
        if 'acid' in j:
            print(names)

In [None]:
AAs

In [None]:
df.loc['Glycine-Proline']['Interaction_Energy']

In [None]:
df.loc['Glycine-Proline']['label']

In [None]:
AA_df.loc['Glycine'],AA_df.loc['Proline']

In [None]:
# pca = PCA(n_components=2)
# pca_result = pca.fit_transform(X)

# pcadf['pca-one'] = pca_result[:,0]
# pcadf['pca-two'] = pca_result[:,1] 
# # pcadf['pca-three'] = pca_result[:,2]
# print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))
# print(sum(pca.explained_variance_ratio_))
# plt.figure(figsize=(16,10))
# sns.scatterplot(
#     x="pca-one", y="pca-two",
#     # hue='A_side',
#     palette=sns.color_palette("hls", 8),
#     data=pcadf,
#     legend="full"
# )

# # sns.scatterplot(
# #     x="pca-one", y="pca-three",
# #     hue='Interaction_Energy',
# #     palette=sns.color_palette("hls", 8),
# #     data=pcadf,
# #     legend="full"
# # )


In [None]:
sorted_IE={}
all_energies=pd.DataFrame(columns=AAs,index=AAs)
IE_df=pd.DataFrame(columns=AAs,index=AAs)
glygly_df=pd.DataFrame(columns=AAs,index=AAs)
for A in AAs:
    sorted_IE[A]={B:df['Interaction_Energy'].to_dict()['-'.join((A,B))] for B in AAs}
    for B in AAs:
        IE_df[A].loc[B]=df['Interaction_Energy'].to_dict()['-'.join((A,B))]-df['Interaction_Energy'].to_dict()['-'.join((A,A))]
        glygly_df[A].loc[B]=df['Interaction_Energy'].to_dict()['-'.join((A,B))]-df['Interaction_Energy'].to_dict()['-'.join(('Glycine','Glycine'))]
        all_energies[A].loc[B]=df['Interaction_Energy'].to_dict()['-'.join((A,B))]

In [None]:
df['Interaction_Energy'].describe().to_excel('describe_dipep_IE.xlsx')

In [None]:
minDF=pd.DataFrame([all_energies.astype(float).idxmax(),all_energies.astype(float).max()]).T.rename(columns={0:'idxmin',1:'min'})

In [None]:
minDF['idxmin'][minDF['idxmin']=='Glycine']

In [None]:
df.loc[[f'{k}-{v}' for k,v in minDF['idxmin'][minDF['idxmin']=='Glycine'].to_dict().items()]].sum()

In [None]:
maxDF=pd.DataFrame([all_energies.astype(float).idxmin(),all_energies.astype(float).min()]).T.rename(columns={0:'idxmax',1:'max'})

In [None]:
maxDF

In [None]:
sum([[f'{k} {v:.4f} kcal/mol'] for k,v in maxDF['max'].astype(float).describe().to_dict().items()],[])

In [None]:
aa_color_dct=dict(zip(AA_df.index,sns.color_palette('Paired',len(AA_df))))

In [None]:
df.loc[[f"{k}-{v}" for k,v in minDF['idxmin'].to_dict().items()]]['Interaction_Energy'].describe()

In [None]:
df.loc[[f"{k}-{v}" for k,v in maxDF['idxmax'].to_dict().items()]]['Interaction_Energy'].describe()

In [None]:
df

In [None]:
df.loc[[i[0]+'-'+i[1] for i in minDF.reset_index().to_numpy()]].sum()

In [None]:
df.loc[[i[0]+'-'+i[1] for i in maxDF.reset_index().to_numpy()]].sum()

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(12,5))

cntidmin=minDF['idxmin'].value_counts()
mincmap=sns.color_palette('Paired',len(cntidmin))
ax1.pie(cntidmin.values, labels = cntidmin.index, autopct='%.0f%%',colors=mincmap)
ax1.set_title('Weakest Dehydrogenated Distribution')

cntidmax=maxDF['idxmax'].value_counts()
maxcmap=sns.color_palette('Paired',len(cntidmax)-1)+[sns.color_palette('Paired',2)[1]]
ax2.pie(cntidmax.values, labels = cntidmax.index, autopct='%.0f%%',colors=maxcmap)
ax2.set_title('Strongest Dehydrogenated Distribution')
plt.tight_layout()
plt.savefig('dehydrogenated_dis.png',dpi=300,bbox_inches='tight')
plt.show()




In [None]:
df.loc[['-'.join(i) for i in minDF['idxmin'].reset_index().to_numpy() if i[1]=='Glycine']][sitelist].sum().sort_values()

In [None]:
['-'.join(i) for i in minDF['idxmin'].reset_index().to_numpy() if i[1]=='Glycine']

In [None]:
df[sitelist].loc[["-".join(i) for i in np.vstack([maxDF['idxmax'].reset_index().to_numpy()[maxDF['idxmax'].reset_index().to_numpy()[:,1]==B] for B in ["Histidine","Glutamic","Glutamine"]])]].sum().sort_values()

In [None]:
df.loc[['-'.join(i) for i in maxDF['idxmax'].reset_index().to_numpy() if i[1]=='Asparagine']][sitelist].sum().sort_values()


In [None]:
['-'.join(i) for i in maxDF['idxmax'].reset_index().to_numpy() if i[1]=='Asparagine']


In [None]:
outliers=df['Interaction_Energy'][df['Interaction_Energy']<df['Interaction_Energy'].describe()['25%']-1.5*(df['Interaction_Energy'].describe()['75%']-df['Interaction_Energy'].describe()['25%'])].index

In [None]:
df['Interaction_Energy'].loc[outliers]

In [None]:
df['Interaction_Energy'].idxmin(),df['Interaction_Energy'].min().round(2)

In [None]:
df['Interaction_Energy'].idxmax(),df['Interaction_Energy'].max().round(2)

In [None]:
df['Interaction_Energy'].describe().round(2)

In [None]:
AA_df.describe().round(2)

In [None]:
all_energies.astype(float).describe().loc['max'].describe().round(2)

In [None]:
all_energies.astype(float).describe().loc['min'].describe().round(2)

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5),sharey=True)
sns.boxplot(data=pd.concat([AA_df,df['Interaction_Energy']]).rename(columns={"Interaction_Energy":'Dipeptides',1:'Amino Acids'}),ax=ax1,palette=sns.color_palette('Paired'))

ax1.set_ylim(-12,0)
ax1.set_ylabel('Interaction Energies (kcal/mol)')
sns.boxplot(data=all_energies,ax=ax2,palette=sns.color_palette('Paired'))
ax2.set_xticklabels(ax2.get_xticklabels(), rotation = 90, ha="center")

plt.tight_layout()
plt.savefig('boxplot_IEs_vs_AAs.png',dpi=300,bbox_inches='tight')

In [None]:

ax=sns.heatmap(data=all_energies.astype(float),cmap=sns.cm.rocket, linewidth=.01,cbar_kws={'label': 'Interaction Energy (kcal/mol)'}, vmin=-12, vmax=0,square=True)
ax.invert_yaxis()
plt.xlabel('Peptide B')
plt.ylabel('Peptide A')
plt.savefig('pair_heat.png',dpi=300,bbox_inches='tight')
plt.show()

In [None]:
palette=sns.color_palette('Paired',20)
markers=['o','x','^','1','*']

In [None]:
from matplotlib import cm
from matplotlib.ticker import LinearLocator


data_3D=[]
for idx,A in enumerate(AA_df.index):
    for idxb,B in enumerate(AA_df.index):
        data_3D.append([AA_df.loc[A].values[0],AA_df.loc[B].values[0],all_energies[A][B]])
data_3D=np.array(data_3D)

X=data_3D[:,0]
Y=data_3D[:,1]
Z=data_3D[:,2].reshape(-1,1)



# fig, ax = plt.subplots(subplot_kw={"projection": "3d"})
# ax.scatter(X, Y, Z)
# # X, Y = np.meshgrid(X, Y)

# # # Plot the surface.
# # surf = ax.plot_surface(X, Y, Z, cmap=cm.coolwarm,
# #                        linewidth=0, antialiased=False)

# # # Customize the z axis.
# # # ax.set_zlim(-1.01, 1.01)
# # ax.zaxis.set_major_locator(LinearLocator(10))
# # # A StrMethodFormatter is used automatically
# # ax.zaxis.set_major_formatter('{x:.02f}')

# # # Add a color bar which maps values to colors.
# # fig.colorbar(surf, shrink=0.5, aspect=5)

# # Rotate the axes and update
# for angle in range(0, 360*4 + 1):
#     # Normalize the angle to the range [-180, 180] for display
#     angle_norm = (angle + 180) % 360 - 180

#     # Cycle through a full rotation of elevation, then azimuth, roll, and all
#     elev = azim = roll = 0
#     if angle <= 360:
#         elev = angle_norm
#     elif angle <= 360*2:
#         azim = angle_norm
#     elif angle <= 360*3:
#         roll = angle_norm
#     else:
#         elev = azim = roll = angle_norm

#     # Update the axis view and title
#     ax.view_init(elev, azim, roll)
#     plt.title('Elevation: %d°, Azimuth: %d°, Roll: %d°' % (elev, azim, roll))

#     plt.draw()
#     plt.pause(.001)

# # plt.show()


In [None]:
greaterdehydroxy=pd.concat([(all_energies[A]<=AA_df.loc[A].values[0]).value_counts() for idx,A in enumerate(AA_df.index)],axis=1)
greaterdehydro=pd.concat([(all_energies.T[A]<=AA_df.loc[A].values[0]).value_counts() for idx,A in enumerate(AA_df.index)],axis=1)
greaterdehydroxy_int=greaterdehydroxy.T.sum()
greaterdehydro_int=pd.concat([(all_energies.T[A]<=AA_df.loc[A].values[0]).value_counts() for idx,A in enumerate(AA_df.index)],axis=1).T.sum()

In [None]:
df[sitelist].loc[[f'Leucine-{i}' for i in all_energies['Leucine'][AA_df.loc['Leucine'].values[0]<all_energies['Leucine']].index]].sum().sort_values()

In [None]:
df[sitelist].loc[[f'Arginine-{i}' for i in all_energies['Arginine'][AA_df.loc['Arginine'].values[0]<all_energies['Arginine']].index]].sum().sort_values()

In [None]:
all_energies.loc[['Leucine','Arginine']]

In [None]:
all_energies[['Leucine','Arginine']]

In [None]:
[[all_energies[A].loc[(all_energies[A]>=AA_df.loc[A].values[0])],AA_df.loc[A].values[0]] for idx,A in enumerate(AA_df.index)]

In [None]:
greaterdehydro.T[False].dropna().sort_values()

In [None]:
for i in AA_df.index:
    print(all_energies.T[i][(AA_df.loc[i].values[0]>all_energies.T[i])&(AA_df.loc[i].values[0]>all_energies[i])])

In [None]:





fig,((ax1,ax2),(ax3,ax4),(ax5,ax6))=plt.subplots(3,2,figsize=(12,15))


cmap=sns.color_palette('Paired',2)
ax1.pie(greaterdehydroxy_int.values, labels = ['Stronger Interaction (S)','Weaker Interaction (W)'], autopct='%.0f%%',colors=cmap)
ax1.set_title('Dehydroxylated Peptide vs. Amino Acid')


ax2.pie(greaterdehydro_int.values, labels = ['Stronger Interaction (S)','Weaker Interaction (W)'], autopct='%.0f%%',colors=cmap)
ax2.set_title('Dehydrogenated Peptide vs. Amino Acid')
plt.tight_layout()



gtIE_dehydroxy=[]
for i in AA_df.index:
    gtIE_dehydroxy.append(list(all_energies[i][AA_df.loc[i].values[0]>all_energies[i]].index))
gtIE_dehydroxy_arr=np.array(sorted(Counter(sum(gtIE_dehydroxy,[])).items(),key=lambda x: x[1]))

mincmap=sns.color_palette('Paired',len(gtIE_dehydroxy_arr))
ax3.pie(gtIE_dehydroxy_arr[:,1].astype(int), labels = gtIE_dehydroxy_arr[:,0], autopct='%.0f%%',colors=mincmap, pctdistance=0.9)
ax3.set_title('Composition of S Dehydroxylated Peptide')



ltIE_dehydroxy=[]
for i in AA_df.index:
    ltIE_dehydroxy.append(list(all_energies[i][AA_df.loc[i].values[0]<all_energies[i]].index))
ltIE_dehydroxy_arr=np.array(sorted(Counter(sum(ltIE_dehydroxy,[])).items(),key=lambda x: x[1]))

mincmap=sns.color_palette('Paired',len(ltIE_dehydroxy_arr))
ax5.pie(ltIE_dehydroxy_arr[:,1].astype(int), labels = ltIE_dehydroxy_arr[:,0], autopct='%.0f%%',colors=mincmap, pctdistance=0.9)
ax5.set_title('Composition of W Dehydroxylated Peptide')

gtIE_dehydrogen=[]
for i in AA_df.index:
    gtIE_dehydrogen.append(list(all_energies.T[i][AA_df.loc[i].values[0]>all_energies.T[i]].index))
gtIE_dehydrogen_arr=np.array(sorted(Counter(sum(gtIE_dehydrogen,[])).items(),key=lambda x: x[1]))

mincmap=sns.color_palette('Paired',len(gtIE_dehydrogen_arr))
ax4.pie(gtIE_dehydrogen_arr[:,1].astype(int), labels = gtIE_dehydrogen_arr[:,0], autopct='%.0f%%',colors=mincmap, pctdistance=0.9)
ax4.set_title('Composition of S Dehydrogenated Peptide')


ltIE_dehydrogen=[]
for i in AA_df.index:
    ltIE_dehydrogen.append(list(all_energies.T[i][AA_df.loc[i].values[0]<all_energies.T[i]].index))
ltIE_dehydrogen_arr=np.array(sorted(Counter(sum(ltIE_dehydrogen,[])).items(),key=lambda x: x[1]))

mincmap=sns.color_palette('Paired',len(ltIE_dehydrogen_arr))
ax6.pie(ltIE_dehydrogen_arr[:,1].astype(int), labels = ltIE_dehydrogen_arr[:,0], autopct='%.0f%%',colors=mincmap, pctdistance=0.9)
ax6.set_title('Composition of W Dehydrogenated Peptide')

plt.tight_layout()

plt.savefig('interaction_compare.png',dpi=300,bbox_inches='tight')
plt.show()





In [None]:
for i in AA_df.index:
    print(list(all_energies[i][AA_df.loc[i].values[0]>all_energies[i]].index))

In [None]:
pd.concat([pd.DataFrame.from_dict(dict(Counter(sum(ltIE_dehydroxy,[]))),orient='index'),pd.DataFrame.from_dict(dict(Counter(sum(ltIE_dehydrogen,[]))),orient='index')],axis=1).sum(axis=1).sort_values()

In [None]:
pd.concat([pd.DataFrame.from_dict(dict(Counter(sum(gtIE_dehydroxy,[]))),orient='index'),pd.DataFrame.from_dict(dict(Counter(sum(gtIE_dehydrogen,[]))),orient='index')],axis=1).sum(axis=1).sort_values()




In [None]:
from matplotlib.colors import LinearSegmentedColormap

fig,(ax1,ax2)=plt.subplots(1,2,figsize=(15,6))
cmap = sns.cm.rocket_r
# cmap = LinearSegmentedColormap.from_list('', np.vstack([sns.color_palette('rocket'),sns.color_palette('rocket_r')]))
sns.heatmap(data=IE_df.astype(float),ax=ax1,cmap=cmap, linewidth=.01,cbar_kws={'label': 'Deviation (kcal/mol)'}, vmin=-5, vmax=9,square=True)
ax1.set_title('Deviation Between AA')
ax1.set_xlabel('Amino Acid A')
ax1.set_ylabel('Amino Acid B')
ax1.invert_yaxis()

# cmap = LinearSegmentedColormap.from_list('', np.vstack([sns.color_palette('rocket',8),sns.color_palette('rocket_r',3)]))
sns.heatmap(data=glygly_df.astype(float),ax=ax2,cmap=cmap, linewidth=.01,cbar_kws={'label': 'Deviation (kcal/mol)'}, vmin=-8, vmax=1,square=True)
ax2.set_title('Deviation Between Glycine-Glycine')
ax2.set_xlabel('Amino Acid A')
ax2.set_ylabel('Amino Acid B')
ax2.invert_yaxis()
plt.tight_layout()
# plt.savefig(rdeviations.png',dpi=300,bbox_inches='tight')
plt.show()

In [None]:
site_df=df[sorted(sitelist)+['Interaction_Energy']].sort_values(by='Interaction_Energy')

In [None]:
site_df[site_df==0]=np.nan

In [None]:
len(sitelist)

In [None]:
[(i+1)%2 for i in range(8)]

In [None]:
interdict_conv={'A_carboxy':'A Carboxyl', 'A_amine':'A Secondary Amine', 'A_link':'A Linking Hydrogen', 'A_side':'A Side-Chain','B_carboxy':'B Carboxylic Acid', 'B_amine':'B Primary Amine', 'B_link':'B Linking Hydrogen', 'B_side':'B Side-Chain'}

In [None]:


for i in sorted(set([i.strip('A_').strip('_B') for i in sitelist])):
    print(i)
    print(site_df.loc[site_df[f'A_{i}'].dropna().index]['Interaction_Energy'].describe())
    print(site_df.loc[site_df[f'B_{i}'].dropna().index]['Interaction_Energy'].describe())
    print()

In [None]:
site_df.loc[site_df['A_amine'].dropna().index]['Interaction_Energy'].describe()

In [None]:
site_df.loc[site_df['B_amine'].dropna().index]['Interaction_Energy'].describe()

In [None]:
site_df

In [None]:
sitelist

In [None]:
pd.concat([site_df[site_df[i].isna()==False]['Interaction_Energy'].describe().rename(i) for i in sitelist],axis=1).to_excel('interaction_site_dist.xlsx')

In [None]:
site_df[site_df['A_amine'].isna()==False]['Interaction_Energy'].describe().round(2).loc['mean']

In [None]:
for sl in sorted(set([i.split('_')[1] for i in sitelist])):
    print("A_"+sl,site_df[site_df["A_"+sl].isna()==False]['Interaction_Energy'].describe().round(2).loc['mean'])
    print("B_"+sl,site_df[site_df["B_"+sl].isna()==False]['Interaction_Energy'].describe().round(2).loc['mean'])
    print()

In [None]:
site_df.sort_values(by='Interaction_Energy').nsmallest(10,columns=['Interaction_Energy'])

In [None]:
site_df.sum().drop('Interaction_Energy').sort_values()

In [None]:
site_df.sum().loc[['A_carboxy','B_carboxy']]

In [None]:
site_df.sum().loc[['A_side','B_side']].sum()

In [None]:
site_df6

In [None]:
site_df.sum().loc[['A_amine','B_amine']].sum()

In [None]:
site_df.sum().loc['B_side']

In [None]:
site_df.sum().loc['A_side']

In [None]:
site_df.sum().loc[['A_carboxy','B_carboxy']].sum()

##### Counter(site_df['A_side'].dropna())

In [None]:
Counter(site_df['B_side'].dropna())

In [None]:
site_df['A_side'].dropna().sum()+site_df['B_amine'].dropna().sum()

In [None]:
bardict=dict(zip(['amine', 'carboxy', 'link', 'side'],['Amine','Carboxy','Linker Hydrogen', 'Side']))
# fig,ax=plt.subplots(3,len(sitelist)//2,figsize=(15,10))
fig,ax=plt.subplots(2,len(sitelist)//2,figsize=(14,8))
for idx,i in enumerate(sorted(sitelist)):
    
    if idx<=3:
        sns.boxplot(data=site_df,x=i,y='Interaction_Energy',ax=ax[0,idx%4],palette=sns.color_palette('Paired',3))
        ax[0,idx%4].set_xticks([0,1,2])
        ax[0,idx%4].set_xticklabels([1,2,3])
        ax[0,idx%4].set_xlabel(interdict_conv[i])
        ax[0,idx%4].set_ylim(-12,0)
        ax[0,idx%4].set_xlim(-1,3)
        ax[0,idx%4].set_ylabel('Interaction Energy (kcal/mol)')
    else:
        sns.boxplot(data=site_df,x=i,y='Interaction_Energy',ax=ax[1,idx%4],palette=sns.color_palette('Paired',3))
        ax[1,idx%4].set_xticks([0,1,2])
        ax[1,idx%4].set_xticklabels([1,2,3])
        ax[1,idx%4].set_xlabel(interdict_conv[i])
        ax[1,idx%4].set_ylim(-12,0)
        ax[1,idx%4].set_xlim(-1,3)        
        ax[1,idx%4].set_ylabel('Interaction Energy (kcal/mol)')

    
# for idxx,j in enumerate(sorted(set([i.split('_')[1] for i in sitelist]))):
#     melted=pd.concat([site_df[f'A_{j}'].value_counts(),site_df[f'B_{j}'].value_counts()],axis=1).reset_index().melt(id_vars='index')
#     ax[2,idxx%4]=sns.barplot(data=melted,x='index',y='value',hue='variable',ax=ax[2,idxx%4],palette=sns.color_palette('Paired',2))
#     ax[2,idxx%4].set_xticks([0,1,2])
#     ax[2,idxx%4].set_xticklabels([1,2,3])    
#     ax[2,idxx%4].set_xlabel(j)
#     ax[2,idxx%4].set_ylim(0,160)
#     ax[2,idxx%4].set_xlim(-1,3)    
#     ax[2,idxx%4].set_ylabel('Count')
#     ax[2,idxx%4].set_xlabel(bardict[j])
    
fig.supxlabel('Interaction Sites',fontsize=10)
plt.tight_layout()
plt.savefig('boxplot.png',dpi=300,bbox_inches='tight')
plt.show()

In [None]:
g=sns.pairplot(data=site_df,y_vars=['Interaction_Energy'])

for i in range(len(sitelist)):
    ax=g.axes[0,i]
    ax.set_xticks([1,2,3])
    ax.set_xticklabels([1,2,3])
    ax.set_xlim(0,4)
    ax.set_ylim(-12,0)
plt.tight_layout()
# plt.savefig('pairplot.png',dpi=300,bbox_inches='tight')
plt.show()

In [None]:
SerDF=site_df.loc[[i for i in site_df.index if 'Serine-' in i]]

SerList=[]
SerDict={}
for i in SerDF.index:
    subdf=SerDF[sitelist].loc[i]
    SerList.append(['/'.join(subdf.dropna().index),SerDF['Interaction_Energy'].loc[i]])
    

for i in set(np.array(SerList)[:,0]):
    SerDict[i]=[IE for nam,IE in SerList if nam==i]    

SerInt=pd.DataFrame.from_dict(SerDict,orient='index').T        


In [None]:
AspDF=site_df.loc[[i for i in site_df.index if 'Asparagine-' in i]]

AspList=[]
AspDict={}
for i in AspDF.index:
    subdf=AspDF[sitelist].loc[i]
    AspList.append(['/'.join(subdf.dropna().index),AspDF['Interaction_Energy'].loc[i]])
    

for i in set(np.array(AspList)[:,0]):
    AspDict[i]=[IE for nam,IE in AspList if nam==i]    
    
AspInt=pd.DataFrame.from_dict(AspDict,orient='index').T   



In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
sns.boxplot(data=IE_df.astype(float),ax=ax1)
ax1.hlines(np.mean(IE_df.values.flatten()),-2,20,color='red',linestyles='--',label='Mean Deviation')
ax1.legend()
ax1.set_xlim(-1,20)
ax2.set_ylim(-5,9)
ax1.set_xticklabels(ax1.get_xticklabels(), rotation = 90, ha="center")
ax1.set_title('Deviation Between AA')
ax1.set_xlabel('Amino Acid A')
ax1.set_ylabel('Deviation (kcal/mol)')

sns.boxplot(data=glygly_df.astype(float),ax=ax2)
ax2.hlines(np.mean(glygly_df.values.flatten()),-2,20,color='red',linestyles='--',label='Mean Deviation')
ax2.legend()
ax2.set_xlim(-1,20)
ax2.set_ylim(-8,1)
ax2.set_xticklabels(ax2.get_xticklabels(), rotation = 90, ha="center")
ax2.set_title('Deviation Between Glycine-Glycine')
ax2.set_xlabel('Amino Acid A')
ax2.set_ylabel('Deviation (kcal/mol)')
plt.tight_layout()
# plt.savefig('deviation_boxplots.png',dpi=300,bbox_inches='tight')
plt.tight_layout()

In [None]:
# from mpl_toolkits.mplot3d import Axes3D
# from matplotlib.colors import ListedColormap
# # axes instance
# fig, ax = plt.subplots(subplot_kw={"projection": "3d"})


# # plot
# # sc = ax.scatter(unX['level_0'].values, unX['level_1'].values, unX[0].values, s=40,c=unX[0].values, marker='o', cmap=cmap, alpha=1)
# X=Y=range(len(AAs))
# X, Y = np.meshgrid(X, Y)
# sc = ax.plot_surface(X,Y, unX[0].values.reshape(20,20), cmap=sns.cm.rocket_r)
# fig.colorbar(sc, shrink=0.5, aspect=5)

# ax.set_xlabel('Amino Acid A')
# ax.set_xticks(range(len(AAs)))
# ax.set_xticklabels(AAs)
# ax.set_yticks(range(len(AAs)))
# ax.set_yticklabels(AAs)
# ax.set_ylabel('Amino Acid B')
# ax.set_zlabel('Interaction Energy (kcal/mol)')
# plt.tight_layout()
# plt.show()

In [None]:
# # Create the MolGraph object
# mg = MolGraph()

# # Read the data from the .xyz file
# mg.read_xyz(monomerfiles[0])


# # Convert the molecular graph to the NetworkX graph
# G = to_networkx_graph(mg)

# # G.nodes(data=True),G.edges(data=True)

In [None]:
# from rdkit.Chem import rdFingerprintGenerator
# mols=[Chem.MolFromSmiles(list(pybel.readfile('xyz',m))[0].write().split('\t')[0]) for m in monomerfiles]
# fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=6)

# # info={}
# # X = np.vstack([fpgen.GetFingerprint(mol) for mol in mols ])

# fps = [MACCSkeys.GenMACCSKeys(x) for x in mols]
# X=np.vstack([f.ToList() for f in fps])
# Y=y.values

# keys=pd.read_excel('MACCS_keys_example.xlsx',index_col='Key').drop(columns=['Unnamed: 0'])

# mol_keys=[idx for idx, i in enumerate(X[0]) if i==1]

# {str(keys.loc[idx].values[0]):i for idx, i in enumerate(np.count_nonzero(X,axis=0)) if i!=0}

In [None]:

# sns.heatmap([[DataStructs.TanimotoSimilarity(i,j) for i in fps] for j in fps],vmin=0,vmax=1,cmap=sns.cm.rocket_r)
# plt.show()

In [None]:
# Draw.MolsToGridImage(mols,molsPerRow=10, subImgSize=(300,300))