In [None]:
import sys
# !{sys.executable} -m pip install shap
import lightgbm as lgb
import xgboost
import shap
from xgboost import XGBRegressor
from collections import Counter

from tqdm.notebook import trange, tqdm
from time import sleep, perf_counter

import os
from glob import glob
import pandas as pd
import numpy as np
import networkx as nx
from scipy.spatial import distance_matrix

# Scikit-learn
from sklearn.model_selection import KFold
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error,mean_absolute_percentage_error
from sklearn.preprocessing import normalize, MinMaxScaler


# Torch
import torch
print(torch.__version__,torch.__path__)
import torch.nn as nn

# Reps
from alchemical_cms import genpaddedCMs
from dscribe.descriptors import SOAP
from dscribe.kernels import REMatchKernel
from mendeleev.fetch import fetch_table
from ase.io import read
from Element_PI import VariancePersist
from Element_PI import VariancePersistv1

from rdkit import DataStructs
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw, MACCSkeys
from openbabel import openbabel as ob
from openbabel import pybel

from xyz2graph import MolGraph, to_networkx_graph, to_plotly_figure
from plotly.offline import offline
#Plotting
import seaborn as sns
sns.set_style()
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['figure.dpi'] = 200


In [None]:
df=pd.read_excel('site_data.xlsx',index_col=0)
sitelist=['A_carboxy', 'A_amine', 'B_carboxy', 'B_amine', 'A_link','B_link', 'A_side', 'B_side']
inter_df=df[sitelist]
# inter_df[inter_df!=0]=1

In [None]:


y=pd.read_excel('400_dipeptides_interaction_energy.xlsx',usecols=[1,2],index_col=0)
y['keypair']=['_'.join(i.split('_')[:3]) for i in y.index]
pairs=pd.DataFrame([(i[0].split(":")[0],i[0].split(":")[1].replace(" ","").replace('acid','')) for i in pd.read_excel('400_dipeptides_interaction_energy.xlsx', 'Sheet2',header=None).values]).set_index(0)


# Find set of files
monomerfiles={}        
co2files={}
for k,v in pairs.to_dict()[1].items():
    try:
        path=glob(f'./dipeptides_co2_coordinates/{k}_*.xyz')[0]
        monpath=f'./dipeptides_coordinates/{k}.xyz'
        if os.path.exists(path) and os.path.exists(monpath):
            co2files[v]=path    
            monomerfiles[v]=monpath
    except:
        print(k)
        
# Name check
co2check=['_'.join(i.split('/')[-2].split('_')[:3]) for i in co2files.values()]
moncheck=['_'.join(i.split('/')[-2].split('_')[:4]) for i in monomerfiles.values()]

In [None]:
AAs=sorted(set([i[0].split('-')[0].replace('acid','') for i in pairs.values]))

In [None]:
noncarboyx=inter_df[(inter_df[['A_carboxy','B_carboxy']]==0).all(axis=1)]

In [None]:
noncarboyx

In [None]:

co2dipepgraphs={}
co2mgs={}
for k,v in co2files.items():
    # Create the MolGraph object
    mg = MolGraph()

    # Read the data from the .xyz file
    mg.read_xyz(v)
    co2mgs[k]=mg
    # Convert the molecular graph to the NetworkX graph
    G = to_networkx_graph(mg)
    co2dipepgraphs[k]=G
    
dipepgraphs={}    
mgs={}
for k,v in monomerfiles.items():
    # Create the MolGraph object
    mg = MolGraph()

    # Read the data from the .xyz file
    mg.read_xyz(v)
    mgs[k]=mg
    # Convert the molecular graph to the NetworkX graph
    G = to_networkx_graph(mg)
    dipepgraphs[k]=G    


In [None]:
# FLAG!
# df.loc['Asparagine-Alanine']
# fig = to_plotly_figure(co2mgs['Asparagine-Alanine'])
# offline.plot(fig)

# FLAG THIS ONE!!
# fig = to_plotly_figure(co2mgs['Alanine-Lysine'])
# offline.plot(fig)

In [None]:
for k,v in pairs.to_dict()[1].items():
    names=v.split('-')
    for j in names:
        if 'acid' in j:
            print(names)

In [None]:
AAs

In [None]:
dfpairE=pd.DataFrame(np.zeros((20,20)),index=AAs,columns=AAs)
for k,v in pairs.to_dict()[1].items():
    names=v.split('-')
    dfpairE[names[0]][names[1]]=y.set_index('keypair').loc[k].values

In [None]:
devpairs=pd.DataFrame(sum([[('-'.join((i,j)),abs(dfpairE[i][j]-dfpairE[j][i])) for idxi,i in enumerate(AAs) if idxj>idxi] for idxj,j in enumerate(AAs)],[]),columns=['Pairs','Deviation'])

In [None]:
len(devpairs[devpairs['Deviation']<=1])/len(devpairs)

In [None]:
df['dev_gly']=df['Interaction_Energy']-df['Interaction_Energy']['Glycine-Glycine']

In [None]:
df[df['dev_gly']>0]

In [None]:
df[(df['dev_gly']<0)&(df['dev_gly']>=-2)]

In [None]:
df[(df['dev_gly']<=-2)&(df['dev_gly']>=-4)]

In [None]:
df[(df['dev_gly']<=-4)&(df['dev_gly']>=-6)]

In [None]:
df[df['dev_gly']<=-6]

In [None]:
pairs[pairs[1]=='Asparagine-Asparagine']

In [None]:
ax=sns.histplot(data=df,x='dev_gly')

In [None]:
plt.figure(figsize=(30,5))
plt.scatter(range(len(df['Interaction_Energy'])),(df['Interaction_Energy']-df['Interaction_Energy']['Glycine-Glycine']).sort_values())
plt.xticks(range(len(df['Interaction_Energy'])),df['Interaction_Energy'].index,rotation=90, fontsize = 6)
plt.xlim(-1,len(df['Interaction_Energy'])+1)
plt.show()

In [None]:
plt.figure(figsize=(25,5))
# sns.lineplot(data=devpairs,x='Pairs',y='Deviation')
plt.plot(range(len(devpairs['Pairs'])),devpairs['Deviation'],'o--')

plt.fill_between(np.arange(-1,len(devpairs['Pairs'])+1)*[1],np.zeros(len(devpairs['Pairs'])+2),np.ones(len(devpairs['Pairs'])+2),color='gray')
plt.xticks(range(len(devpairs['Pairs'])),devpairs['Pairs'],rotation=90, fontsize = 8)
plt.xlim(-1,len(devpairs)+1)
plt.ylim(0,7)
plt.xlabel('Pairs')
plt.ylabel('Deviation (kcal/mol)')
plt.title('Deviation Between Pairs AB and BA')
plt.tight_layout()
plt.savefig('absolute_pair_dev.png',dpi=300,bbox_inches='tight')
plt.show()

In [None]:
sns.heatmap(data=dfpairE,vmin=-12,vmax=0,cbar_kws={'label': 'Interaction Energy (kcal/mol)'},linewidths=0.1,square=True)
plt.xlabel('Amino Acid')
plt.ylabel('Amino Acid')

plt.tight_layout()
plt.savefig('pair_heat.png',dpi=300,bbox_inches='tight')
plt.show()

In [None]:
inter_df.T.values.shape

In [None]:
dict_count=dict(zip(inter_df.T.columns,np.count_nonzero(inter_df.T.values==1,axis=0)))

In [None]:
y

In [None]:
intvsE=[]
for k,v in df['label'].to_dict().items():
    # print(k,v,dict_count[k])
    # print(y[y['keypair']==v].values)
    intvsE.append((dict_count[k],y[y['keypair']==v]['Interaction_Energy'].values[0]))
intvsE=np.array(sorted(intvsE,key=lambda x: x[0]))    

dfintvsE=pd.DataFrame(intvsE,columns=['Count','Interaction_Energy'])

In [None]:
sns.histplot(data=dfintvsE,x='Interaction_Energy',hue='Count',kde=True,stat='count')

In [None]:
stats=pd.DataFrame(y['Interaction_Energy'].describe()).round(2)

plt.figure(figsize=(5,5))
sns.histplot(data=y,x='Interaction_Energy')
table =plt.table(cellText=stats.values,
          rowLabels=stats.index,
          colLabels=stats.columns,
          cellLoc = 'center', rowLoc = 'center',
          loc='bottom', bbox=[0.25, -0.5, 0.5, 0.3])


table.auto_set_font_size(False)
table.set_fontsize(8)


plt.subplots_adjust(left=0, bottom=0.5)
plt.tight_layout()
plt.savefig('spread.png',dpi=300,bbox_inches='tight')
plt.show()

Q1=y['Interaction_Energy'].quantile(0.25)
Q3=y['Interaction_Energy'].quantile(0.75)
IQR=Q3-Q1
upper = Q3 + 1.5*IQR

lower = Q1 - 1.5*IQR

y=y[(y['Interaction_Energy']>=lower)&(y['Interaction_Energy']<=upper)].dropna()
sns.histplot(data=y,x='Interaction_Energy')
plt.show()

In [None]:
bitkey=dict(zip(AAs,range(len(AAs))))

In [None]:
bitkey

In [None]:
# samples=len(y)
# X=np.zeros((samples,len(AAs)))
# Y=np.zeros((samples,1))

# for idx,(k,v) in enumerate(y.set_index('keypair').to_dict()['Interaction_Energy'].items()):
#     aa=pairs.loc[k].values[0].split('-')
#     a1=aa[0]
#     a2=aa[1]
#     if a1!=a2:
#         X[idx,bitkey[a1]]=1
#         X[idx,bitkey[a2]]=1
#     else:
#         X[idx,bitkey[a1]]=2
#     Y[idx]=v

In [None]:
# # Create the MolGraph object
# mg = MolGraph()

# # Read the data from the .xyz file
# mg.read_xyz(monomerfiles[0])


# # Convert the molecular graph to the NetworkX graph
# G = to_networkx_graph(mg)

# # G.nodes(data=True),G.edges(data=True)

In [None]:
# from rdkit.Chem import rdFingerprintGenerator
# mols=[Chem.MolFromSmiles(list(pybel.readfile('xyz',m))[0].write().split('\t')[0]) for m in monomerfiles]
# fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=6)

# # info={}
# # X = np.vstack([fpgen.GetFingerprint(mol) for mol in mols ])

# fps = [MACCSkeys.GenMACCSKeys(x) for x in mols]
# X=np.vstack([f.ToList() for f in fps])
# Y=y.values

# keys=pd.read_excel('MACCS_keys_example.xlsx',index_col='Key').drop(columns=['Unnamed: 0'])

# mol_keys=[idx for idx, i in enumerate(X[0]) if i==1]

# {str(keys.loc[idx].values[0]):i for idx, i in enumerate(np.count_nonzero(X,axis=0)) if i!=0}

In [None]:

# sns.heatmap([[DataStructs.TanimotoSimilarity(i,j) for i in fps] for j in fps],vmin=0,vmax=1,cmap=sns.cm.rocket_r)
# plt.show()

In [None]:
# Draw.MolsToGridImage(mols,molsPerRow=10, subImgSize=(300,300))

In [None]:
# help(Draw.MolsToGridImage)

In [None]:
stats

In [None]:
# # Intersection of the two sets
# intersection=set(['_'.join(i.split('_')[:3]) for i in list(y.index)]) & set(co2check)&set(moncheck)
# X=genpaddedCMs(intersection)
# Y=y.loc[sum([[i for i in y.index if j=='_'.join(i.split('_')[:3])] for j in intersection],[])].to_numpy().reshape(-1)

In [None]:


# ptable = fetch_table('elements').set_index('symbol')
# Z=ptable['atomic_number']

# atomLabels = {v:k for k,v in Z.to_dict().items()}

# Elementlist=list(Z.index)


# def speciescheck(files):
#     return np.unique(np.hstack([np.genfromtxt(m,skip_header=2,dtype=str,usecols=0) for m in files]))

# # species = speciescheck(foldername)
# rcut = 8.0
# nmax = 6
# lmax = 6
# species=speciescheck(monomerfiles)
# # Setting up the SOAP descriptor
# soap = SOAP(
#     species=species,
#     periodic=False,
#     rcut=rcut,
#     nmax=nmax,
#     lmax=lmax,
#     sigma=1.5
# )

# soaplist=list()



# files=glob('./dipeptides_coordinates/AA_AA_*/final.xyz')
# samples=len(y)
# Y=np.zeros((samples,1))

# for idx,i in enumerate(y.index):
#     nam='_'.join(i.split('_')[:3])
#     filename=f"./dipeptides_coordinates/{nam}/final.xyz"
#     atoms=read(filename)
#     soaplist.append(normalize(soap.create(atoms)))
#         #soaplist.append(normalize(soap.create(atoms)))
#     Y[idx,:]=y.loc[i].values
    
    

# re = REMatchKernel(metric="rbf", gamma=2, alpha=1.2, threshold=1e-8, normalize_kernel=False)
# X = re.create(soaplist)

In [None]:
# Multitarget classification 
pixelsx=150
pixelsy=150
spread=.01
Max=2.5



samples=len(df)
X=np.zeros((samples,pixelsx*pixelsy))
Y=np.zeros((samples,8))
sitelist=['A_carboxy', 'A_amine', 'B_carboxy', 'B_amine', 'A_link','B_link', 'A_side', 'B_side']
inter_df=df[sitelist]
inter_df[inter_df!=0]=1

for idx,(k,v) in enumerate(df['label'].to_dict().items()):
    nam='_'.join(v.split('_')[:3])
    filename=f"./dipeptides_coordinates/{nam}/final.xyz"
    if os.path.exists(os.path.join(os.getcwd(),filename)):
        X[idx,:]=VariancePersist(Filename = filename, pixelx=pixelsx, pixely=pixelsy, myspread=spread, myspecs={"maxBD": Max, "minBD":-0.1}, showplot=False)
        Y[idx,:]=inter_df.loc[k].values

In [None]:
print(X.shape,Y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
print(y_train.shape,y_test.shape)

In [None]:
from sklearn.linear_model import RidgeClassifier
parameters = {'alpha':np.logspace(-6,6,7)}
GridSearch = GridSearchCV(RidgeClassifier(),param_grid=parameters,cv=5,verbose=0,n_jobs=-1,scoring='r2').fit(X_train,y_train)
model=GridSearch.best_estimator_.fit(X_train,y_train)

# model=RidgeClassifier(alpha=1e-3).fit(X_train,y_train)

In [None]:
model.score(X_train,y_train),model.score(X_test,y_test)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(2)
 
#Transform the data
Y_pca = pca.fit_transform(Y)

In [None]:
from sklearn.cluster import KMeans
n_clusters=3
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(Y_pca)

In [None]:
pca_df=pd.DataFrame({'x':Y_pca[:,0],'y':Y_pca[:,1],'l':kmeans.labels_})

In [None]:
unique_clusters=np.unique(kmeans.labels_)

In [None]:
clusY=np.hstack([Y,kmeans.labels_.reshape(-1,1)])

In [None]:
uniqueY=np.unique(Y,axis=0)

In [None]:
groupuniq={idun:{'idx':[ridx for ridx,row in enumerate(Y) if all(uY==row)],'inter':np.array(sitelist)[uY.astype(bool)]} for idun, uY in enumerate(uniqueY)}

In [None]:
cnt_inter={k:(len(v['idx']),v['inter']) for k,v in sorted(groupuniq.items(),key=lambda x: len(x[1]['idx']))}

In [None]:
np.unique(np.array(list(cnt_inter.values()),dtype=object)[:,0].astype(int))

In [None]:
set([len(v[1]) for k,v in sorted(cnt_inter.items(),key=lambda x: len(x[1][1]))])

In [None]:
for i in [1,2,3]:    
    for idx, (cnt, inter) in sorted(cnt_inter.items(),key=lambda x: len(x[1][1])):
        if len(inter)==i:
            sns.kdeplot(df['Interaction_Energy'].iloc[groupuniq[idx]['idx']],label=inter,warn_singular=False)
    plt.legend()
    plt.show()

In [None]:
# cnt_inter=pd.DataFrame(np.zeros((len(sitelist),len(sitelist))),index=sitelist,columns=sitelist)

In [None]:
cnt_inter

In [None]:
for c in range(n_clusters):
    sns.kdeplot(df.iloc[[idx for idx, i in enumerate(clusY[:,-1]) if i==c]]['Interaction_Energy'],label=c)
plt.legend()
plt.tight_layout()
plt.savefig(f'cluster_n{n_clusters}_kde.png',dpi=300,bbox_inches='tight')
plt.show()

In [None]:
for i in unique_clusters:
    uni=np.unique(clusY[clusY[:,-1]==i],axis=0)
    for j in uni:
        print(np.array(sitelist)[np.array(j.astype(bool)[:-1])])
    # print(uni)
    print(i,uni.shape)

In [None]:
sns.scatterplot(data=pca_df,x='x',y='y',hue='l',palette=sns.color_palette('tab10',n_clusters))
plt.xlim(-1,1.75)
plt.savefig(f'cluster_n{n_clusters}_cluster.png',dpi=300,bbox_inches='tight')
plt.show()

In [None]:
# Multitarget classification 
pixelsx=50
pixelsy=50
spread=.01
Max=2.5



samples=len(df)
X=np.zeros((samples,pixelsx*pixelsy))
Y=np.zeros((samples,1))

for idx,(k,v) in enumerate(df['label'].to_dict().items()):
    nam='_'.join(v.split('_')[:3])
    filename=f"./dipeptides_coordinates/{nam}/final.xyz"
    if os.path.exists(os.path.join(os.getcwd(),filename)):
        perX=VariancePersist(Filename = filename, pixelx=pixelsx, pixely=pixelsy, myspread=spread, myspecs={"maxBD": Max, "minBD":-0.1}, showplot=False)
        X[idx,:]=perX
        # X[idx,:]=np.concatenate([perX,inter_df.loc[k].values])
        Y[idx,:]=df['Interaction_Energy'].loc[k]
        
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=34)
print(X_train.shape,X_test.shape)



model=XGBRegressor(device='cpu',reg_lambda=1e-3,reg_alpha=1e-3,max_depth=10,min_child_weight=10)
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X, Y, cv=4)
print(scores)
print("Mean accuracy of %0.4f with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

model.fit(X_train,y_train)
y_pred_train=model.predict(X_train)
y_pred_test=model.predict(X_test)
print(f"R2: {r2_score(y_train,y_pred_train):.4f},{r2_score(y_test,y_pred_test):.4f}")
print(f"RMSE (kcal/mol): {mean_squared_error(y_train,y_pred_train,squared=False):.4f},{mean_squared_error(y_test,y_pred_test,squared=False):.4f}")
plt.scatter(y_train,y_pred_train)
plt.scatter(y_test,y_pred_test)
plt.show()

In [None]:
inter_df.loc['Alanine-Alanine'].values

In [None]:
AB={}
BA={}
AA={}
AB_y=[]
BA_y=[]
AA_y=[]
AB_PI=[]
BA_PI=[]
AA_PI=[]
for idxi,i in enumerate(AAs):
    for idxj,j in enumerate(AAs):
        if i!=j and idxi>=idxj:
            AB[f'{i}-{j}']={'x':inter_df.loc[f'{i}-{j}'].values,'y':df['Interaction_Energy'].loc[f'{i}-{j}']}
            BA[f'{j}-{i}']={'x':inter_df.loc[f'{j}-{i}'].values,'y':df['Interaction_Energy'].loc[f'{j}-{i}']}
            AB_y.append(df['Interaction_Energy'].loc[f'{i}-{j}'])
            BA_y.append(df['Interaction_Energy'].loc[f'{j}-{i}'])     
            AB_PI.append(VariancePersist(Filename = f"./dipeptides_coordinates/{df['label'].to_dict()[f'{i}-{j}']}/final.xyz", pixelx=pixelsx, pixely=pixelsy, myspread=spread, myspecs={"maxBD": Max, "minBD":-0.1}, showplot=False))
            BA_PI.append(VariancePersist(Filename = f"./dipeptides_coordinates/{df['label'].to_dict()[f'{j}-{i}']}/final.xyz", pixelx=pixelsx, pixely=pixelsy, myspread=spread, myspecs={"maxBD": Max, "minBD":-0.1}, showplot=False))            
        elif i==j:
            AA[f'{i}-{j}']={'x':inter_df.loc[f'{i}-{j}'].values,'y':df['Interaction_Energy'].loc[f'{i}-{j}']}
            AA_y.append(df['Interaction_Energy'].loc[f'{i}-{j}'])     
            AA_PI.append(VariancePersist(Filename = f"./dipeptides_coordinates/{df['label'].to_dict()[f'{i}-{j}']}/final.xyz", pixelx=pixelsx, pixely=pixelsy, myspread=spread, myspecs={"maxBD": Max, "minBD":-0.1}, showplot=False))
            

In [None]:
AB_dat=np.vstack([np.hstack([v['x'],v['y']]) for v in AB.values()])
BA_dat=np.vstack([np.hstack([v['x'],v['y']]) for v in BA.values()])
AA_dat=np.vstack([np.hstack([v['x'],v['y']]) for v in AA.values()])

In [None]:
np.vstack(AA_PI).shape,np.vstack(AB_PI).shape,np.vstack(BA_PI).shape

In [None]:
def try_(X,Y):
    Y=Y.reshape(-1,)        
    print(X.shape,Y.shape)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)
    print(X_train.shape,X_test.shape)



    model=XGBRegressor(device='cpu',reg_lambda=1e-3,reg_alpha=1e-3,max_depth=10,min_child_weight=10)
    model.fit(X_train,y_train)
    y_pred_train=model.predict(X_train)
    y_pred_test=model.predict(X_test)
    print(f"R2: {r2_score(y_train,y_pred_train):.4f},{r2_score(y_test,y_pred_test):.4f}")
    print(f"RMSE (kcal/mol): {mean_squared_error(y_train,y_pred_train,squared=False):.4f},{mean_squared_error(y_test,y_pred_test,squared=False):.4f}")
    print(model.score(X_train, y_train),model.score(X_test, y_test))
    scores = cross_val_score(model, X, Y, cv=4)
    print(scores)
    print("Mean accuracy of %0.4f with a standard deviation of %0.2f" % (scores.mean(), scores.std()))    
    plt.scatter(y_train,y_pred_train)
    plt.plot(y_train,y_train)
    plt.scatter(y_test,y_pred_test)
    plt.show()

In [None]:
try_(np.vstack(AA_PI),np.array(AA_y))

In [None]:
try_(np.vstack(AB_PI),np.array(AB_y))

In [None]:
try_(np.vstack(BA_PI),np.array(BA_y))

In [None]:
try_(AA_dat[:,:-1],AA_dat[:,-1])

In [None]:
try_(AB_dat[:,:-1],AB_dat[:,-1])

In [None]:
try_(BA_dat[:,:-1],BA_dat[:,-1])