In [27]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.5


In [33]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import Descriptors

from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, r2_score

In [2]:
monomer_smiles = {
    'A': 'CC(C(=O)O)N',  # protein
    'R': 'C(CC(C(=O)O)N)CN=C(N)N',
    'N': 'C(C(C(=O)O)N)C(=O)N',
    'D': 'C(C(C(=O)O)N)C(=O)O',
    'C': 'C(C(C(=O)O)N)S',
    'Q': 'C(CC(=O)N)C(C(=O)O)N',
    'E': 'C(CC(=O)O)C(C(=O)O)N',
    'G': 'C(C(=O)O)N',
    'H': 'C1=C(NC=N1)CC(C(=O)O)N',
    'I': 'CCC(C)C(C(=O)O)N',
    'L': 'CC(C)CC(C(=O)O)N',
    'K': 'C(CCN)CC(C(=O)O)N',
    'M': 'CSCCC(C(=O)O)N',
    'F': 'C1=CC=C(C=C1)CC(C(=O)O)N',
    'P': 'C1CC(NC1)C(=O)O',
    'S': 'C(C(C(=O)O)N)O',
    'T': 'CC(C(C(=O)O)N)O',
    'W': 'C1=CC=C2C(=C1)C(=CN2)CC(C(=O)O)N',
    'Y': 'C1=CC(=CC=C1CC(C(=O)O)N)O',
    'V': 'CC(C)C(C(=O)O)N',
    'O': 'CC1CC=NC1C(=O)NCCCCC(C(=O)O)N',
    'U': 'C(C(C(=O)O)N)[Se]'
}

In [20]:
df = pd.read_csv('energy_set.csv')
df

Unnamed: 0,Aminoacid,Ca interaction energy,Mg interaction energy,Ba interaction energy
0,A,-18544.0042,-5440.008381,-225137.251898
1,R,-18545.713481,-5442.382717,-225138.576911
2,N,-18545.245935,-5441.559292,-225138.145427
3,D,-18552.439422,-5449.157982,-225145.101781
4,C,-18544.412396,-5440.689088,-225137.41113
5,Q,-18545.571978,-5442.1506,-225138.410463
6,E,-18552.407734,-5446.024302,-225145.090525
7,G,-18543.524531,-5439.394483,-225136.789473
8,H,-18544.820821,-5440.731788,-225138.067971
9,I,-18544.540097,-5441.150706,-225137.559706


In [21]:
df['smiles'] = df['Aminoacid'].map(monomer_smiles)

In [22]:
df


Unnamed: 0,Aminoacid,Ca interaction energy,Mg interaction energy,Ba interaction energy,smiles
0,A,-18544.0042,-5440.008381,-225137.251898,CC(C(=O)O)N
1,R,-18545.713481,-5442.382717,-225138.576911,C(CC(C(=O)O)N)CN=C(N)N
2,N,-18545.245935,-5441.559292,-225138.145427,C(C(C(=O)O)N)C(=O)N
3,D,-18552.439422,-5449.157982,-225145.101781,C(C(C(=O)O)N)C(=O)O
4,C,-18544.412396,-5440.689088,-225137.41113,C(C(C(=O)O)N)S
5,Q,-18545.571978,-5442.1506,-225138.410463,C(CC(=O)N)C(C(=O)O)N
6,E,-18552.407734,-5446.024302,-225145.090525,C(CC(=O)O)C(C(=O)O)N
7,G,-18543.524531,-5439.394483,-225136.789473,C(C(=O)O)N
8,H,-18544.820821,-5440.731788,-225138.067971,C1=C(NC=N1)CC(C(=O)O)N
9,I,-18544.540097,-5441.150706,-225137.559706,CCC(C)C(C(=O)O)N


In [23]:
additional_aa = [['O', None, None, None, 'CC1CC=NC1C(=O)NCCCCC(C(=O)O)N'], ['U', None, None, None, 'C(C(C(=O)O)N)[Se]']]
df_new = pd.DataFrame(additional_aa)
df_new = df_new.rename(columns={0: 'Aminoacid', 1: 'Ca interaction energy', 2: 'Mg interaction energy', 3: 'Ba interaction energy', 4: 'smiles'})
df_new

Unnamed: 0,Aminoacid,Ca interaction energy,Mg interaction energy,Ba interaction energy,smiles
0,O,,,,CC1CC=NC1C(=O)NCCCCC(C(=O)O)N
1,U,,,,C(C(C(=O)O)N)[Se]


In [24]:
data = pd.concat([df, df_new], axis=0)
data

Unnamed: 0,Aminoacid,Ca interaction energy,Mg interaction energy,Ba interaction energy,smiles
0,A,-18544.0042,-5440.008381,-225137.251898,CC(C(=O)O)N
1,R,-18545.713481,-5442.382717,-225138.576911,C(CC(C(=O)O)N)CN=C(N)N
2,N,-18545.245935,-5441.559292,-225138.145427,C(C(C(=O)O)N)C(=O)N
3,D,-18552.439422,-5449.157982,-225145.101781,C(C(C(=O)O)N)C(=O)O
4,C,-18544.412396,-5440.689088,-225137.41113,C(C(C(=O)O)N)S
5,Q,-18545.571978,-5442.1506,-225138.410463,C(CC(=O)N)C(C(=O)O)N
6,E,-18552.407734,-5446.024302,-225145.090525,C(CC(=O)O)C(C(=O)O)N
7,G,-18543.524531,-5439.394483,-225136.789473,C(C(=O)O)N
8,H,-18544.820821,-5440.731788,-225138.067971,C1=C(NC=N1)CC(C(=O)O)N
9,I,-18544.540097,-5441.150706,-225137.559706,CCC(C)C(C(=O)O)N


In [25]:
data['smiles'] = data['Aminoacid'].map(monomer_smiles)
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,Aminoacid,Ca interaction energy,Mg interaction energy,Ba interaction energy,smiles
0,A,-18544.0042,-5440.008381,-225137.251898,CC(C(=O)O)N
1,R,-18545.713481,-5442.382717,-225138.576911,C(CC(C(=O)O)N)CN=C(N)N
2,N,-18545.245935,-5441.559292,-225138.145427,C(C(C(=O)O)N)C(=O)N
3,D,-18552.439422,-5449.157982,-225145.101781,C(C(C(=O)O)N)C(=O)O
4,C,-18544.412396,-5440.689088,-225137.41113,C(C(C(=O)O)N)S
5,Q,-18545.571978,-5442.1506,-225138.410463,C(CC(=O)N)C(C(=O)O)N
6,E,-18552.407734,-5446.024302,-225145.090525,C(CC(=O)O)C(C(=O)O)N
7,G,-18543.524531,-5439.394483,-225136.789473,C(C(=O)O)N
8,H,-18544.820821,-5440.731788,-225138.067971,C1=C(NC=N1)CC(C(=O)O)N
9,I,-18544.540097,-5441.150706,-225137.559706,CCC(C)C(C(=O)O)N


# Adding rdkit descriptors
To fill missing data we want to use ML algorithms. So we need more descriptors to find distance between data points. Thta's why we are adding new descriptors, later we will drop them.

In [30]:
descriptor_names = list(rdMolDescriptors.Properties.GetAvailableProperties())
get_descriptors = rdMolDescriptors.Properties(descriptor_names)
num_descriptors = len(descriptor_names)

# Initiallization of the empty matrix
descriptors_set = np.empty((0, num_descriptors), float)


for _, row in data.iterrows():
    smiles = row['smiles']
    molecule = Chem.MolFromSmiles(smiles)

    if molecule is not None:
        descriptors = np.array(get_descriptors.ComputeProperties(molecule)).reshape((-1, num_descriptors))
        descriptors_set = np.append(descriptors_set, descriptors, axis=0)

# Creating DataFrame with descriptors
df_descriptors = pd.DataFrame(descriptors_set, columns=descriptor_names)

data = pd.concat([data, df_descriptors], axis=1)

In [31]:
data

Unnamed: 0,Aminoacid,Ca interaction energy,Mg interaction energy,Ba interaction energy,smiles,exactmw,amw,lipinskiHBA,lipinskiHBD,NumRotatableBonds,...,chi0n,chi1n,chi2n,chi3n,chi4n,hallKierAlpha,kappa1,kappa2,kappa3,Phi
0,A,-18544.0042,-5440.008381,-225137.251898,CC(C(=O)O)N,89.047678,89.094,3.0,3.0,1.0,...,3.510162,1.62709,0.389528,0.389528,0.0,-0.57,5.43,1.767634,1.721545,1.599709
1,R,-18545.713481,-5442.382717,-225138.576911,C(CC(C(=O)O)N)CN=C(N)N,174.111676,174.204,6.0,7.0,5.0,...,6.733397,3.575173,1.284817,1.284817,0.642832,-1.3,10.7,5.363379,5.48223,4.782347
2,N,-18545.245935,-5441.559292,-225138.145427,C(C(C(=O)O)N)C(=O)N,132.053492,132.119,5.0,5.0,3.0,...,4.702868,2.30434,0.738295,0.738295,0.304056,-1.1,7.9,3.032307,3.47971,2.661691
3,D,-18552.439422,-5449.157982,-225145.101781,C(C(C(=O)O)N)C(=O)O,133.037508,133.103,5.0,4.0,3.0,...,4.572731,2.239272,0.711731,0.711731,0.275438,-1.1,7.9,3.032307,3.47971,2.661691
4,C,-18544.412396,-5440.689088,-225137.41113,C(C(C(=O)O)N)S,121.019749,121.161,3.0,3.0,2.0,...,3.664483,1.774215,0.513894,0.513894,0.078093,-0.22,6.78,2.872925,2.472042,2.782633
5,Q,-18545.571978,-5442.1506,-225138.410463,C(CC(=O)N)C(C(=O)O)N,146.069142,146.146,5.0,5.0,4.0,...,5.409975,2.80434,1.018939,1.018939,0.421236,-1.1,8.9,3.837557,3.848566,3.415426
6,E,-18552.407734,-5446.024302,-225145.090525,C(CC(=O)O)C(C(=O)O)N,147.053158,147.13,5.0,4.0,4.0,...,5.279838,2.739272,0.986405,0.986405,0.402453,-1.1,8.9,3.837557,3.848566,3.415426
7,G,-18543.524531,-5439.394483,-225136.789473,C(C(=O)O)N,75.032028,75.067,3.0,3.0,1.0,...,2.639919,1.189533,0.17462,0.17462,0.0,-0.57,4.43,1.721545,3.43,1.525289
8,H,-18544.820821,-5440.731788,-225138.067971,C1=C(NC=N1)CC(C(=O)O)N,155.069477,155.157,5.0,4.0,3.0,...,5.819183,3.155289,1.311877,1.311877,0.720551,-1.36,7.743734,3.156504,2.047487,2.222102
9,I,-18544.540097,-5441.150706,-225137.559706,CCC(C)C(C(=O)O)N,131.094629,131.175,3.0,3.0,3.0,...,5.794619,3.075778,1.542155,1.542155,0.497328,-0.57,8.43,3.454517,2.463571,3.235731


# Filling 'Ca interaction energy' column

In [34]:
# Define the descriptors and target column
descriptors = data.columns.difference(['Aminoacid', 'smiles', 'Ca interaction energy', 'Mg interaction energy', 'Ba interaction energy'])
target_column = 'Ca interaction energy'

# Split the data into training and testing sets
train_data = data.iloc[:-2]  # Exclude the last two rows with missing values
test_data = data.iloc[-2:]

# Create a random forest model to predict missing values
model = RandomForestRegressor()

# Train the model on data without missing values in the target column
model.fit(train_data[descriptors], train_data[target_column])

# Predict the missing values
predicted_values = model.predict(test_data[descriptors])

# Fill in the missing values in the original dataset
data.loc[data.index[-2:], target_column] = predicted_values

# Print the updated dataset
print(data)

   Aminoacid  Ca interaction energy  Mg interaction energy  \
0          A          -18544.004200           -5440.008381   
1          R          -18545.713481           -5442.382717   
2          N          -18545.245935           -5441.559292   
3          D          -18552.439422           -5449.157982   
4          C          -18544.412396           -5440.689088   
5          Q          -18545.571978           -5442.150600   
6          E          -18552.407734           -5446.024302   
7          G          -18543.524531           -5439.394483   
8          H          -18544.820821           -5440.731788   
9          I          -18544.540097           -5441.150706   
10         L          -18544.416363           -5441.044820   
11         K          -18544.677974           -5441.118804   
12         M          -18544.760238           -5441.086101   
13         F          -18544.994381           -5441.389603   
14         P          -18544.117336           -5440.577089   
15      

# Filling 'Mg interaction energy' column

In [36]:
# Define the descriptors and target column
descriptors = data.columns.difference(['Aminoacid', 'smiles', 'Ca interaction energy', 'Mg interaction energy', 'Ba interaction energy'])
target_column = 'Mg interaction energy'

# Split the data into training and testing sets
train_data = data.iloc[:-2]  # Exclude the last two rows with missing values
test_data = data.iloc[-2:]

# Create a random forest model to predict missing values
model = RandomForestRegressor()

# Train the model on data without missing values in the target column
model.fit(train_data[descriptors], train_data[target_column])

# Predict the missing values
predicted_values = model.predict(test_data[descriptors])

# Fill in the missing values in the original dataset
data.loc[data.index[-2:], target_column] = predicted_values

# Print the updated dataset
print(data)

   Aminoacid  Ca interaction energy  Mg interaction energy  \
0          A          -18544.004200           -5440.008381   
1          R          -18545.713481           -5442.382717   
2          N          -18545.245935           -5441.559292   
3          D          -18552.439422           -5449.157982   
4          C          -18544.412396           -5440.689088   
5          Q          -18545.571978           -5442.150600   
6          E          -18552.407734           -5446.024302   
7          G          -18543.524531           -5439.394483   
8          H          -18544.820821           -5440.731788   
9          I          -18544.540097           -5441.150706   
10         L          -18544.416363           -5441.044820   
11         K          -18544.677974           -5441.118804   
12         M          -18544.760238           -5441.086101   
13         F          -18544.994381           -5441.389603   
14         P          -18544.117336           -5440.577089   
15      

# Filling 'Ba interaction energy' column

In [37]:
# Define the descriptors and target column
descriptors = data.columns.difference(['Aminoacid', 'smiles', 'Ca interaction energy', 'Mg interaction energy', 'Ba interaction energy'])
target_column = 'Ba interaction energy'

# Split the data into training and testing sets
train_data = data.iloc[:-2]  # Exclude the last two rows with missing values
test_data = data.iloc[-2:]

# Create a random forest model to predict missing values
model = RandomForestRegressor()

# Train the model on data without missing values in the target column
model.fit(train_data[descriptors], train_data[target_column])

# Predict the missing values
predicted_values = model.predict(test_data[descriptors])

# Fill in the missing values in the original dataset
data.loc[data.index[-2:], target_column] = predicted_values

# Print the updated dataset
print(data)

   Aminoacid  Ca interaction energy  Mg interaction energy  \
0          A          -18544.004200           -5440.008381   
1          R          -18545.713481           -5442.382717   
2          N          -18545.245935           -5441.559292   
3          D          -18552.439422           -5449.157982   
4          C          -18544.412396           -5440.689088   
5          Q          -18545.571978           -5442.150600   
6          E          -18552.407734           -5446.024302   
7          G          -18543.524531           -5439.394483   
8          H          -18544.820821           -5440.731788   
9          I          -18544.540097           -5441.150706   
10         L          -18544.416363           -5441.044820   
11         K          -18544.677974           -5441.118804   
12         M          -18544.760238           -5441.086101   
13         F          -18544.994381           -5441.389603   
14         P          -18544.117336           -5440.577089   
15      

In [38]:
final_df = pd.DataFrame()
final_df = data.iloc[:, :4]
final_df

Unnamed: 0,Aminoacid,Ca interaction energy,Mg interaction energy,Ba interaction energy
0,A,-18544.0042,-5440.008381,-225137.251898
1,R,-18545.713481,-5442.382717,-225138.576911
2,N,-18545.245935,-5441.559292,-225138.145427
3,D,-18552.439422,-5449.157982,-225145.101781
4,C,-18544.412396,-5440.689088,-225137.41113
5,Q,-18545.571978,-5442.1506,-225138.410463
6,E,-18552.407734,-5446.024302,-225145.090525
7,G,-18543.524531,-5439.394483,-225136.789473
8,H,-18544.820821,-5440.731788,-225138.067971
9,I,-18544.540097,-5441.150706,-225137.559706


In [39]:
final_df.to_csv('final_energy_set.csv', index=False)