In [1]:
import pandas as pd

In [2]:
! pip install rdkit -q

In [3]:
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
import pandas as pd
from catboost import CatBoostRegressor


from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors, rdMolDescriptors
import pandas as pd
from catboost import CatBoostRegressor

class CatLgKPredictor:
    
    def __init__(self, model_path) -> None:
        self.reg = CatBoostRegressor().load_model(model_path)

    def calculate_descriptors_and_fingerprints(self, mol):
        if not mol:
            descriptors = {desc[0]: None for desc in Descriptors.descList}
            fingerprints = [None] * 1024
        else:
            descriptors = {desc_name: desc_func(mol) for desc_name, desc_func in Descriptors.descList}
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
            fingerprints = list(map(int, fp))
        return {**descriptors, "fingerprints": fingerprints}
        
    def preprocess_data(self, data: pd.DataFrame):
        data = data.copy()
        # Convert SMILES to molecules once
        data['Molecules'] = data['smiles'].apply(Chem.MolFromSmiles)
        
        # Calculate descriptors and fingerprints together
        descriptors_list = data['Molecules'].apply(self.calculate_descriptors_and_fingerprints).tolist()
        
        # Split fingerprints from descriptors for DataFrame creation
        descriptors_df = pd.DataFrame([d for d in descriptors_list])
        fingerprints_df = pd.DataFrame(descriptors_df.pop('fingerprints').tolist())
        
        # Combine everything
        combined_data = pd.concat([data, descriptors_df, fingerprints_df], axis=1).drop(columns=["smiles", "Molecules"])
        
        return combined_data
        
    def predict_lgK(self, data: pd.DataFrame):
        prep_data = self.preprocess_data(data=data)
    
        preds = self.reg.predict(prep_data)
        
        return preds



In [37]:
df = pd.read_csv('/kaggle/input/afadadfs/data_to_razduplit_1.csv')
df.head()

Unnamed: 0,smiles,lgK
0,O=P(O)(O)CN(Cc1cccc(CN(CP(=O)(O)O)CP(=O)(O)O)n...,29.016023
1,O=C(O)N1CCN(CP(=O)(O)O)CCN(C(=O)O)CCN(C(=O)O)CC1,28.399766
2,N=P(O)(O)CN1CCN(C(=O)O)CCN(C(=O)O)CCN(C(=O)O)CC1,27.039173
3,O=C(O)N1CCN(CS(O)(O)O)CCN(C(=O)O)CCN(C(=O)O)CC1,26.844821
4,O=C(O)N1CCN(CP(=O)(O)CO)CCN(C(=O)O)CCN(C(=O)O)CC1,26.655937


In [38]:
df.sort_values(by='lgK', ascending=False).iloc[:15]['smiles'].iloc[0]

'O=P(O)(O)CN(Cc1cccc(CN(CP(=O)(O)O)CP(=O)(O)O)n1)CP(=O)(O)O'

In [39]:
! pip install selfies -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [40]:
import selfies as sf
sf.encoder('O=P(O)(O)CN(Cc1cccc(CN(CP(=O)(O)O)CP(=O)(O)O)n1)CP(=O)(O)O')

'[O][=P][Branch1][C][O][Branch1][C][O][C][N][Branch2][Ring2][Ring2][C][C][=C][C][=C][C][Branch2][Ring1][#Branch1][C][N][Branch1][#Branch2][C][P][=Branch1][C][=O][Branch1][C][O][O][C][P][=Branch1][C][=O][Branch1][C][O][O][=N][Ring2][Ring1][C][C][P][=Branch1][C][=O][Branch1][C][O][O]'

In [41]:
! pip install selfies accelerate bitsandbytes -q

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [42]:
import selfies as sf

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("zjunlp/MolGen-large-opt")
model = AutoModelForSeq2SeqLM.from_pretrained("zjunlp/MolGen-large-opt")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [43]:
chelates = ['CC(=O)NC1=C2SSC=C2NC1=O', 'N[C@@H](CCN[C@@H](CCN1CC[C@H]1C(O)=O)C(O)=O)C(O)=O', 'OC(CS)C(O)CS','[C@@]1(OC=2C=C(O)C=C(O)C2C[C@H]1O)([H])C3=CC(=O)C(=C4C(=C3)C([C@]5(OC=6C=C(O)C=C(O)C6C[C@H]5O)[H])=CC(O)=C4O)O' 'OC(=O)CC(O)(CC(O)=O)C(O)=O', 'OC[C@@H](O)[C@@H](O)[C@H](O)[C@@H](O)C(O)=O', 'OC(=O)[C@@H](S)[C@@H](S)C(O)=O', 'Oc1c(I)cc(Cl)c2cccnc12', 'O=C(O)CN(CC1)CCN(CC(=O)O)CCN(CC(=O)O)CCN1CC(=O)O', 'OCC(S)CS', 'C1CSCCSCCS1', 'CC(C)(S)[C@@H](N)C(O)=O', 'C1=CN=C(C=C1)C1=CC=CC=N1', 'CCCCC(O)(Cn1cncn1)c1ccc(Cl)cc1Cl', '[H+].[Cl-].C[C@@H](CN1CC(=O)NC(=O)C1)N1CC(=O)NC(=O)C1', 'C(=S)(S)N(CC)CC', 'Cc1cc(-c2ccc(cc2)S(O)(=O)=O)c2ccc3c(cc(C)nc3c2n1)-c1ccc(cc1)S(O)(=O)=O', 'OC(=O)CN(CCOCCOCCN(CC(O)=O)CC(O)=O)CC(O)=O', 'OC(=O)CN(CCN(CC(O)=O)CC(O)=O)CC(O)=O', 'CC(O)(P(O)(O)=O)P(O)(O)=O', 'OC(=O)CNCC(O)=O', 'OP(O)(=O)CP(O)(O)=O', 'N1=C(C=CC2=CC=C3C=CC(=NC3=C12)C)C', 'OC(=O)CN1CCCN(CCN(CCCN(CC1)CC(O)=O)CC(O)=O)CC(O)=O', 'CN1C(=O)C(NC(C)=O)=C2SSC=C12', 'CCCCCCCc1cc(O)c2ccccc2n1']

In [44]:
chelates += list(df.sort_values(by='lgK', ascending=False).iloc[:40]['smiles'].values)
#chelates = list(df.sort_values(by='lgK', ascending=False).iloc[:15]['smiles'].values)

In [45]:
#additional_chelates = ['O=C(O)CN1CCN(CC(=O)O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CCCO)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CC(=O)O)CCN(C(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CC(=O)O)CCN(CC(O)O)CCN(CC(=O)O)CC1', 'C=C(O)CN1CCN(CC(=O)O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CC[CH]O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CC(=O)O)CCN(C[C+](=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1C[CH]N(CCCO)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CC=NO)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN(CC(=O)O)C1CCCCC1N(CC(=O)O)CC(=O)O', 'O=C(O)CN(CC(=O)O)[C@@H]1CCCC[C@H]1N(CC(=O)O)CC(=O)O', 'O=C(O)CN1CCNCCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1[CH]CN(CCCO)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CC=PO)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)[CH]N1CCN(CC(=O)O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1[CH]CN(CC(=O)O)CCN(CCCO)CCN(CC(=O)O)CC1', 'O=C(O)CN1[CH]CN(CC(=O)O)CCN(CC(=O)O)CCN(CCCO)CC1', 'O=C(O)CN1[CH]CN(CC(=O)O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)[C]N1CCN(CC(=O)O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CCCO)CCN(C(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)[CH]N1CCN(CCCO)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CCCO)CCN(CC(=O)O)CCN(C[C+](=O)O)CC1', 'CCC[C@@H](CN(CC(=O)O)CC(=O)O)N(CC(=O)O)CC(=O)O', 'O=C(O)CN1CCN(CCCO)CCN([N]C(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CC(=O)O)CCN(CC(=O)CO)CCN(CC(=O)O)CC1', 'O=C(O)CN1[C]CN(CCCO)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=COCN1CCN(CC(=O)O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CCCO)CCN(OC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1[C]CN(CC(=O)O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'CCCCCC(CN(CC(=O)O)CC(=O)O)N(CC(=O)O)CC(=O)O', 'O=C(O)[C]N1CCN(CCCO)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1[C]CN(CC(=O)O)CCN(CC(=O)O)CCN(CCCO)CC1', 'O=C(O)CN1CCN(CCCO)CCN(NC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN(C[C]1CCCCC1N(CC(=O)O)CC(=O)O)CC(=O)O', 'CC1N(CC(=O)O)CCN(CC(=O)O)CCN(CC(=O)O)CCN1CC(=O)O', 'O=C(O)CN1CCN(CCCO)CCN([O+]C(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CCCO)C[N]N(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(C=C(O)O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'CC=CCCC(CN(CC(=O)O)CC(=O)O)N(CC(=O)O)CC(=O)O', 'O=C(O)CN1CCN(CCCO)CCN(CNC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CNN1CCN(CCCO)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN(CC(=O)O)[C@@H]1CC=CC[C@H]1N(CC(=O)O)CC(=O)O', 'O=C(O)CN1CCN(CC(=O)O)CCN(CCCO)OCN(CC(=O)O)CC1', 'O=C(O)COCN1CCN(CCCO)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'CC1N(CCCO)CCN(CC(=O)O)CCN(CC(=O)O)CCN1CC(=O)O', 'O=C(O)[CH-]N1CCN(CC(=O)O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN(CCN(CC(=O)O)CC(=O)O)CC(=O)O', 'O=C(O)CN1C[CH]NCCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCNCCN(NC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)[C-]N1CCN(CC(=O)O)CCN(CC(=O)O)CCN(CC(=O)O)CC1']

In [46]:
#additional_chelates += ['O=C(O)CN1CCN(CC(=O)O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CCO)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CCCO)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CC(=O)O)CCN(C(=O)O)CCN(CC(=O)O)CC1', 'N=C(O)CN1CCN(CC(=O)O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CC(=O)O)CCN(CC(O)O)CCN(CC(=O)O)CC1', 'C=C(O)CN1CCN(CC(=O)O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CC[CH]O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CC[N]O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CCNO)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1C[CH]N(CCCO)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CC=NO)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CC[N+]O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CC[C]O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN(CC(=O)O)[C@@H]1CCCC[C@H]1N(CC(=O)O)CC(=O)O', 'O=C(O)CN1CCN(CCOO)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCNCCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1[CH]CN(CCCO)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CC=PO)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CC#CO)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)[CH]N1CCN(CC(=O)O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CCSO)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1[CH]CN(CC(=O)O)CCN(CCCO)CCN(CC(=O)O)CC1', 'O=C(O)CN1[CH]CN(CC(=O)O)CCN(CC(=O)O)CCN(CCCO)CC1', 'O=C(O)CN1[CH]CN(CC(=O)O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)[C]N1CCN(CC(=O)O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1C[C]N(CCCO)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1C[CH]N(C(=O)O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CCCO)CCN(C(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CC#[PH]O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)[CH]N1CCN(CCCO)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CC=[S]O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=CNCN1CCN(CC(=O)O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CC[P]O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CC[CH-]O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CC[SH]O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CC[P@H]O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'CCCC(CN(CC(=O)O)CC(=O)O)N(CC(=O)O)CC(=O)O', 'CCC[C@@H](CN(CC(=O)O)CC(=O)O)N(CC(=O)O)CC(=O)O', 'O=C(O)CN1CCN(NC(=O)O)CCN(CCCO)CCN(C(=O)O)CC1', 'O=C(O)CN1CCN(CCCO)CCN([N]C(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CNC(=O)O)CCN(CCCO)CCN(C(=O)O)CC1', 'O=C(O)CN1CCN(CC(=O)O)CCN(CC(=O)CO)CCN(CC(=O)O)CC1', 'O=C(O)CN1[C]CN(CCCO)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=COCN1CCN(CC(=O)O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CC=[N+]O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1CCN(CCCO)CCN(OC(=O)O)CCN(CC(=O)O)CC1', 'O=C(O)CN1[C]CN(CC(=O)O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 'CCCCCC(CN(CC(=O)O)CC(=O)O)N(CC(=O)O)CC(=O)O', 'O=C(O)[C]N1CCN(CCCO)CCN(CC(=O)O)CCN(CC(=O)O)CC1']

In [47]:
#chelates = chelates + list(set(additional_chelates))

In [15]:
len(chelates)

65

N1CCN(CCN(CCN(CCN1CC(=O)O)CC(=O)O)CC(=O)O)CC(=O)O

In [48]:
chelates = df['smiles'].unique()

In [None]:
print(chelates[len(chelates)//2:])
chelates = chelates[:len(chelates)//2]

In [49]:
import numpy as np

mo = CatLgKPredictor('/kaggle/input/catboost-model/mymodel')

preds1 = mo.predict_lgK(pd.DataFrame({'smiles': chelates}))
print(np.mean(preds1), max(preds1))

23.274513912212075 29.016022672675394


In [50]:
chelates = pd.DataFrame({'smiles': chelates, 'lgK': preds1})
chelates = chelates[chelates['lgK'] > 20]['smiles'].tolist()
len(chelates)

261

In [51]:
chelate_selfies = [sf.encoder(smiles) for smiles in chelates]

In [52]:
len(chelate_selfies)

261

In [53]:
from tqdm.auto import tqdm

In [55]:
out = []

model = model.to('cuda')
k = 0

for selfie in tqdm(chelate_selfies):
  print(k)
  k += 1
  sf_input = tokenizer(selfie, return_tensors="pt").to('cuda')
  # beam search
  molecules = model.generate(input_ids=sf_input["input_ids"],
                            attention_mask=sf_input["attention_mask"],
                                max_length=80,
                                min_length=10,
                                num_return_sequences=100,
                                num_beams=210)
  sf_output = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True).replace(" ","") for g in molecules]

  out += sf_output


  0%|          | 0/261 [00:00<?, ?it/s]

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260


In [56]:
smiles_out = [sf.decoder(selfie) for selfie in out]

In [57]:
sf.decoder(out[1])

'O=P(O)(O)CN(CC1=CC=CC(CN(CP(=O)(O)O)CP(=O)(O)O)=N1)CP(=O)(O)O'

In [58]:
smiles_out = list(set(smiles_out))
len(smiles_out)

20679

In [59]:
sub_df = pd.Series(smiles_out)
sub_df.sample(10)

18729    O=C(O)N1CCN(COCC\[N+1](=O)O)CCN(C(=O)O)CCN(C(=...
4117     O=C(O)CN1CCN(C/[S@@](=O)(O)O)CCN(C(=O)O)CCN(C(...
13209        O=C(O)N1CCN(CC(O)O)C=NN(C(=O)O)CCN(C(=O)O)CC1
16519    CP(=O)(O)CN(CCNC(=O)O)NC1CN(C(=O)O)NCCN(C(=O)O...
4000     O=CC(O)\[C@@]N1CCN(C(=O)O)CCN(C(=O)O)CCN(C(=O)...
8171     NP(=O)(O)NN(CC1=CC=CC(CN(CP(=O)(O)O)CP(=O)(O)O...
17571    O=C(O)NC1CN(CN(CCNCP(=O)(O)O)C(=O)O)CCN(C(=O)O...
20080       O=C(O)NC1CN(NCP(=O)O)OCCN(N(O)O)CCN(C(=O)O)CC1
13584    O=C(O)N1CCN(C\[P@@](=O)O)CCN(C(=O)O)CCN(C(=O)O...
7454      O=C(O)NCP(=O)(O)CNCCN(C(=O)O)CCN(C(=O)O)C(=O)OCC
dtype: object

In [60]:
sub_df.to_csv('./sub.csv', index=False)

In [61]:
sub_df.shape

(20679,)

In [73]:
part_1 = pd.read_csv('/kaggle/input/razdupl-1/pppppooooo (1).csv')

In [62]:
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from rdkit.Chem.Descriptors import ExactMolWt

In [63]:
from collections import Counter
from rdkit.Contrib.SA_Score import sascorer

def check_molecule(smiles):
    # Разрешенные элементы и элементы для подсчета атомов
    allowed_elements = {'C', 'H', 'O', 'N', 'P', 'S'}
    count_elements = {'O', 'N', 'P', 'S'}

    # Счетчик элементов в молекуле
    elements_counter = Counter()
    temp_element = '' # Временная переменная для хранения элементов более одного символа

    for char in smiles:
        if char.isalpha():
            # Проверяем, является ли элемент двухбуквенным (например, Cl) - не применимо к данным элементам, но полезно для общности
            if temp_element:
                # Предыдущий элемент был начат, завершаем его
                elements_counter[temp_element] += 1
                temp_element = ''
            if char.isupper():
                # Начало нового элемента
                elements_counter[char] += 1
            else:
                # Продолжение текущего элемента
                temp_element += char
        elif temp_element:
            # Если был накоплен временный элемент, добавляем его в счетчик
            elements_counter[temp_element] += 1
            temp_element = '' # Сброс временного элемента

    # Проверяем, входит ли последний элемент в счетчик, если цикл завершился на букве
    if temp_element:
        elements_counter[temp_element] += 1

    # Проверка условий
    # 1. Все элементы в молекуле должны быть из разрешенного списка
    if not all(element in allowed_elements for element in elements_counter):
        return False

    # 2. Молекула должна включать не менее трех разных элементов из списка
    if sum(element in allowed_elements for element in elements_counter) < 3:
        return False

    # 3. Суммарно не более 12 атомов элементов O, N, P, S
    if sum(count for element, count in elements_counter.items() if element in count_elements) > 12:
        return False
    if smiles in df["smiles"].tolist():
      return False

    weight = ExactMolWt(Chem.MolFromSmiles(smiles))
    if weight > 500: return False

    if sascorer.calculateScore(Chem.MolFromSmiles(smiles))>5: return False


    return True


In [66]:
sub = sub_df

sub.head()

0       O=C(O)NC1CN(CP(=O)COOCC)N(C(O)O)CCN(C(=O)O)CC1
1        CP(O)CN1C/SN(C(=O)O)CCN(C(=O)O)CCN(C(=O)O)CC1
2    OOCOCN(CC1=CC=CC(CN(CP(=O)(O)O)NP(=O)(O)O)=N1)...
3    CN(O1)CCN(C(=O)O)CCN(C(=O)O)CCN(C(=O)O)CCOP1(=O)O
4    NP(=O)(CC[C+1](=O)O)CN1CCN(C(=O)O)CCN(C(=O)O)C...
dtype: object

In [67]:
#checked = [check_molecule(smiles) for smiles in sub]

#checked = [smiles for smiles in sub.values]
#sub = sub.iloc[checked]

p = []

for sm in sub.tolist():
    try:
        Chem.MolToSmiles(Chem.MolFromSmiles(sm), isomericSmiles=True, canonical=True)
        p.append(sm)
    except:
        pass
sub = p
can_smiles = [Chem.MolToSmiles(Chem.MolFromSmiles(m), isomericSmiles=True, canonical=True) for m in sub]

can_smiles = list(set(can_smiles))
sub = pd.DataFrame({"smiles": can_smiles})

[09:29:42] Explicit valence for atom # 5 Sn, 5, is greater than permitted
[09:29:42] Explicit valence for atom # 12 F, 2, is greater than permitted
[09:29:42] Explicit valence for atom # 8 Sn, 7, is greater than permitted
[09:29:42] Explicit valence for atom # 1 Br, 3, is greater than permitted
[09:29:42] Explicit valence for atom # 15 Br, 3, is greater than permitted
[09:29:43] Explicit valence for atom # 16 F, 3, is greater than permitted
[09:29:43] Explicit valence for atom # 15 F, 2, is greater than permitted
[09:29:43] Explicit valence for atom # 14 Sn, 5, is greater than permitted
[09:29:43] Explicit valence for atom # 15 Sn, 6, is greater than permitted
[09:29:43] Explicit valence for atom # 1 F, 3, is greater than permitted
[09:29:43] Explicit valence for atom # 12 F, 3, is greater than permitted
[09:29:43] Explicit valence for atom # 11 F, 3, is greater than permitted
[09:29:43] Explicit valence for atom # 11 Sn, 6, is greater than permitted
[09:29:43] Explicit valence for ato

In [68]:
len(sub)

16060

In [69]:
mo = CatLgKPredictor('/kaggle/input/catboost-model/mymodel')
sub["lgK"] = mo.predict_lgK(pd.DataFrame({'smiles': sub["smiles"].values}))

In [70]:
sub.reset_index(drop=True, inplace=True)

sub.shape

(16060, 2)

In [76]:
part_1 = part_1.rename(columns={'lgk': 'lgK'})

In [78]:
sub = sub.dropna(subset=['lgK'])
sub

Unnamed: 0,smiles,lgK,lgk
0,CP(O)CN1CCN(C(=O)O)C[C-]N(C(=O)O)CCN(C(=O)O)CC1,23.361290,
1,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCN(CC...,9.897617,
2,C[C@H](O)NC1CCN(C(=O)O)CCON(CC=O)CCN(C(=O)O)C1,16.024089,
3,NCCN1CCN(C(=O)O)CCN(C(=O)S)CCOP(=O)(O)CC(=O)O1,17.253234,
4,CP(=O)(O)CN(CCNCP(=O)(O)O)NCCN(CCN(CCC=O)C(=O)...,19.669421,
...,...,...,...
16055,O=C(O)CN1CCC(=O)CN(CCNCNO)CCN(CC(=O)O)CC1,14.611856,
16056,O=C(COO)CN(CC1=NC(=CCN(CP(=O)(O)O)CP(=O)(O)O)C...,14.148499,
16057,C=CC1CN(N(O)O)CCN(C(=O)O)CCN(C(=O)O)CCN1CPN,17.687677,
16058,O=CON1CSN(CP(=O)(O)O)CC(NC(=O)O)CCN(C(=O)O)CCO1,17.180552,


In [79]:
sub = sub.drop(columns=['lgk'])

In [80]:
sub = pd.concat([sub, part_1])

In [81]:
sub

Unnamed: 0,smiles,lgK
0,CP(O)CN1CCN(C(=O)O)C[C-]N(C(=O)O)CCN(C(=O)O)CC1,23.361290
1,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCN(CC...,9.897617
2,C[C@H](O)NC1CCN(C(=O)O)CCON(CC=O)CCN(C(=O)O)C1,16.024089
3,NCCN1CCN(C(=O)O)CCN(C(=O)S)CCOP(=O)(O)CC(=O)O1,17.253234
4,CP(=O)(O)CN(CCNCP(=O)(O)O)NCCN(CCN(CCC=O)C(=O)...,19.669421
...,...,...
1708,O=C(O)CN[O-],3.527904
1709,[N-]=NCC(=O)O,3.297806
1710,NN=CC(=O)O,3.101087
1711,C=NCC(=O)O,2.940261


In [82]:
#sub = sub.drop(columns=['lgk'])

In [83]:
sub

Unnamed: 0,smiles,lgK
0,CP(O)CN1CCN(C(=O)O)C[C-]N(C(=O)O)CCN(C(=O)O)CC1,23.361290
1,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCN(CC...,9.897617
2,C[C@H](O)NC1CCN(C(=O)O)CCON(CC=O)CCN(C(=O)O)C1,16.024089
3,NCCN1CCN(C(=O)O)CCN(C(=O)S)CCOP(=O)(O)CC(=O)O1,17.253234
4,CP(=O)(O)CN(CCNCP(=O)(O)O)NCCN(CCN(CCC=O)C(=O)...,19.669421
...,...,...
1708,O=C(O)CN[O-],3.527904
1709,[N-]=NCC(=O)O,3.297806
1710,NN=CC(=O)O,3.101087
1711,C=NCC(=O)O,2.940261


In [93]:
sub1 = sub[:-5000]

Unnamed: 0,smiles,lgK
0,CP(O)CN1CCN(C(=O)O)C[C-]N(C(=O)O)CCN(C(=O)O)CC1,23.361290
1,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCN(CC...,9.897617
2,C[C@H](O)NC1CCN(C(=O)O)CCON(CC=O)CCN(C(=O)O)C1,16.024089
3,NCCN1CCN(C(=O)O)CCN(C(=O)S)CCOP(=O)(O)CC(=O)O1,17.253234
4,CP(=O)(O)CN(CCNCP(=O)(O)O)NCCN(CCN(CCC=O)C(=O)...,19.669421
...,...,...
1708,O=C(O)CN[O-],3.527904
1709,[N-]=NCC(=O)O,3.297806
1710,NN=CC(=O)O,3.101087
1711,C=NCC(=O)O,2.940261


In [96]:
sub.sort_values(by='lgK', ascending=False).to_csv('./ssssssss.csv', index=False)

In [95]:
sub.head()

Unnamed: 0,smiles,lgK
0,CP(O)CN1CCN(C(=O)O)C[C-]N(C(=O)O)CCN(C(=O)O)CC1,23.36129
1,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCN(CC...,9.897617
2,C[C@H](O)NC1CCN(C(=O)O)CCON(CC=O)CCN(C(=O)O)C1,16.024089
3,NCCN1CCN(C(=O)O)CCN(C(=O)S)CCOP(=O)(O)CC(=O)O1,17.253234
4,CP(=O)(O)CN(CCNCP(=O)(O)O)NCCN(CCN(CCC=O)C(=O)...,19.669421


In [97]:
sub_copy = sub.copy()

In [140]:
sub = sub_copy.sort_values(by='lgK', ascending=False)[:-5000]

In [141]:
sub.head()

Unnamed: 0,smiles,lgK
1004,O=P(O)(O)CN(Cc1cccc(CN(CP(=O)(O)O)CP(=O)(O)O)n...,29.016023
14205,O=C(O)N1CCN(CP(=O)(O)O)CCN(C(=O)O)CCN(C(=O)O)CC1,28.399766
2641,N=P(O)(O)CN1CCN(C(=O)O)CCN(C(=O)O)CCN(C(=O)O)CC1,27.039173
9656,O=C(O)N1CCN(CS(O)(O)O)CCN(C(=O)O)CCN(C(=O)O)CC1,26.844821
8991,O=C(O)N1CCN(CP(=O)(O)CO)CCN(C(=O)O)CCN(C(=O)O)CC1,26.655937


In [142]:
sub = sub.reset_index(drop=True)
sub.head()

Unnamed: 0,smiles,lgK
0,O=P(O)(O)CN(Cc1cccc(CN(CP(=O)(O)O)CP(=O)(O)O)n...,29.016023
1,O=C(O)N1CCN(CP(=O)(O)O)CCN(C(=O)O)CCN(C(=O)O)CC1,28.399766
2,N=P(O)(O)CN1CCN(C(=O)O)CCN(C(=O)O)CCN(C(=O)O)CC1,27.039173
3,O=C(O)N1CCN(CS(O)(O)O)CCN(C(=O)O)CCN(C(=O)O)CC1,26.844821
4,O=C(O)N1CCN(CP(=O)(O)CO)CCN(C(=O)O)CCN(C(=O)O)CC1,26.655937


In [143]:
from tqdm.auto import tqdm

In [None]:
from rdkit import DataStructs
ms = [Chem.MolFromSmiles(sm) for sm in sub['smiles']]
fpgen = AllChem.GetRDKitFPGenerator()
fps = [fpgen.GetFingerprint(x) for x in ms]
sim = 0
for i in tqdm(range(len(fps))):
  for j in range(i+1, len(fps)):
    try:
      if DataStructs.TanimotoSimilarity(fps[i], fps[j]) > 0.6:
        if sub["lgK"][i]>=sub["lgK"][j]: #and sub["lgK"][j]<sub['lgK'].mean():
          sub.drop(labels=[j], inplace=True)
        elif sub["lgK"][i]<sub["lgK"][j]: #and sub["lgK"][i]<sub['lgK'].mean():
          sub.drop(labels=[i], inplace=True)
    except KeyError:
      pass

  0%|          | 0/12773 [00:00<?, ?it/s]

In [None]:
sub.sort_values(by="lgK", ascending=False)["lgK"][:100].mean()

In [None]:
sub.sort_values(by="lgK", ascending=False)["lgK"][:10]

In [None]:
subb = sub.sort_values(by="lgK", ascending=False).drop(columns=["lgK"])

can_smiles = [Chem.MolToSmiles(Chem.MolFromSmiles(m), isomericSmiles=True, canonical=True) for m in subb['smiles']]

can_smiles = list(set(can_smiles))

pd.Series(can_smiles[:100]).to_csv('opopopopopo.csv', index=False, header=None)

In [None]:
sup = sub.sort_values(by="lgK", ascending=False).drop(columns=["lgK"])
sub.sort_values(by="lgK", ascending=False)[:100]

In [138]:
sup.head()

Unnamed: 0,smiles
0,O=P(O)(O)CN(Cc1cccc(CN(CP(=O)(O)O)CP(=O)(O)O)n...
1,O=C(O)N1CCN(CP(=O)(O)O)CCN(C(=O)O)CCN(C(=O)O)CC1
12,O=C(O)N1CCN(C[SH](=O)(O)O)CCN(C(=O)O)CCN(C(=O)...
30,O=C(O)N1CCN(SP(=O)(O)O)CCN(C(=O)O)CCN(C(=O)O)CC1
38,CP(=O)(O)NN1CCN(C(=O)O)CCN(C(=O)O)CCN(C(=O)O)CC1


In [118]:
sup = sup['smiles'].iloc[:100]

In [None]:
add = list(set(['O=P(O)(O)CN(CC1=CC=CC(CN(CP(=O)(O)O)CP(=O)(O)O)=N1)CP(=O)(O)O', 
 'O=P(O)(O)CN(CC1=CC=CC(CN(CP(=O)(O)O)CP(=O)(O)O)=N1)CP(=O)(O)O', 
 'O=C(O)CCP(=O)(O)CN1CCN(C(=O)O)CCN(C(=O)O)CCN(C(=O)O)CC1', 
 'O=P(O)(O)CN(CC1=CC=CC(CN(CP(=O)(O)O)CP(=O)(O)O)=N1)CP(=O)(O)O', 
 'O=C(O)CCP(=O)(O)CN1CCN(C(=O)O)CCN(C(=O)O)CCN(C(=O)O)CC1', 
 'O=P(O)(O)CN(CC1=CC=CC(CN(CP(=O)(O)O)CP(=O)(O)O)=N1)CP(=O)(O)O', 
 'O=C(O)NCCN(CP(=O)(O)C)N1CCN(C(=O)O)CCN(C(=O)O)CCN(C(=O)O)CC1', 
 'O=P(O)(O)CN(CC1=CC=CC(CN(CP(=O)(O)O)CP(=O)(O)O)=N1)CP(=O)(O)O', 
 'O=C(O)CN1CCN(CC(=O)O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 
 'O=P(O)(O)CN(CC1=CC=CC(CN(CP(=O)(O)O)CP(=O)(O)O)=N1)CP(=O)(O)O', 
 'O=P(O)(O)CN(CC1=CC=CC(CN(CP(=O)(O)O)CP(=O)(O)O)=N1)CP(=O)(O)O', 
 'O=C(O)N1CCN(CP(=O)(O)O)CCN(C(=O)O)CCN(C(=O)O)CC1', 
 'O=P(O)(O)CN1CCN(CP(=O)(O)O)CCN(CP(=O)(O)O)CCN(CP(=O)(O)O)CC1', 
 'O=P(O)(O)CN(CC1=CC=CC(CN(CP(=O)(O)O)CP(=O)(O)O)=N1)CP(=O)(O)O', 
 'O=C(O)N1CCN(CP(=O)(O)O)CCN(C(=O)O)CCN(C(=O)O)CC1', 
 'O=P(O)(O)CN1CCN(CC(=O)O)CCN(CC(=O)O)CCN(CC(=O)O)CCN(CC(=O)O)CCN(CC(=O)O)CC1', 
 'O=P(O)(O)CN(CC1=CC=CC(CN(CP(=O)(O)O)CP(=O)(O)O)=N1)CP(=O)(O)O', 
 'O=P(O)(O)CN1CCN(CP(=O)(O)O)CCN(CP(=O)(O)O)CCN(CP(=O)(O)O)CC1', 
 'O=P(O)(O)CN(CC1=CC=CC(CN(CP(=O)(O)O)CP(=O)(O)O)=N1)CP(=O)(O)O', 
 'O=P(O)(O)CN(CC1=CC=CC(CN(CP(=O)(O)O)CP(=O)(O)O)=N1)CP(=O)(O)O']))

In [None]:
sup.iloc[-1] = 'O=P(O)(O)CN1CCN(C(=O)O)CCN(C(=O)O)CCN(C(=O)O)CC1'

In [131]:
sup.to_csv("sup.csv", header=None, index=None)

In [None]:
best_sub = pd.read_csv('/kaggle/input/best-sub/sup (20).csv', names=['smiles'])
best_sub.head()

In [None]:
'O=C(O)CN(CCN(CC(=O)O)CC(=O)O)CC(=O)O' in best_sub['smiles']

In [None]:
sup

In [None]:
bbb = best_sub
bbb

In [None]:
bbb = bbb.drop_duplicates()

In [None]:
bbb

In [None]:
bbb = pd.DataFrame({'smiles': bbb['smiles'].tolist() + add}).drop_duplicates()

In [None]:
pp = mo.predict_lgK(bbb.reset_index(drop=False))
print(min(pp), max(pp))

In [None]:
bbb['lgK'] = pp
bbb.head()

In [None]:
bbb

In [None]:
sub

In [None]:
bbb = bbb.sort_values(by='lgK', ascending=False)
bbb['lgK'].min()

In [None]:
sub = bbb['smiles']

checked = [check_molecule(smiles) for smiles in sub]
sub = sub.iloc[checked]

can_smiles = [Chem.MolToSmiles(Chem.MolFromSmiles(m), isomericSmiles=True, canonical=True) for m in sub.tolist()]

can_smiles = list(set(can_smiles))
sub = pd.DataFrame({"smiles": can_smiles})

In [None]:
sub.iloc[:100].to_csv('ooooo.csv', index=False, header=None)

In [None]:
bbb.to_csv('asdasd.csv', index=False, header=None)

In [None]:
ssss = bbb.iloc[-100:]
ssss

In [None]:
ssss['smiles'].to_csv('suup.csv', index=False, header=None)

In [None]:
sub.sort_values(by="lgk", ascending=False).iloc[:20]

In [None]:
pp[-100:]