<a href="https://colab.research.google.com/github/Bodya-collab/De-Novo-Drug-Design-with-Generative-AI-LSTM-/blob/main/De-Novo-Drug-Design-with-Generative-AI-LST_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing Workspace

In [None]:
# Importing tools
!pip install rdkit

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from rdkit import Chem
from rdkit.Chem import Draw
import matplotlib.pyplot as plt
import sys

print('Done')

## Importing DataFrame

In [None]:

import pandas as pd
import os


# 2. 250K mol from ZINC
!wget https://raw.githubusercontent.com/aspuru-guzik-group/chemical_vae/master/models/zinc_properties/250k_rndm_zinc_drugs_clean_3.csv -O zinc_250k.csv

data = pd.read_csv('zinc_250k.csv')

# Smiles column
smiles = data['smiles'].tolist()

# using only 50000 mol
smiles = smiles[:50000]


print("Example:", smiles[0])

## Tokenization


In [None]:
# Vocabulary
chars = sorted(list(set("".join(smiles))))
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

vocab_size = len(chars)
max_len = max([len(s) for s in smiles])

print(f"Vocabulary size: {vocab_size}")
print(f"Max length: {max_len}")
print("Characters:", chars)

# Function of postion
def vectorize(smiles_list):
    X = np.zeros((len(smiles_list), max_len, vocab_size), dtype=bool)
    Y = np.zeros((len(smiles_list), max_len, vocab_size), dtype=bool)
    for i, smile in enumerate(smiles_list):
        for t, char in enumerate(smile):
            X[i, t, char_to_int[char]] = 1
            if t > 0:
                Y[i, t-1, char_to_int[char]] = 1 #predicting symbol
    return X, Y

X, Y = vectorize(smiles)

## Brain of code (LSTM)

In [None]:
model = keras.Sequential([
    layers.Input(shape=(max_len, vocab_size)),
    layers.LSTM(128, return_sequences=True), # LSTM (memory)
    layers.Dropout(0.2),
    layers.Dense(vocab_size, activation='softmax') # probability of letter
])

model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

## Learning part

In [None]:
history = model.fit(X, Y, batch_size=128, epochs=50, validation_split=0.1)

## Generator

In [None]:
def sample(preds, temperature=1.0):
    # Applying function (random)
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-7) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate_molecule():
    start_index = np.random.randint(0, len(smiles)-1)
    seed_sentence = smiles[start_index][:5]
    generated = seed_sentence

    for i in range(max_len):
        x_pred = np.zeros((1, max_len, vocab_size))
        for t, char in enumerate(generated):
            if char in char_to_int:
                x_pred[0, t, char_to_int[char]] = 1.

        preds = model.predict(x_pred, verbose=0)[0][len(generated)-1]
        next_index = sample(preds, temperature=0.2)
        next_char = int_to_char[next_index]

        generated += next_char
        if len(generated) > max_len: break

    return generated

# Generating 10 and verify
valid_mols = []
for i in range(20):
    smi = generate_molecule()
    mol = Chem.MolFromSmiles(smi)
    if mol: # if rdkit read = good
        valid_mols.append(mol)
        print(f"✅ Valid: {smi}")
    else:
        print(f"❌ Invalid: {smi}")

# Drug-like verification according to Lipinski
from rdkit.Chem import Descriptors
from rdkit.Chem import QED  # <--- Импортируем QED

def check_lipinski(mol):
    mw = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    hbd = Descriptors.NumHDonors(mol)
    hba = Descriptors.NumHAcceptors(mol)

    # Counting QED
    qed_score = QED.qed(mol)

    # Verifing Lipinski
    if (150 <= mw <= 500) and (logp <= 5) and (hbd <= 5) and (hba <= 10):
        # ВОТ ТУТ мы формируем подпись. Добавили QED в конец строки:
        return True, f"MW:{mw:.0f} LogP:{logp:.1f} QED:{qed_score:.2f}"
    else:
        return False, "Failed"

# filtration
good_mols = []
legends = []

print("Mining for drug-like molecules with QED...")


attempts = 0
while len(good_mols) < 9 and attempts < 150:
    attempts += 1
    smi = generate_molecule()
    mol = Chem.MolFromSmiles(smi)

    if mol:
        passed, stats = check_lipinski(mol)
        if passed:
            good_mols.append(mol)
            legends.append(stats)
            print(f" Found: {smi} -> {stats}")
#Draw
if len(good_mols) > 0:
    print(f"\nSuccess! Found {len(good_mols)} drug-like candidates after {attempts} attempts.")
    img = Draw.MolsToGridImage(good_mols, molsPerRow=3, subImgSize=(300, 300), legends=legends)
    display(img)
else:
    print("No drug-like molecules found.")

### This part was generate by AI assistance in order to solve low productivity and increase calculating potencial

# Task
Increase the number of training epochs in the LSTM model from 20 to 50 to improve the model's ability to generate valid molecules.

## Increase training epochs

### Subtask:
Increase the number of training epochs for the LSTM model to allow it to learn the SMILES grammar more effectively, aiming to generate more valid molecules.


## Summary:

### Data Analysis Key Findings
The provided information describes a subtask to increase the training epochs of an LSTM model from 20 to 50, aiming to improve its ability to learn SMILES grammar and generate more valid molecules. However, the execution or results of this subtask are not included in the provided solving process. Therefore, no data analysis key findings can be reported at this time.

### Insights or Next Steps
*   Execute the described subtask of increasing the LSTM model's training epochs to 50 and evaluate its impact on the validity of generated molecules.
*   Analyze the model's performance (e.g., loss, perplexity) during and after the increased training epochs to understand the effectiveness of the change.
