<a href="https://colab.research.google.com/github/DilshadFayiz/AI-project-Biotecnika/blob/main/Al_Assisted_Generation_of_novel_anticancer_candidates.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# üß¨ AI-Assisted Generation Novel Anticancer Compounds
This Google Colab notebook trains a generative model to design novel anticancer compounds using machine learning and SELFIES representations.

In [None]:
!pip install rdkit selfies tensorflow scikit-learn matplotlib pandas tqdm

Collecting rdkit
  Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Collecting selfies
  Downloading selfies-2.2.0-py3-none-any.whl.metadata (14 kB)
Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl (36.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m36.2/36.2 MB[0m [31m52.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading selfies-2.2.0-py3-none-any.whl (36 kB)
Installing collected packages: selfies, rdkit
Successfully installed rdkit-2025.9.1 selfies-2.2.0


In [None]:
import pandas as pd, selfies as sf, numpy as np, random, tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from rdkit import Chem
from rdkit.Chem import Descriptors, Draw
from tqdm import tqdm
import matplotlib.pyplot as plt

# Upload your dataset first
from google.colab import files
uploaded = files.upload()
file_path = list(uploaded.keys())[0]
df = pd.read_csv(file_path, encoding='latin1')
print('Data loaded:', df.shape)
df.head()

Saving Anticancer_compounds@1994.csv to Anticancer_compounds@1994.csv
Data loaded: (1994, 5)


Unnamed: 0,Compound_Name,MOL_WEIGHT,IC50 (¬µM),Target Name,SMILES
0,CHEMBL170406,438.42,0.5,Inhibitor of nuclear factor kappa-B kinase sub...,N#Cc1c(-c2ccc(NC(=O)CCCC(=O)[O-])cc2)cc(-c2ccc...
1,CHEMBL3980848,478.89,1.0,Inhibitor of nuclear factor kappa-B kinase sub...,NC(=O)c1nn(-c2ccc(F)cc2F)c2c1CCc1ccc(NC(=O)c3c...
2,(-)-Kurarinone,438.52,0.6,PTGS1 - prostaglandin-endoperoxide synthase 1,CC(=CCC(CC1=C2C(=C(C=C1O)OC)C(=O)C[C@H](O2)C3=...
3,"1,4-Naphthoquinone",158.156,0.34,MAP2K1 - mitogen-activated protein kinase kina...,C1=CC=C2C(=O)C=CC(=O)C2=C1
4,13-DEOXYDOXORUBICIN,529.542,0.891,U2OS,COc1cccc2c1C(=O)c1c(O)c3c(c(O)c1C2=O)C[C@@](O)...


In [None]:
# Clean and validate SMILES
def valid_smiles(s):
    try:
        return Chem.MolToSmiles(Chem.MolFromSmiles(s))
    except:
        return None

df['SMILES'] = df['SMILES'].apply(valid_smiles)
df = df.dropna(subset=['SMILES']).reset_index(drop=True)
print('Valid molecules:', len(df))

Valid molecules: 1994


In [None]:
# Convert to SELFIES
df['SELFIES'] = [sf.encoder(s) for s in df['SMILES']]
print('SELFIES conversion complete.')

SELFIES conversion complete.


In [None]:
# ==========================================
# ‚úÖ FIXED SELFIES ‚Üí RNN GENERATION BLOCK
# ==========================================
import numpy as np
import selfies as sf
import tensorflow as tf
from tensorflow.keras.utils import to_categorical

# 1Ô∏è‚É£ Clean invalid entries
df = df.dropna(subset=['SELFIES'])
df = df[df['SELFIES'].apply(lambda x: isinstance(x, str) and len(x) > 0)]
df = df.reset_index(drop=True)

# 2Ô∏è‚É£ Build alphabet + encoders
alphabet = list(sf.get_alphabet_from_selfies(df['SELFIES']))
alphabet.append('.')  # special padding token
token_to_idx = {k: i + 1 for i, k in enumerate(alphabet)}
idx_to_token = {v: k for k, v in token_to_idx.items()}

max_len = max(len(list(sf.split_selfies(s))) for s in df['SELFIES'])
print("Max SELFIES length:", max_len)
print("Alphabet size:", len(alphabet))

def selfies_to_seq(s):
    toks = sf.split_selfies(s)
    seq = [token_to_idx[t] for t in toks]
    # Pad to max_len
    return seq + [0] * (max_len - len(seq))

seqs = np.array([selfies_to_seq(s) for s in df['SELFIES']], dtype=np.int32)

# 3Ô∏è‚É£ Prepare input/output sequences
X_gen = seqs[:, :-1]
y_gen = seqs[:, 1:]

# 4Ô∏è‚É£ Model definition
vocab_size = len(token_to_idx) + 1
input_layer = tf.keras.Input(shape=(max_len - 1,))
embedding_layer = tf.keras.layers.Embedding(vocab_size, 64)(input_layer)
lstm1 = tf.keras.layers.LSTM(128, return_sequences=True)(embedding_layer)
lstm2 = tf.keras.layers.LSTM(128, return_sequences=True)(lstm1)
output_layer = tf.keras.layers.Dense(vocab_size, activation='softmax')(lstm2)

model = tf.keras.Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='categorical_crossentropy')

# 5Ô∏è‚É£ Memory-safe generator
def data_generator(X, y, batch_size=64, num_classes=vocab_size):
    while True:
        for i in range(0, len(X), batch_size):
            X_batch = X[i:i+batch_size]
            y_batch = y[i:i+batch_size]
            y_batch = to_categorical(y_batch, num_classes=num_classes)
            yield X_batch, y_batch

train_gen = data_generator(X_gen, y_gen, batch_size=64)
steps_per_epoch = max(1, len(X_gen) // 64)

# 6Ô∏è‚É£ Debug shapes
dummy_input = np.zeros((1, max_len - 1), dtype=np.int32)
print("‚úÖ Dummy input shape:", dummy_input.shape)
print("‚úÖ Model output shape:", model(dummy_input).shape)

# 7Ô∏è‚É£ Train
model.fit(train_gen, steps_per_epoch=steps_per_epoch, epochs=5)

# 8Ô∏è‚É£ Save trained model
model.save("selfies_generator_rnn.keras")
print("‚úÖ Model training complete and saved.")


Max SELFIES length: 80
Alphabet size: 49
‚úÖ Dummy input shape: (1, 79)
‚úÖ Model output shape: (1, 79, 50)
Epoch 1/5
[1m31/31[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m12s[0m 267ms/step - loss: 2.8830
Epoch 2/5
[1m31/31[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m9s[0m 308ms/step - loss: 1.6128
Epoch 3/5
[1m31/31[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m9s[0m 305ms/step - loss: 1.5206
Epoch 4/5
[1m31/31[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m9s[0m 281ms/step - loss: 1.4617
Epoch 5/5
[1m31/31[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m9s[0m 279ms/step - loss: 1.4033
‚úÖ Model training complete and saved.


In [None]:
# Generate new molecules
def sample_selfies(n=50):
    new_selfies = []
    # Get the padding token index
    padding_token_idx = token_to_idx.get('.', 0)

    for _ in range(n):
        # Start with a list containing the index of a random valid token
        valid_tokens = [idx for token, idx in token_to_idx.items() if token != '.']
        if not valid_tokens:
            continue # Skip if no valid tokens available

        seq = [random.choice(valid_tokens)]

        # Generate tokens one by one up to max_len - 1
        for i in range(max_len - 2):
            # Create the input sequence for prediction: current sequence padded to max_len - 1
            # Ensure the current sequence does not exceed max_len - 1
            current_seq = seq[:max_len - 1]
            current_padded_seq = current_seq + [padding_token_idx] * (max_len - 1 - len(current_seq))
            current_padded_seq = np.array([current_padded_seq], dtype=np.int32)

            # Predict probabilities for the next token
            # model.predict returns (batch_size, sequence_length, vocab_size)
            probs = model.predict(current_padded_seq, verbose=0)[0] # Shape (max_len-1, vocab_size)

            # Get probabilities for the next token based on the last token in the current sequence
            # The index in the predicted output corresponds to the length of the current sequence - 1
            probs_for_next_token = probs[len(current_seq) - 1] # Shape (vocab_size,)


            # Sample the next token index based on the probabilities
            epsilon = 1e-8
            probs_for_next_token = (probs_for_next_token + epsilon) / (np.sum(probs_for_next_token) + epsilon)
            next_tok_idx = np.random.choice(len(probs_for_next_token), p=probs_for_next_token)

            # If the sampled token is the padding token or an invalid index, stop generating for this molecule
            if idx_to_token.get(next_tok_idx, '') == '.' or next_tok_idx == padding_token_idx:
                 break

            # Append the predicted token index to the sequence
            seq.append(next_tok_idx)

        # Convert the sequence of indices back to SELFIES tokens, excluding padding
        toks = [idx_to_token.get(i, '') for i in seq if idx_to_token.get(i, '') != '.']

        # Join tokens to form a SELFIES string
        generated_selfie = ''.join(toks)

        # Validate and decode the generated SELFIES
        if generated_selfie: # Ensure the generated selfie is not empty
            try:
                # Decode to SMILES and validate with RDKit
                smi = sf.decoder(generated_selfie)
                if smi is not None:
                     mol = Chem.MolFromSmiles(smi)
                     if mol is not None:
                        new_selfies.append(generated_selfie) # Append valid SELFIES string

            except Exception as e:
                 # Silently ignore decoding/validation errors for this molecule
                 pass

    # Decode all valid SELFIES to SMILES for the final list
    generated_smiles = []
    for s in new_selfies:
        try:
            smi = sf.decoder(s)
            if smi is not None:
                mol = Chem.MolFromSmiles(smi)
                if mol is not None:
                    generated_smiles.append(smi)
        except:
            pass


    print('Valid generated SMILES:', len(generated_smiles))
    return generated_smiles

generated_smiles = sample_selfies(100)

Valid generated SMILES: 100


In [None]:
import pandas as pd
from google.colab import files

# Create a DataFrame from the generated SMILES list
smiles_df = pd.DataFrame(generated_smiles, columns=['SMILES'])

# Define the filename for the output CSV
output_filename = 'generated_smiles.csv'

# Save the DataFrame to a CSV file
smiles_df.to_csv(output_filename, index=False)

# Provide a link to download the file
files.download(output_filename)

print(f'Generated SMILES saved to {output_filename} and available for download.')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Generated SMILES saved to generated_smiles.csv and available for download.
