# **Business Understanding**
The goal of this project is to optimize the process of drug discovery by automatise the lead compound step which decrease the time and the cost of a drug production. 

# **Defining the data requirements:**
We need a dataset of small molecules and their properties.

###  Identify potential data sources:
We have 2 data sources : Chembl - Pubchem 

# **Data Understanding & Data Preparation**

### 1. Data Understanding:
- Explore the data and get a feel for what it contains.
- Check the shape of dataset.
- Check data types.

In [None]:
!nvidia-smi

In [None]:
import tensorflow as tf
tf.test.gpu_device_name()

# **Chembl dataset analysis**

In [None]:
!pip install pandas-profiling

In [None]:
#importation of required librairies
import pandas as pd 
import numpy as np

In [None]:
from pandas_profiling import ProfileReport

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Chembl = pd.read_csv(r'/content/Chembl.csv', sep=';', error_bad_lines =False, engine='python')
Chembl = pd.read_csv('/content/drive/MyDrive/Chembl.csv',sep=';', error_bad_lines =False, engine='python')

In [None]:
Chembl.shape

Work on a fixed subset

In [None]:


# Set the size of the subset (e.g. 10,000)
subset_size = 10000

# Randomly select a fixed subset of your data
subset = Chembl.sample(n=subset_size, random_state=123)


In [None]:
profil=ProfileReport(subset)

In [None]:
profil.to_file("Analysis.html")

# **Data preprocessing**


1.   Delete unnecessary columns.
2.   Delete molecules with 1050 < weight < 50.
3.   Convert None to  Nan. 
4.   Convert categorical type for some columns.
5.   Impute nan values
6.   Standardization. 
7.   Calcul descriptors.
8.   Features selection.
9.   Normalization. 
10.  Plots.
11.  Visualise some mols in 2D or 3D.




In [None]:
subset.head(100)
subset[subset['Molecular Species'] == 'ZWITTERION'].head(1)

In [None]:
#Chem_data= subset.copy()
Chem_data= Chembl.copy()

### 1. Delete unnecessary columns

ajouter des expliquations

In [None]:
Chem_data.drop(['ChEMBL ID',	'Name',	'Synonyms',	'Type',	'Max Phase','Targets','Structure Type',], axis=1, inplace=True)

In [None]:
Chem_data.drop(['Inchi Key',], axis=1, inplace=True)

In [None]:
Chem_data.shape

In [None]:
Chem_data.head(100)

## 2. Delete molecules with 1050 < weight < 50

In [None]:
Chem_data['Molecular Weight'].max()

In [None]:
over_weightedd = Chem_data[(Chem_data['Molecular Weight'] >= 1050.0) | (Chem_data['Molecular Weight'] <= 50.0)]

In [None]:
over_weightedd.shape

In [None]:
over_weighted = Chem_data[(Chem_data['Molecular Weight'] >= 1050.0) | (Chem_data['Molecular Weight'] <= 50.0)].index

In [None]:
Chem_data.drop( over_weighted , inplace=True)

In [None]:
Chem_data.shape

## 3. Convert None to  Nan 

In [None]:
Chem_data.replace('None',np.nan, inplace=True )

## 4. Convert categorical type for some columns



In [None]:
Chem_data[['AlogP','Polar Surface Area','QED Weighted','CX Acidic pKa','CX Basic pKa','CX LogP','CX LogD','Molecular Weight (Monoisotopic)']] = Chem_data[['AlogP','Polar Surface Area','QED Weighted','CX Acidic pKa','CX Basic pKa','CX LogP','CX LogD','Molecular Weight (Monoisotopic)']].astype(float)


In [None]:
# we can't convert this columns to int because we have nan so we are going to convert them to float, handel the nan values and then convert them to in  
Chem_data[['HBA','HBD','#RO5 Violations','#Rotatable Bonds','Aromatic Rings','Inorganic Flag','Heavy Atoms','HBA (Lipinski)','HBD (Lipinski)','#RO5 Violations (Lipinski)']] = Chem_data[['HBA','HBD','#RO5 Violations','#Rotatable Bonds','Aromatic Rings','Inorganic Flag','Heavy Atoms','HBA (Lipinski)','HBD (Lipinski)','#RO5 Violations (Lipinski)']].astype(float)


In [None]:
Chem_data.dtypes

## 5. Impute Nan values

In [None]:
#Resume table of columns with missing values
def missing_values_table(Chem_data):
    mis_val = Chem_data.isnull().sum()
    mis_val_percent = 100 * Chem_data.isnull().sum() / len(Chem_data)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
    print ("Your selected dataframe has " + str(Chem_data.shape[1]) + " columns.\n"
                                                               "There are " + str(mis_val_table_ren_columns.shape[0]) +
           " columns that have missing values.")
    return mis_val_table_ren_columns

In [None]:
missing_values_table(Chem_data)


In [None]:
# Get the columns with > 50% missing
na_threshold = 50
missing_df = missing_values_table(Chem_data);
missing_columns = list(missing_df[missing_df['% of Total Values'] > na_threshold].index)
print(f'We will remove {len(missing_columns)} columns as we set threshold of 50%.')

### 5.1 Distribution

In [None]:
import matplotlib.pyplot as plt

In [None]:
num_cols_ab = Chem_data.select_dtypes("float64")
num_cols = Chem_data.select_dtypes(include=[np.number]).columns


In [None]:
num_cols

In [None]:
fig, axs = plt.subplots(ncols=5, nrows=5, figsize=(16, 20), squeeze=False)
plt.subplots_adjust(wspace=.4)

sampled_data = Chem_data.sample(n=10000) # randomly sample 1000 rows from the dataset

for ax, col in zip(axs.ravel(), Chem_data.columns):
    ax.hist(sampled_data[col].dropna())
    ax.title.set_text(col)

### 5.2 outliers

In [None]:
# valeur abberantes 
fig, axs = plt.subplots(ncols=5, nrows=2, figsize=(16, 20), squeeze=False)
plt.subplots_adjust(wspace=.4)

for ax, col in zip(axs.ravel(), num_cols_ab.columns):
    ax.boxplot(Chem_data[col].dropna())
    ax.title.set_text(col)

* Our data is not normally distributed and there are many outliers so the method of imputation to use is the median 

In [None]:
median_val = Chem_data[num_cols].median()
median_val

In [None]:
Chem_data[num_cols] = Chem_data[num_cols].fillna(median_val)

In [None]:
missing_values_table(Chem_data)

* for these categorical data it does'nt make sense to impute them cuz the values are unique for each molecule so we gonna drop them 

In [None]:
Chem_data.dropna(axis=0, inplace= True)

In [None]:
Chem_data.shape

In [None]:
missing_values_table(Chem_data)

Check if there is any duplicated rows

In [None]:
Chem_data.duplicated().sum()

In [None]:
Chem_data.drop_duplicates(keep='first')

In [None]:
# reconvert these columns to int
Chem_data[['HBA','HBD','#RO5 Violations','#Rotatable Bonds','Aromatic Rings','Inorganic Flag','Heavy Atoms','HBA (Lipinski)','HBD (Lipinski)','#RO5 Violations (Lipinski)']] = Chem_data[['HBA','HBD','#RO5 Violations','#Rotatable Bonds','Aromatic Rings','Inorganic Flag','Heavy Atoms','HBA (Lipinski)','HBD (Lipinski)','#RO5 Violations (Lipinski)']].astype(int)


In [None]:
int_cols=Chem_data[['HBA','HBD','#RO5 Violations','#Rotatable Bonds','Aromatic Rings','Inorganic Flag','Heavy Atoms','HBA (Lipinski)','HBD (Lipinski)','#RO5 Violations (Lipinski)']]

In [None]:
int_cols


In [None]:
num_cols = list(set(num_cols).difference(set(int_cols)))


In [None]:
num_cols

In [None]:
Chem_data.dtypes

## . Calcul Topological Descriptor : to remove the data we have is already descriptor

In [None]:
!pip install rdkit

In [None]:
from rdkit import Chem
from rdkit.Chem import Descriptors

In [None]:
import rdkit.Chem.Descriptors as Descriptors

print(dir(Descriptors))


In [None]:
# Define a function to calculate the descriptors for a given molecule
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    mw = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    psa = Descriptors.TPSA(mol)
    hba = Descriptors.NumHAcceptors(mol)
    hbd = Descriptors.NumHDonors(mol)
    ro5_violations = Descriptors.NumRotatableBonds(mol)
    passes_ro3 = Descriptors.NumHeteroatoms(mol)
    qed_weighted = Descriptors.qed(mol)
   # cx_acidic_pka = Descriptors.PredictedLogAC(mol)
   # cx_basic_pka = Descriptors.PredictedLogBC(mol)
    cx_logp = Descriptors.MolLogP(mol)
    cx_logd = Descriptors.MolLogP(mol)
    aromatic_rings = Descriptors.NumAromaticRings(mol)
    inorganic_flag = 1 if any(atom.GetSymbol() not in ['C', 'H', 'O', 'N', 'S', 'P', 'F', 'Cl', 'Br', 'I'] for atom in mol.GetAtoms()) else 0
    heavy_atoms = Descriptors.HeavyAtomCount(mol)
    hba_lipinski = Descriptors.NumHAcceptors(mol)
    hbd_lipinski = Descriptors.NumHDonors(mol)
    ro5_violations_lipinski = 1 if (hba_lipinski + hbd_lipinski) > 5 or ro5_violations > 1 else 0
    mw_monoisotopic = Descriptors.ExactMolWt(mol)
    molecular_species = Chem.rdinchi.InchiToInchiKey(Chem.inchi.MolToInchi(mol))
    #return [mw, logp, psa, hba, hbd, ro5_violations, passes_ro3, qed_weighted, cx_acidic_pka, cx_basic_pka, cx_logp, cx_logd, aromatic_rings, inorganic_flag, heavy_atoms, hba_lipinski, hbd_lipinski, ro5_violations_lipinski, mw_monoisotopic, molecular_species]
    descriptors_dict = {'Molecular Weight': mw,
                        'AlogP': logp,
                        'Polar Surface Area': psa,
                        'HBA': hba,
                        'HBD': hbd,
                        '#RO5 Violations': ro5_violations,
                        'Passes Ro3':  passes_ro3,
                        'QED Weighted': qed_weighted,
                        #'CX Acidic pKa': cx_acidic_pka,
                        #'CX Basic pKa': cx_basic_pka,
                        'CX LogP': cx_logp,
                        'CX LogD': cx_logd,
                        'Aromatic Rings': aromatic_rings,
                        'Inorganic Flag': inorganic_flag,
                        'Heavy Atoms': heavy_atoms,
                        'HBA (Lipinski)': hba_lipinski,
                        'HBD (Lipinski)': hbd_lipinski,
                        '#RO5 Violations (Lipinski)': ro5_violations_lipinski,
                        'Molecular Weight (Monoisotopic)': mw_monoisotopic,
                        'Molecular Species': molecular_species
                        }

    return descriptors_dict
    #return [mw, logp, psa, hba, hbd, ro5_violations, passes_ro3, qed_weighted, cx_logp, cx_logd, aromatic_rings, inorganic_flag, heavy_atoms, hba_lipinski, hbd_lipinski, ro5_violations_lipinski, mw_monoisotopic, molecular_species, molecular_formula]


In [None]:

# Create a new dataframe to store the descriptor values for each molecule
descriptors = pd.DataFrame(columns=['Smiles', 'Molecular Weight', 'AlogP', 'Polar Surface Area', 'HBA', 'HBD', '#RO5 Violations', '#Rotatable Bonds', 'Passes Ro3', 'QED Weighted', 'CX Acidic pKa', 'CX Basic pKa', 'CX LogP', 'CX LogD', 'Aromatic Rings', 'Inorganic Flag', 'Heavy Atoms', 'HBA (Lipinski)', 'HBD (Lipinski)', '#RO5 Violations (Lipinski)', 'Molecular Weight (Monoisotopic)', 'Molecular Species', 'Molecular Formula'])

# Loop through each SMILES string in your dataset and calculate the descriptors for each molecule
for smiles in Chem_data['Smiles']:
    descriptors_row = calculate_descriptors(smiles)
    descriptors_row['Smiles'] = smiles
    descriptors = descriptors.append(descriptors_row, ignore_index=True)



In [None]:
descriptors

## Encoding 

1.   Passes Ro3 encoding : '0': N ,'1' : Y
2.   Molecular Species encoding : '0':ACID , '1':BASE ,'2':NEUTRAL, '3':ZWITTERION  



In [None]:
Chem_data['Passes Ro3'].unique()

In [None]:
subset['Molecular Species'].unique()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
# create a label encoder object
label_encoder = LabelEncoder()

# fit the encoder to the categories and transform them into integers
Chem_data['Passes Ro3'] = label_encoder.fit_transform(Chem_data['Passes Ro3'])


#Chem_data.head()

In [None]:
Chem_data['Passes Ro3'].unique()

In [None]:
# create a label encoder object
label_encoder = LabelEncoder()

# fit the encoder to the categories and transform them into integers
Chem_data['Molecular Species'] = label_encoder.fit_transform(Chem_data['Molecular Species'])


#Chem_data.head(100)

In [None]:
row_index = 538971
row = Chem_data.loc[row_index]
row


## . Feature Selection 


* Correlation Matrix and ClusterMap

In [None]:
import seaborn as sns

In [None]:
corr_matrix = Chem_data.select_dtypes(float).corr()
#corr_matrix

In [None]:
#plt.figure(figsize=(5500, 5000))
#sns.clustermap(cor_mat, colors_ratio=0.01, annot=True)

sns.clustermap(corr_matrix, cmap='coolwarm', vmin=-1, vmax=1, figsize=(20, 10),annot=True)

plt.show()

In [None]:
# extract highly correlated features
threshold = 0.8 # set a correlation threshold
corr_threshold = np.where(abs(corr_matrix) > threshold)
corr_pairs = [(corr_matrix.iloc[corr_threshold[0][i], corr_threshold[1][i]], 
               corr_matrix.columns[corr_threshold[1][i]], 
               corr_matrix.columns[corr_threshold[0][i]]) for i in range(len(corr_threshold[0])) if corr_threshold[0][i] < corr_threshold[1][i]]

# print highly correlated features
for corr in corr_pairs:
    print(corr)

à tester 

In [None]:
# case 1 : 
Chem_data.drop(['Heavy Atoms', 'Molecular Weight','AlogP','Polar Surface Area','HBA','HBD','#RO5 Violations','CX LogD',], axis=1, inplace=True)

In [None]:
Chem_data.shape

In [None]:
# case 2: 
#Chem_data.drop(['Heavy Atoms', 'Molecular Weight (Monoisotopic)','HBA (Lipinski)','HBD (Lipinski)','#RO5 Violations (Lipinski)','CX LogP'], axis=1, inplace=True)

In [None]:
#save cleaned data to new file 
Chem_data.to_csv('cleaned_Chem_dataset_vers1.csv', index=False)

## 7. Standardization : 
1. from the distribution plots we can see that our data is not normaly distributed so we need to standrize their values.

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# Create the scaler object
scaler = StandardScaler()
num_cols_ab = Chem_data.select_dtypes("float64").columns
#num_cols = Chem_data.select_dtypes(include=[np.number]).columns
# Fit and transform the data
Chem_data[num_cols_ab] = scaler.fit_transform(Chem_data[num_cols_ab])

In [None]:
Chem_data

# **Data Modeling & Model Evaluation**






## Split data 

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_data, test_data = train_test_split(Chem_data, test_size=0.3, random_state=45)

In [None]:
Chem_data.drop(['Molecular Formula'], axis=1, inplace=True)

In [None]:
X_train, X_test,y_train, y_test = train_test_split(Chem_data.drop(['Smiles'],axis=1),Chem_data['Smiles'],test_size=0.3, random_state=45)

In [None]:
y_train.shape

In [None]:
y_test.shape

## Nouvelle section

In [None]:
!pip -q install rdkit-pypi

In [None]:
!pip install transformers

In [None]:
!pip install --pre deepchem
import deepchem
deepchem.__version__

In [None]:
#from rdkit import Chem
#from rdkit.Chem import AllChem
#from rdkit.Chem import DataStructs
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, LeakyReLU
from deepchem.feat.smiles_tokenizer import BasicSmilesTokenizer
#from deepchem.feat.smiles_tokenizer import DeepSMILESTokenizer


In [None]:
#tokenizer = BasicSmilesTokenizer()
#print(tokenizer.tokenize(Chem_data['Smiles']))

#from deepchem.feat.smiles_tokenizer import SmilesTokenizer
# Preprocess the SMILES strings using RDKit
#tokenizer = SmilesTokenizer(Chem_data['Smiles'])
#Chem_data['Smiles'] = Chem_data['Smiles'].apply(tokenizer.Tokenize)

# Load the pretrained tokenizer
#tokenizer = SmilesTokenizer.from_pretrained('smiles')

# Define a function to tokenize a SMILES string
#def tokenize(smiles):
 #   return tokenizer.tokenize(smiles)

# Apply the tokenizer to the SMILES column
#Chem_data['Smiles'] = Chem_data['Smiles'].apply(tokenize)
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")
smiles_series = Chem_data['Smiles']

smiles_list = smiles_series.tolist()
encoded_smiles = tokenizer(smiles_list, padding=True, truncation=True, max_length=100, return_tensors="pt")

encoded_smiles


In [None]:
# Define the architecture of the GAN

from transformers import AutoTokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LeakyReLU

#tokenize a SMILES string

tokenizer = AutoTokenizer.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")
smiles_series = Chem_data['Smiles']

smiles_list = smiles_series.tolist()
encoded_smiles = tokenizer(smiles_list, padding=True, truncation=True, max_length=100, return_tensors="pt")

vocab_size = len(tokenizer.get_vocab())
# Define the architecture of the GAN
generator = Sequential()
generator.add(Dense(128, input_dim=100))
generator.add(LeakyReLU(alpha=0.2))
generator.add(Dense(256))
generator.add(LeakyReLU(alpha=0.2))
generator.add(Dense(512))
generator.add(LeakyReLU(alpha=0.2))
generator.add(Dense(vocab_size, activation='softmax'))

discriminator = Sequential()
discriminator.add(Dense(512, input_dim=vocab_size))
discriminator.add(LeakyReLU(alpha=0.2))
discriminator.add(Dense(256))
discriminator.add(LeakyReLU(alpha=0.2))
discriminator.add(Dense(1, activation='sigmoid'))



In [None]:
# Compile the GAN model
gan_input = generator.input
gan_output = discriminator(generator(gan_input))
gan = Model(gan_input, gan_output)
gan.compile(loss='binary_crossentropy', optimizer='adam')

# Train the GAN on the training set
for epoch in range(num_epochs):
    # Train the discriminator
    real_samples = train_data.sample(batch_size)
    real_X, real_y = real_samples[['property1', 'property2', 'property3']], np.ones((batch_size, 1))
    fake_samples = generator.predict(np.random.normal(0, 1, (batch_size, 100)))
    fake_X, fake_y = fake_samples, np.zeros((batch_size, 1))
    X, y = np.vstack((real_X, fake_X)), np.vstack((real_y, fake_y))
    discriminator.trainable = True
    discriminator.train_on_batch(X, y)

    # Train the generator
    noise = np.random.normal(0, 1, (batch_size, 100))
    y = np.ones((batch_size, 1))
    discriminator.trainable = False
    gan.train_on_batch(noise,y)

In [None]:
# Import necessary libraries
from transformers import AutoTokenizer
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, LeakyReLU
import numpy as np

# Load the SMILES dataset
smiles_series = Chem_data['Smiles']

# Tokenize the SMILES strings using the PubChem10M SMILES BPE tokenizer
tokenizer = AutoTokenizer.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")
smiles_list = smiles_series.tolist()
encoded_smiles = tokenizer(smiles_list, padding=True, truncation=True, max_length=100, return_tensors="pt")
vocab_size = len(tokenizer.get_vocab())


    

In [None]:
# Define the architecture of the generator and discriminator
generator = Sequential()
generator.add(Dense(128, input_dim=100))
generator.add(LeakyReLU(alpha=0.2))
generator.add(Dense(256))
generator.add(LeakyReLU(alpha=0.2))
generator.add(Dense(512))
generator.add(LeakyReLU(alpha=0.2))
generator.add(Dense(vocab_size, activation='softmax'))

discriminator = Sequential()
discriminator.add(Dense(512, input_dim=vocab_size))
discriminator.add(LeakyReLU(alpha=0.2))
discriminator.add(Dense(256))
discriminator.add(LeakyReLU(alpha=0.2))
discriminator.add(Dense(1, activation='sigmoid'))



In [None]:
# Compile the GAN model
gan_input = generator.input
gan_output = discriminator(generator(gan_input))
gan = Model(gan_input, gan_output)
gan.compile(loss='binary_crossentropy', optimizer='adam')


In [None]:
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'
import keras.backend as K
K.set_image_data_format('channels_last')


In [None]:
# Train the GAN on the training set

num_epochs = 100
batch_size = 128

for epoch in range(num_epochs):
    # Train the discriminator
    real_samples = train_data.sample(batch_size)
    real_X, real_y = real_samples.drop(['Smiles'], axis=1), np.ones((batch_size, 1))
    fake_samples = generator.predict(np.random.normal(0, 1, (batch_size, 100)))
    fake_X = np.zeros_like(real_X).astype('float32')
    num_cols = min(fake_samples.shape[1], real_X.shape[1])
    fake_X[:, :num_cols] = fake_samples[:, :num_cols]
    fake_y = np.zeros((batch_size, 1))
    X, y = np.vstack((real_X, fake_X)), np.vstack((real_y, fake_y))
    discriminator.trainable = True
    discriminator.compile(loss='binary_crossentropy', optimizer='adam')
    discriminator.train_on_batch(X, y)


In [None]:
# Train the generator
    noise = np.random.normal(0, 1, (batch_size, 100))
    y = np.ones((batch_size, 1))
    discriminator.trainable = False
    gan.train_on_batch(noise,y)

Evalution Function 

In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LeakyReLU
from transformers import AutoTokenizer

import tensorflow as tf

# Load the SMILES dataset
smiles_series = Chem_data['Smiles']
smiles_list = smiles_series.tolist()

# Tokenize the SMILES strings using the PubChem10M SMILES BPE tokenizer
tokenizer = AutoTokenizer.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")
encoded_smiles = tokenizer(smiles_list, padding=True, truncation=True, max_length=100, return_tensors="pt")
vocab_size = len(tokenizer.get_vocab())

# Define the architecture of the generator and discriminator
def make_generator():
    generator = Sequential()
    generator.add(Dense(128, input_dim=100))
    generator.add(LeakyReLU(alpha=0.2))
    generator.add(Dense(256))
    generator.add(LeakyReLU(alpha=0.2))
    generator.add(Dense(512))
    generator.add(LeakyReLU(alpha=0.2))
    generator.add(Dense(vocab_size, activation='softmax'))
    return generator

def make_discriminator():
    discriminator = Sequential()
    discriminator.add(Dense(512, input_dim=vocab_size))
    discriminator.add(LeakyReLU(alpha=0.2))
    discriminator.add(Dense(256))
    discriminator.add(LeakyReLU(alpha=0.2))
    discriminator.add(Dense(1, activation='sigmoid'))
    return discriminator

generator = make_generator()
discriminator = make_discriminator()

# Compile the GAN model
discriminator.compile(loss='binary_crossentropy', optimizer='adam')
gan = Sequential([generator, discriminator])
discriminator.trainable = False
gan.compile(loss='binary_crossentropy', optimizer='adam')

# Train the GAN on the training set
num_epochs = 100
batch_size = 128

for epoch in range(num_epochs):
  # Train the discriminator
  real_samples = train_data['#Rotatable Bonds',	'Passes Ro3'].sample(batch_size)
  real_X, real_y = real_samples(axis=1), np.ones((batch_size, 1))
  fake_samples = generator.predict(np.random.normal(0, 1, (batch_size, 100)))
  fake_X = np.zeros_like(real_X).astype('float32')
  num_cols = min(fake_samples.shape[1], real_X.shape[1])
  fake_X[:, :num_cols] = fake_samples[:, :num_cols]
  fake_y = np.zeros((batch_size, 1))
  X, y = np.vstack((real_X, fake_X)), np.vstack((real_y, fake_y))
  discriminator.trainable = True
  discriminator.compile(loss='binary_crossentropy', optimizer='adam')
  X_tensor = tf.convert_to_tensor(X, dtype=tf.float32)
  y_tensor = tf.convert_to_tensor(y, dtype=tf.float32)
  discriminator.train_on_batch(X_tensor, y_tensor)
  




In [None]:
  # Train the generator
    noise = np.random.normal(0, 1, (batch_size, 100))
    y = np.ones((batch_size, 1))
    discriminator.trainable = False
    gan.train_on_batch(noise,y)

  # Train the discriminator
    real_samples = train_data.sample(batch_size)
    real_X, real_y = real_samples.drop(['Smiles'], axis=1), np.ones((batch_size, 1))
    fake_samples = generator.predict(np.random.normal(0, 1, (batch_size, 100)))
    fake_X = np.zeros_like(real_X).astype('float64')
    num_cols = min(fake_samples.shape[1], real_X.shape[1])
    fake_X[:, :num_cols] = fake_samples[:, :num_cols]
    fake_y = np.zeros((batch_size, 1))
    X, y = np.vstack((real_X, fake_X)), np.vstack((real_y, fake_y))
    discriminator.trainable = True
    discriminator.train_on_batch(X, y)

In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LeakyReLU, Concatenate
from transformers import AutoTokenizer

import tensorflow as tf
import pandas as pd
from rdkit import Chem


# Tokenize the SMILES strings using the PubChem10M SMILES BPE tokenizer
tokenizer = AutoTokenizer.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")
smiles_list = Chem_data['Smiles'].tolist()
encoded_smiles = tokenizer(smiles_list, padding=True, truncation=True, max_length=100, return_tensors="pt")
vocab_size = len(tokenizer.get_vocab())

# Preprocess the properties

properties = Chem_data.drop(['Smiles','Molecular Formula'], axis=1).values.astype('float32')
num_properties = properties.shape[1]

# Define the architecture of the generator and discriminator
def make_generator():
    generator = Sequential()
    generator.add(Dense(128, input_dim=100+num_properties))
    generator.add(LeakyReLU(alpha=0.2))
    generator.add(Dense(256))
    generator.add(LeakyReLU(alpha=0.2))
    generator.add(Dense(512))
    generator.add(LeakyReLU(alpha=0.2))
    generator.add(Dense(vocab_size, activation='softmax'))
    return generator

def make_discriminator():
    discriminator = Sequential()
    discriminator.add(Dense(512, input_dim=vocab_size+num_properties))
    discriminator.add(LeakyReLU(alpha=0.2))
    discriminator.add(Dense(256))
    discriminator.add(LeakyReLU(alpha=0.2))
    discriminator.add(Dense(1, activation='sigmoid'))
    return discriminator

generator = make_generator()
discriminator = make_discriminator()

# Compile the GAN model
discriminator.compile(loss='binary_crossentropy', optimizer='adam')
gan = Sequential([generator, discriminator])
discriminator.trainable = False
gan.compile(loss='binary_crossentropy', optimizer='adam')

# Train the GAN on the training set
num_epochs = 100
batch_size = 128

for epoch in range(num_epochs):
    # Train the discriminator
    real_samples = Chem_data.sample(batch_size)
    real_smiles, real_props = real_samples['Smiles'], real_samples.drop(['Smiles','Molecular Formula'], axis=1).values.astype('float32')
    real_X, real_y = np.concatenate((encoded_smiles[real_smiles].numpy(), real_props), axis=1), np.ones((batch_size, 1))
    fake_samples = generator.predict(np.hstack((np.random.normal(0, 1, (batch_size, 100)), real_props)))
    fake_X, fake_y = np.concatenate((fake_samples, real_props), axis=1), np.zeros((batch_size, 1))
    X, y = np.vstack((real_X, fake_X)), np.vstack((real_y, fake_y))
    discriminator.trainable = True
    discriminator.compile(loss='binary_crossentropy', optimizer='adam')
    discriminator.train_on_batch(X, y)

    # Train the generator
    noise = np.random.normal(0, 1, (batch_size, 100))
    gan_X, gan_y = np.hstack((noise, real_props)), np.ones((batch_size, 1))
    discriminator.trainable = False
    gan.compile(loss='binary_crossentropy', optimizer='adam')
    gan.train_on_batch(gan_X, gan_y)

    # Print the progress
   
    print(f"Epoch {epoch+1}/{num_epochs}, generator loss: {gen_loss:.4f}, discriminator loss: {disc_loss:.4f}")


In [None]:
import numpy as np
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, LeakyReLU, Concatenate, Input
from transformers import AutoTokenizer

import tensorflow as tf
import pandas as pd
from rdkit import Chem



# Tokenize the SMILES strings using the PubChem10M SMILES BPE tokenizer
tokenizer = AutoTokenizer.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")
smiles_list = Chem_data['Smiles'].tolist()
encoded_smiles = tokenizer(smiles_list, padding=True, truncation=True, max_length=100, return_tensors="pt")
vocab_size = len(tokenizer.get_vocab())

# Preprocess the properties
properties = Chem_data.drop(['Smiles','Molecular Formula'], axis=1).values.astype('float32')
num_properties = properties.shape[1]

# Define the architecture of the generator and discriminator
def make_generator():
    generator = Sequential()
    generator.add(Dense(128, input_dim=100+num_properties))
    generator.add(LeakyReLU(alpha=0.2))
    generator.add(Dense(256))
    generator.add(LeakyReLU(alpha=0.2))
    generator.add(Dense(512))
    generator.add(LeakyReLU(alpha=0.2))
    generator.add(Dense(vocab_size, activation='softmax'))
    return generator

def make_discriminator():
    discriminator = Sequential()
    discriminator.add(Dense(512, input_dim=vocab_size+num_properties))
    discriminator.add(LeakyReLU(alpha=0.2))
    discriminator.add(Dense(256))
    discriminator.add(LeakyReLU(alpha=0.2))
    discriminator.add(Dense(1, activation='sigmoid'))
    return discriminator

# Concatenate noise vector and properties
z = Input(shape=(100,))
prop = Input(shape=(num_properties,))
combined_inputs = Concatenate()([z, prop])

generator = make_generator()
discriminator = make_discriminator()

# Generate molecule from noise vector and properties
molecule = generator(combined_inputs)

# Compile the discriminator model
discriminator.compile(loss='binary_crossentropy', optimizer='adam')

# Set discriminator to non-trainable
discriminator.trainable = False

# Compile the GAN model
gan = Model([z, prop], discriminator(molecule))
gan.compile(loss='binary_crossentropy', optimizer='adam')

# Train the GAN on the training set
num_epochs = 100
batch_size = 128
latent_dim = 100


for epoch in range(num_epochs):
    # Train the discriminator
    real_samples = Chem_data.sample(batch_size)
    real_smiles, real_props = real_samples['Smiles'], real_samples.drop(['Smiles'], axis=1).values.astype('float32')
    real_X, real_y = np.concatenate((encoded_smiles[real_smiles].numpy(), real_props), axis=1), np.ones((batch_size, 1))
    fake_samples = generator.predict([np.random.normal(0, 1, (batch_size, 100)), real_props])
    fake_X, fake_y = np.concatenate((fake_samples, real_props), axis=1), np.zeros((batch_size, 1))
    X, y = np.vstack((real_X, fake_X)), np.vstack((real_y, fake_y))
    discriminator.trainable = True
    discriminator.compile(loss='binary_crossentropy', optimizer='adam')
    discriminator.train_on_batch(X, y)

    # Train the generator
    noise = np.random.normal(0, 1, (batch_size, latent_dim))
    prop = real_props
    gan_X = [noise, prop]
    gan_y = np.ones((batch_size, 1))
    discriminator.trainable = False
    gan.compile(loss='binary_crossentropy', optimizer='adam')
    gan.train_on_batch(gan_X, gan_y)


In [None]:
import numpy as np
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, LeakyReLU, Concatenate, Input
from transformers import AutoTokenizer

import tensorflow as tf
import pandas as pd
from rdkit import Chem

# Tokenize the SMILES strings using the PubChem10M SMILES BPE tokenizer
tokenizer = AutoTokenizer.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")
smiles_list = Chem_data['Smiles'].tolist()
encoded_smiles = tokenizer(smiles_list, padding=True, truncation=True, max_length=100, return_tensors="pt")
vocab_size = len(tokenizer.get_vocab())

# Preprocess the properties
properties = Chem_data.drop(['Smiles'], axis=1).values.astype('float32')
num_properties = properties.shape[1]

# Define the architecture of the generator and discriminator
def make_generator():
    generator = Sequential()
    generator.add(Dense(128, input_dim=100+num_properties))
    generator.add(LeakyReLU(alpha=0.2))
    generator.add(Dense(256))
    generator.add(LeakyReLU(alpha=0.2))
    generator.add(Dense(512))
    generator.add(LeakyReLU(alpha=0.2))
    generator.add(Dense(vocab_size, activation='softmax'))
    return generator

def make_discriminator():
    discriminator = Sequential()
    discriminator.add(Dense(512, input_dim=vocab_size+num_properties))
    discriminator.add(LeakyReLU(alpha=0.2))
    discriminator.add(Dense(256))
    discriminator.add(LeakyReLU(alpha=0.2))
    discriminator.add(Dense(1, activation='sigmoid'))
    return discriminator

# Concatenate noise vector and properties
z = Input(shape=(100,))
prop = Input(shape=(num_properties,))
combined_inputs = Concatenate()([z, prop])

generator = make_generator()
discriminator = make_discriminator()

# Generate molecule from noise vector and properties
molecule = generator(combined_inputs)

# Concatenate properties to generated molecule
gen_input = Concatenate()([molecule, prop])

# Compile the discriminator model
discriminator.compile(loss='binary_crossentropy', optimizer='adam')

# Set discriminator to non-trainable
discriminator.trainable = False

# Compile the GAN model
gan = Model([z, prop], discriminator(gen_input))
gan.compile(loss='binary_crossentropy', optimizer='adam')

# Train the GAN model
batch_size = 32
num_epochs = 1000
start_index = 0
end_index = 31
for epoch in range(num_epochs):
    # Train the discriminator
    real_smiles = Chem_data['Smiles'][start_index:end_index]
    #real_smiles = next(smiles_generator)
    real_labels = np.ones((batch_size, 1))
    fake_smiles = generator.predict([np.random.normal(size=(len(real_smiles), 100)), properties[:len(real_smiles)]])
    #fake_smiles = generator.predict([np.random.normal(size=(batch_size, 100)), properties])
    fake_labels = np.zeros((batch_size, 1))
    discriminator.train_on_batch(np.concatenate([real_smiles, properties[:batch_size]], axis=1), real_labels)
    discriminator.train_on_batch(np.concatenate([fake_smiles, properties[:batch_size]], axis=1), fake_labels)

    # Train the generator
    gan_labels = np.ones((batch_size, 1))
    gan.train_on_batch([np.random.normal(size=(batch_size, 100)), properties], gan_labels)


In [None]:
import numpy as np
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, LeakyReLU, Concatenate, Input
from transformers import AutoTokenizer

import tensorflow as tf
import pandas as pd
from rdkit import Chem

# Tokenize the SMILES strings using the PubChem10M SMILES BPE tokenizer
tokenizer = AutoTokenizer.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")
smiles_list = Chem_data['Smiles'].tolist()
encoded_smiles = tokenizer(smiles_list, padding=True, truncation=True, max_length=100, return_tensors="pt")
vocab_size = len(tokenizer.get_vocab())

# Preprocess the properties
properties = Chem_data.drop(['Smiles'], axis=1).values.astype('float64')
num_properties = properties.shape[1]

def make_generator():
    z = Input(shape=(100,))
    prop = Input(shape=(num_properties,))
    x = Concatenate()([z, prop])
    x = Dense(128)(x)
    x = LeakyReLU(alpha=0.2)(x)
    x = Dense(256)(x)
    x = LeakyReLU(alpha=0.2)(x)
    x = Dense(512)(x)
    x = LeakyReLU(alpha=0.2)(x)
    x = Dense(vocab_size, activation='softmax')(x)
    generator = Model([z, prop], x)
    return generator

def make_discriminator():
    discriminator = Sequential()
    discriminator.add(Dense(512, input_dim=vocab_size+num_properties))
    discriminator.add(LeakyReLU(alpha=0.2))
    discriminator.add(Dense(256))
    discriminator.add(LeakyReLU(alpha=0.2))
    discriminator.add(Dense(1, activation='sigmoid'))
    return discriminator

# Concatenate noise vector and properties
z = Input(shape=(100,))
prop = Input(shape=(num_properties,))

generator = make_generator()
molecule = generator([z, prop])

discriminator = make_discriminator()
gen_input = Concatenate()([molecule, prop])

# Compile the discriminator model
discriminator.compile(loss='binary_crossentropy', optimizer='adam')

# Set discriminator to non-trainable
discriminator.trainable = False

# Compile the GAN model
discriminator.trainable = False
gan = Model([z, prop], discriminator(gen_input))
gan.compile(loss='binary_crossentropy', optimizer='adam')
fake_smiles = generator.predict([np.random.normal(size=(len(real_smiles), 100)), properties[start_index:end_index]])

# Train the GAN model
batch_size = 32
num_epochs = 1000
start_index = 1
end_index = 32

for epoch in range(num_epochs):
    # Train the discriminator
    
    real_smiles = Chem_data['Smiles'][start_index:end_index].values.astype('str')
    #real_smiles = next(smiles_generator)
    real_labels = np.ones((batch_size, 1))
    fake_smiles = generator.predict([np.random.normal(size=(len(real_smiles), 100)), properties[:len(real_smiles)]])
    #fake_smiles = generator.predict([np.random.normal(size=(batch_size, 100)), properties])
    fake_labels = np.zeros((batch_size, 1))
    real_smiles = Chem_data['Smiles'][start_index:end_index].values.reshape(-1, 1)
    #discriminator.train_on_batch(np.concatenate([real_smiles, properties[:len(real_smiles)]], axis=1), real_labels)
    discriminator.train_on_batch(np.concatenate([real_smiles.reshape(-1, 1), properties[:len(real_smiles)]], axis=1), real_labels)


    #discriminator.train_on_batch(np.concatenate([real_smiles.reshape(-1,1), properties[:batch_size]], axis=1), real_labels)

    #discriminator.train_on_batch(np.concatenate([real_smiles, properties[:batch_size]], axis=1), real_labels)
    discriminator.train_on_batch(np.concatenate([fake_smiles, properties[:batch_size]], axis=1), fake_labels)

    # Train the generator
    gan_labels = np.ones((batch_size, 1))
    gan.train_on_batch([np.random.normal(size=(batch_size, 100)), properties], gan_labels)


In [None]:
real_smiles

In [None]:
#Only smiles
import numpy as np
import pandas as pd
import tensorflow as tf
from rdkit import Chem
from tensorflow.keras.layers import Dense, Input, LeakyReLU
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from transformers import AutoTokenizer

# Load and preprocess the data
smiles_list = Chem_data['Smiles'].tolist()

# Tokenize the SMILES strings using the PubChem10M SMILES BPE tokenizer
tokenizer = AutoTokenizer.from_pretrained("seyonec/PubChem10M_SMILES_BPE_450k")
encoded_smiles = tokenizer(smiles_list, padding=True, truncation=True, max_length=100, return_tensors="pt")
vocab_size = len(tokenizer.get_vocab())

# Define the generator model
def make_generator():
    z = Input(shape=(100,))
    x = Dense(128)(z)
    x = LeakyReLU(alpha=0.2)(x)
    x = Dense(256)(x)
    x = LeakyReLU(alpha=0.2)(x)
    x = Dense(512)(x)
    x = LeakyReLU(alpha=0.2)(x)
    x = Dense(vocab_size, activation='softmax')(x)
    generator = Model(z, x)
    return generator

# Define the discriminator model
def make_discriminator():
    discriminator = Sequential()
    discriminator.add(Dense(512, input_dim=vocab_size))
    discriminator.add(LeakyReLU(alpha=0.2))
    discriminator.add(Dense(256))
    discriminator.add(LeakyReLU(alpha=0.2))
    discriminator.add(Dense(1, activation='sigmoid'))
    return discriminator

# Concatenate noise vector and properties
z = Input(shape=(100,))
generator = make_generator()
molecule = generator(z)

discriminator = make_discriminator()
discriminator.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.0002, beta_1=0.5))

# Set discriminator to non-trainable
discriminator.trainable = False

# Compile the GAN model
gan_input = z
gan_output = discriminator(molecule)
gan = Model(gan_input, gan_output)
gan.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.0002, beta_1=0.5))

# Train the GAN model
num_epochs = 1000
batch_size = 32
for epoch in range(num_epochs):
    # Train the discriminator
    real_smiles = np.array(smiles_list)
    real_labels = np.ones((len(real_smiles), 1))
    fake_smiles = generator.predict(np.random.normal(size=(len(real_smiles), 100)))
    fake_labels = np.zeros((len(real_smiles), 1))
    discriminator.train_on_batch(encoded_smiles.reshape(len(real_smiles), -1), real_labels)
    discriminator.train_on_batch(fake_smiles.reshape(len(real_smiles), -1), fake_labels)

    # Train the generator
    gan_labels = np.ones((len(real_smiles), 1))
    gan.train_on_batch(np.random.normal(size=(len(real_smiles), 100)), gan_labels)

   

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Concatenate

# Concatenate the input tensors
concatenated = Concatenate()([input_tensor_1, input_tensor_2])

# Pass the concatenated tensor through the sequential model
output = sequential_model(concatenated)


# Define the input layers for the generator and discriminator
z = Input(shape=(100,))
prop = Input(shape=(14,))

# Generate molecule from noise vector and properties
molecule = generator([z, prop])

# Concatenate properties to generated molecule
gen_input = Concatenate()([molecule, prop])

# Compile the discriminator model
discriminator.compile(loss='binary_crossentropy', optimizer='adam')

# Set discriminator to non-trainable
discriminator.trainable = False

# Compile the GAN model
gan = Model([z, prop], discriminator(gen_input))
gan.compile(loss='binary_crossentropy', optimizer='adam')

# Train the GAN model
for epoch in range(num_epochs):
    # Train the discriminator
    real_smiles = next(smiles_generator)
    real_labels = np.ones((batch_size, 1))
    fake_smiles = generator.predict([np.random.normal(size=(batch_size, 100)), properties])
    fake_labels = np.zeros((batch_size, 1))
    discriminator.train_on_batch(real_smiles, real_labels)
    discriminator.train_on_batch(fake_smiles, fake_labels)

    # Train the generator
    gan_labels = np.ones((batch_size, 1))
    gan.train_on_batch([np.random.normal(size=(batch_size, 100)), properties], gan_labels)


In [None]:
# Print the summary of the GAN model
gan.summary()

In [None]:
pip install molgen

In [None]:
import numpy as np
from rdkit import Chem
from molgen.graph import GraphGenerator, MolGraph

# Define a function to preprocess your dataset
def preprocess_dataset(dataset_path):
    # Read your dataset from a CSV file or any other format
    # Here, we assume that the dataset has two columns: "Smiles" and "Properties"
    # The "Properties" column should contain a vector of your 14 properties
    # We also assume that the dataset has a header row

    # Load the dataset into a list of MolGraphs
    molgraphs = []
    with open(dataset_path, "r") as f:
        next(f)  # Skip the header row
        for line in f:
            smiles, props = line.strip().split(",")
            props = np.array(props.split(), dtype=np.float32)
            mol = Chem.MolFromSmiles(smiles)
            if mol is not None:
                molgraph = MolGraph(mol)
                molgraph.set_property("properties", props)
                molgraphs.append(molgraph)

    return molgraphs

# Preprocess your dataset

molgraphs = preprocess_dataset(Chem_data)

# Create a GraphGenerator object
generator = GraphGenerator(num_nodes=9, latent_dim=32)

# Train the generator on your dataset
generator.fit(molgraphs, epochs=100, batch_size=32)

# Generate new molecules based on specific properties
props = np.array([0.2, 0.4, 0.1, 0.7, 0.9, 0.3, 0.5, 0.8, 0.6, 0.2, 0.4, 0.1, 0.7, 0.9], dtype=np.float32)
generated_mol = generator.sample(props)
generated_smiles = Chem.MolToSmiles(generated_mol)

print(generated_smiles)


## new try 

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load and preprocess data
X = Chem_data['']
y = dataset['smiles']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Normalize the input features
X_train_normalized = (X_train - X_train.mean()) / X_train.std()
X_test_normalized = (X_test - X_train.mean()) / X_train.std()

# Train the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_normalized, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_normalized)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean squared error: {mse}")

# Generate SMILES strings from new input descriptors
new_descriptors = [[1.2, 3.4, ..., 0.5]]
new_descriptors_normalized = (new_descriptors - X_train.mean()) / X_train.std()
new_smiles = rf_model.predict(new_descriptors_normalized)
