In [None]:
import pandas as pd
from padelpy import from_smiles

def calculate_molecular_descriptors(descriptor_list, output_filename='all_descriptors.csv'):
    """
    Calculate molecular descriptors using padelpy from a list of SMILES IDs.

    Parameters:
    - descriptor_list (list): List of SMILES IDs.
    - output_filename (str, optional): Output filename to save the descriptors (default is 'all_descriptors.csv').

    Returns:
    - None: Saves the calculated descriptors to a CSV file.
    """
    # Create an empty DataFrame to store all the descriptors
    full_df = pd.DataFrame()

    # Loop through each SMILES ID
    for index, smile_id in enumerate(descriptor_list):
        print(f"Calculating descriptors for SMILES ID {index + 1}/{len(descriptor_list)}")
        output_csv = f'{index}.csv'
        from_smiles(smile_id, fingerprints=False, output_csv=output_csv, timeout=600)
        df = pd.read_csv(output_csv)
        full_df = pd.concat([full_df, df], ignore_index=True)

    # Save all descriptors to a single CSV file
    full_df.to_csv(output_filename, index=False)
    print(f"Descriptors saved to {output_filename}")

# Example usage:
if __name__ == "__main__":
    # Replace with your list of SMILES IDs
    descriptor_list = ['SMILES_ID_1', 'SMILES_ID_2', 'SMILES_ID_3']  # Example SMILES IDs
    
    # Call the function to calculate descriptors
    calculate_molecular_descriptors(descriptor_list, output_filename='all_descriptors.csv')


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import skew, kurtosis, boxcox, yeojohnson
from sklearn.preprocessing import StandardScaler
from MultiColumnLabelEncoder import MultiColumnLabelEncoder

def preprocess_multi_omics_datasets(transcriptomics_df, proteomics_df, fusion_protein_df, crispr_df, cn_alteration_df, mutation_df):
    """
    Preprocess multiple omics datasets for drug designing and model development.
    
    Parameters:
    transcriptomics_df (DataFrame): Transcriptomics dataset.
    proteomics_df (DataFrame): Proteomics dataset.
    fusion_protein_df (DataFrame): Fusion Protein dataset.
    crispr_df (DataFrame): CRISPR KO dataset.
    cn_alteration_df (DataFrame): Copy number alteration dataset.
    mutation_df (DataFrame): Mutation dataset.
    
    Returns:
    tuple: Preprocessed DataFrames for each omics dataset.
    """
    
    def preprocess_single_dataset(df, dataset_name):
        """
        Preprocess a single omics dataset.
        
        Parameters:
        df (DataFrame): Input dataset.
        dataset_name (str): Name of the dataset.
        
        Returns:
        DataFrame: Preprocessed dataset.
        """
        
        print(f"Preprocessing {dataset_name} dataset...")
        
        # Handling missing values
        print(f"Handling missing values for {dataset_name} dataset...")
        def fill_null_with_mean(df):
            df.fillna(df.mean(), inplace=True)
            df.fillna(0, inplace=True)
        
        fill_null_with_mean(df)
        
        # Encoding categorical variables
        print(f"Encoding categorical variables for {dataset_name} dataset...")
        mcle = MultiColumnLabelEncoder()
        df = mcle.fit_transform(df)
        
        # Identifying skewness
        print(f"Identifying skewness for {dataset_name} dataset...")
        skewness_values = np.apply_along_axis(skew, axis=0, arr=df)
        plt.hist(x=skewness_values)
        plt.title(f'Skewness Distribution for {dataset_name}')
        plt.show()

        # Identifying kurtosis
        print(f"Identifying kurtosis for {dataset_name} dataset...")
        kurtosis_values = np.apply_along_axis(kurtosis, axis=0, arr=df)
        plt.hist(x=kurtosis_values)
        plt.title(f'Kurtosis Distribution for {dataset_name}')
        plt.show()

        # Transforming columns to reduce skewness and kurtosis
        print(f"Transforming columns to reduce skewness and kurtosis for {dataset_name} dataset...")
        start_column = 12  # Adjust as per your dataset
        
        X = df.iloc[:, start_column:]
        kurtosis_threshold = 3
        
        for i, kurt in enumerate(kurtosis_values[start_column:]):
            col_index = start_column + i
            if kurt > kurtosis_threshold:
                try:
                    transformed_column, _ = boxcox(X.iloc[:, i] + 1)
                    X.iloc[:, i] = transformed_column
                except ValueError:
                    transformed_column, _ = yeojohnson(X.iloc[:, i] + 1)
                    X.iloc[:, i] = transformed_column
            elif kurt < kurtosis_threshold:
                if np.min(X.iloc[:, i]) >= 0:
                    X.iloc[:, i] = np.sqrt(X.iloc[:, i])
                else:
                    X.iloc[:, i] = np.log(X.iloc[:, i] - np.min(X.iloc[:, i]) + 1)
        
        df.iloc[:, start_column:] = X
        
        print(f"Preprocessing for {dataset_name} dataset complete.")
        return df
    
    # Process each dataset
    processed_transcriptomics = preprocess_single_dataset(transcriptomics_df, "Transcriptomics")
    processed_proteomics = preprocess_single_dataset(proteomics_df, "Proteomics")
    processed_fusion_protein = preprocess_single_dataset(fusion_protein_df, "Fusion Protein")
    processed_crispr = preprocess_single_dataset(crispr_df, "CRISPR KO")
    processed_cn_alteration = preprocess_single_dataset(cn_alteration_df, "Copy Number Alteration")
    processed_mutation = preprocess_single_dataset(mutation_df, "Mutation")
    
    print("Data preprocessing for all datasets complete.")
    
    return (processed_transcriptomics, processed_proteomics, processed_fusion_protein, processed_crispr, processed_cn_alteration, processed_mutation)

# Example usage
if __name__ == "__main__":
    # Load your datasets here, replace with actual dataset loading code
    transcriptomics_df = pd.DataFrame()  # Replace with actual data loading
    proteomics_df = pd.DataFrame()  # Replace with actual data loading
    fusion_protein_df = pd.DataFrame()  # Replace with actual data loading
    crispr_df = pd.DataFrame()  # Replace with actual data loading
    cn_alteration_df = pd.DataFrame()  # Replace with actual data loading
    mutation_df = pd.DataFrame()  # Replace with actual data loading
    
    # Preprocess all datasets
    (processed_transcriptomics, processed_proteomics, processed_fusion_protein,
     processed_crispr, processed_cn_alteration, processed_mutation) = preprocess_multi_omics_datasets(transcriptomics_df,
                                                                                                      proteomics_df,
                                                                                                      fusion_protein_df,
                                                                                                      crispr_df,
                                                                                                      cn_alteration_df,
                                                                                                      mutation_df)
    
    # Example of accessing processed data
    print("Processed Transcriptomics Data:")
    print(processed_transcriptomics.head())
    # Similarly, access other processed datasets as needed


In [None]:
from __future__ import division
import numpy as np
import theano
import theano.tensor as T
import pickle
from collections import OrderedDict

epsilon = 1e-8

def relu(x):
    return T.switch(x < 0, 0, x)

def create_weight(dim_input, dim_output, sigma_init=0.01):
    return np.random.normal(0, sigma_init, (dim_input, dim_output)).astype(theano.config.floatX)

def create_bias(dim_output):
    return np.zeros(dim_output).astype(theano.config.floatX)

def variational_autoencoder(x_train, continuous=True, hu_encoder=500, hu_decoder=500, n_latent=20, 
                            b1=0.95, b2=0.999, batch_size=100, learning_rate=0.001, lam=0, L=1):
    """
    Variational Autoencoder for dimensionality reduction of omics datasets.
    
    Parameters:
    - x_train (numpy.ndarray): Input data matrix of shape (N, features) where N is the number of samples.
    - continuous (bool, optional): Whether the data is continuous (default is True).
    - hu_encoder (int, optional): Number of units in the encoder hidden layer (default is 500).
    - hu_decoder (int, optional): Number of units in the decoder hidden layer (default is 500).
    - n_latent (int, optional): Dimensionality of the latent space (default is 20).
    - b1 (float, optional): Adam optimizer parameter (default is 0.95).
    - b2 (float, optional): Adam optimizer parameter (default is 0.999).
    - batch_size (int, optional): Batch size for training (default is 100).
    - learning_rate (float, optional): Learning rate for training (default is 0.001).
    - lam (float, optional): L2 regularization coefficient (default is 0).
    - L (int, optional): Number of samples z^(i,l) per datapoint (default is 1).
    
    Returns:
    - numpy.ndarray: Transformed data matrix of shape (N, n_latent).
    """
    [N, features] = x_train.shape
    prng = np.random.RandomState(42)

    # Define weights and biases
    W_xh = theano.shared(create_weight(features, hu_encoder), name='W_xh')
    b_xh = theano.shared(create_bias(hu_encoder), name='b_xh')

    W_hmu = theano.shared(create_weight(hu_encoder, n_latent), name='W_hmu')
    b_hmu = theano.shared(create_bias(n_latent), name='b_hmu')

    W_hsigma = theano.shared(create_weight(hu_encoder, n_latent), name='W_hsigma')
    b_hsigma = theano.shared(create_bias(n_latent), name='b_hsigma')

    W_zh = theano.shared(create_weight(n_latent, hu_decoder), name='W_zh')
    b_zh = theano.shared(create_bias(hu_decoder), name='b_zh')

    params = OrderedDict([
        ("W_xh", W_xh), ("b_xh", b_xh), ("W_hmu", W_hmu), ("b_hmu", b_hmu),
        ("W_hsigma", W_hsigma), ("b_hsigma", b_hsigma), ("W_zh", W_zh), ("b_zh", b_zh)
    ])

    if continuous:
        W_hxmu = theano.shared(create_weight(hu_decoder, features), name='W_hxmu')
        b_hxmu = theano.shared(create_bias(features), name='b_hxmu')

        W_hxsig = theano.shared(create_weight(hu_decoder, features), name='W_hxsigma')
        b_hxsig = theano.shared(create_bias(features), name='b_hxsigma')

        params.update({'W_hxmu': W_hxmu, 'b_hxmu': b_hxmu, 'W_hxsigma': W_hxsig, 'b_hxsigma': b_hxsig})
    else:
        W_hx = theano.shared(create_weight(hu_decoder, features), name='W_hx')
        b_hx = theano.shared(create_bias(features), name='b_hx')

        params.update({'W_hx': W_hx, 'b_hx': b_hx})

    m = OrderedDict()
    v = OrderedDict()

    for key, value in params.items():
        m[key] = theano.shared(np.zeros_like(value.get_value()).astype(theano.config.floatX), name='m_' + key)
        v[key] = theano.shared(np.zeros_like(value.get_value()).astype(theano.config.floatX), name='v_' + key)

    x_train = theano.shared(x_train.astype(theano.config.floatX), name="x_train")

    def encoder(x):
        h_encoder = relu(T.dot(x, params['W_xh']) + params['b_xh'].dimshuffle('x', 0))
        mu = T.dot(h_encoder, params['W_hmu']) + params['b_hmu'].dimshuffle('x', 0)
        log_sigma = T.dot(h_encoder, params['W_hsigma']) + params['b_hsigma'].dimshuffle('x', 0)
        return mu, log_sigma

    def sampler(mu, log_sigma):
        seed = 42
        srng = theano.tensor.shared_randomstreams.RandomStreams(seed=seed)
        eps = srng.normal((L, mu.shape[0], n_latent))
        z = mu + T.exp(0.5 * log_sigma) * eps
        return z

    def decoder(x, z):
        h_decoder = relu(T.dot(z, params['W_zh']) + params['b_zh'].dimshuffle('x', 0))
        if continuous:
            reconstructed_x = T.dot(h_decoder, params['W_hxmu']) + params['b_hxmu'].dimshuffle('x', 0)
            log_sigma_decoder = T.dot(h_decoder, params['W_hxsigma']) + params['b_hxsigma']
            logpxz = (-(0.5 * np.log(2 * np.pi) + 0.5 * log_sigma_decoder) -
                      0.5 * ((x - reconstructed_x)**2 / T.exp(log_sigma_decoder))).sum(axis=2).mean(axis=0)
        else:
            reconstructed_x = T.nnet.sigmoid(T.dot(h_decoder, params['W_hx']) + params['b_hx'].dimshuffle('x', 0))
            logpxz = - T.nnet.binary_crossentropy(reconstructed_x, x).sum(axis=2).mean(axis=0)
        return reconstructed_x, logpxz

    def create_gradientfunctions(x_train):
        x = T.matrix("x")
        epoch = T.scalar("epoch")
        batch_size = x.shape[0]
        mu, log_sigma = encoder(x)
        z = sampler(mu, log_sigma)
        reconstructed_x, logpxz = decoder(x, z)
        KLD = 0.5 * T.sum(1 + log_sigma - mu**2 - T.exp(log_sigma), axis=1)
        logpx = T.mean(logpxz + KLD)
        gradients = T.grad(logpx, list(params.values()))
        updates = get_adam_updates(gradients, epoch)
        batch = T.iscalar('batch')
        givens = {
            x: x_train[batch * batch_size:(batch + 1) * batch_size, :]
        }
        update = theano.function([batch, epoch], logpx, updates=updates, givens=givens)
        likelihood = theano.function([x], logpx)
        encode = theano.function([x], z)
        decode = theano.function([z], reconstructed_x)
        return update, likelihood, encode, decode

    def transform_data(x_train):
        transformed_x = np.zeros((N, n_latent))
        batches = np.arange(int(N / batch_size))
        for batch in batches:
            batch_x = x_train[batch * batch_size:(batch + 1) * batch_size, :]
            transformed_x[batch * batch_size:(batch + 1) * batch_size, :] = encode(batch_x)
        return transformed_x

    def save_parameters(path):
        pickle.dump({name: p.get_value() for name, p in params.items()}, open(path + "/params.pkl", "wb"))
        pickle.dump({name: m.get_value() for name, m in m.items()}, open(path + "/m.pkl", "wb"))
        pickle.dump({name: v.get_value() for name, v in v.items()}, open(path + "/v.pkl", "wb"))

    def load_parameters(path):
        p_list = pickle.load(open(path + "/params.pkl", "rb"))
        m_list = pickle.load(open(path + "/m.pkl", "rb"))
        v_list = pickle.load(open(path + "/v.pkl", "rb"))
        for name in p_list.keys():
            params[name].set_value(p_list[name].astype(theano.config.floatX))
            m[name].set_value(m_list[name].astype(theano.config.floatX))
            v[name].set_value(v_list[name].astype(theano.config.floatX))

    def get_adam_updates(gradients, epoch):
        updates = OrderedDict()
        gamma = T.sqrt(1 - b2 ** epoch) / (1 - b1 ** epoch)
        values_iterable = zip(params.keys(), params.values(), gradients, m.values(), v.values())
        for name, parameter, gradient, m, v in values_iterable:
            new_m = b1 * m + (1. - b1) * gradient
            new_v = b2 * v + (1. - b2) * (gradient ** 2)
            updates[parameter] = parameter + learning_rate * gamma * new_m / (T.sqrt(new_v) + epsilon)
            if 'W' in name:
                updates[parameter] -= learning_rate * lam * (parameter * np.float32(batch_size / N))
            updates[m] = new_m
            updates[v] = new_v
        return updates

    update, likelihood, encode, decode = create_gradientfunctions(x_train)

    # Training loop (if needed)

    # Example usage:
    # transformed_data = transform_data(x_train)
    # save_parameters("model_path")
    # load_parameters("model_path")
    
    return transform_data(x_train)

# Example usage:
if __name__ == "__main__":
    # Replace with your omics dataset
    x_train = np.random.rand(100, 500)  # Example dataset (replace with your data)
    
    # Perform dimensionality reduction using VAE
    transformed_data = variational_autoencoder(x_train)
    
    # Save or use the transformed_data as needed
    print(f"Transformed data shape: {transformed_data.shape}")

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import GaussianNoise, BatchNormalization, Dropout
import pandas as pd
import numpy as np

def build_denoising_autoencoder(input_shape):
    """
    Build and train a denoising autoencoder model for dimensionality reduction.
    
    Parameters:
    input_shape (tuple): Shape of the input data.
    
    Returns:
    tuple: Trained autoencoder model, encoder model, and encoded data DataFrame.
    """
    # Define the input layer
    input_data = keras.Input(shape=input_shape)
    
    # Add Gaussian noise to the input data
    noisy_input = GaussianNoise(0.25)(input_data)  # Adjust the standard deviation for more substantial noise
    
    # Encoder layers
    encoded = layers.Dense(2000, activation='relu')(noisy_input)
    encoded = Dropout(0.2)(encoded)  # Adding Dropout for regularization
    
    # Decoder layers
    decoded = layers.Dense(input_shape[0], activation='relu')(encoded)
    
    # Autoencoder model
    autoencoder = keras.Model(input_data, decoded)
    autoencoder.summary()
    
    # Compile the autoencoder
    optimizer = keras.optimizers.RMSprop(learning_rate=0.001)
    autoencoder.compile(optimizer=optimizer, loss='mean_squared_error')
    
    # Train the autoencoder
    autoencoder.fit(X_resampled, X_resampled,
                    epochs=50,
                    batch_size=25,
                    shuffle=True,
                    validation_split=0.3,
                    callbacks=[EarlyStopping('val_loss', patience=10)])
    
    # Define the encoder model (up to the bottleneck layer)
    encoder = keras.Model(input_data, encoded)
    
    # Define the decoder model
    bottleneck_input = keras.Input(shape=(2000,))
    decoder_layers = autoencoder.layers[-1]
    decoder_output = bottleneck_input
    
    # Adding BatchNormalization and Dropout in the decoder
    decoder_output = BatchNormalization()(decoder_output)
    decoder_output = Dropout(0.2)(decoder_output)
    
    decoder_output = decoder_layers(decoder_output)
    decoder = keras.Model(bottleneck_input, decoder_output)
    
    # Get the encoded data
    encoded_data = encoder.predict(X_resampled)
    encoded_df = pd.DataFrame(encoded_data)
    
    return autoencoder, encoder, encoded_df

# Assuming X_resampled is your input data
X_resampled = pd.DataFrame()  # Replace with actual data

# Build the denoising autoencoder
autoencoder, encoder, encoded_df = build_denoising_autoencoder(input_shape=(20532,))

# Example code to concatenate multiomics datasets after dimensionality reduction
# Concatenate encoded data with other omics datasets
concatenated_df = pd.concat([encoded_df, transcriptomics_df, proteomics_df, fusion_protein_df,
                             crispr_df, cn_alteration_df, mutation_df], axis=1)

# Example usage
print("Concatenated DataFrame:")
print(concatenated_df.head())


Explanation:
Function build_denoising_autoencoder:

Constructs a denoising autoencoder model using TensorFlow/Keras.
Adds Gaussian noise to the input data, defines encoder and decoder layers, and compiles the model.
Trains the autoencoder on X_resampled data.
Returns the trained autoencoder model, encoder model (up to the bottleneck layer), and DataFrame encoded_df containing encoded data.
Concatenation of Multiomics Datasets:

After dimensionality reduction using the encoder, concatenate the encoded data (encoded_df) with other omics datasets (transcriptomics_df, proteomics_df, etc.).
Adjust axis=1 in pd.concat according to how you want to merge the datasets (by columns in this case).
Example Usage:

Replace X_resampled with your actual input data.
Use the returned models (autoencoder, encoder) and encoded data (encoded_df) for further analysis or modeling tasks.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam, SGD
import tensorflow as tf
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

# Assuming you have X_train, Y_train, X_test, Y_test defined

# Combine X_train and Y_train into a single DataFrame
train_data = pd.concat([pd.DataFrame(X_train), pd.DataFrame(Y_train)], axis=1)

# Define the neural network model
def create_model(params):
    model = Sequential()
    model.add(Dense(int(params['units1']), input_shape=(X_train.shape[1],), activation=params['activation']))
    model.add(Dropout(params['dropout1']))
    model.add(Dense(int(params['units2']), activation=params['activation']))
    model.add(Dropout(params['dropout2']))
    model.add(Dense(int(params['units3']), activation=params['activation']))
    model.add(Dropout(params['dropout3']))
    model.add(Dense(int(params['units4']), activation=params['activation']))
    model.add(Dropout(params['dropout4']))
    model.add(Dense(1, activation='linear'))

    if params['optimizer'] == 'adam':
        optimizer = Adam(learning_rate=params['learning_rate'])
    elif params['optimizer'] == 'sgd':
        optimizer = SGD(learning_rate=params['learning_rate'])

    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

# Define the objective function for hyperopt
def objective(params):
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    validation_losses = []
    
    for train_index, val_index in kf.split(train_data):
        X_train_fold, X_val_fold = train_data.iloc[train_index, :-1], train_data.iloc[val_index, :-1]
        Y_train_fold, Y_val_fold = train_data.iloc[train_index, -1], train_data.iloc[val_index, -1]

        # Standardize the data
        scaler = StandardScaler()
        X_train_fold = scaler.fit_transform(X_train_fold)
        X_val_fold = scaler.transform(X_val_fold)

        # Create and compile the model
        model = create_model(params)

        # Train the model
        model.fit(
            X_train_fold, Y_train_fold,
            epochs=50,
            batch_size=int(params['batch_size']),
            validation_data=(X_val_fold, Y_val_fold),
            callbacks=[EarlyStopping(monitor='val_loss', patience=int(params['early_stopping_patience']), restore_best_weights=True)],
            verbose=0
        )

        # Evaluate the model on the validation set
        val_loss = model.evaluate(X_val_fold, Y_val_fold, verbose=0)
        validation_losses.append(val_loss)

    return {'loss': np.mean(validation_losses), 'status': STATUS_OK}

# Define the hyperparameter space
space = {
    'units1': hp.quniform('units1', 32, 512, 32),
    'dropout1': hp.uniform('dropout1', 0.0, 0.5),
    'units2': hp.quniform('units2', 32, 512, 32),
    'dropout2': hp.uniform('dropout2', 0.0, 0.5),
    'units3': hp.quniform('units3', 32, 512, 32),
    'dropout3': hp.uniform('dropout3', 0.0, 0.5),
    'units4': hp.quniform('units4', 32, 512, 32),
    'dropout4': hp.uniform('dropout4', 0.0, 0.5),
    'learning_rate': hp.loguniform('learning_rate', np.log(1e-5), np.log(1e-2)),
    'batch_size': hp.quniform('batch_size', 16, 128, 16),
    'early_stopping_patience': hp.quniform('early_stopping_patience', 5, 20, 1),
    'activation': hp.choice('activation', ['relu', 'tanh', 'sigmoid']),
    'optimizer': hp.choice('optimizer', ['adam', 'sgd'])
}

# Run the optimization
trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=50, trials=trials)

print("Best hyperparameters found: ", best)

# Create the model with the best hyperparameters
best_params = {
    'units1': int(best['units1']),
    'dropout1': best['dropout1'],
    'units2': int(best['units2']),
    'dropout2': best['dropout2'],
    'units3': int(best['units3']),
    'dropout3': best['dropout3'],
    'units4': int(best['units4']),
    'dropout4': best['dropout4'],
    'learning_rate': best['learning_rate'],
    'batch_size': int(best['batch_size']),
    'early_stopping_patience': int(best['early_stopping_patience']),
    'activation': ['relu', 'tanh', 'sigmoid'][best['activation']],
    'optimizer': ['adam', 'sgd'][best['optimizer']]
}

# Training the final model with the best hyperparameters
X_train_final, X_val_final, Y_train_final, Y_val_final = train_test_split(X_train, Y_train, test_size=0.1, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_final = scaler.fit_transform(X_train_final)
X_val_final = scaler.transform(X_val_final)

# Create and compile the model
final_model = create_model(best_params)

# Train the model
final_model.fit(
    X_train_final, Y_train_final,
    epochs=50,
    batch_size=best_params['batch_size'],
    validation_data=(X_val_final, Y_val_final),
    callbacks=[EarlyStopping(monitor='val_loss', patience=best_params['early_stopping_patience'], restore_best_weights=True)],
    verbose=1
)

# Evaluate the model on the test set
X_test_scaled = scaler.transform(X_test)
Y_test_pred = final_model.predict(X_test_scaled).ravel()  # Reshape to 1D array

# Calculate evaluation metrics
rmse = np.sqrt(mean_squared_error(Y_test, Y_test_pred))
mse = mean_squared_error(Y_test, Y_test_pred)
mae = mean_absolute_error(Y_test, Y_test_pred)
evs = explained_variance_score(Y_test, Y_test_pred)
r_squared = r2_score(Y_test, Y_test_pred)
q_squared = calculate_q_squared(Y_test, Y_test_pred)

print(f"Test RMSE: {rmse}")
print(f"Test MSE: {mse}")
print(f"Test MAE: {mae}")
print(f"Test Explained Variance: {evs}")
print(f"Test R-squared: {r_squared}")
print(f"Test Q squared: {q_squared}")


Explanation of the Code:
Hyperopt Integration: The objective function now evaluates the model using 10-fold cross-validation, calculates the mean validation loss, and returns it as the objective value.
Parameter Space: Defined the hyperparameter space using Hyperopt's hp module.
Optimization: Used fmin to run the optimization process with 50 evaluations.
Best Parameters: After finding the best hyperparameters, the model is retrained using the entire training set and the best hyperparameters.
Evaluation: The final model is evaluated on the test set using various metrics.

In [None]:
from IPython.display import display
from rdkit import Chem
from rdkit.Chem import AllChem, Draw
import os
import pandas as pd

def process_smiles_excel(excel_file, output_folder, num_conformers=10):
    def generate_conformers(smiles, num_conformers=10):
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            AllChem.EmbedMultipleConfs(mol, numConfs=num_conformers)
        return mol

    def clean_filename(filename):
        # Remove problematic characters from the filename
        return "".join(c for c in filename if c.isalnum() or c in ['_', '-'])

    def save_sdf(mol, folder, filename_prefix):
        os.makedirs(folder, exist_ok=True)
        drug_name = mol.GetProp('Drug Name')
        clean_drug_name = clean_filename(drug_name)  # Clean the drug name
        for i, conf in enumerate(mol.GetConformers()):
            filename = os.path.join(folder, f"{clean_drug_name}_{filename_prefix}_{i+1}.sdf")
            writer = Chem.SDWriter(filename)
            writer.write(mol, confId=conf.GetId())
            writer.close()

    # Read SMILES IDs and drug names from an Excel sheet
    df = pd.read_excel(excel_file)

    # Iterate over SMILES IDs and drug names in the Excel sheet
    for index, row in df.iterrows():
        smiles_id = row['SMILES ID']

        mol = generate_conformers(smiles_id, num_conformers)

        if mol is not None:
            mol.SetProp('Drug Name', str(row['Drug Name']))
            save_sdf(mol, output_folder, f"conformers_{index+1}_2d")

            img = Draw.MolsToGridImage([mol], molsPerRow=5, subImgSize=(300, 300))
            display(img)
        else:
            drug_name = row['Drug Name']
            clean_drug_name = clean_filename(drug_name)
            print(f"Error: Unable to create molecule for drug '{clean_drug_name}'")

# Usage
excel_file = "SMILES.xlsx"  # Replace with your Excel file path
output_folder = "conformers_output"
process_smiles_excel(excel_file, output_folder)


Explanation:
Function Definition: The entire script is wrapped into a single function called process_smiles_excel.
Inner Functions: Helper functions generate_conformers, clean_filename, and save_sdf are defined inside the main function to keep the scope local.
Parameters: The main function process_smiles_excel takes three parameters: excel_file, output_folder, and num_conformers with a default value of 10.
Usage: At the end, the function is called with the required parameters.