In [1]:
pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl (33.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.3.5


In [6]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load the data
df = pd.read_csv('hdac6_inhibitors.csv')

# Check for and remove rows with NaN or invalid SMILES strings
df = df[df['smiles'].notna()]

# Calculate molecular descriptors with error handling
def calculate_descriptors(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            raise ValueError("Invalid SMILES")
        return {d[0]: d[1](mol) for d in Descriptors._descList}
    except Exception as e:
        print(f"Error processing SMILES {smiles}: {e}")
        return {d[0]: np.nan for d in Descriptors._descList}

# Apply the descriptor calculation
descriptors = df['smiles'].apply(calculate_descriptors)

# Convert list of dictionaries to a DataFrame
X = pd.DataFrame(descriptors.tolist(), index=df.index)

# Ensure consistency between X and y
y = df['standard_value']
X = X.dropna()  # Drop rows with missing descriptors
y = y.loc[X.index]  # Keep only rows in y that correspond to rows in X

# Option 1: Drop NaN values from y
y = y.dropna()

# Option 2: Alternatively, you can fill NaN values in y with a default value
# y = y.fillna(y.mean())  # Replace NaNs with the mean value of y

# Ensure X and y are aligned after dropping or filling NaNs
X = X.loc[y.index]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# Function to predict IC50 for new compounds
def predict_ic50(smiles):
    descriptors = calculate_descriptors(smiles)
    X_new = pd.DataFrame([descriptors])
    return model.predict(X_new)[0]

# Example usage
new_compound_smiles = "CC1=C(C(=O)NC(=O)N1)C2=CC=C(C=C2)C3=CC=CC=C3C(=O)NCC4=CC=C(C=C4)C(F)(F)F"
predicted_ic50 = predict_ic50(new_compound_smiles)
print(f"Predicted IC50 for new compound: {predicted_ic50}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m


Mean Squared Error: 325222710341.653
R-squared: -72.27412216363956
Predicted IC50 for new compound: 13843.674300000002




In [7]:
from rdkit import Chem
from rdkit.Chem import Descriptors, Crippen

def calculate_admet(smiles):
    mol = Chem.MolFromSmiles(smiles)

    mw = Descriptors.ExactMolWt(mol)
    logp = Crippen.MolLogP(mol)
    hbd = Descriptors.NumHDonors(mol)
    hba = Descriptors.NumHAcceptors(mol)
    psa = Descriptors.TPSA(mol)

    return {
        "Molecular Weight": mw,
        "LogP": logp,
        "H-Bond Donors": hbd,
        "H-Bond Acceptors": hba,
        "Polar Surface Area": psa
    }

# Example usage
compound_smiles = "CC1=C(C(=O)NC(=O)N1)C2=CC=C(C=C2)C3=CC=CC=C3C(=O)NCC4=CC=C(C=C4)C(F)(F)F"
admet_properties = calculate_admet(compound_smiles)

for prop, value in admet_properties.items():
    print(f"{prop}: {value}")

Molecular Weight: 479.14567616
LogP: 4.654420000000003
H-Bond Donors: 3
H-Bond Acceptors: 3
Polar Surface Area: 94.82
