In [2]:
pip install rdkit


Collecting rdkit
  Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.6


In [5]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Descriptors
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import csv

"""
  Perform feature selection using a Random Forest Regressor.

  This function fits a Random Forest Regressor to the data, computes the feature importances,
  and returns the indices of features sorted by their importance in descending order.

  Parameters:
  - rf_model (RandomForestRegressor): The Random Forest Regressor model to fit.
  - X (pd.DataFrame): The input features.
  - y (pd.Series): The target variable.

  Returns:
  - indices (np.ndarray): Indices of features sorted by their importance in descending order.
  - importances (np.ndarray): Importance scores of the features.
"""

# Step 1: Read the input CSV file
input_csv = "filtered_molecules222.csv"
df = pd.read_csv(input_csv)

# Step 2: Prepare data for modeling
descriptor_names = df.columns.drop(['SMILES', 'PKM2_inhibition', 'ERK2_inhibition'])
X = df[descriptor_names]
y_pkm2 = df['PKM2_inhibition']
y_erk2 = df['ERK2_inhibition']

# Step 3: Feature selection with Random Forest Regressor
rf_pkm2 = RandomForestRegressor(n_estimators=100, random_state=42)
rf_erk2 = RandomForestRegressor(n_estimators=100, random_state=42)

# Function to perform feature selection
def perform_feature_selection(rf_model, X, y):
    rf_model.fit(X, y)
    importances = rf_model.feature_importances_
    indices = np.argsort(importances)[::-1]  # Sort feature importances in descending order
    return indices, importances

# Feature selection for PKM2 inhibition
indices_pkm2, importances_pkm2 = perform_feature_selection(rf_pkm2, X, y_pkm2)

# Feature selection for ERK2 inhibition
indices_erk2, importances_erk2 = perform_feature_selection(rf_erk2, X, y_erk2)

# Feature ranking for PKM2 inhibition
print("Feature ranking for PKM2 inhibition:")
top_features_pkm2 = sorted(zip(descriptor_names, importances_pkm2), key=lambda x: x[1], reverse=True)[:30]
for i, (feature, importance) in enumerate(top_features_pkm2, start=1):
    print(f"{i}. {feature}: {importance}")

print()

# Feature ranking for ERK2 inhibition
print("Feature ranking for ERK2 inhibition:")
top_features_erk2 = sorted(zip(descriptor_names, importances_erk2), key=lambda x: x[1], reverse=True)[:30]
for i, (feature, importance) in enumerate(top_features_erk2, start=1):
    print(f"{i}. {feature}: {importance}")

# Optional: Evaluate performance with selected features
# Split data into train and test sets
X_train_pkm2, X_test_pkm2, y_pkm2_train, y_pkm2_test = train_test_split(X.iloc[:, indices_pkm2[:10]], y_pkm2, test_size=0.2, random_state=42)
X_train_erk2, X_test_erk2, y_erk2_train, y_erk2_test = train_test_split(X.iloc[:, indices_erk2[:10]], y_erk2, test_size=0.2, random_state=42)

# Initialize new models with selected features
rf_pkm2_selected = RandomForestRegressor(n_estimators=100, random_state=42)
rf_erk2_selected = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit models on selected features
rf_pkm2_selected.fit(X_train_pkm2, y_pkm2_train)
rf_erk2_selected.fit(X_train_erk2, y_erk2_train)

# Predict on test set
y_pkm2_pred = rf_pkm2_selected.predict(X_test_pkm2)
y_erk2_pred = rf_erk2_selected.predict(X_test_erk2)

# Evaluate performance
mse_pkm2 = mean_squared_error(y_pkm2_test, y_pkm2_pred)
mse_erk2 = mean_squared_error(y_erk2_test, y_erk2_pred)

print(f"\nMean Squared Error (MSE) for PKM2 inhibition prediction: {mse_pkm2:.4f}")
print(f"Mean Squared Error (MSE) for ERK2 inhibition prediction: {mse_erk2:.4f}")

# Extracting top descriptors
top_descriptors = set()
for feature, _ in top_features_pkm2:
    top_descriptors.add(feature)
for feature, _ in top_features_erk2:
    top_descriptors.add(feature)

# Retain only top descriptors in the dataframe
top_descriptors_df = df[['SMILES', 'PKM2_inhibition', 'ERK2_inhibition'] + list(top_descriptors)]

# Write the descriptors to a CSV file
output_file = 'top_descriptors.csv'
top_descriptors_df.to_csv(output_file, index=False)

print(f"Top descriptors with SMILES and inhibition columns written to {output_file}.")



Feature ranking for PKM2 inhibition:
1. BCUT2D_MRLOW: 0.05765606660939422
2. BalabanJ: 0.055445742237385964
3. AvgIpc: 0.05244760335515452
4. qed: 0.05060592484453866
5. FpDensityMorgan1: 0.04532530009059169
6. BCUT2D_MWLOW: 0.04322159763206442
7. fr_sulfonamd: 0.03279110049469601
8. fr_Ar_N: 0.030436361894959515
9. FpDensityMorgan3: 0.026464740748246868
10. MinEStateIndex: 0.024941015355088348
11. fr_ether: 0.02394110136012251
12. BCUT2D_CHGLO: 0.021941540087607934
13. Chi3v: 0.021921071998921796
14. NumAromaticHeterocycles: 0.021148084353303717
15. FpDensityMorgan2: 0.01921340232702454
16. MinPartialCharge: 0.01888997224238114
17. MaxAbsPartialCharge: 0.01815250854759875
18. MolLogP: 0.017418491835984557
19. Chi4n: 0.01734203275816163
20. BCUT2D_LOGPHI: 0.01647134864186189
21. SPS: 0.01605838680718691
22. MaxAbsEStateIndex: 0.016020682692041478
23. Chi4v: 0.015445777693911257
24. BCUT2D_LOGPLOW: 0.015300615827837045
25. MaxEStateIndex: 0.013965702199081168
26. Chi1v: 0.01315591587666