In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Load training descriptors data
df = pd.read_csv('training_descriptors_no_outliers.csv')
X = df.drop(columns='standard_value')
y = df['standard_value']

# Initialize model and pipeline
model = RandomForestRegressor(max_depth=20, min_samples_leaf=10)
pipe = Pipeline([('scaler', StandardScaler()), ('model', model)])

# Load new descriptors data
new_descriptors = pd.read_csv('FDA_features.csv')
new_descriptors.dropna(inplace=True)

# Extract ChEMBL IDs and SMILES
chembl_id_column = new_descriptors['ChEMBL ID']
smiles_column = new_descriptors['SMILES']
new_descriptors.drop(columns=['ChEMBL ID', 'SMILES'], inplace=True)

# Initialize a DataFrame to store results
loop_results_df = pd.DataFrame(columns=['ChEMBL ID', 'SMILES', 'Predicted Value'])

# Number of iterations
loop_n = 50

# Dictionary to store predicted values for each ChEMBL ID
predicted_values = {chembl_id: [] for chembl_id in chembl_id_column.unique()}

# Run the loop to fit the model and predict the values
for i in range(loop_n):
    pipe.fit(X, y)
    
    loop_predicted_values = pipe.predict(new_descriptors)
    
    loop_df = pd.DataFrame({
        'ChEMBL ID': chembl_id_column,
        'SMILES': smiles_column,
        'Predicted Value': loop_predicted_values
    })
    loop_df.sort_values(by='Predicted Value', ascending=False, inplace=True)
    
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=FutureWarning)
        loop_results_df = pd.concat([loop_results_df, loop_df.head(10)])
        
    # Store the predicted values for each ChEMBL ID
    for chembl_id, predicted_value in zip(chembl_id_column, loop_predicted_values):
        if chembl_id in predicted_values:
            predicted_values[chembl_id].append(predicted_value)

# Calculate frequency of each ChEMBL ID in the top 10
value_counts = loop_results_df['ChEMBL ID'].value_counts(normalize=True) * 10
#print(value_counts)

# Get the top 13 ChEMBL IDs
top_13_chembl_ids = value_counts.head(13).index


In [3]:
mean_values = {chembl_id: np.mean(predicted_values[chembl_id]) for chembl_id in top_13_chembl_ids}
std_dev_values = {chembl_id: np.std(predicted_values[chembl_id]) for chembl_id in top_13_chembl_ids}

summary_df = pd.DataFrame({
    'ChEMBL ID': top_13_chembl_ids,
    'Frequency': [value_counts[chembl_id] for chembl_id in top_13_chembl_ids],
    'Avg Predicted Value': [mean_values[chembl_id] for chembl_id in top_13_chembl_ids]
})

summary_df.head(13)

Unnamed: 0,ChEMBL ID,Frequency,Avg Predicted Value
0,CHEMBL877,0.96,11.417949
1,CHEMBL548,0.8,11.389269
2,CHEMBL256087,0.72,11.39007
3,CHEMBL696,0.66,11.378514
4,CHEMBL504760,0.6,11.374508
5,CHEMBL267894,0.58,11.371353
6,CHEMBL815,0.56,11.361601
7,CHEMBL550,0.46,11.344359
8,CHEMBL488,0.38,11.35102
9,CHEMBL404422,0.34,11.336206


In [4]:
summary_df.to_csv("all_results.csv", index=False)