In [1]:
import os
import json
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, KFold

# Directory setup

In [2]:
base_dir = '/Users/Avi/Dissertation/'
train_data_directory = os.path.join(base_dir, 'Data/Non_Curated/Train_Data')

hyperparams_dir = {
    "non_curated_rf_classification_hyperparameters": os.path.join(base_dir, 'Models/Random_Forest_Parameters/Non_Curated/Classification'),
    "non_curated_rf_regression_hyperparameters": os.path.join(base_dir, 'Models/Random_Forest_Parameters/Non_Curated/Regression')
}

for directory in hyperparams_dir.values():
    os.makedirs(directory, exist_ok=True)

# Assigning Targets and Fingerprints

In [3]:
targets = ['CHEMBL4078', 'CHEMBL279', 'CHEMBL5763', 'CHEMBL240', 'CHEMBL4005']
fingerprints = ['ECFP', 'MACCS Keys', 'AP2', 'AP3', 'AP2+AP3']

# Defining Main Functions

In [4]:
def load_fingerprinted_data(file_path):
    df = pd.read_csv(file_path)
    for fp in fingerprints:
        df[fp] = df[fp].apply(DataStructs.CreateFromBitString)
    return df

def convert_fp(fp):
    arr = np.zeros((1,), dtype=np.int32)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

def prepare_data(df, fingerprint, target_column):
    X = np.array([convert_fp(fp) for fp in df[fingerprint]])
    y = df[target_column].values
    return X, y

def tune_rf_hyperparameters(X_train, y_train, classification=True):
    param_dist = {
        'n_estimators': [100, 200, 300, 400, 500],
        'max_depth': [None, 5, 10, 20, 30],
        'min_samples_split': [5, 10, 15, 20, 25],
        'min_samples_leaf': [5, 10, 20],
        'max_features': ['sqrt', 'log2']
    }
    
    if classification:
        param_dist['class_weight'] = ['balanced']
        model = RandomForestClassifier(random_state=42, n_jobs=-1)
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    else:
        model = RandomForestRegressor(random_state=42, n_jobs=-1)
        cv = KFold(n_splits=5, shuffle=True, random_state=42)
    
    random_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=200, cv=cv, n_jobs=-1, verbose=0, random_state=42)
    random_search.fit(X_train, y_train)
    
    return random_search.best_params_

# Main execution

In [5]:
for target in targets:
    for fingerprint in fingerprints:
        train_file_path = os.path.join(train_data_directory, f'{target}_Train.csv')
        df = load_fingerprinted_data(train_file_path)

        X_train, y_train = prepare_data(df, fingerprint, 'activity')
        best_params_classification = tune_rf_hyperparameters(X_train, y_train, classification=True)
        with open(os.path.join(hyperparams_dir['non_curated_rf_classification_hyperparameters'], f'{target}_{fingerprint}_Best_RF_Non_Curated_Classification_Hyperparams.json'), 'w') as f:
            json.dump(best_params_classification, f)

        X_train, y_train = prepare_data(df, fingerprint, 'pchembl_value')
        best_params_regression = tune_rf_hyperparameters(X_train, y_train, classification=False)
        with open(os.path.join(hyperparams_dir['non_curated_rf_regression_hyperparameters'], f'{target}_{fingerprint}_Best_RF_Non_Curated_Regression_Hyperparams.json'), 'w') as f:
            json.dump(best_params_regression, f)

    print(f"Hyperparameter tuning for {target} completed.")

Hyperparameter tuning for CHEMBL4078 completed.
Hyperparameter tuning for CHEMBL279 completed.
Hyperparameter tuning for CHEMBL5763 completed.
Hyperparameter tuning for CHEMBL240 completed.
Hyperparameter tuning for CHEMBL4005 completed.
