In [90]:
#IMPORT some LIBS
import pandas as pd
import seaborn as sb
import numpy as np
import sys

#IMPORT DATASET
df = pd.read_csv('~/Documents/temp_full_w-bad-remove.csv')
df.head()

Unnamed: 0,PDB_Code,Binding_Type,Binding_Value,Unit,Ligand_Name,Binding_Value_nM,Affinity_Category,MOL2_File,Molecule_Index,SMILES
0,4tmn,Ki,0.068,nM,0PK,0.068,high,4tmn_ligand.mol2,0.0,CC(C)C[C@H](N[P@@](=O)([O-])[C@H](Cc1ccccc1)N[...
1,5tmn,Ki,9.1,nM,0PJ,9.1,high,5tmn_ligand.mol2,0.0,CC(C)C[C@H](NC(=O)[C@H](CC(C)C)N[P@](=O)([O-])...
2,1ydr,Ki,3.0,uM,IQP,3000.0,low,1ydr_ligand.mol2,0.0,C[C@H]1C[NH2+]CCN1S(=O)(=O)c1cccc2cnccc12
3,1ydt,Ki,48.0,nM,IQB,48.0,medium,1ydt_ligand.mol2,0.0,O=S(=O)(NCC[NH2+]C/C=C/c1ccc(Br)cc1)c1cccc2cnc...
4,1bcu,Kd,0.53,mM,PRL,530000.0,low,1bcu_ligand.mol2,0.0,Nc1ccc2cc3ccc(N)cc3[nH+]c2c1


In [121]:
#ONE HOT ENCODING SMILES
def smiles_to_onehot(smiles_list, max_length=None, char_to_idx=None):
    """
    Convert SMILES strings to one-hot encoded vectors.
    
    Parameters:
    smiles_list: List of SMILES strings
    max_length: Maximum length to pad/truncate sequences (if None, uses max length in data)
    char_to_idx: Dictionary mapping characters to indices (if None, creates from data)
    
    Returns:
    onehot_encoded: numpy array of shape (n_samples, max_length, n_unique_chars)
    char_to_idx: character to index mapping
    """
    
    # Get all unique characters if char_to_idx not provided
    if char_to_idx is None:
        all_chars = set()
        for smiles in smiles_list:
            all_chars.update(smiles)
        
        # Create character to index mapping
        unique_chars = sorted(list(all_chars))
        char_to_idx = {char: idx for idx, char in enumerate(unique_chars)}
    
    # Add padding character if not present
    if '<PAD>' not in char_to_idx:
        char_to_idx['<PAD>'] = len(char_to_idx)
    
    # Determine max length
    if max_length is None:
        max_length = max(len(smiles) for smiles in smiles_list)
    
    n_chars = len(char_to_idx)
    n_samples = len(smiles_list)
    
    # Initialize one-hot array
    onehot_encoded = np.zeros((n_samples, max_length, n_chars))
    
    # Fill the one-hot array
    for i, smiles in enumerate(smiles_list):
        for j, char in enumerate(smiles[:max_length]):  # Truncate if longer
            if char in char_to_idx:
                onehot_encoded[i, j, char_to_idx[char]] = 1
        
        # Pad with padding character if shorter
        for j in range(len(smiles), max_length):
            onehot_encoded[i, j, char_to_idx['<PAD>']] = 1
    
    return onehot_encoded, char_to_idx

def flatten_onehot_for_rf(onehot_encoded):
    """
    Flatten one-hot encoded SMILES for use with Random Forest.
    
    Parameters:
    onehot_encoded: numpy array of shape (n_samples, max_length, n_unique_chars)
    
    Returns:
    flattened: numpy array of shape (n_samples, max_length * n_unique_chars)
    """
    n_samples, max_length, n_chars = onehot_encoded.shape
    return onehot_encoded.reshape(n_samples, max_length * n_chars)

def encode_smiles_column(df, smiles_column, method='onehot'):
    """
    Encode SMILES column in DataFrame.
    
    Parameters:
    df: pandas DataFrame
    smiles_column: name of column containing SMILES
    method: 'onehot' or 'fingerprint'
    
    Returns:
    encoded_features: numpy array of encoded features
    """
    
    smiles_list = df[smiles_column].dropna().tolist()
    
    if method == 'onehot':
        # CORRECT way to use one-hot encoding
        onehot_encoded, char_to_idx = smiles_to_onehot(smiles_list)
        encoded_features = flatten_onehot_for_rf(onehot_encoded)
        print(f"One-hot encoded shape: {encoded_features.shape}")
        return encoded_features
    
    elif method == 'fingerprint':
        encoded_features = molecular_fingerprint_alternative(smiles_list)
        print(f"Fingerprint features shape: {encoded_features.shape}")
        return encoded_features
    
    else:
        raise ValueError("Method must be 'onehot' or 'fingerprint'")


def add_encoded_smiles_to_df(df, smiles_column, method='onehot', column_name='encoded_smiles'):
    """
    Add encoded SMILES features as a new column to the DataFrame.
    
    Parameters:
    df: pandas DataFrame
    smiles_column: name of column containing SMILES
    method: 'onehot' or 'fingerprint'
    column_name: name for the new column containing encoded features
    
    Returns:
    df: DataFrame with new encoded column added
    """
    
    # Get the encoded features
    encoded_features = encode_smiles_column(df, smiles_column, method)
    
    # Add as new column - each row contains the full encoded vector
    df[column_name] = [row for row in encoded_features]
    
    print(f"Added column '{column_name}' with shape: {encoded_features.shape}")
    print(f"Each row contains a vector of length: {encoded_features.shape[1]}")
    
    return df


df = add_encoded_smiles_to_df(df, 'SMILES', method='onehot', column_name='onehot_encoded')
df.head()

#np.set_printoptions(threshold=sys.maxsize)
#np.vstack(df.loc[2, 'onehot_encoded'])

One-hot encoded shape: (282, 5712)
Added column 'onehot_encoded' with shape: (282, 5712)
Each row contains a vector of length: 5712


Unnamed: 0,PDB_Code,Binding_Type,Binding_Value,Unit,Ligand_Name,Binding_Value_nM,Affinity_Category,MOL2_File,Molecule_Index,SMILES,onehot_encoded
0,4tmn,Ki,0.068,nM,0PK,0.068,high,4tmn_ligand.mol2,0.0,CC(C)C[C@H](N[P@@](=O)([O-])[C@H](Cc1ccccc1)N[...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,5tmn,Ki,9.1,nM,0PJ,9.1,high,5tmn_ligand.mol2,0.0,CC(C)C[C@H](NC(=O)[C@H](CC(C)C)N[P@](=O)([O-])...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1ydr,Ki,3.0,uM,IQP,3000.0,low,1ydr_ligand.mol2,0.0,C[C@H]1C[NH2+]CCN1S(=O)(=O)c1cccc2cnccc12,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1ydt,Ki,48.0,nM,IQB,48.0,medium,1ydt_ligand.mol2,0.0,O=S(=O)(NCC[NH2+]C/C=C/c1ccc(Br)cc1)c1cccc2cnc...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1bcu,Kd,0.53,mM,PRL,530000.0,low,1bcu_ligand.mol2,0.0,Nc1ccc2cc3ccc(N)cc3[nH+]c2c1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [92]:
#DESIGNATING FEATURES(X) and PREDICTED VAR(y)
X = np.vstack(df['onehot_encoded'].values)
y = df['Affinity_Category']

#SPLITTING TESTING AND TRAINING DATA 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [93]:
#Diagnosing issues with model, exmaining class balance in training splits and looking at sparcity
def diagnose_rf_issues(X_train, y_train, X_test, y_test):
    """
    Diagnose potential issues with Random Forest performance.
    """
    print("=== DATASET DIAGNOSTICS ===")
    print(f"Training set size: {X_train.shape}")
    print(f"Test set size: {X_test.shape}")
    print(f"Number of features: {X_train.shape[1]}")
    
    # Check class distribution
    from collections import Counter
    train_distribution = Counter(y_train)
    test_distribution = Counter(y_test)
    
    print(f"\nClass distribution in training set: {train_distribution}")
    print(f"Class distribution in test set: {test_distribution}")
    
    # Check for class imbalance
    class_counts = list(train_distribution.values())
    imbalance_ratio = max(class_counts) / min(class_counts)
    print(f"Class imbalance ratio: {imbalance_ratio:.2f}")
    
    if imbalance_ratio > 3:
        print("⚠️  SEVERE CLASS IMBALANCE detected!")
    
    # Check feature sparsity
    sparsity = np.mean(X_train == 0)
    print(f"Feature sparsity: {sparsity:.2%}")
    
    if sparsity > 0.95:
        print("⚠️  VERY SPARSE features detected!")
    
    # Check for identical features
    unique_features = np.unique(X_train, axis=1)
    print(f"Unique feature columns: {unique_features.shape[1]} out of {X_train.shape[1]}")
    
    return train_distribution, test_distribution


diagnose_rf_issues(X_train, y_train, X_test, y_test)

=== DATASET DIAGNOSTICS ===
Training set size: (225, 5712)
Test set size: (57, 5712)
Number of features: 5712

Class distribution in training set: Counter({'low': 92, 'medium': 74, 'high': 59})
Class distribution in test set: Counter({'low': 25, 'medium': 17, 'high': 15})
Class imbalance ratio: 1.56
Feature sparsity: 97.06%
⚠️  VERY SPARSE features detected!
Unique feature columns: 1432 out of 5712


(Counter({'low': 92, 'medium': 74, 'high': 59}),
 Counter({'low': 25, 'medium': 17, 'high': 15}))

In [119]:
#BUILDING RANDOM FOREST
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [120]:
#SIMPLE MODEL SCORE, (IDK WHAT THIS IS MADE WITH)
y_pred = rf.predict(X_test)
rf.score(X_test,y_test)

0.5087719298245614

In [122]:
#ADVANCED METRICS
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

        high       0.45      0.67      0.54        15
         low       0.64      0.56      0.60        25
      medium       0.38      0.29      0.33        17

    accuracy                           0.51        57
   macro avg       0.49      0.51      0.49        57
weighted avg       0.51      0.51      0.50        57



In [99]:
#FEATURE IMPORTANCE !!Does not work currently with one hot encoding!!
features = pd.DataFrame(rf.feature_importances_, index = X.columns)
features.head()

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [117]:
#HYPER PARAMATERS (We should do hyperparameter optimization if possible)
rf2 = RandomForestClassifier(n_estimators = 200,
                             #criterion = 'entropy',
                             max_depth=20,
                             min_samples_split=5,
                             max_features='sqrt',
                             class_weight='balanced'
)
rf2.fit(X_train, y_train)
rf2.score(X_test, y_test)

0.5614035087719298

In [118]:
y_pred2 = rf2.predict(X_test)
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

        high       0.46      0.73      0.56        15
         low       0.81      0.52      0.63        25
      medium       0.47      0.47      0.47        17

    accuracy                           0.56        57
   macro avg       0.58      0.57      0.56        57
weighted avg       0.62      0.56      0.57        57



In [123]:
#Using actual nM affinity value to make a REGRESSOR Random Forest
#DESIGNATING FEATURES(XR) and PREDICTED VAR(yR)
XR = np.vstack(df['onehot_encoded'].values)
yR = df['Binding_Value_nM']

#SPLITTING TESTING AND TRAINING DATA (FIND OUT HOW THIS SPLIT HAPPENS)
XR_train, XR_test, yR_train, yR_test = train_test_split(XR, yR, test_size=0.2)

In [124]:
#CREATING REGRESSOR MODEL
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(XR_train, yR_train)

In [125]:
#METRICS
yR_pred = rfr.predict(XR_test)
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print(mean_absolute_error(yR_pred, yR_test))
print(mean_squared_error(yR_pred, yR_test))
print(r2_score(yR_test, yR_pred))

298951.5993351053
611461937988.148
-0.2941855196515857


In [126]:
#Hyperparam tuning
param_grid = {
    'n_estimators': [100, 200, 300], #number of decision trees
    'max_depth': [10,20,30], #depth of trees
    'min_samples_split':[2,5,10], #min number of samples at split node(WHAT THIS MEAN?)
    'min_samples_leaf': [1,2,3] #min number of samples at leaf node (WHAT THIS MEAN?)
} #all numbers of options multiplied together give CV value (WAT?)

In [127]:
#NEW MODEL FOR HYPERPARAMATIZATION
from sklearn.model_selection import GridSearchCV
rfr_cv = GridSearchCV(estimator=rfr, param_grid=param_grid, cv = 3, scoring='neg_mean_squared_error', n_jobs = -1)
rfr_cv.fit(XR_train, yR_train)

In [128]:
#METRICS FOR NEW MODEL
yR_pred = rfr_cv.predict(XR_test)
print(mean_absolute_error(yR_pred, yR_test))
print(mean_squared_error(yR_pred, yR_test))
print(r2_score(yR_test, yR_pred))

299745.8107726913
441075063595.51746
0.0664456363990632
