In [None]:
!pip uninstall -y rdkit-pypi numpy
!pip install rdkit-pypi

In [None]:
!pip install numpy==1.26.4

In [None]:
import os
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors, AllChem, DataStructs
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')


from google.colab import drive
drive.mount('/content/drive')

train_df = pd.read_csv('/content/drive/Shared drives/MLDD/ORD_Data/Main Data/train_04_13_25.csv')
test_df =  pd.read_csv('/content/drive/Shared drives/MLDD/ORD_Data/Main Data/test_04_13_25.csv')

train_df['data_type'] = 'Train'
test_df['data_type'] = 'Test'
allData = pd.concat([train_df, test_df], axis=0)
allData.head(2)

In [None]:
allData.shape

In [None]:
data = allData.copy()
data = data[data['Yield'] < 100]
data = data[data['Yield'] != 0]
data = data.replace('None', np.nan) #should consider adding
data = data.dropna(subset=['Yield'])

In [None]:
if 'Temperature' in data.columns and 'Time' in data.columns:
    data['TempTimeInteraction'] = data['Temperature'] * data['Time'] #add

# RDKit functions
def molFromSmiles(smiles):
    try:
        return Chem.MolFromSmiles(smiles)
    except:
        return None

def calcDesc(mol):
    if mol is None:
        return [np.nan]*6
    return [
        Descriptors.MolWt(mol),
        Descriptors.MolLogP(mol),
        Descriptors.TPSA(mol),
        Descriptors.NumHDonors(mol),   #hdonor
        Descriptors.NumHAcceptors(mol),    #numh acceptors
        Descriptors.NumRotatableBonds(mol)   #rot bonds
    ]

def morgan_fp(smiles, n_bits=128):    #smaller nbits number? finetune this?
    mol = molFromSmiles(smiles)
    if mol is None:
        return np.zeros(n_bits)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=n_bits)
    arr = np.zeros((n_bits,), dtype=int)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

# Feature definitions
descNames = ['MolWt', 'LogP', 'TPSA', 'HDonors', 'HAcceptors', 'RotBonds']
smilesRoles = [
    ('COOH SMILES', 'COOH'),
    ('Amine SMILES', 'Amine'),
    ('Additive SMILES', 'Additive'),
    ('Coupling Agent SMILES', 'Coupling Agent'),
    ('Solvent SMILES', 'Solvent')
]
numericalFeatures = [
    'Temperature', 'Time', 'TempTimeInteraction',
    'COOH MW', 'COOH logP', 'Amine MW', 'Amine logP'  #, 'data_type'
]
categoricalFeatures = [
    'Solvent', 'Coupling Agent', 'COOH', 'Amine', 'Additive'
]
yield_split_features = ['Yield', 'data_type']

# Feature engineering for GBR, SVR, RF, LGBM
def process_smiles_features(data, fp_bits=128):
    for smilesCol, prefix in smilesRoles:
        if smilesCol in data.columns:
            molCol = f'{prefix}_Mol'   #f syntax GOT IT
            data[molCol] = data[smilesCol].apply(molFromSmiles) #apply feature pandas function
            descDf = data[molCol].apply(calcDesc).apply(pd.Series)
            descDf.columns = [f'{prefix}_{n}' for n in descNames]  #list comprehension: renaming elements in descnames
            data = pd.concat([data, descDf], axis=1)
            data = data.drop(columns=[molCol]) #removes the molCol column (saving objects we no longer need)
            for n in descNames:
                col = f'{prefix}_{n}'
                if col in data.columns and col not in numericalFeatures:
                    numericalFeatures.append(col)

            fps = data[smilesCol].fillna('').apply(lambda s: morgan_fp(s, n_bits=fp_bits)) #lambda functions in pandas
            fp_df = pd.DataFrame(fps.tolist(), columns=[f'{prefix}_FP_{i}' for i in range(fp_bits)], index=data.index) #how fun
            data = pd.concat([data, fp_df], axis=1)
            for i in range(fp_bits):
                col = f'{prefix}_FP_{i}'
                if col in data.columns and col not in numericalFeatures:
                    numericalFeatures.append(col)

#what in the world!! read up on this.
            vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 4), min_df=1, max_features=32)
            smiles_vec = vectorizer.fit_transform(data[smilesCol].fillna('')).toarray()     #toarray function.  numpy courses?
            smiles_vec_names = [f"{prefix}_SMILES_{f}" for f in vectorizer.get_feature_names_out()]
            smiles_vec_df = pd.DataFrame(smiles_vec, columns=smiles_vec_names, index=data.index)
            data = pd.concat([data, smiles_vec_df], axis=1)
            for name in smiles_vec_names:
                if name not in numericalFeatures:
                    numericalFeatures.append(name)
    return data

# Prepare data for models
data_gbr_rf_lgbm = process_smiles_features(data.copy(), fp_bits=128)

In [None]:
# Common preprocessing function
def preprocess_data(X, y, scaler_type='standard'):
    X = pd.get_dummies(X, columns=[col for col in X.columns if X[col].dtype == 'object'], dummy_na=False)
    tmp = X.columns
    imputer = SimpleImputer(strategy='median')
    X = imputer.fit_transform(X)
    if scaler_type == 'robust':
        scaler = RobustScaler()
    else:
        scaler = StandardScaler()
    X = scaler.fit_transform(X)
    X = pd.DataFrame(X, columns=tmp)
    return X, y


In [None]:
availableNumericalFeatures = [col for col in numericalFeatures if col in data_gbr_rf_lgbm.columns]
availableCategoricalFeatures = [col for col in categoricalFeatures if col in data_gbr_rf_lgbm.columns]

In [None]:
splitFeature = data_gbr_rf_lgbm['data_type'] #pandas series

In [None]:
X_rf = data_gbr_rf_lgbm[availableNumericalFeatures + availableCategoricalFeatures]

# + splitFeature

y_rf = data_gbr_rf_lgbm[yield_split_features]
X_rf, y_rf = preprocess_data(X_rf, y_rf)

In [None]:
X_rf = X_rf.reset_index()

In [None]:
splitFeature = splitFeature.reset_index()

In [None]:
X_rf['data_type'] = splitFeature['data_type']

In [None]:
# X_rf['data_type'] = splitFeature['data_type']
# X_rf = X_rf.drop(columns=['index'])

In [None]:
# X_train = X_rf[X_rf['data_type'] == 'Train']
# y_train = y_rf[y_rf['data_type'] == 'Train']['Yield']
# X_test = X_rf[X_rf['data_type'] == 'Test']
# y_test = y_rf[y_rf['data_type'] == 'Test']['Yield']

# X_train = X_train.drop(columns=['data_type'])
# X_test = X_test.drop(columns=['data_type'])

# rf = RandomForestRegressor(
#     n_estimators=500, max_depth=20, min_samples_leaf=3, n_jobs=-1, random_state=42
# )
# rf.fit(X_train, y_train)
# yPred_rf = rf.predict(X_test)
# r2_rf = r2_score(y_test, yPred_rf)
# # print(f"Random Forest r^2: {r2_rf:.4f}")

In [None]:
X_train = X_rf[X_rf['data_type'] == 'Train'].drop(columns=['data_type'])
y_train = y_rf[y_rf['data_type'] == 'Train']['Yield']
X_test = X_rf[X_rf['data_type'] == 'Test'].drop(columns=['data_type'])
y_test = y_rf[y_rf['data_type'] == 'Test']['Yield']

rf = RandomForestRegressor(
    n_estimators=500, max_depth=20, min_samples_leaf=3, n_jobs=-1, random_state=42
)
rf.fit(X_train, y_train)
yPred_rf = rf.predict(X_test)
r2_rf = r2_score(y_test, yPred_rf)
print(f"Random Forest r^2: {r2_rf:.4f}")

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error

mae = mean_absolute_error(y_test, yPred_rf)
print(f"Degree MAE: {mae}")


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

y_train.plot.kde(bw_method=0.3, label='KDE Plot', color='blue')

plt.title('Kernel Density Estimation Plot')
plt.xlabel('Value')
plt.ylabel('Density')
plt.legend()
plt.grid(True)

plt.show()

The previous error was caused by a conflict between the numpy version installed by `rdkit-pypi` and the numpy version explicitly installed later. To resolve this, I've uninstalled both `rdkit-pypi` and `numpy`, and then reinstalled `rdkit-pypi`. This ensures that a compatible version of numpy is installed alongside `rdkit-pypi`.