In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn import preprocessing, ensemble
from sklearn.metrics import mean_squared_error

# Load your dataset
Dataset = pd.read_csv("full_descriptors.csv")
X = Dataset[['MW','volume','G_sol','DeltaG_sol','sol_dip',
             'Lsolu_Hsolv','Lsolv_Hsolu','SASA','O_charges',
             'C_charges','Most_neg','Most_pos','Het_charges']]
y = Dataset['LogS']

# Initialize lists
y_true_all = []
y_pred_all = []

# 10-fold CV
kf = KFold(n_splits=10, shuffle=True, random_state=42)
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    model = ensemble.RandomForestRegressor(n_estimators=500, n_jobs=-1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    y_true_all.extend(y_test)
    y_pred_all.extend(y_pred)

# Save predictions
results_df = pd.DataFrame({
    'Experimental': y_true_all,
    'Predicted': y_pred_all,
    'Absolute Error': np.abs(np.array(y_true_all) - np.array(y_pred_all))
})
results_df.to_csv("RandomForest_predictions.csv", index=False)
print("Saved: RandomForest_predictions.csv")


Saved: RandomForest_predictions.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn import preprocessing, ensemble

# Load your dataset
Dataset = pd.read_csv("full_descriptors.csv")

# Shuffle the dataset like in your original code
Dataset = Dataset.sample(frac=1).reset_index(drop=True)

# Define input features and target
X = Dataset[['MW','volume','G_sol','DeltaG_sol','sol_dip',
             'Lsolu_Hsolv','Lsolv_Hsolu','SASA','O_charges',
             'C_charges','Most_neg','Most_pos','Het_charges']]
y = Dataset['LogS']

# Set up cross-validation (no random_state, no shuffle — same as your original code)
kf = KFold(n_splits=10)

# Lists to collect results
y_true_all = []
y_pred_all = []

# Run 10-fold CV for Random Forest
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Standardize features (like your original)
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # Train Random Forest
    model = ensemble.RandomForestRegressor(n_estimators=500, n_jobs=-1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Save results
    y_true_all.extend(y_test)
    y_pred_all.extend(y_pred)

# Create results DataFrame
results_df = pd.DataFrame({
    'Experimental': y_true_all,
    'Predicted': y_pred_all,
    'Absolute Error': np.abs(np.array(y_true_all) - np.array(y_pred_all))
})

# Save to CSV
results_df.to_csv("RandomForest_predictions.csv", index=False)
print("✅ Saved: RandomForest_predictions.csv")


✅ Saved: RandomForest_predictions.csv


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn import svm
from sklearn.cross_decomposition import PLSRegression
from sklearn import ensemble
from scipy.stats import pearsonr

# Load and shuffle dataset
Dataset = pd.read_csv("full_descriptors.csv")
Dataset = Dataset.sample(frac=1).reset_index(drop=True)

# Features and target
X = Dataset[['MW','volume','G_sol','DeltaG_sol','sol_dip',
             'Lsolu_Hsolv','Lsolv_Hsolu','SASA','O_charges',
             'C_charges','Most_neg','Most_pos','Het_charges']]
y = Dataset['LogS']

# Set up 10-fold CV
kf = KFold(n_splits=10)


In [4]:
true_vals = []
pred_vals = []

for train_idx, test_idx in kf.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    model = LinearRegression()
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    true_vals.extend(y_test)
    pred_vals.extend(preds)

pd.DataFrame({
    'Experimental': true_vals,
    'Predicted': pred_vals,
    'Absolute Error': np.abs(np.array(true_vals) - np.array(pred_vals))
}).to_csv("MLR_predictions.csv", index=False)


In [5]:
true_vals = []
pred_vals = []

for train_idx, test_idx in kf.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    mlp = MLPRegressor(hidden_layer_sizes=300, max_iter=800)
    for _ in range(100):
        try:
            mlp.fit(X_train, y_train)
            preds = mlp.predict(X_test)
            if np.ptp(preds) == 0:
                continue
            break
        except:
            continue

    true_vals.extend(y_test)
    pred_vals.extend(preds)

pd.DataFrame({
    'Experimental': true_vals,
    'Predicted': pred_vals,
    'Absolute Error': np.abs(np.array(true_vals) - np.array(pred_vals))
}).to_csv("ANN_predictions.csv", index=False)


In [6]:
true_vals = []
pred_vals = []

for train_idx, test_idx in kf.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    model = svm.SVR(C=4, epsilon=0.01, gamma=0.03, kernel='rbf')
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    true_vals.extend(y_test)
    pred_vals.extend(preds)

pd.DataFrame({
    'Experimental': true_vals,
    'Predicted': pred_vals,
    'Absolute Error': np.abs(np.array(true_vals) - np.array(pred_vals))
}).to_csv("SVM_predictions.csv", index=False)


In [7]:
true_vals = []
pred_vals = []

for train_idx, test_idx in kf.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    model = PLSRegression(n_components=9)
    model.fit(X_train, y_train)
    preds = model.predict(X_test).flatten()  # reshape

    true_vals.extend(y_test)
    pred_vals.extend(preds)

pd.DataFrame({
    'Experimental': true_vals,
    'Predicted': pred_vals,
    'Absolute Error': np.abs(np.array(true_vals) - np.array(pred_vals))
}).to_csv("PLS_predictions.csv", index=False)


In [8]:
true_vals = []
pred_vals = []

for train_idx, test_idx in kf.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    model = ensemble.ExtraTreesRegressor(n_estimators=500, n_jobs=-1)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    true_vals.extend(y_test)
    pred_vals.extend(preds)

pd.DataFrame({
    'Experimental': true_vals,
    'Predicted': pred_vals,
    'Absolute Error': np.abs(np.array(true_vals) - np.array(pred_vals))
}).to_csv("ExtraTrees_predictions.csv", index=False)


In [9]:
true_vals = []
pred_vals = []

for train_idx, test_idx in kf.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    model = ensemble.BaggingRegressor(n_estimators=500, n_jobs=-1)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    true_vals.extend(y_test)
    pred_vals.extend(preds)

pd.DataFrame({
    'Experimental': true_vals,
    'Predicted': pred_vals,
    'Absolute Error': np.abs(np.array(true_vals) - np.array(pred_vals))
}).to_csv("Bagging_predictions.csv", index=False)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn import preprocessing
import GPy

# Store results
true_vals = []
pred_vals = []

for train_idx, test_idx in kf.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    y_train_array = np.array(y_train).reshape(-1, 1)

    # GPR with RBF kernel
    kernel = GPy.kern.RBF(input_dim=X_train.shape[1])
    model = GPy.models.GPRegression(X_train, y_train_array, kernel)
    model.optimize()

    preds = model.predict(X_test)[0].flatten()

    true_vals.extend(y_test)
    pred_vals.extend(preds)

# Save predictions
pd.DataFrame({
    'Experimental': true_vals,
    'Predicted': pred_vals,
    'Absolute Error': np.abs(np.array(true_vals) - np.array(pred_vals))
}).to_csv("GPR_predictions.csv", index=False)