In [3]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
import pandas as pd
import numpy as np

# Load data
descriptors = pd.read_csv('training_descriptors.csv')
y = descriptors['standard_value']
X = descriptors.drop(columns='standard_value')

# Find outliers
selector = VarianceThreshold(threshold=0.1)
X_selected = selector.fit_transform(X)

RF_outliers = []
for i in range(50):
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.20,
                                                        shuffle=True)

    model = RandomForestRegressor(max_depth=20, min_samples_leaf=10)
    pipe = Pipeline([('scaler', StandardScaler()), ('model', model)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    residuals = y_test - y_pred
    abs_residuals = np.abs(residuals)
    threshold = 6 * np.median(abs_residuals)
    outliers_mask = abs_residuals > threshold

    X_outliers_index = X_test[outliers_mask].index.tolist()
    RF_outliers.extend(X_outliers_index)

# Count frequency of each outlier index
outlier_frequency = {index: RF_outliers.count(index) for index in set(RF_outliers)}

# remove top outliers
top_ten_outliers = sorted(outlier_frequency, key=outlier_frequency.get, reverse=True)[:8]

X_clean = X.drop(index=top_ten_outliers)
y_clean = y.drop(index=top_ten_outliers)

# custom scorers for R2 and MSE
scorers = {
    'r2': make_scorer(r2_score),
    'mse': make_scorer(mean_squared_error)
}

#  5-fold cross-validated pred using RF
model = RandomForestRegressor(max_depth=20, min_samples_leaf=10)
pipe = Pipeline([('scaler', StandardScaler()), ('model', model)])
cv = KFold(n_splits=5, shuffle=True, random_state=None)
scores_r2 = cross_val_score(pipe, X_clean, y_clean, cv=cv, scoring=scorers['r2'])
scores_mse = cross_val_score(pipe, X_clean, y_clean, cv=cv, scoring=scorers['mse'])

mean_r2 = scores_r2.mean()
mean_mse = scores_mse.mean()

print("Mean R2 Score:", mean_r2)
print("Mean MSE Score:", mean_mse)

Mean R2 Score: 0.6877564341082436
Mean MSE Score: 0.648885784874459


In [4]:
new_df = pd.concat([X_clean, y_clean], axis = 1)
new_df

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,standard_value
0,6.171280,6.171280,0.617806,0.617806,0.834426,11.200000,243.137,231.041,242.037754,80.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.809668
1,6.171280,6.171280,0.617806,0.617806,0.834426,11.200000,243.137,231.041,242.037754,80.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.465974
2,15.021999,15.021999,0.005149,-1.846230,0.018162,23.080000,1396.591,1301.839,1395.709934,544.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,5.920819
3,15.494058,15.494058,0.008604,-1.707951,0.013858,22.266055,1499.798,1395.974,1498.781308,582.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,5.000000
4,15.044508,15.044508,0.005398,-1.719962,0.017958,22.390000,1377.612,1284.876,1376.691744,534.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,5.744727
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
900,17.150194,17.150194,0.155398,-0.486473,0.185937,20.740000,679.801,637.465,679.339464,260.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.301030
901,16.669089,16.669089,0.108723,-0.539477,0.212523,21.127660,643.739,604.427,643.308245,246.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.301030
902,16.874213,16.874213,0.077871,-0.738097,0.221835,19.739130,647.174,609.878,646.274692,240.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.301030
903,17.156997,17.156997,0.063242,-0.606465,0.163919,20.490196,694.787,654.467,694.319144,264.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.301030


In [6]:
new_df.to_csv('training_descriptors_no_outliers.csv', index=False)