In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from lolopy.learners import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import ShuffleSplit

import warnings

warnings.filterwarnings('ignore', category=ConvergenceWarning)

Goal:

* For each of the 9 datasets
* For each of the algorithms (Simple Gauss, PCA Gauss, Tuned Gauss; Simple RF, PCA RF, Tuned RF)

Determine the following R2 cross validation scores and the take the median over all datasets:

In [2]:
data_path = '../data/'

# Adjust in case you want to run this notebook on another dataset
dataset = 'DS_cube_150_2_Strength.csv'
filename = data_path + dataset
full_path = filename

df = pd.read_csv(full_path)
df = df.sample(frac=1).reset_index(drop=True)

df

Unnamed: 0,Idx_Sample,SiO2,Al2O3,Fe2O3,CaO,MgO,Na2O,K2O,SO3,TiO2,...,NaOH (Dry),Additional water,Superplasticizer,water -eff,Initial curing time (day),Initial curing temp (C),Initial curing rest time (day),Final curing temp (C),Mixture CO2 (Na2SiO3 as solution),28-d Cubic compressive strength (MPa)
0,1523,47.085,23.265,2.525,18.30,4.570,0.110,0.0,0.625,0.0,...,17.371429,0.0,15.2,125.427143,0,25,1,30,130.204069,59.9800
1,1539,52.295,24.571,3.215,12.58,3.242,0.154,0.0,0.515,0.0,...,22.125714,0.0,17.6,158.698571,0,25,1,30,154.167954,25.8800
2,1473,52.295,24.571,3.215,12.58,3.242,0.154,0.0,0.515,0.0,...,15.634286,0.0,15.2,113.796429,0,25,1,30,118.343726,34.7900
3,1433,47.085,23.265,2.525,18.30,4.570,0.110,0.0,0.625,0.0,...,22.628571,0.0,18.0,162.305357,0,25,1,30,161.732571,48.4967
4,1410,52.295,24.571,3.215,12.58,3.242,0.154,0.0,0.515,0.0,...,24.685714,0.0,18.0,176.078571,0,25,1,30,167.137714,20.5882
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269,1711,42.040,33.600,4.000,12.73,0.000,0.000,0.0,0.000,0.0,...,23.912000,0.0,0.0,78.461250,1,60,2,24,117.142053,47.5800
270,1584,47.085,23.265,2.525,18.30,4.570,0.110,0.0,0.625,0.0,...,21.394286,0.0,15.6,152.601429,0,25,1,30,150.187886,47.1300
271,1476,52.295,24.571,3.215,12.58,3.242,0.154,0.0,0.515,0.0,...,16.868571,0.0,16.4,122.780357,0,25,1,30,126.742011,36.2400
272,1492,47.085,23.265,2.525,18.30,4.570,0.110,0.0,0.625,0.0,...,15.222857,0.0,14.8,110.801786,0,25,1,30,119.065577,55.6200


In [None]:
# Adjust target in case you want to run this notebook on another dataset
target_column = '28-d Cubic compressive strength (MPa)'
X_train = df.drop(columns=[target_column, 'Idx_Sample', 'Mixture CO2 (Na2SiO3 as solution)'], axis=1)
y_train = StandardScaler().fit_transform(df[target_column].copy().to_numpy().reshape(-1, 1))
y_train

In [None]:
# Simple Gauss
default_kernel = ConstantKernel(1.0, (1e-3, 1e3)) * RBF(10, (1e-2, 1e2))
gauss_from_slamd = make_pipeline(StandardScaler(),
                                 GaussianProcessRegressor(kernel=default_kernel, n_restarts_optimizer=3))

scores_gauss_simple_r2 = cross_val_score(gauss_from_slamd, X_train, y_train, scoring='r2', cv=ShuffleSplit(n_splits=25, train_size=0.7))
print('#### SIMPLE GAUSS PREDICTION SCORES ####')
print('# R2 #')
print(f'Scores: {scores_gauss_simple_r2}')
print(f'Mean: {scores_gauss_simple_r2.mean()}')
print(f'Std: {scores_gauss_simple_r2.std()}')

In [None]:
# Gauss with PCA
gauss_pca = make_pipeline(StandardScaler(), PCA(n_components=0.99), GaussianProcessRegressor(n_restarts_optimizer=3))
scores_gauss_pca_r2 = cross_val_score(gauss_pca, X_train, y_train, scoring='r2', cv=ShuffleSplit(n_splits=25, train_size=0.7))
print('#### PCA GAUSS WITH DEFAULT KERNEL PREDICTION SCORES ####')
print('# R2 #')
print(f'Scores: {scores_gauss_pca_r2}')
print(f'Mean: {scores_gauss_pca_r2.mean()}')
print(f'Std: {scores_gauss_pca_r2.std()}')

In [None]:
# TUNED GAUSS ALGORITHM - setup

default_kernel = ConstantKernel(1.0, constant_value_bounds="fixed") * RBF(1.0, length_scale_bounds="fixed")

def _create_aniso_kernel(n_dims):
    return ConstantKernel(1.0, constant_value_bounds="fixed") * RBF(length_scale=[1] * n_dims)

param_grid = {
    'sfs__k_features': [8],
    'gp2__kernel': [default_kernel, _create_aniso_kernel(8)]
}

gauss1 = GaussianProcessRegressor(normalize_y=True, n_restarts_optimizer=3)
gauss2 = GaussianProcessRegressor(normalize_y=True, n_restarts_optimizer=3)

sfs_gr_testing = SFS(estimator=gauss1,
                     forward=True,
                     floating=False,
                     scoring='r2',
                     cv=None)

pipe = Pipeline([('std', StandardScaler()),
                 ('sfs', sfs_gr_testing),
                 ('gp2', gauss2)])

grid_search_cv = GridSearchCV(estimator=pipe,
                              param_grid=param_grid,
                              scoring='r2',
                              n_jobs=1,
                              cv=5,
                              refit=False)

In [None]:
# TUNED GAUSS ALGORITHM - run gridsearch
grid_search_cv = grid_search_cv.fit(X_train, y_train)

print('#### GAUSS TUNING RESULTS ####')

#best_grid = grid_search_cv.best_params_
for i in range(len(grid_search_cv.cv_results_['params'])):
    #if grid_search_cv.cv_results_['params'] == grid_search_cv.best_params_:
    print(" -------------- NEXT GRID RESULTS -------------------------")
    print(grid_search_cv.cv_results_['params'][i], 'test acc.:', grid_search_cv.cv_results_['mean_test_score'][i])
    print(grid_search_cv.cv_results_['params'][i], 'test std.:', grid_search_cv.cv_results_['std_test_score'][i])

print("BEST PARAMETERS VIA GRIDSEARCH", grid_search_cv.best_params_)

best_estimator = pipe.set_params(**grid_search_cv.best_params_)

scores_gauss_tuned_r2 = cross_val_score(best_estimator, X_train, y_train, scoring='r2', cv=ShuffleSplit(n_splits=25, train_size=0.7))
print('#### TUNED GAUSS PREDICTION SCORES ####')
print('# R2 #')
print(f'Scores: {scores_gauss_tuned_r2}')
print(f'Mean: {scores_gauss_tuned_r2.mean()}')
print(f'Std: {scores_gauss_tuned_r2.std()}')

In [None]:
class SlamdLoloRF(RandomForestRegressor):

    def fit(self, X, y, weigths=None, random_seed=42):
        if y.shape[0] < 8:
            X = np.tile(X, (4, 1))
            y = np.tile(y, (4, 1))
        super().fit(X, y, weigths, random_seed)

In [None]:
# Simple RF
rf_from_slamd = make_pipeline(StandardScaler(), SlamdLoloRF())
scores_rf_simple_r2 = cross_val_score(rf_from_slamd, X_train, y_train, scoring='r2', cv=ShuffleSplit(n_splits=25, train_size=0.7))

rf_from_slamd.fit(X_train, y_train)

print('#### SIMPLE RF PREDICTION SCORES ####')
print('# R2 #')
print(f'Scores: {scores_rf_simple_r2}')
print(f'Mean: {scores_rf_simple_r2.mean()}')
print(f'Std: {scores_rf_simple_r2.std()}')

In [None]:
# RF with PCA
rf_pca = make_pipeline(StandardScaler(), PCA(n_components=0.99), SlamdLoloRF())
scores_rf_pca_r2 = cross_val_score(rf_pca, X_train, y_train, scoring='r2', cv=ShuffleSplit(n_splits=25, train_size=0.7))
print('#### PCA RF PREDICTION SCORES ####')
print('# R2 #')
print(f'Scores: {scores_rf_pca_r2}')
print(f'Mean: {scores_rf_pca_r2.mean()}')
print(f'Std: {scores_rf_pca_r2.std()}')

In [None]:
# TUNED RF ALGORITHM - setup

rf1 = SlamdLoloRF()
rf2 = SlamdLoloRF()

param_grid = {
    'sfs__k_features': [8],
    'rf2__max_depth': [1, 5],
}

sfs_rf_testing = SFS(estimator=rf1,
                     forward=True,
                     floating=False,
                     scoring='r2',
                     cv=None)

pipe = Pipeline([('std', StandardScaler()),
                 ('sfs', sfs_rf_testing),
                 ('rf2', rf2)])

grid_search_cv = GridSearchCV(estimator=pipe,
                              param_grid=param_grid,
                              scoring='r2',
                              n_jobs=1,
                              cv=5,
                              refit=False)

In [None]:
# TUNED RF ALGORITHM - run gridsearch
grid_search_cv = grid_search_cv.fit(X_train, y_train)

print('#### RF TUNING RESULTS ####')

#best_grid = grid_search_cv.best_params_
for i in range(len(grid_search_cv.cv_results_['params'])):
    #if grid_search_cv.cv_results_['params'] == grid_search_cv.best_params_:
    print(" -------------- NEXT GRID RESULTS -------------------------")
    print(grid_search_cv.cv_results_['params'][i], 'test acc.:', grid_search_cv.cv_results_['mean_test_score'][i])
    print(grid_search_cv.cv_results_['params'][i], 'test std.:', grid_search_cv.cv_results_['std_test_score'][i])

print("BEST PARAMETERS VIA GRIDSEARCH", grid_search_cv.best_params_)

best_estimator = pipe.set_params(**grid_search_cv.best_params_)

scores_rf_tuned_r2 = cross_val_score(best_estimator, X_train, y_train, scoring='r2', cv=ShuffleSplit(n_splits=25, train_size=0.7))
print('#### PCA RF PREDICTION SCORES ####')
print('# R2 #')
print(f'Scores: {scores_rf_tuned_r2}')
print(f'Mean: {scores_rf_tuned_r2.mean()}')
print(f'Std: {scores_rf_tuned_r2.std()}')

In [None]:
print('All outputs in CSV format (copy them into a file):')
print(
    'dataset,gauss_simple_r2_mean,gauss_simple_r2_std,'
    'gauss_pca_r2_mean,gauss_pca_r2_std,'
    'gauss_tuned_r2_mean,gauss_tuned_r2_std,'
    'rf_simple_r2_mean,rf_simple_r2_std,'
    'rf_pca_r2_mean,rf_pca_r2_std,'
    'rf_tuned_r2_mean,rf_tuned_r2_std'
    )
print(
    f'{dataset},{scores_gauss_simple_r2.mean()},{scores_gauss_simple_r2.std()},'
    f'{scores_gauss_pca_r2.mean()},{scores_gauss_pca_r2.std()},'
    f'{scores_gauss_tuned_r2.mean()},{scores_gauss_tuned_r2.std()},'
    f'{scores_rf_simple_r2.mean()},{scores_rf_simple_r2.std()},'
    f'{scores_rf_pca_r2.mean()},{scores_rf_pca_r2.std()}'
    f'{scores_rf_tuned_r2.mean()},{scores_rf_tuned_r2.std()}'
)


In [3]:
results_df = pd.read_csv('../results/baseline/baseline_results_70_30.csv')
results_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   dataset               9 non-null      object 
 1   gauss_simple_r2_mean  9 non-null      float64
 2   gauss_simple_r2_std   9 non-null      float64
 3   gauss_pca_r2_mean     9 non-null      float64
 4   gauss_pca_r2_std      9 non-null      float64
 5   gauss_tuned_r2_mean   9 non-null      float64
 6   gauss_tuned_r2_std    9 non-null      float64
 7   rf_simple_r2_mean     9 non-null      float64
 8   rf_simple_r2_std      9 non-null      float64
 9   rf_pca_r2_mean        9 non-null      float64
 10  rf_pca_r2_std         9 non-null      float64
 11  rf_tuned_r2_mean      9 non-null      float64
 12  rf_tuned_r2_std       9 non-null      float64
dtypes: float64(12), object(1)
memory usage: 1.0+ KB


In [4]:
r2_results_df = results_df[['gauss_simple_r2_mean', 'gauss_pca_r2_mean', 'gauss_tuned_r2_mean', 'rf_simple_r2_mean', 'rf_pca_r2_mean',
            'rf_tuned_r2_mean']]
r2_results_df.index = results_df['dataset']
r2_results_df

Unnamed: 0_level_0,gauss_simple_r2_mean,gauss_pca_r2_mean,gauss_tuned_r2_mean,rf_simple_r2_mean,rf_pca_r2_mean,rf_tuned_r2_mean
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
DS_cube_100_1_Strength.csv,0.166192,0.345226,0.528039,0.916934,0.92407,0.910466
DS_cube_100_2_Strength.csv,0.051853,0.406616,0.080654,0.693609,0.679875,0.634943
DS_cube_100_3_Strength.csv,0.136376,0.647587,0.14316,0.858953,0.771145,0.779053
DS_cube_100_4_Strength.csv,0.103441,0.454574,0.093934,0.7875,0.703432,0.692283
DS_cube_100_5_Strength.csv,0.053141,0.435745,0.079125,0.805169,0.768302,0.763874
DS_cube_150_2_Strength.csv,0.111745,-4.024819,0.445924,0.920297,0.851566,0.891842
DS_cyl_100x200_1_Strength.csv,-0.096577,-1.073853,-1.046272,0.602808,0.452736,0.500272
DS_cyl_100x200_2_Strength.csv,-0.048253,0.219595,-0.047949,0.358973,0.257855,0.422785
DS_cyl_100x200_3_Strength.csv,-0.01447,0.040151,-0.067393,0.342442,0.203142,0.139566


In [5]:
for col in r2_results_df.columns:
    print("Median of ", col, r2_results_df[col].median())

Median of  gauss_simple_r2_mean 0.053141209018329
Median of  gauss_pca_r2_mean 0.3452256349462761
Median of  gauss_tuned_r2_mean 0.0806539686244807
Median of  rf_simple_r2_mean 0.7874996191170959
Median of  rf_pca_r2_mean 0.7034315391513977
Median of  rf_tuned_r2_mean 0.6922834424450391
