In [1]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import re
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold
import os
from sklearn.metrics import mean_squared_error
from joblib import Parallel, delayed
from sklearn.base import clone
import ast
import time

In [2]:
#modele takie same jak w oryginale, wyrzuciłem jedynie kryterium podziału bo w regresji to jest po prostu mse
n_jobs = 1
########################################### MODELS ##########################################
rf_100 =    ('RF[100]', RandomForestRegressor(random_state=123, n_jobs=n_jobs, n_estimators=100))
rf_200 =    ('RF[200]', RandomForestRegressor(random_state=123,  n_jobs=n_jobs,n_estimators=200))
rf_500 =    ('RF[500]', RandomForestRegressor(random_state=123, n_jobs=n_jobs,n_estimators=500))
#rf_entr =   ('  RF[entr]', RandomForestRegressor(random_state=123, criterion="entropy"))
rf_md_10 =  ('RF[md10]', RandomForestRegressor(random_state=123, n_jobs=n_jobs,max_depth=10))
rf_md_15 =  ('RF[md15]', RandomForestRegressor(random_state=123, n_jobs=n_jobs,max_depth=15))
rf_md_20 =  ('RF[md20]', RandomForestRegressor(random_state=123, n_jobs=n_jobs,max_depth=20))
rf_md_25 =  ('RF[md25]', RandomForestRegressor(random_state=123,n_jobs=n_jobs, max_depth=25))
rf_mss_3 =  ('RF[mss3]', RandomForestRegressor(random_state=123,n_jobs=n_jobs, min_samples_split=3))
rf_mss_4 =  ('RF[mss4]', RandomForestRegressor(random_state=123, n_jobs=n_jobs,min_samples_split=4))
rf_mss_6 =  ('RF[mss6]', RandomForestRegressor(random_state=123,n_jobs=n_jobs, min_samples_split=6))
rf_mss_8 =  ('RF[mss8]', RandomForestRegressor(random_state=123, n_jobs=n_jobs,min_samples_split=8))
rf_msl_2 =  ('RF[msl2]', RandomForestRegressor(random_state=123, n_jobs=n_jobs,min_samples_leaf=2))
rf_msl_3 =  ('RF[msl3]', RandomForestRegressor(random_state=123,n_jobs=n_jobs, min_samples_leaf=3))
rf_msl_4 =  ('RF[msl4]', RandomForestRegressor(random_state=123,n_jobs=n_jobs, min_samples_leaf=4))
rf_msl_5 =  ('RF[msl5]', RandomForestRegressor(random_state=123,n_jobs=n_jobs, min_samples_leaf=5))
rf_mf_log = ('RF[mfLog2]', RandomForestRegressor(random_state=123,n_jobs=n_jobs, max_features="log2"))
rf_mf_all = ('RF[mfAll]', RandomForestRegressor(random_state=123,n_jobs=n_jobs, max_features=None))
######################################## END OF MODELS ######################################

######################################### PARAMETERS ########################################
max_samples = [0.2, 0.6, 0.8, 1, 1.2, 2, 3, 4, 5]
# max_samples = [0.2]
forests = [rf_100, rf_200, rf_500, rf_md_10, rf_md_15, rf_md_20, rf_md_25, rf_mss_3, rf_mss_4, rf_mss_6, rf_mss_8,
        rf_msl_2, rf_msl_3, rf_msl_4, rf_msl_5, rf_mf_log, rf_mf_all]
 
##################################### END OF PARAMETERS #####################################

#duplikujemy dane zeby dalo sie zrobic br>1
def make_X_train(X_tra, y_tra, max_sample):
    if max_sample <= 1:
        return X_tra, y_tra
    elif 1 < max_sample < 2:
        return np.tile(X_tra, (2, 1)), np.tile(y_tra, 2)
    else:
        return np.tile(X_tra, (max_sample, 1)), np.tile(y_tra, max_sample)

In [3]:
#to zrobcie tak jak tutaj albo wpiszcie ręcznie do dataset_paths sciezki do datasetow

path_to_datasets = '../test_data' #'path/to/folder/with/datasets'

dataset_paths = []
for filename in os.listdir(path_to_datasets):
    full_path = os.path.join(path_to_datasets, filename)
    dataset_paths.append(full_path)

In [4]:
def process_fold(train_index, test_index, X, y, ms, rfr):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    X_tr, y_tr = make_X_train(X_train, y_train, ms)

    rfr_fold = clone(rfr[1])
    rfr_fold.max_samples = None if ms >= 1 and ms != 1.2 else (0.6 if ms == 1.2 else ms)
    rfr_fold.fit(X_tr, y_tr)

    y_hat = rfr_fold.predict(X_test)
    return mean_squared_error(y_test, y_hat)

In [None]:
results = {'dataset' : [],
           'bootstrap_rate': [],
           'rf' : [],
           'cv_mse_scores' : [],
           'mean_mse' : [],
           'std_mse' : []}

scaler = StandardScaler()
cvsplit = RepeatedKFold(n_splits=2, n_repeats=200, random_state=42)

for dataset_path in dataset_paths:
    df = pd.read_csv(dataset_path)

    X = df.iloc[:, :-1].to_numpy()
    y = df.iloc[:, -1].to_numpy()

    X = scaler.fit_transform(X)

    for ms in max_samples:
        for rfr in forests:
            start_time = time.time()
            print(f'Processing {dataset_path} with bootstrap rate {ms} and model {rfr[0]}  ...')

            cv_mse_scores = Parallel(n_jobs=-1)(
                delayed(process_fold)(train_index, test_index, X, y, ms, rfr)
                for train_index, test_index in cvsplit.split(X, y)
            )

            end_time = time.time() 
            elapsed_time = end_time - start_time 
            print(f'Completed {rfr[0]} with bootstrap rate {ms}. Time taken: {elapsed_time:.2f} seconds.')

            results['dataset'].append(dataset_path)
            results['bootstrap_rate'].append(ms)
            results['rf'].append(rfr[0])
            results['cv_mse_scores'].append(cv_mse_scores)
            results['mean_mse'].append(f'{np.mean(cv_mse_scores):.3f}')
            results['std_mse'].append(f'{np.std(cv_mse_scores):.3f}')

Processing ../test_data\airfoil_self_noise_processed.csv with bootstrap rate 0.2 and model RF[100]  ...
Completed RF[100] with bootstrap rate 0.2. Time taken: 17.14 seconds.
Processing ../test_data\airfoil_self_noise_processed.csv with bootstrap rate 0.2 and model RF[200]  ...
Completed RF[200] with bootstrap rate 0.2. Time taken: 26.58 seconds.
Processing ../test_data\airfoil_self_noise_processed.csv with bootstrap rate 0.2 and model RF[500]  ...
Completed RF[500] with bootstrap rate 0.2. Time taken: 69.79 seconds.
Processing ../test_data\airfoil_self_noise_processed.csv with bootstrap rate 0.2 and model RF[md10]  ...
Completed RF[md10] with bootstrap rate 0.2. Time taken: 13.91 seconds.
Processing ../test_data\airfoil_self_noise_processed.csv with bootstrap rate 0.2 and model RF[md15]  ...
Completed RF[md15] with bootstrap rate 0.2. Time taken: 14.41 seconds.
Processing ../test_data\airfoil_self_noise_processed.csv with bootstrap rate 0.2 and model RF[md20]  ...
Completed RF[md20] wi

In [None]:
wyniki = pd.DataFrame(results)
wyniki.to_csv('../results/wyniki_bb_test.csv', index=False)

In [30]:
res = pd.read_csv('../results/wyniki_bb_test.csv')
res

Unnamed: 0,dataset,bootstrap_rate,rf,cv_mse_scores,mean_mse,std_mse
0,../test_data\Abalone_1_preprocessed.csv,0.2,RF[100],"[4.670553470560077, 4.685861925287356, 4.42996...",4.628,0.186
1,../test_data\Abalone_1_preprocessed.csv,0.2,RF[200],"[4.670646110579225, 4.612593390804598, 4.40546...",4.606,0.186
2,../test_data\Abalone_1_preprocessed.csv,0.2,RF[500],"[4.675499676400192, 4.5899594367816094, 4.3954...",4.593,0.185
3,../test_data\Abalone_1_preprocessed.csv,0.2,RF[md10],"[4.666371538623957, 4.648686912704007, 4.42591...",4.627,0.187
4,../test_data\Abalone_1_preprocessed.csv,0.2,RF[md15],"[4.67435557548462, 4.670776546694385, 4.421879...",4.628,0.185
5,../test_data\Abalone_1_preprocessed.csv,0.2,RF[md20],"[4.6696135475706075, 4.690205888037463, 4.4299...",4.628,0.186
6,../test_data\Abalone_1_preprocessed.csv,0.2,RF[md25],"[4.670553470560077, 4.685861925287356, 4.42996...",4.628,0.186
7,../test_data\Abalone_1_preprocessed.csv,0.2,RF[mss3],"[4.695707484442316, 4.667043690120796, 4.41567...",4.626,0.187
8,../test_data\Abalone_1_preprocessed.csv,0.2,RF[mss4],"[4.6774232591107285, 4.675830004843572, 4.4297...",4.625,0.188
9,../test_data\Abalone_1_preprocessed.csv,0.2,RF[mss6],"[4.6773142893504795, 4.664614676142312, 4.4203...",4.623,0.187


In [31]:


string = res['cv_mse_scores'].iloc[0]
lista = ast.literal_eval(string)

len(lista)


400