In [78]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import re
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold
import os
from sklearn.metrics import mean_squared_error
from joblib import Parallel, delayed
from sklearn.base import clone
import ast
import time

In [79]:
#modele takie same jak w oryginale, wyrzuciłem jedynie kryterium podziału bo w regresji to jest po prostu mse
n_jobs = 1
########################################### MODELS ##########################################
rf_100 =    ('RF[100]', RandomForestRegressor(random_state=123, n_jobs=n_jobs, n_estimators=100))
rf_200 =    ('RF[200]', RandomForestRegressor(random_state=123,  n_jobs=n_jobs,n_estimators=200))
rf_500 =    ('RF[500]', RandomForestRegressor(random_state=123, n_jobs=n_jobs,n_estimators=500))
#rf_entr =   ('  RF[entr]', RandomForestRegressor(random_state=123, criterion="entropy"))
rf_md_10 =  ('RF[md10]', RandomForestRegressor(random_state=123, n_jobs=n_jobs,max_depth=10))
rf_md_15 =  ('RF[md15]', RandomForestRegressor(random_state=123, n_jobs=n_jobs,max_depth=15))
rf_md_20 =  ('RF[md20]', RandomForestRegressor(random_state=123, n_jobs=n_jobs,max_depth=20))
rf_md_25 =  ('RF[md25]', RandomForestRegressor(random_state=123,n_jobs=n_jobs, max_depth=25))
rf_mss_3 =  ('RF[mss3]', RandomForestRegressor(random_state=123,n_jobs=n_jobs, min_samples_split=3))
rf_mss_4 =  ('RF[mss4]', RandomForestRegressor(random_state=123, n_jobs=n_jobs,min_samples_split=4))
rf_mss_6 =  ('RF[mss6]', RandomForestRegressor(random_state=123,n_jobs=n_jobs, min_samples_split=6))
rf_mss_8 =  ('RF[mss8]', RandomForestRegressor(random_state=123, n_jobs=n_jobs,min_samples_split=8))
rf_msl_2 =  ('RF[msl2]', RandomForestRegressor(random_state=123, n_jobs=n_jobs,min_samples_leaf=2))
rf_msl_3 =  ('RF[msl3]', RandomForestRegressor(random_state=123,n_jobs=n_jobs, min_samples_leaf=3))
rf_msl_4 =  ('RF[msl4]', RandomForestRegressor(random_state=123,n_jobs=n_jobs, min_samples_leaf=4))
rf_msl_5 =  ('RF[msl5]', RandomForestRegressor(random_state=123,n_jobs=n_jobs, min_samples_leaf=5))
rf_mf_log = ('RF[mfLog2]', RandomForestRegressor(random_state=123,n_jobs=n_jobs, max_features="log2"))
rf_mf_all = ('RF[mfAll]', RandomForestRegressor(random_state=123,n_jobs=n_jobs, max_features=None))
######################################## END OF MODELS ######################################

######################################### PARAMETERS ########################################
max_samples = [0.2, 0.6, 0.8, 1, 1.2, 2, 3, 4, 5]
# max_samples = [0.2]
forests = [rf_100, rf_200, rf_500, rf_md_10, rf_md_15, rf_md_20, rf_md_25, rf_mss_3, rf_mss_4, rf_mss_6, rf_mss_8,
           rf_msl_2, rf_msl_3, rf_msl_4, rf_msl_5, rf_mf_log, rf_mf_all]

# forests = [rf_100, rf_200]
##################################### END OF PARAMETERS #####################################

#duplikujemy dane zeby dalo sie zrobic br>1
def make_X_train(X_tra, y_tra, max_sample):
    if max_sample <= 1:
        return X_tra, y_tra
    elif 1 < max_sample < 2:
        return np.tile(X_tra, (2, 1)), np.tile(y_tra, 2)
    else:
        return np.tile(X_tra, (max_sample, 1)), np.tile(y_tra, max_sample)

In [80]:
#to zrobcie tak jak tutaj albo wpiszcie ręcznie do dataset_paths sciezki do datasetow

path_to_datasets = '../test_data' #'path/to/folder/with/datasets'

dataset_paths = []
for filename in os.listdir(path_to_datasets):
    full_path = os.path.join(path_to_datasets, filename)
    dataset_paths.append(full_path)

In [81]:
def process_fold(train_index, test_index, X, y, ms, rfr):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    X_tr, y_tr = make_X_train(X_train, y_train, ms)

    rfr_fold = clone(rfr[1])
    rfr_fold.max_samples = None if ms >= 1 and ms != 1.2 else (0.6 if ms == 1.2 else ms)
    rfr_fold.fit(X_tr, y_tr)

    y_hat = rfr_fold.predict(X_test)
    return mean_squared_error(y_test, y_hat)

In [82]:
results = {'dataset' : [],
           'bootstrap_rate': [],
           'rf' : [],
           'cv_mse_scores' : [],
           'mean_mse' : [],
           'std_mse' : []}

scaler = StandardScaler()
cvsplit = RepeatedKFold(n_splits=2, n_repeats=50, random_state=42)

for dataset_path in dataset_paths:
    df = pd.read_csv(dataset_path)

    X = df.iloc[:, :-1].to_numpy()
    y = df.iloc[:, -1].to_numpy()

    X = scaler.fit_transform(X)

    for ms in max_samples:
        for rfr in forests:
            start_time = time.time()
            print(f'Processing {dataset_path} with bootstrap rate {ms} and model {rfr[0]}  ...')

            cv_mse_scores = Parallel(n_jobs=-1)(
                delayed(process_fold)(train_index, test_index, X, y, ms, rfr)
                for train_index, test_index in cvsplit.split(X, y)
            )

            end_time = time.time() 
            elapsed_time = end_time - start_time 
            print(f'Completed {rfr[0]} with bootstrap rate {ms}. Time taken: {elapsed_time:.2f} seconds.')
            print(f'{np.mean(cv_mse_scores):.3f}')

            results['dataset'].append(dataset_path)
            results['bootstrap_rate'].append(ms)
            results['rf'].append(rfr[0])
            results['cv_mse_scores'].append(cv_mse_scores)
            results['mean_mse'].append(f'{np.mean(cv_mse_scores):.3f}')
            results['std_mse'].append(f'{np.std(cv_mse_scores):.3f}')

Processing ../test_data\Appliances_Energy_Prediction_374_preprocessed_cut.csv with bootstrap rate 0.2 and model RF[100]  ...
Completed RF[100] with bootstrap rate 0.2. Time taken: 3.75 seconds.
11386.393
Processing ../test_data\Appliances_Energy_Prediction_374_preprocessed_cut.csv with bootstrap rate 0.2 and model RF[200]  ...
Completed RF[200] with bootstrap rate 0.2. Time taken: 7.47 seconds.
11322.580
Processing ../test_data\Appliances_Energy_Prediction_374_preprocessed_cut.csv with bootstrap rate 0.2 and model RF[500]  ...
Completed RF[500] with bootstrap rate 0.2. Time taken: 18.67 seconds.
11293.656
Processing ../test_data\Appliances_Energy_Prediction_374_preprocessed_cut.csv with bootstrap rate 0.2 and model RF[md10]  ...
Completed RF[md10] with bootstrap rate 0.2. Time taken: 3.02 seconds.
11446.892
Processing ../test_data\Appliances_Energy_Prediction_374_preprocessed_cut.csv with bootstrap rate 0.2 and model RF[md15]  ...
Completed RF[md15] with bootstrap rate 0.2. Time taken:

In [83]:
wyniki = pd.DataFrame(results)
wyniki.to_csv('../results/wyniki_bb/Appliances_Energy_Prediction_374.csv', index=False)

In [84]:
res = pd.read_csv('../results/wyniki_bb/Appliances_Energy_Prediction_374.csv')
res

Unnamed: 0,dataset,bootstrap_rate,rf,cv_mse_scores,mean_mse,std_mse
0,../test_data\Appliances_Energy_Prediction_374_...,0.2,RF[100],"[11061.01901722391, 11174.884452332657, 12440....",11386.393,1031.244
1,../test_data\Appliances_Energy_Prediction_374_...,0.2,RF[200],"[10896.59763171226, 11135.153554766734, 12282....",11322.580,1042.512
2,../test_data\Appliances_Energy_Prediction_374_...,0.2,RF[500],"[10922.509849240121, 11237.92927586207, 12411....",11293.656,1064.387
3,../test_data\Appliances_Energy_Prediction_374_...,0.2,RF[md10],"[11124.791883736378, 11218.049757299064, 12533...",11446.892,1036.409
4,../test_data\Appliances_Energy_Prediction_374_...,0.2,RF[md15],"[10979.63511166687, 11123.55403563114, 12414.9...",11398.485,1028.735
...,...,...,...,...,...,...
148,../test_data\Appliances_Energy_Prediction_374_...,5.0,RF[msl3],"[8117.124917473787, 8385.782298507154, 10567.5...",9455.963,1164.750
149,../test_data\Appliances_Energy_Prediction_374_...,5.0,RF[msl4],"[8020.845787837164, 8493.637853974395, 10408.1...",9311.269,1114.902
150,../test_data\Appliances_Energy_Prediction_374_...,5.0,RF[msl5],"[8199.893345575632, 8424.98547580796, 10227.13...",9295.481,1089.217
151,../test_data\Appliances_Energy_Prediction_374_...,5.0,RF[mfLog2],"[7062.1424721377925, 7356.281866125761, 8742.5...",7514.874,1080.156


In [85]:


string = res['cv_mse_scores'].iloc[0]
lista = ast.literal_eval(string)

len(lista)


100