UWAGA - poniżej również kod do testów na cpu. W celu lokalnych testów warto zrobić tymczasowy katalog z jednym datasetem, na podstawie którego będziemy porównywać czas. 

In [1]:
# path_to_datasets = './test_data/small/'
path_to_datasets = '../test_data/'

In [2]:
import cupy as cp

print("Czy GPU działa?", cp.cuda.is_available())
print("GPU:", cp.cuda.runtime.getDeviceProperties(0)['name'])

Czy GPU działa? True
GPU: b'NVIDIA GeForce RTX 4050 Laptop GPU'


# Training on cpu

In [3]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold
import os
from sklearn.metrics import mean_squared_error
import time
from datetime import timedelta


In [4]:
rf_100 =    ('RF', RandomForestRegressor(random_state=123, n_jobs=-1, n_estimators=100))
rf_200 =    ('RF[200]', RandomForestRegressor(random_state=123, n_jobs=-1,  n_estimators=200))
rf_500 =    ('RF[500]', RandomForestRegressor(random_state=123, n_jobs=-1, n_estimators=500))
rf_md_10 =  ('RF[md10]', RandomForestRegressor(random_state=123, n_jobs=-1, max_depth=10))
rf_md_15 =  ('RF[md15]', RandomForestRegressor(random_state=123, n_jobs=-1, max_depth=15))
rf_md_20 =  ('RF[md20]', RandomForestRegressor(random_state=123, n_jobs=-1, max_depth=20))
rf_md_25 =  ('RF[md25]', RandomForestRegressor(random_state=123, n_jobs=-1, max_depth=25))
rf_mss_3 =  ('RF[mss3]', RandomForestRegressor(random_state=123, n_jobs=-1, min_samples_split=3))
rf_mss_4 =  ('RF[mss4]', RandomForestRegressor(random_state=123, n_jobs=-1, min_samples_split=4))
rf_mss_6 =  ('RF[mss6]', RandomForestRegressor(random_state=123, n_jobs=-1, min_samples_split=6))
rf_mss_8 =  ('RF[mss8]', RandomForestRegressor(random_state=123, n_jobs=-1, min_samples_split=8))
rf_msl_2 =  ('RF[msl2]', RandomForestRegressor(random_state=123, n_jobs=-1, min_samples_leaf=2))
rf_msl_3 =  ('RF[msl3]', RandomForestRegressor(random_state=123, n_jobs=-1, min_samples_leaf=3))
rf_msl_4 =  ('RF[msl4]', RandomForestRegressor(random_state=123, n_jobs=-1, min_samples_leaf=4))
rf_msl_5 =  ('RF[msl5]', RandomForestRegressor(random_state=123, n_jobs=-1, min_samples_leaf=5))
rf_mf_log = ('RF[mfLog2]', RandomForestRegressor(random_state=123, n_jobs=-1, max_features="log2"))
rf_mf_all = ('RF[mfAll]', RandomForestRegressor(random_state=123, n_jobs=-1, max_features=None))

In [5]:
max_samples = [5]
forests = [rf_100, rf_200, rf_500, rf_md_10, rf_md_15, rf_md_20, rf_md_25, rf_mss_3, rf_mss_4, rf_mss_6, rf_mss_8,
        rf_msl_2, rf_msl_3, rf_msl_4, rf_msl_5, rf_mf_log, rf_mf_all]

In [6]:
def make_X_train(X_tra, y_tra, max_sample):
    if max_sample <= 1:
        return X_tra, y_tra
    elif 1 < max_sample < 2:
        return np.tile(X_tra, (2, 1)), np.tile(y_tra, 2)
    else:
        return np.tile(X_tra, (max_sample, 1)), np.tile(y_tra, max_sample)

In [7]:
dataset_paths = []
for filename in os.listdir(path_to_datasets):
    full_path = os.path.join(path_to_datasets, filename)
    dataset_paths.append(full_path)
results = {'dataset' : [],
           'bootstrap_rate': [],
           'rf' : [],
           'cv_mse_scores' : [],
           'mean_mse' : [],
           'std_mse' : []}

standard_scaler = StandardScaler()
cvsplit = RepeatedKFold(n_splits=2, n_repeats=200, random_state=42)
total_datasets = len(dataset_paths)
total_ms = len(max_samples)
total_forests = len(forests)
total_folds = 2 * 200  # n_splits * n_repeats

pd.read_csv(dataset_paths[0]).head()

Unnamed: 0,Length,Diameter,Height,Whole_weight,Shucked_weight,Viscera_weight,Shell_weight,Sex_F,Sex_I,Sex_M,Rings
0,-0.574558,-0.432149,-1.064424,-0.641898,-0.607685,-0.726212,-0.638217,-0.674834,-0.688018,1.316677,15
1,-1.448986,-1.439929,-1.183978,-1.230277,-1.17091,-1.205221,-1.212987,-0.674834,-0.688018,1.316677,7
2,0.050033,0.12213,-0.107991,-0.309469,-0.4635,-0.35669,-0.207139,1.481846,-0.688018,-0.759488,9
3,-0.699476,-0.432149,-0.347099,-0.637819,-0.648238,-0.6076,-0.602294,-0.674834,-0.688018,1.316677,10
4,-1.615544,-1.540707,-1.423087,-1.272086,-1.215968,-1.287337,-1.320757,-0.674834,1.453451,-0.759488,7


In [8]:
dataset_path = dataset_paths[0]
os.path.basename(dataset_path)

'Abalone_1_preprocessed.csv'

In [9]:
start_time = time.time()

print(f"Starting training with {total_datasets} datasets, {total_ms} bootstrap rates, {total_forests} forest configurations, and {total_folds} CV folds")

for dataset_idx, dataset_path in enumerate(dataset_paths, 1):
    dataset_name = os.path.basename(dataset_path)
    print(f"\nProcessing dataset {dataset_idx}/{total_datasets}: {dataset_name}")

    df = pd.read_csv(dataset_path)  # Jeśli w csv jest kolumna z indeksami, użyj: pd.read_csv(dataset_path, index_col=False)

    X = df.iloc[:, :-1].to_numpy()
    y = df.iloc[:, -1].to_numpy()

    X = standard_scaler.fit_transform(X)  # Nie wiem po co, ale tak robili w paperze

    for ms_idx, ms in enumerate(max_samples, 1):
        print(f"  Bootstrap rate {ms_idx}/{total_ms}: {ms}")

        for forest_idx, rfr in enumerate(forests, 1):
            forest_start_time = time.time()
            print(f"    Forest model {forest_idx}/{total_forests}: {rfr[0]}")
            cv_mse_scores = []

            for fold_idx, (train_index, test_index) in enumerate(cvsplit.split(X, y), 1):
                if fold_idx % 10 == 0:  # Print only every 10 folds to avoid too much output
                    print(f"      Processing fold {fold_idx}/{total_folds}")

                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]

                X_tr, y_tr = make_X_train(X_train, y_train, ms)

                rfr[1].max_samples = None if ms >= 1 and ms != 1.2 else (0.6 if ms == 1.2 else ms)
                rfr[1].fit(X_tr, y_tr)

                y_hat = rfr[1].predict(X_test)
                mse = mean_squared_error(y_test, y_hat)
                cv_mse_scores.append(mse)

            # End time for training
            end_time = time.perf_counter()

            forest_time = time.time() - forest_start_time
            print(f"    Completed {rfr[0]} with bootstrap rate {ms}. Mean MSE: {np.mean(cv_mse_scores):.3f}")
            print(f"    Time taken: {timedelta(seconds=int(forest_time))}")

            results['dataset'].append(dataset_path)
            results['bootstrap_rate'].append(ms)
            results['rf'].append(rfr[0])
            results['cv_mse_scores'].append(cv_mse_scores)
            results['mean_mse'].append(f'{np.mean(cv_mse_scores):.3f}')
            results['std_mse'].append(f'{np.std(cv_mse_scores):.3f}')

print("\nTraining complete. Saving results...")
wyniki = pd.DataFrame(results)
wyniki.to_csv('wyniki_bb.csv', index=False)
print("Results saved to 'wyniki_bb.csv'")


Starting training with 1 datasets, 1 bootstrap rates, 17 forest configurations, and 400 CV folds

Processing dataset 1/1: Abalone_1_preprocessed.csv
  Bootstrap rate 1/1: 5
    Forest model 1/17: RF
      Processing fold 10/400
      Processing fold 20/400
      Processing fold 30/400
      Processing fold 40/400
      Processing fold 50/400
      Processing fold 60/400
      Processing fold 70/400
      Processing fold 80/400
      Processing fold 90/400
      Processing fold 100/400
      Processing fold 110/400
      Processing fold 120/400
      Processing fold 130/400
      Processing fold 140/400
      Processing fold 150/400
      Processing fold 160/400
      Processing fold 170/400
      Processing fold 180/400
      Processing fold 190/400
      Processing fold 200/400


KeyboardInterrupt: 