UWAGA - poniżej również kod do testów na cpu. W celu lokalnych testów warto zrobić tymczasowy katalog z jednym datasetem, na podstawie którego będziemy porównywać czas. 

In [10]:
# path_to_datasets = './test_data/small/'
path_to_datasets = '../test_data/'

In [11]:
import cupy as cp

print("Czy GPU działa?", cp.cuda.is_available())
print("GPU:", cp.cuda.runtime.getDeviceProperties(0)['name'])

Czy GPU działa? True
GPU: b'NVIDIA GeForce RTX 4050 Laptop GPU'


# Training on gpu

UWAGA - możliwy tylko jak macie kartę zgodną z [CUDA](https://developer.nvidia.com/cuda-gpus). Później trzeba pobrać odpowiednie biblioteki zgodne z wersją cuda np. dla 11.2

```bash
pip install cupy-cuda112
pip install cuml-cu11
```
teoretycznie działa na google colabie, lokalnie idk, w razie problemów warto promptować

In [12]:
import os
import pandas as pd
import numpy as np
import cupy as cp
from cuml.preprocessing import StandardScaler
from cuml.ensemble import RandomForestRegressor
from cuml.model_selection import train_test_split
from cuml.metrics import mean_squared_error
import time
from datetime import timedelta

In [13]:
rf_100 =    ('RF', RandomForestRegressor(random_state=123, n_estimators=100))
rf_200 =    ('RF[200]', RandomForestRegressor(random_state=123, n_estimators=200))
rf_500 =    ('RF[500]', RandomForestRegressor(random_state=123, n_estimators=500))
rf_md_10 =  ('RF[md10]', RandomForestRegressor(random_state=123, max_depth=10))
rf_md_15 =  ('RF[md15]', RandomForestRegressor(random_state=123, max_depth=15))
rf_md_20 =  ('RF[md20]', RandomForestRegressor(random_state=123, max_depth=20))
rf_md_25 =  ('RF[md25]', RandomForestRegressor(random_state=123, max_depth=25))
rf_mss_3 =  ('RF[mss3]', RandomForestRegressor(random_state=123, min_samples_split=3))
rf_mss_4 =  ('RF[mss4]', RandomForestRegressor(random_state=123, min_samples_split=4))
rf_mss_6 =  ('RF[mss6]', RandomForestRegressor(random_state=123, min_samples_split=6))
rf_mss_8 =  ('RF[mss8]', RandomForestRegressor(random_state=123, min_samples_split=8))
rf_msl_2 =  ('RF[msl2]', RandomForestRegressor(random_state=123, min_samples_leaf=2))
rf_msl_3 =  ('RF[msl3]', RandomForestRegressor(random_state=123, min_samples_leaf=3))
rf_msl_4 =  ('RF[msl4]', RandomForestRegressor(random_state=123, min_samples_leaf=4))
rf_msl_5 =  ('RF[msl5]', RandomForestRegressor(random_state=123, min_samples_leaf=5))
rf_mf_log = ('RF[mfLog2]', RandomForestRegressor(random_state=123, max_features=0.301)) # log2 approximation
rf_mf_all = ('RF[mfAll]', RandomForestRegressor(random_state=123, max_features=1.0))

  return func(**kwargs)


In [14]:
max_samples = [0.2, 0.6, 0.8, 1, 1.2, 2, 3, 4, 5]
forests = [rf_100, rf_200, rf_500, rf_md_10, rf_md_15, rf_md_20, rf_md_25, rf_mss_3, rf_mss_4, rf_mss_6, rf_mss_8,
        rf_msl_2, rf_msl_3, rf_msl_4, rf_msl_5, rf_mf_log, rf_mf_all]

In [15]:
# Function to duplicate data for bootstrap rates > 1
def make_X_train(X_tra, y_tra, max_sample):
    if max_sample <= 1:
        return X_tra, y_tra
    elif 1 < max_sample < 2:
        # For GPU arrays, we need to use appropriate concatenation
        X_dup = cp.concatenate([X_tra, X_tra], axis=0)
        y_dup = cp.concatenate([y_tra, y_tra], axis=0)
        return X_dup, y_dup
    else:
        # Duplicate data max_sample times
        repeats = int(max_sample)
        X_list = [X_tra] * repeats
        y_list = [y_tra] * repeats
        X_dup = cp.concatenate(X_list, axis=0)
        y_dup = cp.concatenate(y_list, axis=0)
        return X_dup, y_dup

In [17]:

dataset_paths = []
for filename in os.listdir(path_to_datasets):
    full_path = os.path.join(path_to_datasets, filename)
    dataset_paths.append(full_path)

results = {'dataset': [],
           'bootstrap_rate': [],
           'rf': [],
           'cv_mse_scores': [],
           'mean_mse': [],
           'std_mse': []}

standard_scaler = StandardScaler()

n_splits = 2  # Original number of splits in KFold
n_repeats = 200
test_size = 1/n_splits  # 1/2 = 0.5 for test set in train_test_split
total_datasets = len(dataset_paths)
total_ms = len(max_samples)
total_forests = len(forests)
total_folds = n_splits * n_repeats

pd.read_csv(dataset_paths[0]).head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,Y1
0,2.041777,-1.785875,-0.561951,-1.470077,1.0,-1.341641,-1.760447,-1.814575,15.55
1,2.041777,-1.785875,-0.561951,-1.470077,1.0,-0.447214,-1.760447,-1.814575,15.55
2,2.041777,-1.785875,-0.561951,-1.470077,1.0,0.447214,-1.760447,-1.814575,15.55
3,2.041777,-1.785875,-0.561951,-1.470077,1.0,1.341641,-1.760447,-1.814575,15.55
4,1.284979,-1.229239,0.0,-1.198678,1.0,-1.341641,-1.760447,-1.814575,20.84


In [18]:
start_time = time.time()

print(f"Starting GPU-accelerated training with {total_datasets} datasets, {total_ms} bootstrap rates, {total_forests} forest configurations, and {total_folds} CV folds")

for dataset_idx, dataset_path in enumerate(dataset_paths, 1):
    dataset_name = os.path.basename(dataset_path)
    print(f"\nProcessing dataset {dataset_idx}/{total_datasets}: {dataset_name}")

    # Load data with pandas first
    df = pd.read_csv(dataset_path)

    # Convert to cuDF for GPU processing
    X_pd = df.iloc[:, :-1].values
    y_pd = df.iloc[:, -1].values

    # Convert to CuPy arrays for GPU
    X = cp.array(X_pd, dtype=cp.float32)
    y = cp.array(y_pd, dtype=cp.float32)

    # Standardize the features
    X = standard_scaler.fit_transform(X)

    dataset_start_time = time.time()

    for ms_idx, ms in enumerate(max_samples, 1):
        print(f"  Bootstrap rate {ms_idx}/{total_ms}: {ms}")

        for forest_idx, rfr in enumerate(forests, 1):
            forest_start_time = time.time()
            print(f"    Forest model {forest_idx}/{total_forests}: {rfr[0]}")
            cv_mse_scores = []

            # Simulate RepeatedKFold by using train_test_split multiple times with different random states
            for repeat in range(n_repeats):
                # For each repeat, we'll perform n_splits different splits (simulating KFold)
                for split in range(n_splits):
                    fold_num = repeat * n_splits + split + 1

                    if fold_num % 10 == 0:  # Print only every 10 folds to avoid too much output
                        print(f"      Processing fold {fold_num}/{total_folds}")

                    # Use different random state for each split
                    random_state = repeat * 100 + split

                    # Split data into train and test sets
                    X_train, X_test, y_train, y_test = train_test_split(
                        X, y, test_size=test_size, random_state=random_state
                    )

                    # Duplicate data based on bootstrap rate
                    X_tr, y_tr = make_X_train(X_train, y_train, ms)
                    # Set max_samples parameter for RandomForestRegressor
                    max_samples_value = None if ms >= 1 and ms != 1.2 else (0.6 if ms == 1.2 else ms)
                    rfr[1].max_samples = max_samples_value

                    # Fit the model
                    rfr[1].fit(X_tr, y_tr)

                    # Predict and evaluate
                    y_hat = rfr[1].predict(X_test)
                    mse = mean_squared_error(y_test, y_hat)
                    cv_mse_scores.append(float(mse))  # Convert from GPU to CPU

            # Record results
            results['dataset'].append(dataset_path)
            results['bootstrap_rate'].append(ms)
            results['rf'].append(rfr[0])
            results['cv_mse_scores'].append(cv_mse_scores)
            results['mean_mse'].append(f'{np.mean(cv_mse_scores):.3f}')
            results['std_mse'].append(f'{np.std(cv_mse_scores):.3f}')

            forest_time = time.time() - forest_start_time
            print(f"    Completed {rfr[0]} with bootstrap rate {ms}. Mean MSE: {np.mean(cv_mse_scores):.3f}")
            print(f"    Time taken: {timedelta(seconds=int(forest_time))}")

    dataset_time = time.time() - dataset_start_time
    print(f"  Dataset {dataset_name} completed in {timedelta(seconds=int(dataset_time))}")

total_time = time.time() - start_time
print(f"\nTraining complete in {timedelta(seconds=int(total_time))}. Saving results...")

wyniki = pd.DataFrame(results)
wyniki.to_csv('wyniki_gpu.csv', index=False)
print("Results saved to 'wyniki_gpu.csv'")

Starting GPU-accelerated training with 1 datasets, 9 bootstrap rates, 17 forest configurations, and 400 CV folds

Processing dataset 1/1: Energy_Efficiency_242_preprocessed.csv
  Bootstrap rate 1/9: 0.2
    Forest model 1/17: RF
      Processing fold 10/400
      Processing fold 20/400
      Processing fold 30/400
      Processing fold 40/400
      Processing fold 50/400
      Processing fold 60/400
      Processing fold 70/400
      Processing fold 80/400
      Processing fold 90/400
      Processing fold 100/400
      Processing fold 110/400
      Processing fold 120/400
      Processing fold 130/400
      Processing fold 140/400
      Processing fold 150/400
      Processing fold 160/400
      Processing fold 170/400
      Processing fold 180/400
      Processing fold 190/400
      Processing fold 200/400
      Processing fold 210/400
      Processing fold 220/400
      Processing fold 230/400
      Processing fold 240/400
      Processing fold 250/400
      Processing fold 260/400


KeyboardInterrupt: 

# Training on cpu

In [20]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold
import os
from sklearn.metrics import mean_squared_error
import time
from datetime import timedelta


In [21]:
rf_100 =    ('RF', RandomForestRegressor(random_state=123, n_jobs=-1, n_estimators=100))
rf_200 =    ('RF[200]', RandomForestRegressor(random_state=123, n_jobs=-1,  n_estimators=200))
rf_500 =    ('RF[500]', RandomForestRegressor(random_state=123, n_jobs=-1, n_estimators=500))
rf_md_10 =  ('RF[md10]', RandomForestRegressor(random_state=123, n_jobs=-1, max_depth=10))
rf_md_15 =  ('RF[md15]', RandomForestRegressor(random_state=123, n_jobs=-1, max_depth=15))
rf_md_20 =  ('RF[md20]', RandomForestRegressor(random_state=123, n_jobs=-1, max_depth=20))
rf_md_25 =  ('RF[md25]', RandomForestRegressor(random_state=123, n_jobs=-1, max_depth=25))
rf_mss_3 =  ('RF[mss3]', RandomForestRegressor(random_state=123, n_jobs=-1, min_samples_split=3))
rf_mss_4 =  ('RF[mss4]', RandomForestRegressor(random_state=123, n_jobs=-1, min_samples_split=4))
rf_mss_6 =  ('RF[mss6]', RandomForestRegressor(random_state=123, n_jobs=-1, min_samples_split=6))
rf_mss_8 =  ('RF[mss8]', RandomForestRegressor(random_state=123, n_jobs=-1, min_samples_split=8))
rf_msl_2 =  ('RF[msl2]', RandomForestRegressor(random_state=123, n_jobs=-1, min_samples_leaf=2))
rf_msl_3 =  ('RF[msl3]', RandomForestRegressor(random_state=123, n_jobs=-1, min_samples_leaf=3))
rf_msl_4 =  ('RF[msl4]', RandomForestRegressor(random_state=123, n_jobs=-1, min_samples_leaf=4))
rf_msl_5 =  ('RF[msl5]', RandomForestRegressor(random_state=123, n_jobs=-1, min_samples_leaf=5))
rf_mf_log = ('RF[mfLog2]', RandomForestRegressor(random_state=123, n_jobs=-1, max_features="log2"))
rf_mf_all = ('RF[mfAll]', RandomForestRegressor(random_state=123, n_jobs=-1, max_features=None))

In [22]:
max_samples = [0.2, 0.6, 0.8, 1, 1.2, 2, 3, 4, 5]
forests = [rf_100, rf_200, rf_500, rf_md_10, rf_md_15, rf_md_20, rf_md_25, rf_mss_3, rf_mss_4, rf_mss_6, rf_mss_8,
        rf_msl_2, rf_msl_3, rf_msl_4, rf_msl_5, rf_mf_log, rf_mf_all]

In [23]:
def make_X_train(X_tra, y_tra, max_sample):
    if max_sample <= 1:
        return X_tra, y_tra
    elif 1 < max_sample < 2:
        return np.tile(X_tra, (2, 1)), np.tile(y_tra, 2)
    else:
        return np.tile(X_tra, (max_sample, 1)), np.tile(y_tra, max_sample)

In [24]:
dataset_paths = []
for filename in os.listdir(path_to_datasets):
    full_path = os.path.join(path_to_datasets, filename)
    dataset_paths.append(full_path)
results = {'dataset' : [],
           'bootstrap_rate': [],
           'rf' : [],
           'cv_mse_scores' : [],
           'mean_mse' : [],
           'std_mse' : []}

standard_scaler = StandardScaler()
cvsplit = RepeatedKFold(n_splits=2, n_repeats=200, random_state=42)
total_datasets = len(dataset_paths)
total_ms = len(max_samples)
total_forests = len(forests)
total_folds = 2 * 200  # n_splits * n_repeats

pd.read_csv(dataset_paths[0]).head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,Y1
0,2.041777,-1.785875,-0.561951,-1.470077,1.0,-1.341641,-1.760447,-1.814575,15.55
1,2.041777,-1.785875,-0.561951,-1.470077,1.0,-0.447214,-1.760447,-1.814575,15.55
2,2.041777,-1.785875,-0.561951,-1.470077,1.0,0.447214,-1.760447,-1.814575,15.55
3,2.041777,-1.785875,-0.561951,-1.470077,1.0,1.341641,-1.760447,-1.814575,15.55
4,1.284979,-1.229239,0.0,-1.198678,1.0,-1.341641,-1.760447,-1.814575,20.84


In [25]:
start_time = time.time()

print(f"Starting training with {total_datasets} datasets, {total_ms} bootstrap rates, {total_forests} forest configurations, and {total_folds} CV folds")

for dataset_idx, dataset_path in enumerate(dataset_paths, 1):
    dataset_name = os.path.basename(dataset_path)
    print(f"\nProcessing dataset {dataset_idx}/{total_datasets}: {dataset_name}")

    df = pd.read_csv(dataset_path)  # Jeśli w csv jest kolumna z indeksami, użyj: pd.read_csv(dataset_path, index_col=False)

    X = df.iloc[:, :-1].to_numpy()
    y = df.iloc[:, -1].to_numpy()

    X = standard_scaler.fit_transform(X)  # Nie wiem po co, ale tak robili w paperze

    for ms_idx, ms in enumerate(max_samples, 1):
        print(f"  Bootstrap rate {ms_idx}/{total_ms}: {ms}")

        for forest_idx, rfr in enumerate(forests, 1):
            forest_start_time = time.time()
            print(f"    Forest model {forest_idx}/{total_forests}: {rfr[0]}")
            cv_mse_scores = []

            for fold_idx, (train_index, test_index) in enumerate(cvsplit.split(X, y), 1):
                if fold_idx % 10 == 0:  # Print only every 10 folds to avoid too much output
                    print(f"      Processing fold {fold_idx}/{total_folds}")

                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]

                X_tr, y_tr = make_X_train(X_train, y_train, ms)

                rfr[1].max_samples = None if ms >= 1 and ms != 1.2 else (0.6 if ms == 1.2 else ms)
                rfr[1].fit(X_tr, y_tr)

                y_hat = rfr[1].predict(X_test)
                mse = mean_squared_error(y_test, y_hat)
                cv_mse_scores.append(mse)

            # End time for training
            end_time = time.perf_counter()

            forest_time = time.time() - forest_start_time
            print(f"    Completed {rfr[0]} with bootstrap rate {ms}. Mean MSE: {np.mean(cv_mse_scores):.3f}")
            print(f"    Time taken: {timedelta(seconds=int(forest_time))}")

            results['dataset'].append(dataset_path)
            results['bootstrap_rate'].append(ms)
            results['rf'].append(rfr[0])
            results['cv_mse_scores'].append(cv_mse_scores)
            results['mean_mse'].append(f'{np.mean(cv_mse_scores):.3f}')
            results['std_mse'].append(f'{np.std(cv_mse_scores):.3f}')

print("\nTraining complete. Saving results...")
wyniki = pd.DataFrame(results)
wyniki.to_csv('wyniki_misiu.csv', index=False)
print("Results saved to 'wyniki_misiu.csv'")


Starting training with 1 datasets, 9 bootstrap rates, 17 forest configurations, and 400 CV folds

Processing dataset 1/1: Energy_Efficiency_242_preprocessed.csv
  Bootstrap rate 1/9: 0.2
    Forest model 1/17: RF
      Processing fold 10/400
      Processing fold 20/400
      Processing fold 30/400
      Processing fold 40/400
      Processing fold 50/400
      Processing fold 60/400
      Processing fold 70/400
      Processing fold 80/400
      Processing fold 90/400
      Processing fold 100/400
      Processing fold 110/400
      Processing fold 120/400
      Processing fold 130/400
      Processing fold 140/400
      Processing fold 150/400
      Processing fold 160/400
      Processing fold 170/400
      Processing fold 180/400
      Processing fold 190/400
      Processing fold 200/400
      Processing fold 210/400
      Processing fold 220/400
      Processing fold 230/400
      Processing fold 240/400
      Processing fold 250/400
      Processing fold 260/400
      Processing

KeyboardInterrupt: 