UWAGA - poniżej również kod do testów na cpu. W celu lokalnych testów warto zrobić tymczasowy katalog z jednym datasetem, na podstawie którego będziemy porównywać czas. 

In [1]:
# path_to_datasets = './test_data/small/'
path_to_datasets = '../test_data/'

In [2]:
import cupy as cp

print("Czy GPU działa?", cp.cuda.is_available())
print("GPU:", cp.cuda.runtime.getDeviceProperties(0)['name'])

Czy GPU działa? True
GPU: b'NVIDIA GeForce RTX 4050 Laptop GPU'


# Training on cpu

In [3]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold
import os
from sklearn.metrics import mean_squared_error
import time
from datetime import timedelta


In [15]:
rf_100 =    ('RF', RandomForestRegressor(random_state=123, n_jobs=-1, n_estimators=100))
rf_200 =    ('RF[200]', RandomForestRegressor(random_state=123, n_jobs=-1,  n_estimators=200))
rf_500 =    ('RF[500]', RandomForestRegressor(random_state=123, n_jobs=-1, n_estimators=500))
rf_md_10 =  ('RF[md10]', RandomForestRegressor(random_state=123, n_jobs=-1, max_depth=10))
rf_md_15 =  ('RF[md15]', RandomForestRegressor(random_state=123, n_jobs=-1, max_depth=15))
rf_md_20 =  ('RF[md20]', RandomForestRegressor(random_state=123, n_jobs=-1, max_depth=20))
rf_md_25 =  ('RF[md25]', RandomForestRegressor(random_state=123, n_jobs=-1, max_depth=25))
rf_mss_3 =  ('RF[mss3]', RandomForestRegressor(random_state=123, n_jobs=-1, min_samples_split=3))
rf_mss_4 =  ('RF[mss4]', RandomForestRegressor(random_state=123, n_jobs=-1, min_samples_split=4))
rf_mss_6 =  ('RF[mss6]', RandomForestRegressor(random_state=123, n_jobs=-1, min_samples_split=6))
rf_mss_8 =  ('RF[mss8]', RandomForestRegressor(random_state=123, n_jobs=-1, min_samples_split=8))
rf_msl_2 =  ('RF[msl2]', RandomForestRegressor(random_state=123, n_jobs=-1, min_samples_leaf=2))
rf_msl_3 =  ('RF[msl3]', RandomForestRegressor(random_state=123, n_jobs=-1, min_samples_leaf=3))
rf_msl_4 =  ('RF[msl4]', RandomForestRegressor(random_state=123, n_jobs=-1, min_samples_leaf=4))
rf_msl_5 =  ('RF[msl5]', RandomForestRegressor(random_state=123, n_jobs=-1, min_samples_leaf=5))
rf_mf_log = ('RF[mfLog2]', RandomForestRegressor(random_state=123, n_jobs=-1, max_features="log2"))
rf_mf_all = ('RF[mfAll]', RandomForestRegressor(random_state=123, n_jobs=-1, max_features=None))

In [16]:
max_samples = [0.2, 0.6, 0.8, 1, 1.2, 2, 3, 4, 5]
forests = [rf_100, rf_200, rf_500, rf_md_10, rf_md_15, rf_md_20, rf_md_25, rf_mss_3, rf_mss_4, rf_mss_6, rf_mss_8,
        rf_msl_2, rf_msl_3, rf_msl_4, rf_msl_5, rf_mf_log, rf_mf_all]

In [None]:
def make_X_train(X_tra, y_tra, max_sample):
    if max_sample <= 1:
        return X_tra, y_tra
    elif 1 < max_sample < 2:
        return np.tile(X_tra, (2, 1)), np.tile(y_tra, 2)
    else:
        return np.tile(X_tra, (max_sample, 1)), np.tile(y_tra, max_sample)

In [None]:
dataset_paths = []
for filename in os.listdir(path_to_datasets):
    full_path = os.path.join(path_to_datasets, filename)
    dataset_paths.append(full_path)
results = {'dataset' : [],
           'bootstrap_rate': [],
           'rf' : [],
           'cv_mse_scores' : [],
           'mean_mse' : [],
           'std_mse' : []}

standard_scaler = StandardScaler()
cvsplit = RepeatedKFold(n_splits=2, n_repeats=200, random_state=42)
total_datasets = len(dataset_paths)
total_ms = len(max_samples)
total_forests = len(forests)
total_folds = 2 * 200  # n_splits * n_repeats

pd.read_csv(dataset_paths[0]).head()

In [None]:
from joblib import Parallel, delayed
from sklearn.base import clone
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_squared_error
import time
from datetime import timedelta
import os

results = []

def run_single_fold(model, ms, X, y, train_index, test_index, fold_idx, dataset_name):
    start_time = time.time()

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    X_tr, y_tr = make_X_train(X_train, y_train, ms)

    model = clone(model)
    model.max_samples = None if ms >= 1 and ms != 1.2 else (0.6 if ms == 1.2 else ms)
    model.fit(X_tr, y_tr)

    y_hat = model.predict(X_test)
    mse = mean_squared_error(y_test, y_hat)

    elapsed_time = timedelta(seconds=(time.time() - start_time))

    return {
        'dataset': dataset_name,
        'model': 'RF',
        'max_samples': ms,
        'fold': fold_idx,
        'mse': mse,
        'elapsed_time': elapsed_time
    }

for dataset_path in dataset_paths:
    dataset_name = os.path.basename(dataset_path).replace('.csv', '')
    print(f"Processing dataset: {dataset_name}")
    
    data = pd.read_csv(dataset_path)
    X = data.drop('target', axis=1).values
    y = data['target'].values
    
    cvsplit = RepeatedKFold(n_splits=10, n_repeats=10, random_state=123)
    
    for ms in max_samples:
        print(f"max_samples = {ms}")
        
        fold_results = Parallel(n_jobs=-1)(
            delayed(run_single_fold)(
                rf_100[1], ms, X, y, train_idx, test_idx, fold_idx, dataset_name
            )
            for fold_idx, (train_idx, test_idx) in enumerate(cvsplit.split(X, y), 1)
        )
        results.extend(fold_results)
    
    # Save after each dataset as per original logic
    results_df = pd.DataFrame(results)
    results_df.to_csv(f'wyniki_{dataset_name}_bb.csv', index=False)
    print(f"Wyniki zapisane dla {dataset_name}.")

In [None]:
end_time = time.time()
print(f'Total elapsed time: {timedelta(seconds=end_time - start_time)}')