# Random Forest Instrument Classifier - NSynth

This notebook classifies a given instrument using different features from NSynth samples. These models were trained on Google Colab using Nvidia A100 GPUs.

## Install RAPIDS for random forest GPU support

In [2]:
!git clone --depth 1 https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py


Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 18, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 18 (delta 1), reused 11 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (18/18), 26.30 KiB | 2.19 MiB/s, done.
Resolving deltas: 100% (1/1), done.
Installing RAPIDS remaining 25.04 libraries
Using Python 3.11.12 environment at: /usr
Resolved 173 packages in 1.70s
Downloading libcuml-cu12 (404.9MiB)
Downloading cudf-cu12 (1.7MiB)
Downloading cugraph-cu12 (3.0MiB)
Downloading rmm-cu12 (1.5MiB)
Downloading shapely (2.4MiB)
Downloading datashader (17.5MiB)
Downloading dask (1.3MiB)
Downloading bokeh (6.6MiB)
Downloading ucx-py-cu12 (2.2MiB)
Downloading libcuspatial-cu12 (31.1MiB)
Downloading libcuvs-cu12 (1.1GiB)
Downloading pylibcudf-cu12 (26.4MiB)
Downloading librmm-cu12 (2.9MiB)
Downloading libcudf-cu12 (538.8MiB)
Downloading libcugraph-cu12 (1.4GiB)
Downloading cuspatial-cu12 (4.1Mi

## Imports and initial setup

In [4]:
import rmm, cupy as cp
from rmm.allocators.cupy import rmm_cupy_allocator

# ONE-TIME INITIALISATION: To allocate VRAM on GPU
rmm.reinitialize(
    pool_allocator=True,        # turn on pooled allocator
    initial_pool_size="30GB"     # grab about half the card up front
)
cp.cuda.set_allocator(rmm_cupy_allocator)
import cudf
cudf.set_option("spill", True)   # allow GPU→CPU overspill
from cuml.ensemble import RandomForestClassifier as cuRF
from cuml.metrics import accuracy_score
## from cuml.common import dtype_utils
import tensorflow as tf
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import class_weight
import os, itertools, random, gc, time, json, pprint
import numpy as np
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

# Ensuring TF does not use the GPU. We use
# cuML to train our model.
tf.config.set_visible_devices([], 'GPU')

Mounted at /content/drive


# Setup TF Record Parsing

In [5]:

# parse single tf record
def parse_tfrecord(file):
    feature_description={
        'note' : tf.io.FixedLenFeature([], tf.int64),
        'note_str' : tf.io.FixedLenFeature([], tf.string),
        'instrument' : tf.io.FixedLenFeature([], tf.int64),
        'instrument_str' : tf.io.FixedLenFeature([], tf.string),
        'pitch': tf.io.FixedLenFeature([], tf.int64),
        'velocity': tf.io.FixedLenFeature([], tf.int64),
        'sample_rate': tf.io.FixedLenFeature([], tf.int64),
        'audio': tf.io.FixedLenFeature([64000], tf.float32),
        'qualities': tf.io.FixedLenFeature([10], tf.int64),
        'qualities_str': tf.io.VarLenFeature(tf.string),
        'instrument_family': tf.io.FixedLenFeature([], tf.int64),
        'instrument_family_str': tf.io.FixedLenFeature([], tf.string),
        'instrument_source': tf.io.FixedLenFeature([], tf.int64),
        'instrument_source_str': tf.io.FixedLenFeature([], tf.string)
    }

    return tf.io.parse_single_example(file, feature_description)


## Parsing TF Records, Saving Datasets

### Training TFRecord

In [6]:
# intake training dataset
trainDS_raw = tf.data.TFRecordDataset("/content/drive/MyDrive/nsynth-train.tfrecord")

# map parsing function to dataset
trainingDS = trainDS_raw.map(parse_tfrecord)

# grab a few examples and examine them

tenExamples = trainingDS.take(10)

for i, example in enumerate(tenExamples):
    print(f"Example {i+1}")
    print(f"Note: {example['note']}")
    instrument_bytes = example['instrument_str'].numpy()
    print(f"Instrument: {instrument_bytes.decode('utf-8')}")
    instrument_fam_bytes = example['instrument_family_str'].numpy()
    print(f"Instrument Family: {instrument_fam_bytes.decode('utf-8')}")
    qualities_str_list_bytes = example['qualities_str'].values
    qualities_str_list = [value.decode('utf-8') for value in qualities_str_list_bytes.numpy()]
    print(f"Qualities: {qualities_str_list}")


Example 1
Note: 217495
Instrument: bass_synthetic_018
Instrument Family: bass
Qualities: ['distortion', 'fast_decay', 'percussive']
Example 2
Note: 266066
Instrument: bass_synthetic_018
Instrument Family: bass
Qualities: ['distortion', 'fast_decay', 'percussive']
Example 3
Note: 163673
Instrument: bass_synthetic_018
Instrument Family: bass
Qualities: ['distortion', 'fast_decay']
Example 4
Note: 22459
Instrument: bass_synthetic_018
Instrument Family: bass
Qualities: ['distortion', 'fast_decay']
Example 5
Note: 231254
Instrument: bass_synthetic_018
Instrument Family: bass
Qualities: ['distortion', 'fast_decay']
Example 6
Note: 209097
Instrument: bass_synthetic_018
Instrument Family: bass
Qualities: ['distortion', 'fast_decay', 'percussive']
Example 7
Note: 272473
Instrument: bass_synthetic_018
Instrument Family: bass
Qualities: ['distortion', 'fast_decay', 'percussive']
Example 8
Note: 231913
Instrument: bass_synthetic_018
Instrument Family: bass
Qualities: ['distortion', 'fast_decay']
E

## Validation TF Record

In [7]:
# intake validation TF Record

validDS_raw = tf.data.TFRecordDataset("/content/drive/MyDrive/nsynth-valid.tfrecord")

# map parsing function to dataset
validDS = validDS_raw.map(parse_tfrecord)

# grab a few examples and examine them

tenValidExamples = validDS.take(10)
for i, example in enumerate(tenValidExamples):
    print(f"Example {i+1}")
    print(f"Note: {example['note']}")
    instrument_bytes = example['instrument_str'].numpy()
    print(f"Instrument: {instrument_bytes.decode('utf-8')}")
    instrument_fam_bytes = example['instrument_family_str'].numpy()
    print(f"Instrument Family: {instrument_fam_bytes.decode('utf-8')}")
    qualities_str_list_bytes = example['qualities_str'].values
    qualities_str_list = [value.decode('utf-8') for value in qualities_str_list_bytes.numpy()]
    print(f"Qualities: {qualities_str_list}")


Example 1
Note: 245705
Instrument: bass_synthetic_033
Instrument Family: bass
Qualities: []
Example 2
Note: 12195
Instrument: bass_synthetic_033
Instrument Family: bass
Qualities: []
Example 3
Note: 150083
Instrument: bass_synthetic_033
Instrument Family: bass
Qualities: []
Example 4
Note: 184598
Instrument: bass_synthetic_033
Instrument Family: bass
Qualities: []
Example 5
Note: 255197
Instrument: bass_synthetic_033
Instrument Family: bass
Qualities: []
Example 6
Note: 254092
Instrument: bass_synthetic_033
Instrument Family: bass
Qualities: []
Example 7
Note: 167817
Instrument: bass_synthetic_033
Instrument Family: bass
Qualities: []
Example 8
Note: 179327
Instrument: bass_synthetic_033
Instrument Family: bass
Qualities: []
Example 9
Note: 140833
Instrument: bass_synthetic_033
Instrument Family: bass
Qualities: []
Example 10
Note: 279694
Instrument: bass_synthetic_033
Instrument Family: bass
Qualities: []


## Establish features and target

In [8]:
# One-off helper: create a mel filterbank for 16 kHz audio
SR        = 16000          # NSynth sample-rate in Hz
FFT_SIZE  = 1024
HOP       = 256
N_MELS    = 128

mel_mat = tf.signal.linear_to_mel_weight_matrix(
    num_mel_bins   = N_MELS,
    num_spectrogram_bins = FFT_SIZE // 2 + 1,
    sample_rate    = SR,
    lower_edge_hertz  = 30.0,
    upper_edge_hertz  = SR/2.0
)

def summarise_mel(mel_spec):
    """Average over time axis -> shape (N_MELS,)"""
    # log(1 + x) keeps dynamics but is always finite
    return tf.math.log1p(tf.reduce_mean(mel_spec, axis=0))

def tfrecord_to_vector(example):
    # audio
    audio = tf.cast(example['audio'], tf.float32)
    audio = audio / (tf.reduce_max(tf.abs(audio)) + 1e-7) # simple normalisation

    spec  = tf.signal.stft(audio, frame_length=FFT_SIZE, frame_step=HOP)
    mag   = tf.abs(spec)
    mel   = tf.matmul(mag, mel_mat) # (time, mel)
    mel   = summarise_mel(mel) # (128,)

    # build feature vector
    features = tf.concat([
        tf.cast([example['note'],
                 example['pitch'],
                 example['velocity'],
                 example['sample_rate'],
                 example['instrument_source']], tf.float32),
        mel                                           # 128-D
    ], axis=0)

    return {
        'x': features, # float32 [133]
        'y': example['instrument_family'] # int64   []
    }


## Get Record Counts

In [9]:
totalTrainRecs = 289205
totalValidRecs = 12678

# commented out because it takes FOREVER
# actual totals pulled from n-synth dataset website
#for _ in trainingDS:
#  totalTrainRecs += 1

#for _ in validDS:
#  totalValidRecs += 1

print(f"Total Training Records: {totalTrainRecs}")
print(f"Total Validation Records: {totalValidRecs}")

Total Training Records: 289205
Total Validation Records: 12678


## Preprocessing Training Data

In [10]:
import math
import pyarrow as pa, pyarrow.parquet as pq  # needed by pandas.to_parquet

OUT_DIR = "/content/drive/MyDrive/NSynth/nsynth_parquet"
os.makedirs(OUT_DIR, exist_ok=True)

from tqdm.auto import tqdm


TRAIN_RECS = 289_205   # ← hard-coded counts from the NSynth docs
VALID_RECS = 12_678
BATCH      = 2048

def write_split(ds_raw, split_name, total_records, batch=BATCH):
    """
    Stream TFRecord → TF → Parquet with < 300 MB peak RAM.
    Creates ONE parquet file by appending each batch with pyarrow.
    """
    ds_vec = (ds_raw
              .map(parse_tfrecord, num_parallel_calls=tf.data.AUTOTUNE)
              .map(tfrecord_to_vector, num_parallel_calls=tf.data.AUTOTUNE)
              .batch(batch)
              .prefetch(tf.data.AUTOTUNE))

    total_batches = math.ceil(total_records / batch)
    writer = None # will be initialised on 1st batch

    # Initiallize our progress bar for splitting
    for batch_dict in tqdm(ds_vec, total=total_batches,
                           desc=f"Converting {split_name}"):
        X = batch_dict['x'].numpy()
        y = batch_dict['y'].numpy().astype(np.int16)

        df_batch = pd.DataFrame(
            X, columns=[f"feat_{i}" for i in range(X.shape[1])]
        ).assign(y=y)

        # Convert to Arrow table and append
        table = pa.Table.from_pandas(df_batch, preserve_index=False)
        if writer is None:                            # first batch → create file
            writer = pq.ParquetWriter(f"{OUT_DIR}/{split_name}.parquet",
                                       table.schema,
                                       compression="snappy")
        writer.write_table(table)

        del df_batch, table, X, y; gc.collect()

    if writer is not None:
        writer.close()
    print(f"✓ Wrote {split_name}.parquet  ({total_records:,} rows)")


## Split and Make Into Parquet

In [11]:
write_split(trainDS_raw, "train", TRAIN_RECS)
write_split(validDS_raw, "valid", VALID_RECS)

Converting train:   0%|          | 0/142 [00:00<?, ?it/s]

✓ Wrote train.parquet  (289,205 rows)


Converting valid:   0%|          | 0/7 [00:00<?, ?it/s]

✓ Wrote valid.parquet  (12,678 rows)


## Preparing the training data

In [12]:
# Read in the "train" and "validate" parquets
train_gdf = cudf.read_parquet(f"{OUT_DIR}/train.parquet")
valid_gdf = cudf.read_parquet(f"{OUT_DIR}/valid.parquet")

# cuML expects 0-based ints, so we have to factorize
y_train_codes, y_uniques = cudf.factorize(train_gdf["y"], sort=False)
y_valid_codes = y_uniques.get_indexer(valid_gdf["y"]).astype("int32")  # unseen = -1

# Verify 11 classes in 'instrument_family' for y
print("Verify 11 classes:", y_uniques.size)

# Need to get rid of y from feature sets
X_train = train_gdf.drop(columns="y").astype("float32") # cuML needs float32
X_valid = valid_gdf.drop(columns="y").astype("float32")

X_train_cu = X_train.to_cupy() # (289 205, 133) float32
y_train_cu = cp.asarray(y_train_codes) # (289 205,) int32
X_valid_cu = X_valid.to_cupy()
y_valid_cu = cp.asarray(y_valid_codes)
y_np = cp.asnumpy(y_train_cu)

del train_gdf, valid_gdf, X_train
gc.collect()
cp.cuda.Device(0).synchronize()
cp.get_default_memory_pool().free_all_blocks()

# Indexing for categorical features
cat_idx = list(range(5)) # note, pitch, velocity, sr, inst_source




Verify 11 classes: 11


## Training the base model

In [13]:
# Initialize classifier with basic hyper-params
rf_gpu = cuRF(
    n_estimators = 64,
    max_depth    = 12,
    n_bins       = 8,
    n_streams    = 6, ###1
    rows_sample  = 0.5,
    max_features = 0.5,
    split_algo   = "GLOBAL_QUANTILE",
    bootstrap    = True,
    random_state = 42,
)

# Train our RF classifier
rf_gpu.fit(X_train_cu, y_train_cu)


  return init_func(self, *args, **kwargs)


# Testing and evaluation

In [14]:
from sklearn.metrics import (
    f1_score, precision_recall_fscore_support,
    confusion_matrix, ConfusionMatrixDisplay, top_k_accuracy_score
)

# Test on validation set
y_pred = rf_gpu.predict(X_valid_cu)

# 1) Accuracy
acc = float(accuracy_score(y_valid_cu, y_pred))
print(f"Accuracy : {acc:.3%} {y_uniques.size}")

# 2) Macro-F1  (sklearn works on CPU, so bring arrays back)
macro_f1 = f1_score(cp.asnumpy(y_valid_cu),
                    cp.asnumpy(y_pred),
                    average="macro")
print(f"Macro F1 : {macro_f1:.3f}")

# 3) Top-3 accuracy
all_labels = np.arange(y_uniques.size)
proba = rf_gpu.predict_proba(X_valid_cu)        # GPU array (N, 11)
top3  = top_k_accuracy_score(cp.asnumpy(y_valid_cu),
                             cp.asnumpy(proba),
                             k=3,
                             labels=all_labels)
print(f"Top-3 Acc: {top3:.3%}")

Accuracy : 64.758% 11
Macro F1 : 0.556
Top-3 Acc: 93.193%


## Saving the base model

In [15]:
# Save the trained base model
import joblib
model_path = "/content/drive/MyDrive/NSynth/models/nsynth_cuml_rf_base.joblib"
os.makedirs(os.path.dirname(model_path), exist_ok=True)
joblib.dump(rf_gpu, model_path)
print("Saved model to:", model_path)

del rf_gpu
gc.collect()
cp.cuda.Device(0).synchronize()
cp.get_default_memory_pool().free_all_blocks()

Saved model to: /content/drive/MyDrive/NSynth/models/nsynth_cuml_rf_base.joblib


## Hyperparameter tuning

In [16]:
def valid_metrics(rf):
    """Return (macro-F1, accuracy, top-3 accuracy) on the held-out set."""
    y_pred   = rf.predict(X_valid_cu)
    proba    = rf.predict_proba(X_valid_cu)

    f1   = f1_score(cp.asnumpy(y_valid_cu),
                    cp.asnumpy(y_pred),
                    average="macro")
    acc  = accuracy_score(cp.asnumpy(y_valid_cu),
                          cp.asnumpy(y_pred))
    top3 = top_k_accuracy_score(cp.asnumpy(y_valid_cu),
                                cp.asnumpy(proba),
                                k=3,
                                labels=np.arange(y_uniques.size))
    return f1, acc, top3

search_space = list(itertools.product(
    [256, 1024], # n_estimators
    [12, 16], # max_depth
    [ 0.6, "sqrt"], # max_features
    [0.6, 0.9], # rows_sample
    [16, 32], # n_bins
    ["GLOBAL_QUANTILE", "HIST"] # split_algo
))

random.seed(42)
candidates = random.sample(search_space, 64)   # pick 64 combos

results = []

for (n_est, depth, feat, row_samp, bins, algo) in tqdm(candidates,
                                                       desc="H-param sweep"):

    rf = cuRF(
        n_estimators = n_est,   max_depth   = depth,
        max_features = feat,    rows_sample = row_samp,
        n_bins       = bins,    split_algo  = algo,
        n_streams    = 6,       bootstrap   = True, ##n=1
        random_state = 42,
    )

    t0 = time.time()
    rf.fit(X_train_cu, y_train_cu)
    f1, acc, top3 = valid_metrics(rf)
    dt = time.time() - t0

    results.append({
        "F1": f1, "Acc": acc, "Top3": top3,
        "trees": n_est, "depth": depth,
        "max_feat": feat, "row_samp": row_samp,
        "bins": bins, "algo": algo,
        "sec": dt
    })

    # free VRAM before next run
    del rf
    cp.cuda.Device(0).synchronize()
    cp.get_default_memory_pool().free_all_blocks()
    gc.collect()


H-param sweep:   0%|          | 0/64 [00:00<?, ?it/s]

[2025-05-17 19:04:57.832] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:04:57.832] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:04:59.877] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:04:59.877] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:05:04.725] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:05:04.725] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:05:12.128] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:05:12.128] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:05:22.690] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:05:22.690] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:05:24.631] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:05:24.631] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:05:35.773] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:05:35.773] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:05:37.635] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:05:37.635] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:05:48.487] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:05:48.487] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:05:53.865] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:05:53.865] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:06:00.828] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:06:00.828] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:06:11.580] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:06:11.580] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:06:31.957] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:06:31.957] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:06:36.845] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:06:36.845] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:06:55.135] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:06:55.136] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:07:00.155] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:07:00.155] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:07:05.465] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:07:05.465] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:07:16.393] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:07:16.393] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:07:49.390] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:07:49.390] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:07:51.225] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:07:51.225] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:08:02.075] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:08:02.075] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:08:20.637] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:08:20.637] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:08:41.257] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:08:41.257] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:09:12.900] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:09:12.900] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:09:33.472] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:09:33.472] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:09:35.284] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:09:35.284] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:10:07.047] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:10:07.047] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:10:12.343] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:10:12.343] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:10:18.894] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:10:18.894] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:10:24.094] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:10:24.094] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:10:34.813] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:10:34.813] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:10:39.671] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:10:39.671] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:10:41.665] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:10:41.665] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:11:13.237] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:11:13.237] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:11:46.138] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:11:46.138] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:11:57.039] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:11:57.039] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:12:01.876] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:12:01.876] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:12:34.786] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:12:34.786] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:12:40.032] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:12:40.032] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:12:45.412] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:12:45.412] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:12:50.761] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:12:50.761] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:13:22.445] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:13:22.445] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:13:43.268] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:13:43.268] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:14:16.060] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:14:16.060] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:14:17.978] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:14:17.978] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:14:28.500] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:14:28.500] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:14:33.834] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:14:33.834] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:14:40.600] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:14:40.600] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:14:59.222] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:14:59.222] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:15:09.829] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:15:09.829] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:15:20.387] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:15:20.387] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:15:25.662] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:15:25.662] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:15:32.718] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:15:32.718] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:15:43.588] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:15:43.588] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:15:50.320] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:15:50.320] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:15:52.142] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:15:52.142] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:15:59.173] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:15:59.173] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:16:10.070] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:16:10.070] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:16:20.992] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:16:20.992] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:16:39.691] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:16:39.691] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:16:46.452] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:16:46.452] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:16:57.326] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:16:57.326] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:17:02.700] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:17:02.700] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


[2025-05-17 19:17:07.898] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:17:07.898] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)


## Evaluating best results

In [17]:
df = (pd.DataFrame(results)
        .sort_values("F1", ascending=False)
        .reset_index(drop=True))

top_f1   = df.head(10)
top_acc  = df.sort_values("Acc",  ascending=False).head(10)
top_top3 = df.sort_values("Top3", ascending=False).head(10)

print("\n=== TOP-10 BY MACRO-F1 ===")
print(top_f1[["F1","Acc","Top3","trees","depth","max_feat",
              "row_samp","bins","algo","sec"]])

print("\n=== TOP-10 BY ACCURACY ===")
print(top_acc[["Acc","F1","Top3","trees","depth","max_feat",
               "row_samp","bins","algo","sec"]])

print("\n=== TOP-10 BY TOP-3 ACCURACY ===")
print(top_top3[["Top3","F1","Acc","trees","depth","max_feat",
                "row_samp","bins","algo","sec"]])

top_f1.to_csv  ("/content/drive/MyDrive/rf_leaderboard_macroF1.csv",  index=False)
top_acc.to_csv ("/content/drive/MyDrive/rf_leaderboard_accuracy.csv", index=False)
top_top3.to_csv("/content/drive/MyDrive/rf_leaderboard_top3.csv",     index=False)


=== TOP-10 BY MACRO-F1 ===
         F1       Acc      Top3  trees  depth max_feat  row_samp  bins  \
0  0.609156  0.708077  0.940290   1024     16      0.6       0.6    32   
1  0.609156  0.708077  0.940290   1024     16      0.6       0.6    32   
2  0.609156  0.708077  0.940290   1024     16      0.6       0.9    32   
3  0.609156  0.708077  0.940290   1024     16      0.6       0.9    32   
4  0.606218  0.706026  0.940606    256     16      0.6       0.6    32   
5  0.606218  0.706026  0.940606    256     16      0.6       0.9    32   
6  0.606218  0.706026  0.940606    256     16      0.6       0.9    32   
7  0.606218  0.706026  0.940606    256     16      0.6       0.6    32   
8  0.603646  0.706657  0.945496    256     16      0.6       0.9    16   
9  0.603646  0.706657  0.945496    256     16      0.6       0.6    16   

              algo        sec  
0             HIST  32.739425  
1  GLOBAL_QUANTILE  32.536111  
2             HIST  32.658407  
3  GLOBAL_QUANTILE  32.655414

In [18]:
rf = cuRF(
    n_estimators = 2048,   max_depth   = 28,
    max_features = 0.6,    rows_sample = 0.6,
    n_bins       = 32,    split_algo  = 'HIST',
    n_streams    = 6,       bootstrap   = True, ##n=1
    random_state = 42,
)

t0 = time.time()
rf.fit(X_train_cu, y_train_cu)
f1, acc, top3 = valid_metrics(rf)
dt = time.time() - t0

results = []

results.append({
    "F1": f1, "Acc": acc, "Top3": top3,
    "trees": 2048, "depth": 28,
    "max_feat": 0.6, "row_samp": 0.6,
    "bins": 32, "algo": 'HIST',
    "sec": dt
})

# free VRAM before next run
del rf
cp.cuda.Device(0).synchronize()
cp.get_default_memory_pool().free_all_blocks()
gc.collect()

df = (pd.DataFrame(results)
        .sort_values("F1", ascending=False)
        .reset_index(drop=True))

top_f1   = df.head(10)
top_acc  = df.sort_values("Acc",  ascending=False).head(10)
top_top3 = df.sort_values("Top3", ascending=False).head(10)

print("\n=== BEST MODEL F1 ===")
print(top_f1[["F1","Acc","Top3","trees","depth","max_feat",
              "row_samp","bins","algo","sec"]])

print("\n=== BEST MODEL ACCURACY ===")
print(top_acc[["Acc","F1","Top3","trees","depth","max_feat",
               "row_samp","bins","algo","sec"]])

print("\n=== BEST MODEL TOP-3 ACCURACY ===")
print(top_top3[["Top3","F1","Acc","trees","depth","max_feat",
                "row_samp","bins","algo","sec"]])

[2025-05-17 19:17:19.507] [CUML] [info] Unused keyword parameter: rows_sample during cuML estimator initialization
[2025-05-17 19:17:19.508] [CUML] [info] Unused keyword parameter: split_algo during cuML estimator initialization


  return init_func(self, *args, **kwargs)



=== BEST MODEL F1 ===
         F1       Acc      Top3  trees  depth  max_feat  row_samp  bins  algo  \
0  0.642672  0.730478  0.935637   2048     28       0.6       0.6    32  HIST   

          sec  
0  115.367586  

=== BEST MODEL ACCURACY ===
        Acc        F1      Top3  trees  depth  max_feat  row_samp  bins  algo  \
0  0.730478  0.642672  0.935637   2048     28       0.6       0.6    32  HIST   

          sec  
0  115.367586  

=== BEST MODEL TOP-3 ACCURACY ===
       Top3        F1       Acc  trees  depth  max_feat  row_samp  bins  algo  \
0  0.935637  0.642672  0.730478   2048     28       0.6       0.6    32  HIST   

          sec  
0  115.367586  
