In [15]:
import tensorflow as tf
import numpy as np
import joblib

In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Setup TF Record Parser

In [17]:
# parse single tf record
def parse_tfrecord(file):
    feature_description={
        'note' : tf.io.FixedLenFeature([], tf.int64),
        'note_str' : tf.io.FixedLenFeature([], tf.string),
        'instrument' : tf.io.FixedLenFeature([], tf.int64),
        #'instrument_str' : tf.io.FixedLenFeature([], tf.string),
        'pitch': tf.io.FixedLenFeature([], tf.int64),
        'velocity': tf.io.FixedLenFeature([], tf.int64),
        'sample_rate': tf.io.FixedLenFeature([], tf.int64),
        'audio': tf.io.FixedLenFeature([64000], tf.float32),
        'qualities': tf.io.FixedLenFeature([10], tf.int64),
        #'qualities_str': tf.io.VarLenFeature(tf.string),
        'instrument_family': tf.io.FixedLenFeature([], tf.int64),
        #'instrument_family_str': tf.io.FixedLenFeature([], tf.string),
        'instrument_source': tf.io.FixedLenFeature([], tf.int64),
        #'instrument_source_str': tf.io.FixedLenFeature([], tf.string)
    }

    return tf.io.parse_single_example(file, feature_description)


# Parsing TF Records, Saving Datasets

## Intake Training TFRecord

In [18]:
from collections import defaultdict

# intake training dataset
trainDS_raw = tf.data.TFRecordDataset("/content/drive/MyDrive/nsynth-train.tfrecord")

# map parsing function to dataset
trainingDS = trainDS_raw.map(parse_tfrecord)

from sklearn.decomposition import IncrementalPCA
import numpy as np




## Intake Validation TFRecord

In [19]:
# intake validation TF Record

validDS_raw = tf.data.TFRecordDataset("/content/drive/MyDrive/nsynth-valid.tfrecord")

# map parsing function to dataset
validDS = validDS_raw.map(parse_tfrecord)


# Intake Test TFRecord

In [20]:
# intake test TF Record

testDS_raw = tf.data.TFRecordDataset("/content/drive/MyDrive/nsynth-test.tfrecord")

# map parsing function to dataset
testDS = testDS_raw.map(parse_tfrecord)


## Establish Features and Target

In [21]:
import time
import csv
from tqdm import tqdm
import re
import numpy as np
import pandas as pd

# get features and target
# note target is 'instrument'
# features are select other categories
feat1 = ['note','instrument_source']
feat2 = ['velocity','pitch']
feat3 = ['audio']
target = ['instrument_family']

## Get Record Counts

In [22]:
totalTrainRecs = 289205
totalValidRecs = 12678
totalTestRecs = 4096
print(f"Total Training Records: {totalTrainRecs}")
print(f"Total Validation Records: {totalValidRecs}")
print(f"Total Test Records: {totalTestRecs}")

Total Training Records: 289205
Total Validation Records: 12678
Total Test Records: 4096


# Convert Training Audio Arrays to Mel Spectograms

In [9]:
# preprocess train audio arrays into mel spectograms
import librosa
for example in tqdm(trainingDS,total=totalTrainRecs,desc="Converting Arrays to Spectograms"):
#for sample in trainingDS.take(1):
  audio = example['audio']
  audio_np = audio.numpy()

  #print(f"Audio shape: {audio_np.shape}, dtype: {audio_np.dtype}")
  mel_spec = librosa.feature.melspectrogram(
      y=audio_np,
      sr=16000,
      n_fft=1024,
      hop_length=256,
      n_mels=64
  )
  mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
  #print(mel_spec_db)
 # Min-max normalization to [0, 1]
  min_val = mel_spec_db.min()
  max_val = mel_spec_db.max()
  mel_spec= (mel_spec_db - min_val) / (max_val - min_val + 1e-6)
  #print(f"Success! Mel shape: {mel_spec.shape}")
  #print(mel_spec)
  mel_spec = mel_spec.flatten()
  #print(f"Success! Mel shape: {mel_spec.shape}")
  example['audio'] = mel_spec

Converting Arrays to Spectograms: 100%|██████████| 289205/289205 [1:00:21<00:00, 79.85it/s] 


# Convert Validation Audio Arrays to Mel Spectograms

In [10]:
# preprocess valid audio arrays into mel spectograms
import librosa

for example in tqdm(validDS,total=totalValidRecs,desc="Converting Arrays to Spectograms"):
#for sample in trainingDS.take(1):
  audio = example['audio']
  audio_np = audio.numpy()

  #print(f"Audio shape: {audio_np.shape}, dtype: {audio_np.dtype}")
  mel_spec = librosa.feature.melspectrogram(
      y=audio_np,
      sr=16000,
      n_fft=1024,
      hop_length=256,
      n_mels=64
  )
  mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
  #print(mel_spec_db)
 # Min-max normalization to [0, 1]
  min_val = mel_spec_db.min()
  max_val = mel_spec_db.max()
  mel_spec= (mel_spec_db - min_val) / (max_val - min_val + 1e-6)
  #print(f"Success! Mel shape: {mel_spec.shape}")
  #print(mel_spec)
  mel_spec = mel_spec.flatten()
  #print(f"Success! Mel shape: {mel_spec.shape}")
  example['audio'] = mel_spec

Converting Arrays to Spectograms: 100%|██████████| 12678/12678 [02:25<00:00, 87.29it/s] 


In [None]:
# preprocess test audio arrays into mel spectograms

for example in tqdm(testDS,total=totalTestRecs,desc="Converting Arrays to Spectograms"):
#for sample in trainingDS.take(1):
  audio = example['audio']
  audio_np = audio.numpy()

  #print(f"Audio shape: {audio_np.shape}, dtype: {audio_np.dtype}")
  mel_spec = librosa.feature.melspectrogram(
      y=audio_np,
      sr=16000,
      n_fft=1024,
      hop_length=256,
      n_mels=64
  )
  mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
  #print(mel_spec_db)
 # Min-max normalization to [0, 1]
  min_val = mel_spec_db.min()
  max_val = mel_spec_db.max()
  mel_spec= (mel_spec_db - min_val) / (max_val - min_val + 1e-6)
  #print(f"Success! Mel shape: {mel_spec.shape}")
  #print(mel_spec)
  mel_spec = mel_spec.flatten()
  #print(f"Success! Mel shape: {mel_spec.shape}")
  example['audio'] = mel_spec

Converting Arrays to Spectograms: 100%|██████████| 4096/4096 [00:52<00:00, 77.49it/s] 


# PreProcessing Training Data in Batches

### NOTE: Function creates files in /content/svmBatches/batchfeatures*, structure file directory accordingly or modify as needed

In [12]:
def batch_parquet(parsedDataset, featnum, features=[], batch_size=1000, max_records=None):

  batch_files = []
  total_recs = 0
  batch_idx = 0
  # Convert dataset to batches
  batched_dataset = parsedDataset.batch(batch_size)
  n_batches = 289205 // batch_size
  for batch in tqdm(batched_dataset,total=n_batches,desc="Batches to Parquet"):
    # Process each batch
    batch_dict = {key: [] for key in features}

    # Get batch size (might be smaller for the last batch)
    current_batch_size = tf.shape(next(iter(batch.values())))[0].numpy()

    # Extract features for each record in the batch
    for i in range(current_batch_size):
      for key in features:
        # Handle different types of features
        feature = batch[key][i]
        if isinstance(feature, tf.Tensor):
          value = feature.numpy()
          # Convert bytes to string if applicable
          if isinstance(value, bytes):
            value = value.decode('utf-8')
        else:
            value = feature
        batch_dict[key].append(value)

    # Create DataFrame for this batch and append to list
    df_batch = pd.DataFrame(batch_dict)

    # Write to parquet
    batch_file = f"/content/svmBatches/bf{featnum}/batch_{batch_idx}.parquet"
    batch_idx += 1
    df_batch.to_parquet(batch_file)
    batch_files.append(batch_file)

    # Update record count
    total_recs += current_batch_size

    # Check if we've reached the maximum number of records
    if max_records is not None and total_recs >= max_records:
        break
  return
# Usage:
# NOTE: returns arent used, these need modification at some point
# process features
batch_parquet(trainingDS, 1, feat1, batch_size=1000,max_records=289205)
batch_parquet(trainingDS, 2, feat2, batch_size=1000,max_records=289205)
batch_parquet(trainingDS, 3, feat3, batch_size=1000,max_records=289205)

# process target
batch_parquet(trainingDS, 0, target, batch_size=1000,max_records=289205)


Batches to Parquet: 100%|██████████| 289/289 [17:00<00:00,  3.53s/it]
Batches to Parquet: 100%|██████████| 289/289 [06:03<00:00,  1.26s/it]
Batches to Parquet: 100%|██████████| 289/289 [16:28<00:00,  3.42s/it]
Batches to Parquet: 100%|██████████| 289/289 [06:46<00:00,  1.40s/it]


## Train SVM Model

## Check for Number of Cores available

In [None]:

import multiprocessing
from sklearn.preprocessing import StandardScaler
from cuml import MBSGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from collections import Counter
import joblib
import glob
import gc
numCores = multiprocessing.cpu_count()
print(f"Available Cores: {numCores}")


stdout:



stderr:

Traceback (most recent call last):
  File "<string>", line 4, in <module>
  File "/usr/local/lib/python3.11/dist-packages/numba_cuda/numba/cuda/cudadrv/driver.py", line 314, in __getattr__
    raise CudaSupportError("Error at driver init: \n%s:" %
numba.cuda.cudadrv.error.CudaSupportError: Error at driver init: 

CUDA driver library cannot be found.
If you are sure that a CUDA driver is installed,
try setting environment variable NUMBA_CUDA_DRIVER
with the file path of the CUDA driver shared library.
:


Not patching Numba


Available Cores: 8


## Train Model in Batches

## Helper Functions for Training & FilePaths

In [14]:
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
import glob
import gc
from sklearn.preprocessing import StandardScaler
import ast
from tqdm import tqdm
import pandas as pd
import joblib
import librosa
import librosa.display

featuresDirs = [
    "/content/svmBatches/bf1",
    "/content/svmBatches/bf2",
    "/content/svmBatches/bf3"
]

targetDir = "/content/svmBatches/bf0"

def collect_all_classes(target_dir, num_batches):
    print("Collecting unique classes...")
    all_classes = set()

    for batch_idx in tqdm(range(num_batches), desc="Collecting unique classes"):
        try:
            target_file = f"{target_dir}/batch_{batch_idx}.parquet"
            y_batch = pd.read_parquet(target_file)
            if y_batch.shape[1] == 1:
                y_batch = y_batch.iloc[:, 0].values

            # Add all unique classes from this batch
            batch_classes = np.unique(y_batch)
            all_classes.update(batch_classes)

        except Exception as e:
            print(f"Error loading target {batch_idx}: {e}")
            # If we hit a consistent error, we might have reached the end
            if "No such file or directory" in str(e):
                break

    all_classes = np.array(sorted(list(all_classes)))
    print(f"Found {len(all_classes)} unique classes in the target")
    return all_classes

def load_batch_from_multiple_dirs(batch_idx, feature_dirs, target_dir):
  """Load features from multiple directories and combine them"""
  feature_arrays = []
  audio_data = None  # Separate variable to store audio data

  # Load features from each directory
  for feature_dir in feature_dirs:
    feature_file = f"{feature_dir}/batch_{batch_idx}.parquet"
    try:
      if os.path.exists(feature_file):
        df = pd.read_parquet(feature_file)
        # Check if this is the audio feature
      if 'audio' in df.columns:
        # Reshape audio data to 2D
        audio_data = df['audio'].apply(lambda x: np.array(x).reshape(1, -1)).values
        audio_data = np.concatenate(audio_data, axis=0)
      else:
        # Append other features to list
        feature_arrays.append(df.values)
      del df
      gc.collect()

    except Exception as e:
      print(f"Error loading {feature_file}: {e}")
      #return None, None

    # Combine features horizontally, excluding audio

    try:
        X_batch_other_features = np.concatenate(feature_arrays, axis=1)
    except Exception as e:
        print(f"Error concatenating features for batch {batch_idx}: {e}")

    # Load target
    target_file = f"{target_dir}/batch_{batch_idx}.parquet"
    try:
      y_batch = pd.read_parquet(target_file)
      if y_batch.shape[1] == 1:
        y_batch = y_batch.iloc[:, 0].values
      else:
        y_batch = y_batch.values
    except Exception as e:
        print(f"Error loading target for batch {batch_idx}: {e}")

    # Combine other features and audio
    if audio_data is not None:
      X_batch = np.concatenate([X_batch_other_features, audio_data], axis=1)
    else:
      X_batch = X_batch_other_features

    del X_batch_other_features
    gc.collect()
  return X_batch, y_batch

# Train Model

In [23]:
from sklearn.linear_model import SGDClassifier
from sklearn.utils.class_weight import compute_class_weight
import os

scaler = StandardScaler()

def train_model(features_dir, target_dir, num_batches, batch_size=10):
# Collect all classes
    all_classes = collect_all_classes(target_dir, num_batches)

      # Fit scaler incrementally
    print("Fitting scaler incrementally...")
    n_scaler_batches = min(30, num_batches)  # Use fewer batches for scaler
    scaler_batches = np.random.choice(num_batches, size=n_scaler_batches, replace=False)

    # First batch to initialize scaler
    first_batch_idx = scaler_batches[0]
    X_first, _ = load_batch_from_multiple_dirs(first_batch_idx, features_dir, target_dir)
    if X_first is not None:
        # Sample subset if batch is large
        if X_first.shape[0] > 5000:
            sample_indices = np.random.choice(X_first.shape[0], 5000, replace=False)
            X_first = X_first[sample_indices]
        scaler.partial_fit(X_first)
        del X_first
        gc.collect()
    # Fit on remaining batches
    for batch_idx in tqdm(scaler_batches[1:], desc="Fitting scaler incrementally"):
        X_batch, _ = load_batch_from_multiple_dirs(batch_idx, features_dir, target_dir)
        if X_batch is not None:
            # Sample subset if batch is large
            if X_batch.shape[0] > 5000:
                sample_indices = np.random.choice(X_batch.shape[0], 5000, replace=False)
                X_batch = X_batch[sample_indices]
            scaler.partial_fit(X_batch)
            del X_batch
            gc.collect()

    print("Scaler fitted!")
    # Collect targets for class weights
    print("Computing class weights...")
    train_targets = []
    for batch_idx in tqdm(range(num_batches), desc="Collecting targets for class weights"):
        _, y_batch = load_batch_from_multiple_dirs(batch_idx, features_dir, target_dir)
        if y_batch is not None:
            train_targets.extend(y_batch)

    train_targets = np.array(train_targets)
    class_weights = compute_class_weight('balanced', classes=all_classes, y=train_targets)
    class_weights_dict = dict(zip(all_classes, class_weights))
    class_weights_dict[2] *= 1.5  # Boost weight for class 2
    class_weights_dict[3] *= 1.5  # Boost weight for class 3
    class_weights_dict[10] *= 5.0  # Significantly boost class 10
    print("Class weights calculated!")

    # Initialize model
    model = SGDClassifier(
        loss='modified_huber',
        learning_rate='optimal',
        eta0=0.001,
        alpha=0.1,
        l1_ratio=0.2,
        fit_intercept=True,
        tol=1e-4,
        class_weight=class_weights_dict,
        random_state=42,
        n_jobs=8,
        warm_start=True  # warm start for partial_fit
    )

    # Training
    train_accuracies = []
    n_samples_processed = 0
    n_epochs = 2

    # Store the last chunk for testing
    previous_X_test = None
    previous_y_test = None

    for epoch in range(n_epochs):
        epoch_time = time.time()
        print(f"\n=== Epoch {epoch + 1}/{n_epochs} ===")

        all_batch_indices = list(range(num_batches))

        # Shuffle multiple times and interleave
        shuffled_sets = []
        for _ in range(3):
            shuffled = all_batch_indices.copy()
            np.random.shuffle(shuffled)
            shuffled_sets.append(shuffled)

        # Interleave the shuffles
        batch_indices = []
        for i in range(num_batches):
            batch_indices.append(shuffled_sets[i % 3][i // 3])
        epoch_accuracies = []

        for chunk_start in range(0, num_batches, batch_size):
            chunk_end = min(chunk_start + batch_size, num_batches)
            chunk_indices = batch_indices[chunk_start:chunk_end]

            X_chunk_list = []
            y_chunk_list = []

            # Load batches in this chunk
            for batch_idx in tqdm(chunk_indices, desc=f"Epoch {epoch+1}, Chunk {chunk_start//batch_size}"):
                X_batch, y_batch = load_batch_from_multiple_dirs(batch_idx, features_dir, target_dir)

                if X_batch is not None and y_batch is not None:
                    X_chunk_list.append(X_batch)
                    y_chunk_list.append(y_batch)

            if X_chunk_list:
                # Stack and scale
                X_chunk = np.vstack(X_chunk_list)
                y_chunk = np.concatenate(y_chunk_list)
                X_chunk_scaled = scaler.transform(X_chunk)

                # Train
                print(f"Training on {X_chunk.shape[0]} samples...")
                start_time = time.time()

                if n_samples_processed == 0:
                    model.partial_fit(X_chunk_scaled, y_chunk, classes=all_classes)
                else:
                    model.partial_fit(X_chunk_scaled, y_chunk)

                fit_time = time.time() - start_time
                print(f"Fitting time: {fit_time:.3f} seconds")
                n_samples_processed += X_chunk.shape[0]

                # Test on previous chunk (if available) and test every 5 chunks
                if previous_X_test is not None and chunk_start % 5 == 0:
                    test_pred = model.predict(previous_X_test)
                    test_acc = accuracy_score(previous_y_test, test_pred)
                    train_accuracies.append(test_acc)
                    epoch_accuracies.append(test_acc)
                    print(f"Test accuracy (on previous chunk): {test_acc:.4f}")

                # Save current chunk for next iteration's testing
                test_size = min(2000, X_chunk_scaled.shape[0])
                test_indices = np.random.choice(X_chunk_scaled.shape[0], test_size, replace=False)
                previous_X_test = X_chunk_scaled[test_indices].copy()
                previous_y_test = y_chunk[test_indices].copy()

                del X_chunk, X_chunk_scaled, y_chunk
                gc.collect()

        # Epoch summary
        if epoch_accuracies:
            epoch_mean_acc = np.mean(epoch_accuracies)
            print(f"\nEpoch {epoch + 1} complete")
            print(f"Samples processed: {n_samples_processed}")
            print(f"Mean accuracy: {epoch_mean_acc:.4f}")
            print(f"Recent accuracies: {train_accuracies[-5:]}")
        else:
            print(f"\nEpoch {epoch + 1} complete")
            print(f"Samples processed: {n_samples_processed}")

        print(f"Time: {(time.time() - epoch_time)/60:.2f} minutes")

    print("\nTraining complete")
    print(f"Final accuracies: {train_accuracies[-10:]}")

    return model, scaler

print("Training model...")
begTime = time.time()
model_new = train_model(featuresDirs, targetDir, num_batches=289)
endTime = time.time() - begTime
print(f"Total Model Training Time: {float(endTime/60)} mins")
# Save the model and the scaler
joblib.dump(model_new, '/content/drive/MyDrive/svm3_with_scaler.pkl')
joblib.dump(scaler, '/content/drive/MyDrive/svm3_scaler.pkl')
print("New model saved!")

Training model...
Collecting unique classes...


Collecting unique classes: 100%|██████████| 289/289 [00:00<00:00, 350.19it/s]


Found 11 unique classes in the target
Fitting scaler incrementally...


Fitting scaler incrementally: 100%|██████████| 29/29 [03:31<00:00,  7.31s/it]


Scaler fitted!
Computing class weights...


Collecting targets for class weights: 100%|██████████| 289/289 [24:00<00:00,  4.98s/it]


Class weights calculated!

=== Epoch 1/2 ===


Epoch 1, Chunk 0: 100%|██████████| 10/10 [00:44<00:00,  4.44s/it]


Training on 10000 samples...
Fitting time: 6.719 seconds


Epoch 1, Chunk 1: 100%|██████████| 10/10 [00:45<00:00,  4.57s/it]


Training on 10000 samples...
Fitting time: 3.826 seconds
Test accuracy (on previous chunk): 0.4495


Epoch 1, Chunk 2: 100%|██████████| 10/10 [00:43<00:00,  4.37s/it]


Training on 10000 samples...
Fitting time: 3.639 seconds
Test accuracy (on previous chunk): 0.4945


Epoch 1, Chunk 3: 100%|██████████| 10/10 [00:42<00:00,  4.30s/it]


Training on 10000 samples...
Fitting time: 3.808 seconds
Test accuracy (on previous chunk): 0.3685


Epoch 1, Chunk 4: 100%|██████████| 10/10 [00:48<00:00,  4.87s/it]


Training on 10000 samples...
Fitting time: 4.233 seconds
Test accuracy (on previous chunk): 0.2830


Epoch 1, Chunk 5: 100%|██████████| 10/10 [01:09<00:00,  6.94s/it]


Training on 10000 samples...
Fitting time: 7.587 seconds
Test accuracy (on previous chunk): 0.3945


Epoch 1, Chunk 6: 100%|██████████| 10/10 [00:49<00:00,  4.99s/it]


Training on 10000 samples...
Fitting time: 3.988 seconds
Test accuracy (on previous chunk): 0.3510


Epoch 1, Chunk 7: 100%|██████████| 10/10 [00:48<00:00,  4.88s/it]


Training on 10000 samples...
Fitting time: 3.625 seconds
Test accuracy (on previous chunk): 0.4410


Epoch 1, Chunk 8: 100%|██████████| 10/10 [00:46<00:00,  4.66s/it]


Training on 10000 samples...
Fitting time: 3.717 seconds
Test accuracy (on previous chunk): 0.2900


Epoch 1, Chunk 9: 100%|██████████| 10/10 [00:44<00:00,  4.47s/it]


Training on 10000 samples...
Fitting time: 3.947 seconds
Test accuracy (on previous chunk): 0.1450


Epoch 1, Chunk 10: 100%|██████████| 10/10 [00:44<00:00,  4.48s/it]


Training on 10000 samples...
Fitting time: 3.823 seconds
Test accuracy (on previous chunk): 0.3575


Epoch 1, Chunk 11: 100%|██████████| 10/10 [00:48<00:00,  4.86s/it]


Training on 10000 samples...
Fitting time: 4.328 seconds
Test accuracy (on previous chunk): 0.3010


Epoch 1, Chunk 12: 100%|██████████| 10/10 [01:05<00:00,  6.60s/it]


Training on 10000 samples...
Fitting time: 4.331 seconds
Test accuracy (on previous chunk): 0.2215


Epoch 1, Chunk 13: 100%|██████████| 10/10 [00:47<00:00,  4.73s/it]


Training on 10000 samples...
Fitting time: 4.145 seconds
Test accuracy (on previous chunk): 0.2620


Epoch 1, Chunk 14: 100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


Training on 10000 samples...
Fitting time: 3.872 seconds
Test accuracy (on previous chunk): 0.2835


Epoch 1, Chunk 15: 100%|██████████| 10/10 [01:19<00:00,  7.95s/it]


Training on 10000 samples...
Fitting time: 3.793 seconds
Test accuracy (on previous chunk): 0.1745


Epoch 1, Chunk 16: 100%|██████████| 10/10 [00:49<00:00,  4.90s/it]


Training on 10000 samples...
Fitting time: 3.742 seconds
Test accuracy (on previous chunk): 0.2425


Epoch 1, Chunk 17: 100%|██████████| 10/10 [00:47<00:00,  4.79s/it]


Training on 10000 samples...
Fitting time: 3.924 seconds
Test accuracy (on previous chunk): 0.2345


Epoch 1, Chunk 18: 100%|██████████| 10/10 [00:46<00:00,  4.61s/it]


Training on 10000 samples...
Fitting time: 3.704 seconds
Test accuracy (on previous chunk): 0.2075


Epoch 1, Chunk 19: 100%|██████████| 10/10 [00:43<00:00,  4.30s/it]


Training on 10000 samples...
Fitting time: 3.678 seconds
Test accuracy (on previous chunk): 0.2965


Epoch 1, Chunk 20: 100%|██████████| 10/10 [00:45<00:00,  4.57s/it]


Training on 10000 samples...
Fitting time: 4.093 seconds
Test accuracy (on previous chunk): 0.3035


Epoch 1, Chunk 21: 100%|██████████| 10/10 [00:47<00:00,  4.77s/it]


Training on 10000 samples...
Fitting time: 4.186 seconds
Test accuracy (on previous chunk): 0.2355


Epoch 1, Chunk 22: 100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


Training on 10000 samples...
Fitting time: 3.794 seconds
Test accuracy (on previous chunk): 0.1580


Epoch 1, Chunk 23: 100%|██████████| 10/10 [00:47<00:00,  4.70s/it]


Training on 10000 samples...
Fitting time: 3.820 seconds
Test accuracy (on previous chunk): 0.1820


Epoch 1, Chunk 24: 100%|██████████| 10/10 [00:41<00:00,  4.16s/it]


Training on 10000 samples...
Fitting time: 3.796 seconds
Test accuracy (on previous chunk): 0.2940


Epoch 1, Chunk 25: 100%|██████████| 10/10 [00:47<00:00,  4.78s/it]


Training on 10000 samples...
Fitting time: 3.901 seconds
Test accuracy (on previous chunk): 0.1520


Epoch 1, Chunk 26: 100%|██████████| 10/10 [00:50<00:00,  5.01s/it]


Training on 10000 samples...
Fitting time: 4.047 seconds
Test accuracy (on previous chunk): 0.4250


Epoch 1, Chunk 27: 100%|██████████| 10/10 [00:51<00:00,  5.11s/it]


Training on 10000 samples...
Fitting time: 3.845 seconds
Test accuracy (on previous chunk): 0.2065


Epoch 1, Chunk 28: 100%|██████████| 9/9 [00:43<00:00,  4.89s/it]


Training on 9000 samples...
Fitting time: 3.575 seconds
Test accuracy (on previous chunk): 0.1760

Epoch 1 complete
Samples processed: 289000
Mean accuracy: 0.2832
Recent accuracies: [0.294, 0.152, 0.425, 0.2065, 0.176]
Time: 31.46 minutes

=== Epoch 2/2 ===


Epoch 2, Chunk 0: 100%|██████████| 10/10 [00:47<00:00,  4.73s/it]


Training on 10000 samples...
Fitting time: 3.986 seconds
Test accuracy (on previous chunk): 0.2930


Epoch 2, Chunk 1: 100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


Training on 10000 samples...
Fitting time: 3.866 seconds
Test accuracy (on previous chunk): 0.2130


Epoch 2, Chunk 2: 100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


Training on 10000 samples...
Fitting time: 4.044 seconds
Test accuracy (on previous chunk): 0.2520


Epoch 2, Chunk 3: 100%|██████████| 10/10 [00:43<00:00,  4.35s/it]


Training on 10000 samples...
Fitting time: 3.836 seconds
Test accuracy (on previous chunk): 0.1680


Epoch 2, Chunk 4: 100%|██████████| 10/10 [00:46<00:00,  4.66s/it]


Training on 10000 samples...
Fitting time: 3.654 seconds
Test accuracy (on previous chunk): 0.2815


Epoch 2, Chunk 5: 100%|██████████| 10/10 [00:46<00:00,  4.64s/it]


Training on 10000 samples...
Fitting time: 3.672 seconds
Test accuracy (on previous chunk): 0.1865


Epoch 2, Chunk 6: 100%|██████████| 10/10 [00:45<00:00,  4.52s/it]


Training on 10000 samples...
Fitting time: 3.867 seconds
Test accuracy (on previous chunk): 0.3510


Epoch 2, Chunk 7: 100%|██████████| 10/10 [00:53<00:00,  5.35s/it]


Training on 10000 samples...
Fitting time: 3.771 seconds
Test accuracy (on previous chunk): 0.2725


Epoch 2, Chunk 8: 100%|██████████| 10/10 [00:44<00:00,  4.45s/it]


Training on 10000 samples...
Fitting time: 3.761 seconds
Test accuracy (on previous chunk): 0.2800


Epoch 2, Chunk 9: 100%|██████████| 10/10 [00:43<00:00,  4.32s/it]


Training on 10000 samples...
Fitting time: 3.687 seconds
Test accuracy (on previous chunk): 0.1855


Epoch 2, Chunk 10: 100%|██████████| 10/10 [00:54<00:00,  5.46s/it]


Training on 10000 samples...
Fitting time: 10.391 seconds
Test accuracy (on previous chunk): 0.1535


Epoch 2, Chunk 11: 100%|██████████| 10/10 [00:43<00:00,  4.34s/it]


Training on 10000 samples...
Fitting time: 3.620 seconds
Test accuracy (on previous chunk): 0.3105


Epoch 2, Chunk 12: 100%|██████████| 10/10 [00:44<00:00,  4.45s/it]


Training on 10000 samples...
Fitting time: 3.812 seconds
Test accuracy (on previous chunk): 0.2715


Epoch 2, Chunk 13: 100%|██████████| 10/10 [00:46<00:00,  4.68s/it]


Training on 10000 samples...
Fitting time: 3.848 seconds
Test accuracy (on previous chunk): 0.3745


Epoch 2, Chunk 14: 100%|██████████| 10/10 [00:43<00:00,  4.32s/it]


Training on 10000 samples...
Fitting time: 3.706 seconds
Test accuracy (on previous chunk): 0.1430


Epoch 2, Chunk 15: 100%|██████████| 10/10 [00:42<00:00,  4.26s/it]


Training on 10000 samples...
Fitting time: 3.806 seconds
Test accuracy (on previous chunk): 0.3105


Epoch 2, Chunk 16: 100%|██████████| 10/10 [00:45<00:00,  4.53s/it]


Training on 10000 samples...
Fitting time: 3.627 seconds
Test accuracy (on previous chunk): 0.2895


Epoch 2, Chunk 17: 100%|██████████| 10/10 [00:59<00:00,  5.93s/it]


Training on 10000 samples...
Fitting time: 3.842 seconds
Test accuracy (on previous chunk): 0.2335


Epoch 2, Chunk 18: 100%|██████████| 10/10 [00:45<00:00,  4.52s/it]


Training on 10000 samples...
Fitting time: 3.719 seconds
Test accuracy (on previous chunk): 0.1970


Epoch 2, Chunk 19: 100%|██████████| 10/10 [00:46<00:00,  4.63s/it]


Training on 10000 samples...
Fitting time: 3.666 seconds
Test accuracy (on previous chunk): 0.1895


Epoch 2, Chunk 20: 100%|██████████| 10/10 [00:44<00:00,  4.47s/it]


Training on 10000 samples...
Fitting time: 3.747 seconds
Test accuracy (on previous chunk): 0.1380


Epoch 2, Chunk 21: 100%|██████████| 10/10 [00:45<00:00,  4.52s/it]


Training on 10000 samples...
Fitting time: 3.788 seconds
Test accuracy (on previous chunk): 0.2340


Epoch 2, Chunk 22: 100%|██████████| 10/10 [00:47<00:00,  4.73s/it]


Training on 10000 samples...
Fitting time: 3.830 seconds
Test accuracy (on previous chunk): 0.1715


Epoch 2, Chunk 23: 100%|██████████| 10/10 [00:47<00:00,  4.75s/it]


Training on 10000 samples...
Fitting time: 4.240 seconds
Test accuracy (on previous chunk): 0.2255


Epoch 2, Chunk 24: 100%|██████████| 10/10 [00:47<00:00,  4.75s/it]


Training on 10000 samples...
Fitting time: 3.650 seconds
Test accuracy (on previous chunk): 0.1710


Epoch 2, Chunk 25: 100%|██████████| 10/10 [00:43<00:00,  4.39s/it]


Training on 10000 samples...
Fitting time: 3.977 seconds
Test accuracy (on previous chunk): 0.1680


Epoch 2, Chunk 26: 100%|██████████| 10/10 [00:44<00:00,  4.46s/it]


Training on 10000 samples...
Fitting time: 6.321 seconds
Test accuracy (on previous chunk): 0.3295


Epoch 2, Chunk 27: 100%|██████████| 10/10 [00:52<00:00,  5.23s/it]


Training on 10000 samples...
Fitting time: 3.856 seconds
Test accuracy (on previous chunk): 0.1725


Epoch 2, Chunk 28: 100%|██████████| 9/9 [00:46<00:00,  5.15s/it]


Training on 9000 samples...
Fitting time: 3.284 seconds
Test accuracy (on previous chunk): 0.2100

Epoch 2 complete
Samples processed: 578000
Mean accuracy: 0.2337
Recent accuracies: [0.171, 0.168, 0.3295, 0.1725, 0.21]
Time: 30.11 minutes

Training complete
Final accuracies: [0.1895, 0.138, 0.234, 0.1715, 0.2255, 0.171, 0.168, 0.3295, 0.1725, 0.21]
Total Model Training Time: 89.21959587732951 mins
New model saved!


# PreProcessing Validation Data in Batches

In [24]:
featureValidDirs = [
    "/content/svmBatches/bv1",
    "/content/svmBatches/bv2",
    "/content/svmBatches/bv3"
]

targetValidDir = "/content/svmBatches/bv0"

def batch_parquet(parsedDataset, featnum, features=[], batch_size=1000, max_records=None):

  batch_files = []
  total_recs = 0
  batch_idx = 0
  # Convert dataset to batches
  batched_dataset = parsedDataset.batch(batch_size)
  n_batches = totalValidRecs // batch_size
  for batch in tqdm(batched_dataset,total=n_batches,desc="Batches to Parquet"):
    # Process each batch
    batch_dict = {key: [] for key in features}

    # Get batch size (might be smaller for the last batch)
    current_batch_size = tf.shape(next(iter(batch.values())))[0].numpy()

    # Extract features for each record in the batch
    for i in range(current_batch_size):
      for key in features:
        # Handle different types of features
        feature = batch[key][i]
        if isinstance(feature, tf.Tensor):
          value = feature.numpy()
          # Convert bytes to string if applicable
          if isinstance(value, bytes):
            value = value.decode('utf-8')
        else:
            value = feature
        batch_dict[key].append(value)

    # Create DataFrame for this batch and append to list
    df_batch = pd.DataFrame(batch_dict)

    # Write to parquet
    batch_file = f"/content/svmBatches/bv{featnum}/batch_{batch_idx}.parquet"
    batch_idx += 1
    df_batch.to_parquet(batch_file)
    batch_files.append(batch_file)

    # Update record count
    total_recs += current_batch_size

    # Check if we've reached the maximum number of records
    if max_records is not None and total_recs >= max_records:
        break
  return
# Usage:
# NOTE: returns arent used, these need modification at some point
# process features
batch_parquet(validDS, 1, feat1, batch_size=1000,max_records=totalValidRecs)
batch_parquet(validDS, 2, feat2, batch_size=1000,max_records=totalValidRecs)
batch_parquet(validDS, 3, feat3, batch_size=1000,max_records=totalValidRecs)

# process target
batch_parquet(validDS, 0, target, batch_size=1000,max_records=totalValidRecs)


Batches to Parquet: 100%|██████████| 12/12 [00:34<00:00,  2.85s/it]
Batches to Parquet: 100%|██████████| 12/12 [00:09<00:00,  1.20it/s]
Batches to Parquet: 100%|██████████| 12/12 [00:38<00:00,  3.20s/it]
Batches to Parquet: 100%|██████████| 12/12 [00:06<00:00,  1.97it/s]


## Validate SVM Model

In [25]:
from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score,
                            classification_report, confusion_matrix, matthews_corrcoef,
                            roc_auc_score, roc_curve, auc)
import os
#model = joblib.load('/content/drive/MyDrive/svm2_with_scaler.pkl')

featureValidDirs = [
    "/content/svmBatches/bv1",
    "/content/svmBatches/bv2",
    "/content/svmBatches/bv3"
]

targetValidDir = "/content/svmBatches/bv0"

def validate_model(model_path,scaler_path, features_dir, target_dir, num_batches, batch_size=12):

    model = joblib.load(model_path)
    # Check if it's a tuple and extract the model
    if isinstance(model, tuple):
        # Assume the model is the first element
        model = model[0]
    else:
        model = model
    scaler = joblib.load(scaler_path)
    # Track validation progress
    all_predictions = []
    all_true_labels = []
    n_samples_processed = 0

    #is_scaler_fitted = False
    print(f"\n=== Starting Validation ===")

    for chunk_start in range(0, num_batches, batch_size):
        chunk_end = min(chunk_start + batch_size, num_batches)
        chunk_indices = range(chunk_start, chunk_end)

        X_chunk_list = []
        y_chunk_list = []

        # Load batches in this chunk
        for batch_idx in tqdm(chunk_indices, desc=f"Validation Chunk {chunk_start//batch_size}"):
            X_batch, y_batch = load_batch_from_multiple_dirs(batch_idx, features_dir, target_dir)

            if X_batch is not None and y_batch is not None:
                X_chunk_list.append(X_batch)
                y_chunk_list.append(y_batch)
        gc.collect()

        if X_chunk_list:
            # Stack all batches in chunk
            X_chunk = np.vstack(X_chunk_list)
            y_chunk = np.concatenate(y_chunk_list)

            # Scale features using the provided scaler
            X_chunk_scaled = scaler.transform(X_chunk)

            # Make predictions
            print(f"Predicting on {X_chunk.shape[0]} samples...")
            chunk_predictions = model.predict(X_chunk_scaled)

            # Collect results
            all_predictions.extend(chunk_predictions)
            all_true_labels.extend(y_chunk)
            n_samples_processed += X_chunk.shape[0]

            # Calculate chunk accuracy for progress monitoring
            chunk_acc = accuracy_score(y_chunk, chunk_predictions)
            print(f"Chunk accuracy: {chunk_acc:.4f}")

            del X_chunk, X_chunk_scaled, y_chunk, chunk_predictions
            gc.collect()

    # Convert to numpy arrays for final metrics
    all_predictions = np.array(all_predictions)
    all_true_labels = np.array(all_true_labels)
    metrics = {}

    # Basic metrics
    metrics['accuracy'] = accuracy_score(all_true_labels, all_predictions)

    # F1 scores - multiple averaging methods
    metrics['f1_weighted'] = f1_score(all_true_labels, all_predictions, average='weighted')
    metrics['f1_macro'] = f1_score(all_true_labels, all_predictions, average='macro')
    metrics['f1_micro'] = f1_score(all_true_labels, all_predictions, average='micro')

    # Precision and Recall
    metrics['precision_weighted'] = precision_score(all_true_labels, all_predictions, average='weighted')
    metrics['precision_macro'] = precision_score(all_true_labels, all_predictions, average='macro')


    # Per-class metrics
    unique_classes = np.unique(all_true_labels)
    metrics['per_class_f1'] = f1_score(all_true_labels, all_predictions, average=None)
    metrics['per_class_precision'] = precision_score(all_true_labels, all_predictions, average=None)


    # Print summary

    print(f"\n=== Validation Complete ===")
    print(f"Total samples processed: {n_samples_processed}")
    print(f"Overall accuracy: {metrics['accuracy']:.4f}")
    print(f"F1-score (weighted): {metrics['f1_weighted']:.4f}")
    print(f"F1-score (macro): {metrics['f1_macro']:.4f}")
    print(f"F1-score (micro): {metrics['f1_micro']:.4f}")

    # Precision scores
    print(f"Precision (weighted): {metrics['precision_weighted']:.4f}")
    print(f"Precision (macro): {metrics['precision_macro']:.4f}")

    print("\nPer-class F1 scores:")
    for i, class_label in enumerate(unique_classes):
        print(f"  Class {class_label}: {metrics['per_class_f1'][i]:.4f}")

    # Return comprehensive results
    return {
        'metrics': metrics,
        'predictions': all_predictions,
        'true_labels': all_true_labels,
        'n_samples': n_samples_processed
    }

# Run evaluation
results = validate_model(
    '/content/drive/MyDrive/svm3_with_scaler.pkl',
    '/content/drive/MyDrive/svm3_scaler.pkl',
    featureValidDirs,
    targetValidDir,
    12
)


=== Starting Validation ===


Validation Chunk 0: 100%|██████████| 12/12 [00:48<00:00,  4.01s/it]


Predicting on 12000 samples...
Chunk accuracy: 0.1134

=== Validation Complete ===
Total samples processed: 12000
Overall accuracy: 0.1134
F1-score (weighted): 0.1265
F1-score (macro): 0.1007
F1-score (micro): 0.1134
Precision (weighted): 0.2752
Precision (macro): 0.2369

Per-class F1 scores:
  Class 0: 0.2595
  Class 1: 0.0954
  Class 2: 0.0310
  Class 3: 0.0269
  Class 4: 0.0120
  Class 5: 0.1796
  Class 6: 0.2387
  Class 7: 0.0741
  Class 8: 0.1678
  Class 10: 0.0000


# PreProcessing Test Data In Batches

In [27]:
featureTestDirs = [
    "/content/svmBatches/bt1",
    "/content/svmBatches/bt2",
    "/content/svmBatches/bt3"
]

targetTestDir = "/content/svmBatches/bt0"

def batch_parquet(parsedDataset, featnum, features=[], batch_size=1000, max_records=None):

  batch_files = []
  total_recs = 0
  batch_idx = 0

  # Convert dataset to batches
  batched_dataset = parsedDataset.batch(batch_size)
  n_batches = totalTestRecs // batch_size

  for batch in tqdm(batched_dataset,total=n_batches,desc="Batches to Parquet"):
    # Process each batch
    batch_dict = {key: [] for key in features}

    # Get batch size (might be smaller for the last batch)
    current_batch_size = tf.shape(next(iter(batch.values())))[0].numpy()

    # Extract features for each record in the batch
    for i in range(current_batch_size):
      for key in features:

        # Handle different types of features
        feature = batch[key][i]
        if isinstance(feature, tf.Tensor):
          value = feature.numpy()

          # Convert bytes to string if applicable
          if isinstance(value, bytes):
            value = value.decode('utf-8')
        else:
            value = feature
        batch_dict[key].append(value)

    # Create DataFrame for this batch and append to list
    df_batch = pd.DataFrame(batch_dict)

    # Write to parquet
    batch_file = f"/content/svmBatches/bt{featnum}/batch_{batch_idx}.parquet"
    batch_idx += 1
    df_batch.to_parquet(batch_file)
    batch_files.append(batch_file)

    # Update record count
    total_recs += current_batch_size

    # Check if we've reached the maximum number of records
    if max_records is not None and total_recs >= max_records:
        break
  return

# process features
batch_parquet(testDS, 1, feat1, batch_size=1000,max_records=totalTestRecs)
batch_parquet(testDS, 2, feat2, batch_size=1000,max_records=totalTestRecs)
batch_parquet(testDS, 3, feat3, batch_size=1000,max_records=totalTestRecs)

# process target
batch_parquet(testDS, 0, target, batch_size=1000,max_records=totalTestRecs)


Batches to Parquet: 100%|██████████| 4/4 [00:11<00:00,  2.85s/it]
Batches to Parquet: 100%|██████████| 4/4 [00:03<00:00,  1.13it/s]
Batches to Parquet: 100%|██████████| 4/4 [00:16<00:00,  4.14s/it]
Batches to Parquet: 100%|██████████| 4/4 [00:02<00:00,  1.79it/s]


# Test Model

In [30]:

def test_model(model_path,scaler_path, features_dir, target_dir, num_batches, batch_size=12):

    model = joblib.load(model_path)
    # Check if it's a tuple and extract the model
    if isinstance(model, tuple):
        # Assume the model is the first element
        model = model[0]
    else:
        model = model
    scaler = joblib.load(scaler_path)
    # Track progress
    all_predictions = []
    all_true_labels = []
    n_samples_processed = 0

    print(f"\n=== Starting Test ===")

    for chunk_start in range(0, num_batches, batch_size):
        chunk_end = min(chunk_start + batch_size, num_batches)
        chunk_indices = range(chunk_start, chunk_end)

        X_chunk_list = []
        y_chunk_list = []

        # Load batches in this chunk
        for batch_idx in tqdm(chunk_indices, desc=f"Test Chunk {chunk_start//batch_size}"):
            X_batch, y_batch = load_batch_from_multiple_dirs(batch_idx, features_dir, target_dir)

            if X_batch is not None and y_batch is not None:
                X_chunk_list.append(X_batch)
                y_chunk_list.append(y_batch)
        gc.collect()

        if X_chunk_list:
            # Stack all batches in chunk
            X_chunk = np.vstack(X_chunk_list)
            y_chunk = np.concatenate(y_chunk_list)

            # Scale features using scaler
            X_chunk_scaled = scaler.transform(X_chunk)

            # Make predictions
            print(f"Predicting on {X_chunk.shape[0]} samples...")
            chunk_predictions = model.predict(X_chunk_scaled)

            # Collect results
            all_predictions.extend(chunk_predictions)
            all_true_labels.extend(y_chunk)
            n_samples_processed += X_chunk.shape[0]

            # Calculate chunk accuracy
            chunk_acc = accuracy_score(y_chunk, chunk_predictions)
            print(f"Chunk accuracy: {chunk_acc:.4f}")

            del X_chunk, X_chunk_scaled, y_chunk, chunk_predictions
            gc.collect()

    # Convert to numpy arrays for final metrics
    all_predictions = np.array(all_predictions)
    all_true_labels = np.array(all_true_labels)
    metrics = {}

    # Basic metrics
    metrics['accuracy'] = accuracy_score(all_true_labels, all_predictions)

    # F1 scores
    metrics['f1_weighted'] = f1_score(all_true_labels, all_predictions, average='weighted')
    metrics['f1_macro'] = f1_score(all_true_labels, all_predictions, average='macro')
    metrics['f1_micro'] = f1_score(all_true_labels, all_predictions, average='micro')

    # Precision and Recall
    metrics['precision_weighted'] = precision_score(all_true_labels, all_predictions, average='weighted')
    metrics['recall_weighted'] = recall_score(all_true_labels, all_predictions, average='weighted')
    metrics['precision_macro'] = precision_score(all_true_labels, all_predictions, average='macro')
    metrics['recall_macro'] = recall_score(all_true_labels, all_predictions, average='macro')

    # Per-class metrics
    unique_classes = np.unique(all_true_labels)
    metrics['per_class_f1'] = f1_score(all_true_labels, all_predictions, average=None)
    metrics['per_class_precision'] = precision_score(all_true_labels, all_predictions, average=None)
    metrics['per_class_recall'] = recall_score(all_true_labels, all_predictions, average=None)

    # Print summary

    print(f"\n=== Test Complete ===")
    print(f"Total samples processed: {n_samples_processed}")
    print(f"Overall accuracy: {metrics['accuracy']:.4f}")
    print(f"F1-score (weighted): {metrics['f1_weighted']:.4f}")
    print(f"F1-score (macro): {metrics['f1_macro']:.4f}")
    print(f"F1-score (micro): {metrics['f1_micro']:.4f}")

    # Precision scores
    print(f"Precision (weighted): {metrics['precision_weighted']:.4f}")
    print(f"Precision (macro): {metrics['precision_macro']:.4f}")


    # Recall scores
    print(f"Recall (weighted): {metrics['recall_weighted']:.4f}")
    print(f"Recall (macro): {metrics['recall_macro']:.4f}")

    print("\nPer-class F1 scores:")
    for i, class_label in enumerate(unique_classes):
        print(f"  Class {class_label}: {metrics['per_class_f1'][i]:.4f}")

    # Return comprehensive results
    return {
        'metrics': metrics,
        'predictions': all_predictions,
        'true_labels': all_true_labels,
        'n_samples': n_samples_processed
    }

# Run test evaluation
results = test_model(
    '/content/drive/MyDrive/svm3_with_scaler.pkl',
    '/content/drive/MyDrive/svm3_scaler.pkl',
    featureTestDirs,
    targetTestDir,
    5
)


=== Starting Test ===


Test Chunk 0: 100%|██████████| 5/5 [00:18<00:00,  3.64s/it]


Predicting on 4096 samples...
Chunk accuracy: 0.1086

=== Test Complete ===
Total samples processed: 4096
Overall accuracy: 0.1086
F1-score (weighted): 0.1159
F1-score (macro): 0.0971
F1-score (micro): 0.1086
Precision (weighted): 0.2673
Precision (macro): 0.2425
Recall (weighted): 0.1086
Recall (macro): 0.0995

Per-class F1 scores:
  Class 0: 0.2306
  Class 1: 0.0733
  Class 2: 0.0633
  Class 3: 0.0365
  Class 4: 0.0101
  Class 5: 0.1674
  Class 6: 0.2305
  Class 7: 0.0584
  Class 8: 0.1609
  Class 10: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
