# Iterative Sampling with Augmentation
- In this kernel, we will be performing **Iterative Sampling with Augmentation**, an approach borrowed from a paper entitled "Increasing Data Diversity with Iterative Sampling to Improve Performance".
- For this approach, we require a pool of Augmented Images (for convenience, we will refer to it as Pool). The **Pool** contains only augmented samples and not the base images.
- The research paper has used many different augmentation techniques for creating the **Pool**. However, for starters, we will be using **random-sampling based augmentation**, using both Traditional Augmentation and GAN-based Augmentation techniques. 

### References
- Cavusoglu, Devrim, Ogulcan Eryuksel, and Sinan Altinuc. "Increasing Data Diversity with Iterative Sampling to Improve Performance." arXiv preprint arXiv:2111.03743 (2021).
- [Convert .CSV file to Images](https://medium.com/lifeandtech/convert-csv-file-to-images-309b6fdb8c49)

# 1. Importing the Packages & Boilerplate Code

In [1]:
!pip install fiftyone

Collecting fiftyone
  Downloading fiftyone-0.16.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting aiofiles
  Downloading aiofiles-0.8.0-py3-none-any.whl (13 kB)
Collecting sse-starlette<1,>=0.10.3
  Downloading sse_starlette-0.10.3-py3-none-any.whl (8.0 kB)
Collecting ndjson
  Downloading ndjson-0.3.1-py2.py3-none-any.whl (5.3 kB)
Collecting strawberry-graphql==0.96.0
  Downloading strawberry_graphql-0.96.0-py3-none-any.whl (135 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.0/135.0 KB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fiftyone-brain<0.9,>=0.8
  Downloading fiftyone_brain-0.8.2-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.9/47.9 KB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)

In [2]:
import os
import sys
import json
import random
import numpy as np
import pandas as pd
import cv2
import seaborn as sns
import matplotlib.pyplot as plt
import fiftyone as fo
import fiftyone.brain as fob

from tqdm import tqdm
from shutil import copyfile
from tabulate import tabulate
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix, f1_score

# https://www.kaggle.com/c/ventilator-pressure-prediction/discussion/274717
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

import tensorflow as tf
import tensorflow.keras.layers as tfl
import tensorflow.keras.backend as K

NumExpr defaulting to 4 threads.
Migrating database to v0.16.1


In [3]:
# Setting the seeds
SEED = 0
os.environ['PYTHONHASHSEED']=str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

In [4]:
# Making sure that Tensorflow is able to detect the GPU
device_name = tf.test.gpu_device_name()
if "GPU" not in device_name:
    print("GPU device not found")
print('Found GPU at: {}'.format(device_name))

GPU device not found
Found GPU at: 


# 2. Importing the Train and Test Sets

In [5]:
# Importing the Labelled Training Dataset
print("For Train Dataset:")
df_train = pd.read_csv("../input/cifar10/train_lab_x.csv")
y_train = pd.read_csv("../input/cifar10/train_lab_y.csv")
df_train = np.array(df_train)
y_train = np.array(y_train)
y_train_oh = tf.one_hot(np.ravel(y_train), depth = 10)
print(df_train.shape, y_train.shape, y_train_oh.shape)

# Reshaping, rescaling and one-hot encoding
df_train = np.reshape(df_train, (-1, 3, 32, 32))
df_train = np.transpose(np.array(df_train), (0, 2, 3, 1))
df_train = df_train / 255
print(df_train.shape)

# Importing the Test Dataset
print("For Test Dataset:")
df_test = pd.read_csv("../input/cifar10/test_x.csv")
y_test = pd.read_csv("../input/cifar10/test_y.csv")
df_test = np.array(df_test)
y_test = np.array(y_test)
print(df_test.shape, y_test.shape)

# Reshaping the dataset
df_test = np.reshape(df_test, (-1, 3, 32, 32))
print(df_test.shape)

# Reshaping, rescaling and one-hot encoding
df_test = np.transpose(np.array(df_test), (0, 2, 3, 1))
df_test = df_test / 255
y_test_oh = tf.one_hot(np.ravel(y_test), depth = 10)
print(df_test.shape, y_test_oh.shape)

For Train Dataset:
(40006, 3072) (40006, 1) (40006, 10)
(40006, 32, 32, 3)
For Test Dataset:
(10000, 3072) (10000, 1)
(10000, 3, 32, 32)
(10000, 32, 32, 3) (10000, 10)


# 3. Performing the Augmentations on the Training Set (Creating the Pool)
## 3.1. GAN-Based Augmentation

In [6]:
df_gan_aug = pd.read_csv("../input/cifar10/df_25per_gan_aug.csv")
y_gan_aug = pd.read_csv("../input/cifar10/y_25per_gan_aug.csv")
df_gan_aug = np.array(df_gan_aug)
y_gan_aug = np.array(y_gan_aug)

# Reshaping, rescaling and one-hot encoding
df_gan_aug = np.reshape(df_gan_aug, (-1, 3, 32, 32))
df_gan_aug = np.transpose(np.array(df_gan_aug), (0, 2, 3, 1))
print(df_gan_aug.shape, y_gan_aug.shape)

(10016, 32, 32, 3) (10016, 1)


## 3.2. Traditional Augmentation

In [7]:
df_trad_aug = pd.read_csv("../input/cifar10/df_25per_trad_aug.csv")
y_trad_aug = pd.read_csv("../input/cifar10/y_25per_trad_aug.csv")
df_trad_aug = np.array(df_trad_aug)
y_trad_aug = np.array(y_trad_aug)

# Reshaping, rescaling and one-hot encoding
df_trad_aug = np.reshape(df_trad_aug, (-1, 32, 32, 3))
print(df_trad_aug.shape, y_trad_aug.shape)

(10045, 32, 32, 3) (10045, 1)


## 3.3. Creating the Pool

In [8]:
# Concatenating the Augmented Datasets
df_pool = np.concatenate([df_gan_aug, df_trad_aug], axis=0)
y_pool = np.concatenate([y_gan_aug, y_trad_aug], axis=0)

# Creating a random permutation & shuffling the dataset
perm = np.random.permutation(df_pool.shape[0])
df_pool = np.array(df_pool[perm, : , : , : ])
y_pool = y_pool[perm]

# One-Hot Encoding
y_pool_oh = tf.one_hot(np.ravel(y_pool), depth = 10)
print(df_pool.shape, y_pool.shape, y_pool_oh.shape)

(20061, 32, 32, 3) (20061, 1) (20061, 10)


In [9]:
# # Code to create a 5*5 grid of images, along with their labels
# fig, ax = plt.subplots(5, 5, figsize = (7, 7))
# fig.tight_layout()

# for ind in range(25):
#     example = df_pool[ind, : , : , : ]
#     axis = ax[ind // 5][ind % 5]
#     axis.get_xaxis().set_visible(False)
#     axis.get_yaxis().set_visible(False)
#     axis.imshow(example)
#     axis.set(title = str(y_pool[ind]))

# 4. Creating Images and Labels (JSON) in a Directory

In [10]:
# TRAIN_IMGS_PATH = r'./train/'
# POOL_IMGS_PATH = r'./pool/'

# Making Directories if they don't exist
# try:
#     os.mkdir(TRAIN_IMGS_PATH)
#     os.mkdir(POOL_IMGS_PATH)
# except:
#     pass

# def create_dir_images(data_csv, path, is_train = True):
#     """A function to create images from CSV file"""
#     for i in tqdm(range(len(data_csv))):
#         img = data_csv[i, : , : , : ] * 255
#         img = img.astype(np.uint8)
#         if is_train:
#             cv2.imwrite(path + str(i) + '_train.png', img)
#         else:
#             cv2.imwrite(path + str(i) + '_pool.png', img)  

# Create Training & Pool Images
# create_dir_images(df_train, TRAIN_IMGS_PATH, True)
# create_dir_images(df_pool, POOL_IMGS_PATH, False)

# Create JSON files for the corresponding labels
# y_train = pd.Series(np.squeeze(y_train))
# y_train.to_json("./y_train.json")
# y_pool = pd.Series(np.squeeze(y_pool))
# y_pool.to_json("./y_pool.json")

In [11]:
# Zipping the files for download
# !tar chvfz data.tar.gz "./"

In [12]:
# Importing the Train Dataset
train_name = "train-dataset"
train_dataset_dir = "../input/cifar10/iter_samp/train_data"

# Create the Voxel training dataset
train_dataset = fo.Dataset.from_dir(
    dataset_dir=train_dataset_dir,
    dataset_type=fo.types.FiftyOneImageClassificationDataset,
    name=train_name,
)

# Importing the Pool Dataset
pool_name = "pool-dataset"
pool_dataset_dir = "../input/cifar10/iter_samp/pool_data"

# Create the Voxel pool dataset
pool_dataset = fo.Dataset.from_dir(
    dataset_dir=pool_dataset_dir,
    dataset_type=fo.types.FiftyOneImageClassificationDataset,
    name=pool_name,
)

 100% |█████████████| 40006/40006 [30.6s elapsed, 0s remaining, 1.1K samples/s]       
 100% |█████████████| 20061/20061 [15.0s elapsed, 0s remaining, 1.4K samples/s]      


# 5. Performing Iterative Sampling
## 5.1. Defining a function to get class-wise samples

In [13]:
def get_class_samples(data, progress = True):
    """A function to get the class-wise indices for a dataset"""
    
    ## Initialization
    num_examples = np.zeros((10,))
    # Creating a list of lists for storing the indices of data-points in the dataset, class-wise
    classes_ind = []
    for i in range(10):
        classes_ind.append([])
    
    ## Iterating over the dataset
    for sample in data.iter_samples(progress=progress):
        label = int(sample['ground_truth']['label'])
        voxel_id = sample['id']
        num_examples[label] += 1
        classes_ind[label].append(voxel_id)
    
    ## Number of examples from each class
    num_exa = num_examples.astype('int32')
    return num_exa, classes_ind

# Example
num_exa, classes_ind = get_class_samples(train_dataset[:50])
print(num_exa, len(classes_ind), len(classes_ind[0]))

 100% |███████████████████| 50/50 [24.5ms elapsed, 0s remaining, 2.0K samples/s] 
[ 5  0 45  0  0  0  0  0  0  0] 10 5


## 5.2. Defining a function to get the embeddings
- We will be using the default embeddings for the images, generated by the FiftyOne's library functions. They are generated using the **MobileNetv2** model.

## 5.3. Defining the Similarity Function
- For this, we will be using the [FiftyOne](https://voxel51.com/docs/fiftyone/) library.

#### References
- [Compute Similarity from data-points and embeddings](https://voxel51.com/docs/fiftyone/api/fiftyone.brain.html#fiftyone.brain.compute_similarity)
- [Find duplicates based on threshold/fraction](https://voxel51.com/docs/fiftyone/api/fiftyone.brain.similarity.html#fiftyone.brain.similarity.SimilarityResults.find_duplicates)
- [Select samples from a dataset](https://voxel51.com/docs/fiftyone/api/fiftyone.core.dataset.html#fiftyone.core.dataset.Dataset.select)
- [Add samples to a dataset](https://voxel51.com/docs/fiftyone/api/fiftyone.core.dataset.html#fiftyone.core.dataset.Dataset.add_samples)
- [Remove samples from a dataset](https://voxel51.com/docs/fiftyone/api/fiftyone.core.dataset.html#fiftyone.core.dataset.Dataset.delete_samples)

## 5.4. Running the Iterative Sampling Algorithm

In [14]:
num_train_samples, train_classes = get_class_samples(train_dataset)
num_pool_samples, pool_classes = get_class_samples(pool_dataset)
print(num_train_samples, num_pool_samples)

# We will consider the `max_sizes` equal to the number of samples for each 
# class with which we started
max_sizes = 5000

 100% |█████████████| 40006/40006 [18.0s elapsed, 0s remaining, 2.4K samples/s]      
 100% |█████████████| 20061/20061 [8.6s elapsed, 0s remaining, 2.4K samples/s]        
[4109 3839 4022 4116 4312 3952 4290 3552 3436 4378] [2073 1917 2026 2021 2177 1975 2184 1783 1691 2214]


In [15]:
classes = np.arange(10)
max_iterations = 5
threshold = 5

- When we delete samples from train_dataset, train_view gets automatically updated. But when we add samples to the dataset, we need to re-initialize the view. For reference, refer to the example mentioned towards the end of this kernel.

In [16]:
for clas in classes:
    print(f"For Class {clas}:")
    num_train_samples, train_classes = get_class_samples(train_dataset, progress = False)
    num_pool_samples, pool_classes = get_class_samples(pool_dataset, progress = False)
    
    train_view = train_dataset.select(train_classes[clas])
    
    # Ensuring that the train dataset has at least 1 sample for this class
    if len(train_view) > 0:
        iteration = 1
        results = fob.compute_similarity(train_view)
        results.find_duplicates(thresh = threshold)
        duplicate_ids = results.duplicate_ids

        while len(duplicate_ids) > 0 or iteration <= max_iterations:
            print(f"For iteration {iteration}, # Duplicate Images = {len(duplicate_ids)}")
            train_dataset.delete_samples(duplicate_ids)
            m = max_sizes - len(train_view)
            
            # Making sure that the pool dataset has enough samples
            if num_pool_samples[clas] >= m:
                pass
            elif num_pool_samples[clas] > 0 and num_pool_samples[clas] < m:
                # Decrease the value of m
                m = num_pool_samples[clas]
            else:
                duplicate_ids = []
                iteration += 1
                continue
                
            m_ids = random.choices(pool_classes[clas], k = m)
            pool_samples = pool_dataset.select(m_ids)
            train_dataset.add_samples(pool_samples)
            pool_dataset.delete_samples(m_ids)

            # Updating the variables
            num_train_samples, train_classes = get_class_samples(train_dataset, progress = False)
            num_pool_samples, pool_classes = get_class_samples(pool_dataset, progress = False)
            
            # Finding duplicates again
            duplicate_ids = []
            train_view = train_dataset.select(train_classes[clas])
            if len(train_view) > 0:
                results = fob.compute_similarity(train_view)
                results.find_duplicates(thresh = threshold)
                duplicate_ids = results.duplicate_ids
            
            iteration += 1
    
    print()

For Class 0:
Downloading model from 'https://download.pytorch.org/models/mobilenet_v2-b0353104.pth'...
 100% |████|  108.4Mb/108.4Mb [330.1ms elapsed, 0s remaining, 328.5Mb/s]     
Computing embeddings...
 100% |███████████████| 4109/4109 [2.5m elapsed, 0s remaining, 26.6 samples/s]      
Generating index...
Index complete
Computing duplicate samples...
Duplicates computation complete
For iteration 1, # Duplicate Images = 4
 100% |█████████████████| 722/722 [920.7ms elapsed, 0s remaining, 786.8 samples/s]      
Computing embeddings...
 100% |███████████████| 4827/4827 [3.0m elapsed, 0s remaining, 25.1 samples/s]      
Generating index...
Index complete
Computing duplicate samples...
Duplicates computation complete
For iteration 2, # Duplicate Images = 8
 100% |█████████████████| 165/165 [229.2ms elapsed, 0s remaining, 719.9 samples/s]     
Computing embeddings...
 100% |███████████████| 4984/4984 [3.1m elapsed, 0s remaining, 28.6 samples/s]      
Generating index...
Index complete
Comp

## 5.5. Preparing the Dataset for modelling

In [17]:
filepaths = []
for sample in train_dataset.iter_samples():
    filepaths.append(sample['filepath'])
    
filepaths = pd.Series(filepaths)
filepaths.to_csv("./filepaths.csv", index = False)
filepaths.head()

0    /kaggle/input/cifar10/iter_samp/train_data/dat...
1    /kaggle/input/cifar10/iter_samp/train_data/dat...
2    /kaggle/input/cifar10/iter_samp/train_data/dat...
3    /kaggle/input/cifar10/iter_samp/train_data/dat...
4    /kaggle/input/cifar10/iter_samp/train_data/dat...
dtype: object

In [18]:
# Initializing the variables to store the images and labels
iter_samp_df = []
iter_samp_y = []

# # Loading the JSON files and extracting the labels
# train_json = json.load(open("../input/cifar10/iter_samp/train_data/labels.json"))
# pool_json = json.load(open("../input/cifar10/iter_samp/pool_data/labels.json"))
# train_labels = train_json["labels"]
# pool_labels = pool_json["labels"]

print(df_train.shape, y_train.shape)
print(df_pool.shape, y_pool.shape)
print(type(df_train), type(df_pool))

(40006, 32, 32, 3) (40006, 1)
(20061, 32, 32, 3) (20061, 1)
<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [19]:
for file in tqdm(filepaths):
    iden = file.split("/")[-1].split(".")[0]
    is_train = iden.split("_")[-1]
    index = int(iden.split("_")[0])
    # print(file, iden, index, type(index), is_train)
    
    if is_train == "train": 
        img = df_train[index]
        label = y_train[index][0]
    else: 
        img = df_pool[index]
        label = y_pool[index][0]
    img = np.reshape(img, (-1))
    # print(label.shape, img.shape)

    iter_samp_df.append(img)
    iter_samp_y.append(label)
    
iter_samp_df = pd.DataFrame(iter_samp_df)
iter_samp_y = pd.Series(iter_samp_y)
print(iter_samp_df.shape, iter_samp_y.shape)

# Saving the Images and the Labels
iter_samp_df.to_csv("iter_samp_df.csv", index = False)
iter_samp_y.to_csv("iter_samp_y.csv", index = False)

100%|██████████| 50000/50000 [00:12<00:00, 3991.06it/s]


(50000, 3072) (50000,)


In [20]:
# Reshaping the Dataset
iter_samp_df = np.reshape(np.array(iter_samp_df), (-1, 32, 32, 3))

# Creating a random permutation
perm = np.random.permutation(iter_samp_df.shape[0])

# Shuffling the training dataset
iter_samp_df = iter_samp_df[perm, : , : , : ]
iter_samp_y = iter_samp_y[perm]
iter_samp_oh = tf.one_hot(np.ravel(iter_samp_y), depth = 10)
print(iter_samp_df.shape, iter_samp_y.shape, iter_samp_oh.shape)

(50000, 32, 32, 3) (50000,) (50000, 10)


## 5.6. Training the Baseline Model

In [21]:
# Importing the Baseline Model Architecture
copyfile(src = "../input/dcai-rw/baseline_arch.py", dst = "../working/baseline_arch.py")
from baseline_arch import cnn_model

# Creating Batches from the Augmented Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((iter_samp_df, iter_samp_oh)).batch(32)

In [22]:
# If the model has been pre-trained
try:
    conv_model = cnn_model((32, 32, 3))
    conv_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics='accuracy')
    conv_model.load_weights("../input/dcai-rw/itersamp_25per_trad_gan.h")

# If the model hasn't been pre-trained
except:
    num_epochs = [10, 20, 30, 40, 50]
    train_loss, test_loss, train_acc, test_acc = [], [], [], []

    for epochs in num_epochs:
        # Training the Model
        conv_model = cnn_model((32, 32, 3))
        conv_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics='accuracy')
        conv_model.fit(train_dataset, epochs = epochs)

        # Predicting on the Train/Test Datasets
        preds_train = conv_model.predict(iter_samp_df)
        preds_test = conv_model.predict(df_test)

        # Finding the Predicted Classes
        cls_train = np.argmax(preds_train, axis = 1)
        cls_test = np.argmax(preds_test, axis = 1)

        # Finding the Train/Test set Loss
        train_loss.append(log_loss(iter_samp_oh, preds_train))
        test_loss.append(log_loss(y_test_oh, preds_test))
        train_acc.append(accuracy_score(iter_samp_y, cls_train))
        test_acc.append(accuracy_score(y_test, cls_test))

        print("For ", epochs, " Epochs:")
        print("Log-loss for Train Dataset = ", train_loss[-1])
        print("Log-loss for Test Dataset = ", test_loss[-1])
        print("Accuracy for Train Dataset = ", train_acc[-1])
        print("Accuracy for Test Dataset = ", test_acc[-1])
        print()

    # Training the Model with the best hyper-parameter settings
    ind = np.argmax(test_acc)
    best_num_epochs = num_epochs[ind]
    conv_model = cnn_model((32, 32, 3))
    conv_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics='accuracy')
    conv_model.fit(train_dataset, epochs = best_num_epochs)

    # Saving the model along with it's weights
    conv_model.save('itersamp_25per_trad_gan.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
For  10  Epochs:
Log-loss for Train Dataset =  0.5414839492597625
Log-loss for Test Dataset =  0.8328513454330656
Accuracy for Train Dataset =  0.80594
Accuracy for Test Dataset =  0.7303

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
For  20  Epochs:
Log-loss for Train Dataset =  0.32867434461801837
Log-loss for Test Dataset =  0.8123821097538599
Accuracy for Train Dataset =  0.8836
Accuracy for Test Dataset =  0.749

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Ep

## 5.7. Predicting the Performance

In [23]:
# Predicting on the Train/Test Datasets
preds_train = conv_model.predict(iter_samp_df)
preds_test = conv_model.predict(df_test)
print(preds_train.shape)

# Finding the Predicted Classes
cls_train = np.argmax(preds_train, axis = 1)
cls_test = np.argmax(preds_test, axis = 1)

# Finding the Train/Test set Loss
print("Log-loss for Train Dataset = ", log_loss(iter_samp_oh, preds_train))
print("Log-loss for Test Dataset = ", log_loss(y_test_oh, preds_test))
print("Weighted F1 Score for Train Dataset = ", f1_score(iter_samp_y, cls_train, average = 'weighted'))
print("Weighted F1 Score for Test Dataset = ", f1_score(y_test, cls_test, average = 'weighted'))
print("Accuracy for Train Dataset = ", accuracy_score(iter_samp_y, cls_train))
print("Accuracy for Test Dataset = ", accuracy_score(y_test, cls_test))

(50000, 10)
Log-loss for Train Dataset =  0.09436428350136501
Log-loss for Test Dataset =  0.7759590346173721
Weighted F1 Score for Train Dataset =  0.9758555895807672
Weighted F1 Score for Test Dataset =  0.7699744204844067
Accuracy for Train Dataset =  0.97592
Accuracy for Test Dataset =  0.7727


### Reference: The effect of modifying Voxel51 dataset on the Voxel51 dataset's view
- We can't add the samples directly using the list of IDs for the corresponding samples.
- First, we have to select the samples using the list of IDs, and then only the samples will be added.
- Another thing to keep in mind is that once the samples have been added, the indices need to be sorted and listed again, and the view needs to be re-initialized.

#### Output from the below code cell:
` # Samples in train_view before deleting any samples: 4022` <br>
`# Samples in pool_view for the corresponding class: 4083` <br>
`# Samples in train_view after deleting 1 sample: 4021` <br>
`100% |███████████████| 4083/4083 [5.3s elapsed, 0s remaining, 840.8 samples/s]` <br>
`# Samples in train_view after adding n samples: 4021` <br>
`# Samples in train_view after recalculating the IDs and reinitializing the view: 8104` <br>

In [24]:
# _ , train_classes = get_class_samples(train_dataset, progress = False)
# _ , pool_classes = get_class_samples(pool_dataset, progress = False)
# train_view = train_dataset.select(train_classes[2])
# pool_view = pool_dataset.select(pool_classes[2])
# print(f"# Samples in train_view before deleting any samples: {len(train_view)}")
# print(f"# Samples in pool_view for the corresponding class: {len(pool_view)}")

# # Extracting the IDs of samples in train_view
# ids = []
# for sample in train_view.iter_samples():
#     voxel_id = sample['id']
#     ids.append(voxel_id)

# # Deleting a single example from train_dataset
# train_dataset.delete_samples([ids[0]])
# print(f"# Samples in train_view after deleting 1 sample: {len(train_view)}")

# # Adding examples to train_dataset
# train_dataset.add_samples(pool_view)
# print(f"# Samples in train_view after adding `n` samples: {len(train_view)}")

# _ , train_classes = get_class_samples(train_dataset, progress = False)
# train_view = train_dataset.select(train_classes[2])
# print(f"# Samples in train_view after recalculating the IDs and reinitializing the view: {len(train_view)}")